feat: finally able to overcome the base of the base of this f*cked up mess

2025-03-01 17:06:56 +01:00
parent eed28168ad
commit 6ee8051004
20 changed files with 518 additions and 2306 deletions
--- a/InkBlot/InkBlotAntlrLexer.g4
+++ b/InkBlot/InkBlotAntlrLexer.g4
@@ -1,47 +1,52 @@
 lexer grammar InkBlotAntlrLexer;

-// classic "white space" and "new line" - ink's new line also allows for some whitespace at start
-WS: [ \t]+ ;
-NL: WS? '\r'? '\n' ;
-
-// one or more (potential whitespace followed by) newline(s); used e.g. for block sequencing
-MULTILINE_WS: NL+ ;
-
-// see InkParser_Content.cs, ContentTextNoEscape and ContentTextAllowingEcapeChar for the escape case
-// this works for the base case where we're not parsing a string, nor a choice 
-CONTENT_TEXT_NO_ESCAPE_SIMPLE:
-    (
-      // any character is valid, except for:
-      // - {}   ==> identifies embedded logic
-      // - |    ==> text alternatives, is forbidden even in non-logic text for some reason
-      // - \n\r ==> a new line of content
-      // - #    ==> a tag
-      // - \, < and - with exceptions (see below)
-      ~[{}|\n\r\\#-< ]
-      // any character can be escaped
-    | '\\' [\u0000-\uFFFF] // TODO: is there a better way to say "any character"?
-      // accept a - only if not followed by a > (->, a divert)
-    | '-' { InputStream.LA(1) != '>' }?
-      // same for threads (<-) and glue (<>)
-//    | '<' { InputStream.LA(1) != '-' && InputStream.LA(1) != '>' }?
-    )+ ;
-
-INLINE_LOGIC_START: '{' ;
-INLINE_LOGIC_END: '}' ;
-
-// All symbols for sequencing: either using the short-hand symbols (https://github.com/inkle/ink/blob/master/Documentation/WritingWithInk.md#types-of-alternatives)
-// or using the multiline blocks (https://github.com/inkle/ink/blob/master/Documentation/WritingWithInk.md#multiline-blocks)
-SEQUENCE_TYPE_SYMBOL_ANNOTATION: [!&~$ ] ;
-ONCE: 'once' WS? ':' ;
-CYCLE: 'cycle' WS? ':' ;
-SHUFFLE: 'shuffle' WS? ':' ;
-STOPPING: 'stopping' WS? ':' ;
-SHUFFLE_ONCE: 'shuffle' WS 'once' WS? ':' ;
-SHUFFLE_STOPPING: 'shuffle' WS 'stopping' WS? ':' ;
-
 THREAD_ARROW: '<-' ;
 DIVERT_ARROW: '->' ;
 TUNNEL_ARROW: '->->' ;

-// TODO: add all extra character ranges from InkParser_CharacterRanges (LatinBasic, LatinExtendedA, ...)
-IDENTIFIER: [A-Za-z0-9_]+;
+IDENTIFIER: [a-zA-Z0-9_]+;
+
+//// classic "white space" and "new line" - ink's new line also allows for some whitespace at start
+WS: [ \t]+ ;
+NL: WS? '\r'? '\n' ;
+//
+//// one or more (potential whitespace followed by) newline(s); used e.g. for block sequencing
+//MULTILINE_WS: NL+ ;
+//
+//// see InkParser_Content.cs, ContentTextNoEscape and ContentTextAllowingEcapeChar for the escape case
+//// this works for the base case where we're not parsing a string, nor a choice
+//// We ALSO have to remove all other tokens from here, otherwise this will gobble them all up, since it will become
+//// the longest-matching token 
+//CONTENT_TEXT_NO_ESCAPE_NO_IDENT_SIMPLE:
+//    (
+//      // any character is valid, except for:
+//      // - {}                   ==> identifies embedded logic
+//      // - |                    ==> text alternatives, is forbidden even in non-logic text for some reason
+//      // - \n\r                 ==> a new line of content
+//      // - #                    ==> a tag
+//      // - \, < and - with exceptions (see below)
+//      // - space and \t         ==> these are used to parse spaces 
+//      // a-z, A-Z, 0-9 and _    ==> these are used to parse identifiers TODO: add missing characters
+//      // !&$                   ==> these are used by as sequence type symbol annotation; "~" too but that is special
+//      ~[{}|\n\r\\#\-< \ta-zA-Z0-9_!&$]
+//      // any character can be escaped
+//    | '\\' [\u0000-\uFFFF] // TODO: is there a better way to say "any character"?
+//      // accept a - only if not followed by a > (->, a divert)
+//    | '-' { InputStream.LA(1) != '>' }?
+//      // same for threads (<-) and glue (<>)
+//    | '<' { InputStream.LA(1) != '-' && InputStream.LA(1) != '>' }?
+//    )+ ;
+//
+//INLINE_LOGIC_START: '{' ;
+//INLINE_LOGIC_END: '}' ;
+//
+//// All symbols for sequencing: either using the short-hand symbols (https://github.com/inkle/ink/blob/master/Documentation/WritingWithInk.md#types-of-alternatives)
+//// or using the multiline blocks (https://github.com/inkle/ink/blob/master/Documentation/WritingWithInk.md#multiline-blocks)
+//SEQUENCE_TYPE_SYMBOL_ANNOTATION: [!&~$ ] ;
+//
+//THREAD_ARROW: '<-' ;
+//DIVERT_ARROW: '->' ;
+//TUNNEL_ARROW: '->->' ;
+//
+//// TODO: add all extra character ranges from InkParser_CharacterRanges (LatinBasic, LatinExtendedA, ...), and also remove them from CONTENT_TEXT_NO_ESCAPE_SIMPLE 
+//IDENTIFIER: [a-zA-Z0-9_]+;