User:OrenBochman/ParserNG/Preprocessor Antlr

The Preprocessor

This is a Lexer for the Preprocessor in ANTLR

grammar wikiParser;

options
{

   language=Java;  // Default

   // Tell ANTLR to make the generated lexer class extend the
   // the named class, which is where any supporting code and 
   // variables will be placed.
   //
   superClass = AbstractTLexer;
   
  //  charVocabulary='\u0000'..'\uFFFE';
   output=AST;
   backtrack=true;
   k=6;
}


wikiParser : WIKITEXT_L1*;



// $<EBNS

ALPHA
    // A-Z | a-z
    : '\u0041'..'\u005A' | '\u0061'..'\u007A'
    ;

BIT
    : '0' | '1'
    ;

CHAR
    // any 7-bit US-ASCII character, excluding NUL
    : '\u0001'..'\u007F'
    ;

CR
    // carriage return
    : '\u000D'
    ;

CRLF
    // Internet standard newline
    : CR LF
    ;

CTL
    // controls
    : '\u0000'..'\u001F' | '\u007F'
    ;

DIGIT
    // 0-9
    : '\u0030'..'\u0039'
    ; 



HEXDIG
    : DIGIT | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
    ;

HTAB
    // horizontal tab
    : '\u0009'
    ;   

LF
    // linefeed
    : '\u000A'
    ;


OCTET
    // 8 bits of data
    : '\u0000'..'\u00FF'
    ;

SP
    // space
    : '\u0020'
    ;

VCHAR
    // visible (printing) characters
    : '\u0021'..'\u007E'
    ;
    
// $>


//reserved characters
GT	:	'>';
LT	:	'<';
EQ	:	'=';
QUOTE	:	'\'';
DQUOTE
    // " (Double Quote)
    : '\u0022'
    ;
PIPE	:	'|';
HASH 	:	'#';
RBRACE 	:	']';
LBRACE	:	'[';

M_RESERVED : EQ | QUOTE | DQUOTE | HASH | PIPE |
	;
L_RESERVED
	:LT| LBRACE;
	
R_RESERVED
	:	
	GT | RBRACE
	;
	
RESERVED
	:	L_RESERVED | M_RESERVED | R_RESERVED;
 
WS  
//Whitesace
	: SP
        | HTAB
        | LF 
        | CR
        | CRLF 
        ;


LS 		
//line space
	: SP | HTAB ;
 
L_LIT
//everything except ;' (%x3E)
	: ~(L_RESERVED | WS )
	;

fragment	
UNI_CHAR
// any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
// this is like . - must be reduced 
//'\u0000'..'\uFFFD' 
	: ~(RESERVED | WS ) 
	;

LITERAL  
    	: UNI_CHAR				//single character
    	| ~(L_RESERVED | WS)~( WS)+ 		//any non Left reserved char with any other chars  
    	;
ATTR            
	: LITERAL?				// can this be factored out
	;		

TITLE 	
 	: WIKITEXT_L3	
 	;
 	
PART_NAME    	
	: WIKITEXT_L3	
	; 
	
PART_VALUE
	: WIKITEXT_L3	
	;	

PART_LITERAL 
//A litteral with no equal
    	: UNI_CHAR				//single character
    	| ~(L_RESERVED | EQ | WS)~( WS)+ 		//any reserved char not on it's own 
	;
PART         	
	: (PART_LITERAL+  EQ)? LITERAL+
	; 
				
PARTS   
	:  TITLE ( PIPE PART )*   
	;
	
TPL_ARG       	
	: '{{{' PARTS '}}}' 
	;

TEMPLATE     	
	: '{{' PARTS '}}' 
	;								//TODO: the closing token needs to match the inner token  
	
LINK         	
	: '[[' WIKITEXT_L3 ']]' 
	;
COMMENT  
    	: '<!--'! (LITERAL | WS)*  '-->'! 
    	;						//todo: make non greedy
    	
//UNCLOSED_COMMENT :	 '<!--' LITERAL EOF 	;	

//LF_REWRITE		: LF -> LINE_END LF  LINE_START;					
//LINE_EATING_COMMENT:	LF SP* comment:COMMENT SP* LF!;
LINE_EATING_COMMENT
	:	LF LS* COMMENT LS* LF {emit("LF");}
	;

				
NOWIKI_ELEMENT  : '&lt;nowiki' ATTR ( '/>' 
				| ( '>' LITERAL 
					( '&lt;/nowiki>' 
					| EOF ) ) )
				;
EXTENTIONS : LT '/'? SP* 'ref' SP*  GT 
	|    LT '/'? SP* 'references' SP*  GT 
	|    LT SP*'references' SP*'/' SP* GT 
	;
// and similar rules added by XML-style EXTENTIONS. 
XMLiS_ELEMENT  : NOWIKI_ELEMENT | EXTENTIONS ;
 
HEADING : LF HEADING_INNER ( LS* COMMENT )? LS* ;
 

 
HEADING_INNER   : '======' LS*  h6=(LITERAL LS*)+'======'!
                | '=====' LS*	h5=(LITERAL LS*)+'====='!
                | '====' LS*	h4=(LITERAL LS)+'===='!
                | '===' LS*	h3=(LITERAL LS*)+'==='!
                | '==' LS*	h2=(LITERAL LS*)+'=='!
		| '=' LS*	h1=(LITERAL LS*)+'='!
                ;
 
// WIKITEXT_L1 is a simple proxy to WIKITEXT_L2, except in INCLUSION mode, where it
// has a role in <onlyinclude> syntax (see below)
WIKITEXT_L1     : WIKITEXT_L2
     	        ;
			    
WIKITEXT_L2     : HEADING
		| WIKITEXT_L3
		;
				
WIKITEXT_L3     
	: LITERAL
	| TEMPLATE
	| TPL_ARG 
	| LINK
	| COMMENT
	| LINE_EATING_COMMENT
	| UNCLOSED_COMMENT
	| XMLiS_ELEMENT

				;
                  
////INC MODE/////////////////////////////////////////////////////////////////////////////////////////

NOINCLUDE_ELEMENT           : '&lt;noinclude' ATTR 
						 	 ( '/>' 
						     | ( '>' LITERAL ( '&lt;/noinclude>' 
						       | EOF ) ) )
						    ;
						    
INCLUSION_IGNORED_TAG       : ('<' 'includeonly' '>'!) 
						    | ('</'! 'includeonly' '>'!) 
						    ;
						    
CLOSED_ONLYINCLUDE_ITEM     : IGNORED_TEXT '<'! 'onlyinclude' '>'! WIKITEXT_L2 '</'! 'onlyinclude' '>' 
						    ;
						    
UNCLOSED_ONLYINCLUDE_ITEM   : IGNORED_TEXT '<' 'onlyinclude' '>' WIKITEXT_L2 ;
						    
IGNORED_TEXT                : LITERAL ;
						    
ONLY_INCLUDE_SEQUENCE       : CLOSED_ONLYINCLUDE_ITEM* UNCLOSED_ONLYINCLUDE_ITEM* ;
						    
INC_XMLiS_ELEMENT           : XMLiS_ELEMENT | NOINCLUDE_ELEMENT ;
						    
INC_WIKITEXT_L1             : WIKITEXT_L1 | ONLY_INCLUDE_SEQUENCE;
						     
INC_WIKITEXT_L3             : WIKITEXT_L3 | INCLUSION_IGNORED_TAG | ONLY_INCLUDE_SEQUENCE	;

////NO INC MODE/////////////////////////////////////////////////////////////////////////////////////

INCLUDE_ONLY_ELEMENT         : '&lt;includeonly' ATTR 
							 ( '/>' 
                             | ( '>' LITERAL ( '&lt;/includeonly>' 
                            				| EOF ) ) ) 
            				;
                            				
NONINCLUSION_IGNORED_TAG   
	: '<'! '/'? 'noinclude' '>'! 
	| '<'! '/'? 'onlyinclude' '>'!
	;
						   
NI_XMLiS_ELEMENT          
	: XMLiS_ELEMENT 
        | INCLUDE_ONLY_ELEMENT 
        ;
                           
NI_WIKITEXT_L3             
	: NONINCLUSION_IGNORED_TAG
	| WIKITEXT_L3 
        ;
//associativity of sequences/////////////////////////////////////////////////////////////////////////////

//P1 ANGLE_BRACKET expressions have associativity from the Left
//i.e. left most opening tag in a sequence is the sequence root 


ANGLE_BRACKET 		
	: COMMENT
	| ONLY_INCLUDE_SEQUENCE 	
//	| UNCLOSED_COMMENT
	| LINE_EATING_COMMENT
	| INCLUSION_IGNORED_TAG
	| NONINCLUSION_IGNORED_TAG
	| XMLiS_ELEMENT
	;

//based on [1]

Test Sets

EOF and broken xxx rules

Antlr offers some remedies. However issue is not so well explained.

The problem with {{ [[ }} ]] is a poor choice since it looks like a syntax error. If we consider {{a [[b | {{c|d}} ]] }} it is easier to see that the inner most element should be evaluated before the outer ones. Inner most element also starts furthest on the right. If we consider {{a [[b | {{c|d}} | {{e|f}} ]] }} it seemes that the parse tree should become something like:

Parse Tree

   TMPL:e|f
    /\
null  |
      /\
  null  TMPL:c|d
        /\
     null |  
          /\
      null  LNK:b
             /\
         null  TMPL:a

where:
- | are pipes
- /\ are branches
- null are empty place holders
once parsed it should than be processed (and inverted) into an AST by further tree building rules.

EOF rules

As mentioned in the EBNF speck it inefficent to run a test till the end of file fpr on closed tokens. However is the parse tree is built as above it should take a single pass.

Premature eof

in case of {{ [[ }} ]]EOF the EOF is premature and the {{ should wither be treeted as a syntax error or as a literal. (the former makes more sense)

The End Of File Condition

A method is available for reacting to the end of file condition as if it were an event; e.g., you might want to pop the lexer state at the end of an include file. This method, CharScanner.uponEOF(), is called from nextToken() right before the scanner returns an EOF_TYPE token object to parser:

public void uponEOF()|
    throws TokenStreamException, CharStreamException;

This event is not generated during a syntactic predicate evaluation (i.e., when the parser is guessing) nor in the middle of the recognition of a lexical rule (that would be an IO exception). This event is generated only after the complete evaluation of the last token and upon the next request from the parser for a token.

You can throw exceptions from this method like "Heh, premature eof" or a retry stream exception. See the includeFile/P.g for an example usage.

Tests

Parser tests are in maintenece tests

- empty syntax
[[{{]]}} is parsed as [[{{]}}
{{[[}}]] is parsed as {{[[}}]]
a{{b[[c}}d]]e is parsed as a{{b[[c}}d]]e
using real link and template