Markup spec/flex

/* Scanner for Wikipedia language.  Built with flex.  */


CARRIAGERETURN                       \r
CARRIAGERETURN_DOUBLE                \r\n\r
VALIDURLCHARS                        [a-z0-9\%\/\?\:\@\=\&\$\_\-\+\!\*\'\(\)\,\.]
NEWPARAGRAPH                         \n\n
MATH                                 <math>
MATH_END                             <\/math>
NOWIKI                               <nowiki>
NOWIKI_END                           <\/nowiki>
GENERICLINK                          [a-z]+:\/\/{VALIDURLCHARS}+
TITLEDLINK                           \133{GENERICLINK}\ [^\133]*\135
WIKILINK                             \133{2}[^\135]+\135{2}
CURRENTDAY                           \{\{CURRENTDAY\}\}
CURRENTMONTH                         \{\{CURRENTMONTH\}\}
CURRENTTIME                          \{\{CURRENTTIME\}\}
CURRENTYEAR                          \{\{CURRENTYEAR\}\}
LIST                                 \n[\:\#\;\*]
PRE                                  ^\040
PRE_END                              \n[^\040]
HR                                   ^----
H1                                   ={1}
H2                                   ={2}
H3                                   ={3}
H4                                   ={4}
H5                                   ={5}
H6                                   ={6}
EMPHASIZE                            '{2}
SEMPHASIZE                           '{3}
VSEMPHASIZE                          '{5}
LESSERTHAN                           <
GREATERTHAN                          >

%option caseless stack
%s list pre
%x math nowiki

%{
#include <time.h>
#include <sys/types.h>

#define MAXLIST 32
%}

%%

%{

/* State variable positions (int state[10]):
0 = pre
1 = h1
2 = h2
3 = h3
4 = h4
5 = h5
6 = h6
7 = emphasis
8 = strong emphasis
9 = very strong emphasis */
int state[10];

/* Temporary variables. */
int i;
char j;

/* A string used for holding the current content of a list (like *#*) */
char listtext[MAXLIST] = "\0";

/* The variables needed for CURRENTTIME-like substitutions. */
time_t time_since_epoch;
struct tm cur_time;

/* Set all state variables to 0. */
for (i=0; i<10; i++) { state[i] = 0; }

/* Get the time once at execution of program, instead of every call. */
time(&time_since_epoch);
gmtime_r(&time_since_epoch, &cur_time);

%}

{CARRIAGERETURN_DOUBLE}              { unput('\n'); }
{CARRIAGERETURN}

{WIKILINK}                           { ECHO; }

{NOWIKI}                             { yy_push_state(nowiki); }
<nowiki>{NOWIKI_END}                 { yy_pop_state(); }
<nowiki>{LESSERTHAN}                 { printf("<"); }
<nowiki>{GREATERTHAN}                { printf(">"); }

{MATH}                               { yy_push_state(math); }
<math>{MATH_END}                     { yy_pop_state(); }

{PRE}                                {
                                     if (state[0] == 0) { printf("\n<pre>"); state[0]++; yy_push_state(pre); }
                                     }
<pre>{PRE_END}                       { printf("</pre>"); state[0]--; yyless(0); yy_pop_state(); }

{HR}                                 { printf("\n<hr>"); }
{NEWPARAGRAPH}                       { printf("\n<p>"); unput('\n'); }

{VSEMPHASIZE}                        {
  if (state[9] == 0) { printf("<strong><em>");   state[9]++; }
  else               { printf("</strong></em>"); state[9]--; }
                                     }
{SEMPHASIZE}                         {
  if (state[8] == 0) { printf("<strong>");  state[8]++; }
  else               { printf("</strong>"); state[8]--; }
                                     }
{EMPHASIZE}                          {
  if (state[7] == 0) { printf("<em>");  state[7]++; }
  else               { printf("</em>"); state[7]--; }
                                     }

{H6}                                 {
  if (state[6] == 0) { printf("<h6>");  state[6]++; }
  else               { printf("</h6>"); state[6]--; }
                                     }
{H5}                                 {
  if (state[5] == 0) { printf("<h5>");  state[5]++; }
  else               { printf("</h5>"); state[5]--; }
                                     }
{H4}                                 {
  if (state[4] == 0) { printf("<h4>");  state[4]++; }
  else               { printf("</h4>"); state[4]--; }
                                     }
{H3}                                 {
  if (state[3] == 0) { printf("<h3>");  state[3]++; }
  else               { printf("</h3>"); state[3]--; }
                                     }
{H2}                                 {
  if (state[2] == 0) { printf("<h2>");  state[2]++; }
  else               { printf("</h2>"); state[2]--; }
                                     }
{H1}                                 {
  if (state[1] == 0) { printf("<h1>");  state[1]++; }
  else               { printf("</h1>"); state[1]--; }
                                     }

{TITLEDLINK}                         {
                                     printf("<a href=\"");
                                     while (*++yytext != ' ') { printf("%c", *yytext); } /* Print everything up to first space */
                                     printf("\">");
                                     while (*++yytext != ']') { printf("%c", *yytext); } /* Print href text */
                                     printf("</a>");
                                     }
{GENERICLINK}                        {
                                     printf("<a href=\"");
                                     j = *(yytext + yyleng - 1);
                                     /* If the last character of a URL is a '.' or a ',', assume it is punctuation. */
                                     if ((j == '.') || (j == ','))
                                       {
                                       *(yytext + yyleng - 1) = '\0';
                                       printf("%s\">%s</a>%c", yytext, yytext, j);
                                       }
                                     else { printf("%s\">%s</a>", yytext, yytext); }
                                     }
{CURRENTTIME}                        { printf("%d:%d", cur_time.tm_hour, cur_time.tm_min); }
{CURRENTDAY}                         { printf("%d", cur_time.tm_mday); }
{CURRENTMONTH}                       { printf("%.2d", (cur_time.tm_mon + 1)); }
{CURRENTYEAR}                        { printf("%d", (cur_time.tm_year + 1900)); }
{LIST}                               {
                                     if (strlen(yytext) < MAXLIST)
                                       {
                                       strcpy(listtext, yytext);
                                       /* 
                                       i = 0;
                                       while(listtext[i] != '\0') {}
                                       */
                                       }
                                     }


%%

int main (int argc, char **argv)
  {
  ++argv, --argc;  /* Don't care about name of program. */
  yyin = fopen(argv[0], "r");
  yylex();
  return 0;
  }