%{ #include "ast.hpp" #define push_state(s) xhp_new_push_state(s, yyg) #define pop_state() xhp_new_pop_state(yyg) #define set_state(s) xhp_set_state(s, yyg) #define pttok(t, txt) \ yyextra->token_list.push_back( \ new xhpast::Token(t, txt, yyextra->list_size++)); \ *yylval = new xhpast::Node(0, yyextra->list_size - 1); #define ptok(t) \ pttok(t, yytext); #define tok(t) \ ptok(t); \ return yy_token(t, yyg) #define YY_USER_INIT \ if (yyextra->insert_token) { \ yyg->yy_init = 0; \ int ft = yyextra->insert_token; \ yyextra->insert_token = 0; \ return yy_token(ft, yyg); \ } using namespace std; const char* yytokname(int tok); static int yy_token(int tok, struct yyguts_t* yyg); static void yy_scan_newlines(const char* text, struct yyguts_t* yyg); %} %option prefix="xhpast" %option reentrant /* PHP allows IF or if */ %option case-insensitive %option noyywrap nodefault %option stack %option bison-bridge %option 8bit /* The different lexing states. Note that the transitions are done either * in the lex actions, or in a generic manner in yy_token(). */ %s PHP %s PHP_COMMENT %s PHP_EOL_COMMENT %s PHP_DOC_COMMENT %s PHP_HEREDOC_START %s PHP_HEREDOC_NSTART %s PHP_HEREDOC_NEWLINE %s PHP_NO_RESERVED_WORDS %s PHP_NO_RESERVED_WORDS_PERSIST %s PHP_ LNUM [0-9]+ DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*) EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) HNUM "0x"[0-9a-fA-F]+ BNUM "0b"[01]+ LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* BYTE (.|\n) WHITESPACE [ \n\r\t]+ TABS_AND_SPACES [ \t]* NEWLINE ("\r\n"|"\n"|"\r") %% /* Open / close PHP + inline HTML */ { "{ ("?>"|""){NEWLINE}? { yy_scan_newlines(yytext + 2, yyg); tok(T_CLOSE_TAG); } } /* Comments and whitespace */ { "#"|"//" { push_state(PHP_EOL_COMMENT); yymore(); } "/**"{WHITESPACE} { yy_scan_newlines(yytext + 3, yyg); push_state(PHP_DOC_COMMENT); yymore(); } "/*" { push_state(PHP_COMMENT); yymore(); } {WHITESPACE}+ { yy_scan_newlines(yytext, yyg); ptok(T_WHITESPACE); } } <> { ptok(T_COMMENT); pop_state(); } { {NEWLINE} { ++yyextra->lineno; ptok(T_COMMENT); pop_state(); } [^\r\n?]+ yymore(); "?>" { yyless(yyleng - 2); ptok(T_COMMENT); pop_state(); } . yymore(); } { {NEWLINE} { ++yyextra->lineno; yymore(); } [^*\r\n]+|"*" yymore(); } "*/" { ptok(T_DOC_COMMENT); pop_state(); } <> { ptok(T_DOC_COMMENT); pop_state(); } "*/" { ptok(T_COMMENT); pop_state(); } <> { ptok(T_COMMENT); pop_state(); } /* Reserved words */ { include tok(T_INCLUDE); include_once tok(T_INCLUDE_ONCE); eval tok(T_EVAL); require tok(T_REQUIRE); require_once tok(T_REQUIRE_ONCE); or tok(T_LOGICAL_OR); xor tok(T_LOGICAL_XOR); and tok(T_LOGICAL_AND); print tok(T_PRINT); instanceof tok(T_INSTANCEOF); new tok(T_NEW); clone tok(T_CLONE); exit tok(T_EXIT); if tok(T_IF); elseif tok(T_ELSEIF); else tok(T_ELSE); endif tok(T_ENDIF); echo tok(T_ECHO); do tok(T_DO); while tok(T_WHILE); endwhile tok(T_ENDWHILE); for tok(T_FOR); endfor tok(T_ENDFOR); foreach tok(T_FOREACH); endforeach tok(T_ENDFOREACH); declare tok(T_DECLARE); enddeclare tok(T_ENDDECLARE); as tok(T_AS); switch tok(T_SWITCH); endswitch tok(T_ENDSWITCH); case tok(T_CASE); default tok(T_DEFAULT); break tok(T_BREAK); continue tok(T_CONTINUE); goto tok(T_GOTO); function tok(T_FUNCTION); const tok(T_CONST); return tok(T_RETURN); try tok(T_TRY); catch tok(T_CATCH); throw tok(T_THROW); use tok(T_USE); global tok(T_GLOBAL); static tok(T_STATIC); abstract tok(T_ABSTRACT); final tok(T_FINAL); private tok(T_PRIVATE); protected tok(T_PROTECTED); public tok(T_PUBLIC); var tok(T_VAR); unset tok(T_UNSET); isset tok(T_ISSET); empty tok(T_EMPTY); __halt_compiler tok(T_HALT_COMPILER); class tok(T_CLASS); interface tok(T_INTERFACE); extends tok(T_EXTENDS); implements tok(T_IMPLEMENTS); list tok(T_LIST); array tok(T_ARRAY); __class__ tok(T_CLASS_C); __method__ tok(T_METHOD_C); __function__ tok(T_FUNC_C); __line__ tok(T_LINE); __file__ tok(T_FILE); namespace tok(T_NAMESPACE); __namespace__ tok(T_NS_C); __dir__ tok(T_DIR); insteadof tok(T_INSTEADOF); callable tok(T_CALLABLE); trait tok(T_TRAIT); __trait__ tok(T_TRAIT_C); yield tok(T_YIELD); finally tok(T_FINALLY); } /* Operators */ { "+=" tok(T_PLUS_EQUAL); "-=" tok(T_MINUS_EQUAL); "*=" tok(T_MUL_EQUAL); "/=" tok(T_DIV_EQUAL); ".=" tok(T_CONCAT_EQUAL); "%=" tok(T_MOD_EQUAL); "&=" tok(T_AND_EQUAL); "|=" tok(T_OR_EQUAL); "^=" tok(T_XOR_EQUAL); "<<=" tok(T_SL_EQUAL); ">>=" tok(T_SR_EQUAL); "||" tok(T_BOOLEAN_OR); "&&" tok(T_BOOLEAN_AND); "==" tok(T_IS_EQUAL); "!="|"<>" tok(T_IS_NOT_EQUAL); "===" tok(T_IS_IDENTICAL); "!==" tok(T_IS_NOT_IDENTICAL); "<=" tok(T_IS_SMALLER_OR_EQUAL); ">=" tok(T_IS_GREATER_OR_EQUAL); "<<" tok(T_SL); ">>" tok(T_SR); "++" tok(T_INC); "--" tok(T_DEC); "->" tok(T_OBJECT_OPERATOR); "=>" tok(T_DOUBLE_ARROW); "::" tok(T_PAAMAYIM_NEKUDOTAYIM); "\\" tok(T_NS_SEPARATOR); "..." tok(T_ELLIPSIS); "??" tok(T_COALESCE); "<=>" tok(T_SPACESHIP); } /* Casts */ { "("{TABS_AND_SPACES}(int|integer){TABS_AND_SPACES}")" tok(T_INT_CAST); "("{TABS_AND_SPACES}(real|double|float){TABS_AND_SPACES}")" tok(T_DOUBLE_CAST); "("{TABS_AND_SPACES}(string|binary){TABS_AND_SPACES}")" tok(T_STRING_CAST); "("{TABS_AND_SPACES}array{TABS_AND_SPACES}")" tok(T_ARRAY_CAST); "("{TABS_AND_SPACES}object{TABS_AND_SPACES}")" tok(T_OBJECT_CAST); "("{TABS_AND_SPACES}(bool|boolean){TABS_AND_SPACES}")" tok(T_BOOL_CAST); "("{TABS_AND_SPACES}unset{TABS_AND_SPACES}")" tok(T_UNSET_CAST); } /* Scalars (parsing these doesn't really matter since we just pass them through literally) */ { {LNUM}|{HNUM}|{BNUM} tok(T_LNUMBER); {DNUM}|{EXPONENT_DNUM} tok(T_DNUMBER); {LABEL} tok(T_STRING); "$"{LABEL} tok(T_VARIABLE); b?'(\\.|\\\n|[^\\']+)*'|b?\"(\\.|\\\n|[^\\\"]+)*\" { yy_scan_newlines(yytext, yyg); tok(T_CONSTANT_ENCAPSED_STRING); } `[^`]*` { yy_scan_newlines(yytext, yyg); tok(T_BACKTICKS_EXPR); } } /* (HERE|NOW)DOC's */ b?"<<<"{TABS_AND_SPACES} { push_state(PHP_HEREDOC_START); yyextra->heredoc_yyleng = yyleng; yymore(); } { "'"{LABEL}"'"|\"{LABEL}\" { // Create a new string for the heredoc label. Since we're using yymore above // yytext will actually start at the "<<<" and not the label. Use of // heredoc_yyleng jumps past that. Then we add 1 to get past the " or '. The // match is similar to calculate length. yyextra->heredoc_label = string( yytext + yyextra->heredoc_yyleng + 1, yyleng - yyextra->heredoc_yyleng - 2); set_state(PHP_HEREDOC_NSTART); yyextra->heredoc_yyleng = yyleng; yymore(); } {LABEL} { yyextra->heredoc_label = string(yytext + yyextra->heredoc_yyleng); set_state(PHP_HEREDOC_NSTART); yyextra->heredoc_yyleng = yyleng; yymore(); } } {NEWLINE} { yyextra->heredoc_yyleng = yyleng; set_state(PHP_HEREDOC_NEWLINE); yymore(); } { {LABEL};?{NEWLINE} { if (strncmp( yyextra->heredoc_label.c_str(), yytext + yyextra->heredoc_yyleng, yyextra->heredoc_label.size()) == 0) { switch (yytext[yyextra->heredoc_yyleng + yyextra->heredoc_label.size()]) { case ';': case '\n': case '\r': yyless( yyleng - ( yyleng - yyextra->heredoc_yyleng - yyextra->heredoc_label.size())); pop_state(); tok(T_HEREDOC); } } ++yyextra->lineno; yyextra->heredoc_yyleng = yyleng; yymore(); } [^\r\n]+ { yyextra->heredoc_yyleng = yyleng; yymore(); } {NEWLINE} { ++yyextra->lineno; yyextra->heredoc_yyleng = yyleng; yymore(); } } /* Other */ <*>{BYTE} { tok(yytext[0]); // fix unused function warnings yy_top_state(NULL); yyunput(0, 0, NULL); } %% #ifdef DEBUG static const char* yy_state_name(int state) { switch (state) { case INITIAL: return "INITIAL"; case PHP: return "PHP"; case PHP_COMMENT: return "PHP_COMMENT"; case PHP_EOL_COMMENT: return "PHP_EOL_COMMENT"; case PHP_DOC_COMMENT: return "PHP_DOC_COMMENT"; case PHP_HEREDOC_START: return "PHP_HEREDOC_START"; case PHP_HEREDOC_NSTART: return "PHP_HEREDOC_NSTART"; case PHP_HEREDOC_NEWLINE: return "PHP_HEREDOC_NEWLINE"; case PHP_NO_RESERVED_WORDS: return "PHP_NO_RESERVED_WORDS"; case PHP_NO_RESERVED_WORDS_PERSIST: return "PHP_NO_RESERVED_WORDS_PERSIST"; default: return "???"; } } static void yy_log_token(int tok) { const char* tokname = yytokname(tok); if (tokname) { fprintf(stderr, "--> %s\n", tokname); } else { fprintf(stderr, "--> '%c'\n", tok); } } #endif static int yy_token(int tok, yyguts_t* yyg) { if (YY_START == PHP_NO_RESERVED_WORDS) { pop_state(); } switch (tok) { case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO: case T_OPEN_TAG_FAKE: push_state(PHP); break; case T_CLOSE_TAG: pop_state(); // We need to return a ';', not a T_CLOSE_TAG, because a construct like // "" is valid and there are about a billion parser rules // which terminate with ';' so making a new rule like // "semicolon_or_close_tag" would be hard. The token in yylval has the // correct type and value, we just don't generate a node. return ';'; // In PHP it's ok to use keywords such as 'if' as field names // or function names. case T_OBJECT_OPERATOR: case T_FUNCTION: push_state(PHP_NO_RESERVED_WORDS); break; case T_PAAMAYIM_NEKUDOTAYIM: push_state(PHP_NO_RESERVED_WORDS); break; } #ifdef DEBUG yy_log_token(tok); #endif return yyextra->last_token = tok; } static inline void yy_scan_newlines(const char* text, struct yyguts_t* yyg) { for (; *text; ++text) { if (*text == '\r') { if (text[1] == '\n') { ++text; } ++yyextra->lineno; } else if (*text == '\n') { ++yyextra->lineno; } } } void xhp_new_push_state(int s, struct yyguts_t* yyg) { #ifdef DEBUG fprintf( stderr, "--> PUSH(%s -> %s)\n", yy_state_name(YY_START), yy_state_name(s)); #endif yy_push_state(s, yyg); } void xhp_new_pop_state(struct yyguts_t* yyg) { #ifdef DEBUG int s = YY_START; #endif yy_pop_state(yyg); #ifdef DEBUG fprintf( stderr, "--> POP(%s -> %s)\n", yy_state_name(s), yy_state_name(YY_START)); #endif } void xhp_set_state(int s, struct yyguts_t* yyg) { #ifdef DEBUG fprintf(stderr, "--> SET(%s)\n", yy_state_name(s)); #endif BEGIN(s); }