/* * Copyright 2011 Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ %{ #include "ast.hpp" #define push_state(s) xhp_new_push_state(s, yyg) #define pop_state() xhp_new_pop_state(yyg) #define set_state(s) xhp_set_state(s, yyg) #define last_token() yyextra->last_token #define YY_USER_ACTION \ if (!yyg->yy_more_len) \ yyextra->first_lineno = yyextra->lineno; #define pttok(t, txt) \ yyextra->token_list.push_back(new xhpast::Token(t, txt, yyextra->list_size++)); \ *yylval = new xhpast::Node(0, yyextra->list_size - 1); #define ptok(t) \ pttok(t, yytext); #define tok(t) \ ptok(t); \ return yy_token(t, yyg) #define tokt(t) pttok(T_XHP_ENTITY, yytext); push_state(XHP_AFTER_ENT); return yyextra->last_token = T_XHP_ENTITY; #define YY_USER_INIT \ if (yyextra->insert_token) { \ yyg->yy_init = 0; \ int ft = yyextra->insert_token; \ yyextra->insert_token = 0; \ return yy_token(ft, yyg); \ } using namespace std; const char* yytokname(int tok); static int yy_token(int tok, struct yyguts_t* yyg); static void yy_scan_newlines(const char* text, struct yyguts_t* yyg); static bool utf8ize(uint32_t v, char* buf /* [5] */) { if (v <= 0x7f) { // 0xxxxxxx buf[0] = v; buf[1] = 0; } else if (v <= 0x7ff) { // 110yyyxx 10xxxxxx buf[0] = 0xc0 | (v >> 6); buf[1] = 0x80 | (v & 0x3f); buf[2] = 0; } else if (v <= 0xffff) { // 1110yyyy 10yyyyxx 10xxxxxx buf[0] = 0xe0 | (v >> 12); buf[1] = 0x80 | ((v >> 6) & 0x3f); buf[2] = 0x80 | (v & 0x3f); buf[3] = 0; } else if (v <= 0x1fffff) { // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx buf[0] = 0xf0 | (v >> 18); buf[1] = 0x80 | ((v >> 12) & 0x3f); buf[2] = 0x80 | ((v >> 6) & 0x3f); buf[3] = 0x80 | (v & 0x3f); buf[4] = 0; } else { return false; } return true; } %} %option prefix="xhpast" %option reentrant /* PHP allows IF or if */ %option case-insensitive %option noyywrap nodefault %option stack %option bison-bridge %option 8bit /* I think an interactive scanner is required because of the bison state * pushing we do. I'm putting an explicit interactive declaration here in case * someone tries adding -CF or whatever to the make flags. */ %option interactive /* The different lexing states. Note that the transitions are done either * in the lex actions, or in a generic manner in yy_token(). */ %s PHP %s PHP_COMMENT %s PHP_EOL_COMMENT %s PHP_DOC_COMMENT %s PHP_HEREDOC_START %s PHP_HEREDOC_NSTART %s PHP_HEREDOC_NEWLINE %s PHP_HEREDOC_DATA %s PHP_NO_RESERVED_WORDS %s PHP_NO_RESERVED_WORDS_PERSIST %s PHP_ %s XHP_LABEL %s XHP_LABEL_WHITESPACE %s XHP_ATTRS %s XHP_ATTR_VAL %s XHP_AFTER_ENT %s XHP_CHILD %s XHP_CHILD_START %s XHP_INVALID_ENTITY %s XHP_ATTR_TYPE_DECL %s XHP_CHILDREN_DECL LNUM [0-9]+ DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*) EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) HNUM "0x"[0-9a-fA-F]+ LABEL [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* BYTE (.|\n) WHITESPACE [ \n\r\t]+ TABS_AND_SPACES [ \t]* NEWLINE ("\r\n"|"\n"|"\r") %% { "bool" tok(T_XHP_BOOLEAN); "int" tok(T_XHP_NUMBER); "float" tok(T_XHP_FLOAT); "var" tok(T_VAR); "array" tok(T_XHP_ARRAY); "string" tok(T_XHP_STRING); "enum" tok(T_XHP_ENUM); @required tok(T_XHP_REQUIRED); "(" tok('('); ":" tok(T_XHP_COLON); } /* Open / close PHP + inline HTML */ { "short_tags) { tok(T_OPEN_TAG); } else { tok(T_INLINE_HTML); } } "short_tags) { tok(T_OPEN_TAG_WITH_ECHO); } else { tok(T_INLINE_HTML); } } "<%" { if (yyextra->asp_tags) { tok(T_OPEN_TAG); } else { tok(T_INLINE_HTML); } } "<%=" { if (yyextra->asp_tags) { tok(T_OPEN_TAG_WITH_ECHO); } else { tok(T_INLINE_HTML); } } "<"|[^<]* { yy_scan_newlines(yytext, yyg); tok(T_INLINE_HTML); } } { ("?>"|""){NEWLINE}? { yy_scan_newlines(yytext + 2, yyg); tok(T_CLOSE_TAG); } "%>" { if (yyextra->asp_tags) { tok(T_CLOSE_TAG); } else { yyless(1); tok(yytext[0]); } } } /* Comments and whitespace */ { "#"|"//" { push_state(PHP_EOL_COMMENT); yymore(); } "/**"{WHITESPACE} { yy_scan_newlines(yytext + 3, yyg); push_state(PHP_DOC_COMMENT); yymore(); } "/*" { push_state(PHP_COMMENT); yymore(); } {WHITESPACE}+ { yy_scan_newlines(yytext, yyg); ptok(T_WHITESPACE); } } { {NEWLINE} { ++yyextra->lineno; ptok(T_COMMENT); pop_state(); } [^\r\n?]+ yymore(); "?>" { yyless(yyleng - 2); ptok(T_COMMENT); pop_state(); } . yymore(); } { {NEWLINE} { ++yyextra->lineno; yymore(); } [^*\r\n]+|"*" yymore(); } "*/" { ptok(T_DOC_COMMENT); pop_state(); } "*/" { ptok(T_COMMENT); pop_state(); } /* Reserved words */ { include tok(T_INCLUDE); include_once tok(T_INCLUDE_ONCE); eval tok(T_EVAL); require tok(T_REQUIRE); require_once tok(T_REQUIRE_ONCE); or tok(T_LOGICAL_OR); xor tok(T_LOGICAL_XOR); and tok(T_LOGICAL_AND); print tok(T_PRINT); instanceof tok(T_INSTANCEOF); new tok(T_NEW); clone tok(T_CLONE); exit tok(T_EXIT); if tok(T_IF); elseif tok(T_ELSEIF); else tok(T_ELSE); endif tok(T_ENDIF); echo tok(T_ECHO); do tok(T_DO); while tok(T_WHILE); endwhile tok(T_ENDWHILE); for tok(T_FOR); endfor tok(T_ENDFOR); foreach tok(T_FOREACH); endforeach tok(T_ENDFOREACH); declare tok(T_DECLARE); enddeclare tok(T_ENDDECLARE); as tok(T_AS); switch tok(T_SWITCH); endswitch tok(T_ENDSWITCH); case tok(T_CASE); default tok(T_DEFAULT); break tok(T_BREAK); continue tok(T_CONTINUE); goto tok(T_GOTO); function tok(T_FUNCTION); const tok(T_CONST); return tok(T_RETURN); try tok(T_TRY); catch tok(T_CATCH); throw tok(T_THROW); use tok(T_USE); global tok(T_GLOBAL); static tok(T_STATIC); abstract tok(T_ABSTRACT); final tok(T_FINAL); private tok(T_PRIVATE); protected tok(T_PROTECTED); public tok(T_PUBLIC); var tok(T_VAR); unset tok(T_UNSET); isset tok(T_ISSET); empty tok(T_EMPTY); __halt_compiler tok(T_HALT_COMPILER); class tok(T_CLASS); interface tok(T_INTERFACE); extends tok(T_EXTENDS); implements tok(T_IMPLEMENTS); list tok(T_LIST); array tok(T_ARRAY); __class__ tok(T_CLASS_C); __method__ tok(T_METHOD_C); __function__ tok(T_FUNC_C); __line__ tok(T_LINE); __file__ tok(T_FILE); namespace tok(T_NAMESPACE); __namespace__ tok(T_NS_C); __dir__ tok(T_DIR); attribute { // expecting_xhp_class_statements is set in some actions in the grammar. // This means the lexer and parser are interdependent. if ((last_token() == '{' || last_token() == '}' || last_token() == ';') && (yyextra->expecting_xhp_class_statements)) { tok(T_XHP_ATTRIBUTE); } else { tok(T_STRING); } } category { if ((last_token() == '{' || last_token() == '}' || last_token() == ';') && (yyextra->expecting_xhp_class_statements)) { tok(T_XHP_CATEGORY); } else { tok(T_STRING); } } children { if ((last_token() == '{' || last_token() == '}' || last_token() == ';') && (yyextra->expecting_xhp_class_statements)) { tok(T_XHP_CHILDREN); } else { tok(T_STRING); } } } /* Operators */ { "+=" tok(T_PLUS_EQUAL); "-=" tok(T_MINUS_EQUAL); "*=" tok(T_MUL_EQUAL); "/=" tok(T_DIV_EQUAL); ".=" tok(T_CONCAT_EQUAL); "%=" tok(T_MOD_EQUAL); "&=" tok(T_AND_EQUAL); "|=" tok(T_OR_EQUAL); "^=" tok(T_XOR_EQUAL); "<<=" tok(T_SL_EQUAL); ">>=" tok(T_SR_EQUAL); "||" tok(T_BOOLEAN_OR); "&&" tok(T_BOOLEAN_AND); "==" tok(T_IS_EQUAL); "!="|"<>" tok(T_IS_NOT_EQUAL); "===" tok(T_IS_IDENTICAL); "!==" tok(T_IS_NOT_IDENTICAL); "<=" tok(T_IS_SMALLER_OR_EQUAL); ">=" tok(T_IS_GREATER_OR_EQUAL); "<<" tok(T_SL); ">>" tok(T_SR); "++" tok(T_INC); "--" tok(T_DEC); "->" tok(T_OBJECT_OPERATOR); "=>" tok(T_DOUBLE_ARROW); "::" tok(T_PAAMAYIM_NEKUDOTAYIM); "\\" tok(T_NS_SEPARATOR); ":" { // A colon can either mean the start (or component) of an XHP class, // a ternary expression (as in 1?false:null), the colon of a 'case', // or finally the start of a block in the old PHP syntax. The following // disambiguate between the XHP case, which requires a special token, // and the other cases. switch (yyextra->last_token) { // In a ternary expression, the colon must follow a full-fledged // expression so seeing for instance a binary operator means // it must be an XHP class. case ',': case '=': case '|': case '^': case '&': case '<': case '>': case '+': case '-': case '%': case '!': case '~': case '[': case '(': case '{': case '.': case T_LOGICAL_OR: case T_LOGICAL_XOR: case T_LOGICAL_AND: case T_PLUS_EQUAL: case T_MINUS_EQUAL: case T_MUL_EQUAL: case T_DIV_EQUAL: case T_CONCAT_EQUAL: case T_MOD_EQUAL: case T_AND_EQUAL: case T_OR_EQUAL: case T_XOR_EQUAL: case T_SL_EQUAL: case T_SR_EQUAL: case T_BOOLEAN_OR: case T_BOOLEAN_AND: case T_IS_EQUAL: case T_IS_NOT_EQUAL: case T_IS_IDENTICAL: case T_IS_NOT_IDENTICAL: case T_IS_SMALLER_OR_EQUAL: case T_IS_GREATER_OR_EQUAL: // An XHP class can also occur after certain keywords. Not sure // we got them all covered though. case T_ECHO: case T_RETURN: case T_EXTENDS: case T_INSTANCEOF: case T_DOUBLE_ARROW: case T_XHP_ATTRIBUTE: tok(T_XHP_COLON); break; default: tok(':'); break; } } } /* Casts */ { "("{TABS_AND_SPACES}(int|integer){TABS_AND_SPACES}")" tok(T_INT_CAST); "("{TABS_AND_SPACES}(real|double|float){TABS_AND_SPACES}")" tok(T_DOUBLE_CAST); "("{TABS_AND_SPACES}string{TABS_AND_SPACES}")" tok(T_STRING_CAST); "("{TABS_AND_SPACES}unicode{TABS_AND_SPACES}")" tok(T_UNICODE_CAST); "("{TABS_AND_SPACES}binary{TABS_AND_SPACES}")" tok(T_BINARY_CAST); "("{TABS_AND_SPACES}array{TABS_AND_SPACES}")" tok(T_ARRAY_CAST); "("{TABS_AND_SPACES}object{TABS_AND_SPACES}")" tok(T_OBJECT_CAST); "("{TABS_AND_SPACES}(bool|boolean){TABS_AND_SPACES}")" tok(T_BOOL_CAST); "("{TABS_AND_SPACES}unset{TABS_AND_SPACES}")" tok(T_UNSET_CAST); } /* Scalars (parsing these doesn't really matter since we just pass them through literally) */ { {LNUM}|{HNUM} tok(T_LNUMBER); {DNUM}|{EXPONENT_DNUM} tok(T_DNUMBER); {LABEL} tok(T_STRING); "$"{LABEL} tok(T_VARIABLE); b?'(\\.|\\\n|[^\\']+)*'|b?\"(\\.|\\\n|[^\\\"]+)*\" { yy_scan_newlines(yytext, yyg); tok(T_CONSTANT_ENCAPSED_STRING); } `[^`]*` { yy_scan_newlines(yytext, yyg); tok(T_BACKTICKS_EXPR); } } /* (HERE|NOW)DOC's */ b?"<<<"{TABS_AND_SPACES} { push_state(PHP_HEREDOC_START); yyextra->heredoc_yyleng = yyleng; yymore(); } { "'"{LABEL}"'"|\"{LABEL}\" { // Create a new string for the heredoc label. Since we're using yymore above // yytext will actually start at the "<<<" and not the label. Use of // heredoc_yyleng jumps past that. Then we add 1 to get past the " or '. The // match is similar to calculate length. yyextra->heredoc_label = string(yytext + yyextra->heredoc_yyleng + 1, yyleng - yyextra->heredoc_yyleng - 2); set_state(PHP_HEREDOC_NSTART); yyextra->heredoc_yyleng = yyleng; yymore(); } {LABEL} { yyextra->heredoc_label = string(yytext + yyextra->heredoc_yyleng); set_state(PHP_HEREDOC_NSTART); yyextra->heredoc_yyleng = yyleng; yymore(); } } {NEWLINE} { ++yyextra->lineno; yyextra->heredoc_data = yytext + yyleng; set_state(PHP_HEREDOC_DATA); yymore(); } { [^\r\n]*{NEWLINE} { ++yyextra->lineno; set_state(PHP_HEREDOC_NEWLINE); yyextra->heredoc_yyleng = yyleng; yymore(); } } { {LABEL};?{NEWLINE} { if (strncmp(yyextra->heredoc_label.c_str(), yytext + yyextra->heredoc_yyleng, yyextra->heredoc_label.size()) == 0) { switch (yytext[yyextra->heredoc_yyleng + yyextra->heredoc_label.size()]) { case ';': case '\n': case '\r': yyless(yyleng - (yyleng - yyextra->heredoc_yyleng - yyextra->heredoc_label.size())); pop_state(); tok(T_HEREDOC); } } ++yyextra->lineno; yyextra->heredoc_yyleng = yyleng; yymore(); } [^\r\n]+ { set_state(PHP_HEREDOC_DATA); yyextra->heredoc_yyleng = yyleng; yymore(); } {NEWLINE} { ++yyextra->lineno; yyextra->heredoc_yyleng = yyleng; yymore(); } } /* XHP */ { {WHITESPACE}+ { yy_scan_newlines(yytext, yyg); ptok(T_WHITESPACE); } } { ":" tok(T_XHP_COLON); "-" tok(T_XHP_HYPHEN); "::" { pop_state(); yyextra->colon_hack = true; tok(T_PAAMAYIM_NEKUDOTAYIM); } "--" { pop_state(); tok(T_DEC); } {WHITESPACE} { yy_scan_newlines(yytext, yyg); pop_state(); tok(T_XHP_WHITESPACE); } {LABEL} tok(T_STRING); . { pop_state(); tok(yytext[0]); } } { "="|"/"|">" tok(yytext[0]); {WHITESPACE}+ { yy_scan_newlines(yytext, yyg); ptok(T_WHITESPACE); } {LABEL} tok(T_STRING); } { [^&'\\"]+ tok(T_XHP_TEXT); \" { pop_state(); tok('"'); } } { {WHITESPACE}+ { /* ignore whitespace at the start */ yy_scan_newlines(yytext, yyg); ptok(T_WHITESPACE); set_state(XHP_CHILD); } . { yyless(0); set_state(XHP_CHILD); } } /* Below we use tokt() (and not tok) which internally transits to * the XHP_AFTER_ENT state. */ { /* xml entities */ (?-i:") tokt("\""); (?-i:&) tokt("&"); (?-i:') tokt("\\'"); (?-i:<) tokt("<") (?-i:>) tokt(">"); /* html entities */ (?-i: ) tokt("\u00A0"); (?-i:¡) tokt("\u00A1"); (?-i:¢) tokt("\u00A2"); (?-i:£) tokt("\u00A3"); (?-i:¤) tokt("\u00A4"); (?-i:¥) tokt("\u00A5"); (?-i:¦) tokt("\u00A6"); (?-i:§) tokt("\u00A7"); (?-i:¨) tokt("\u00A8"); (?-i:©) tokt("\u00A9"); (?-i:ª) tokt("\u00AA"); (?-i:«) tokt("\u00AB"); (?-i:¬) tokt("\u00AC"); (?-i:­) tokt("\u00AD"); (?-i:®) tokt("\u00AE"); (?-i:¯) tokt("\u00AF"); (?-i:°) tokt("\u00B0"); (?-i:±) tokt("\u00B1"); (?-i:²) tokt("\u00B2"); (?-i:³) tokt("\u00B3"); (?-i:´) tokt("\u00B4"); (?-i:µ) tokt("\u00B5"); (?-i:¶) tokt("\u00B6"); (?-i:·) tokt("\u00B7"); (?-i:¸) tokt("\u00B8"); (?-i:¹) tokt("\u00B9"); (?-i:º) tokt("\u00BA"); (?-i:») tokt("\u00BB"); (?-i:¼) tokt("\u00BC"); (?-i:½) tokt("\u00BD"); (?-i:¾) tokt("\u00BE"); (?-i:¿) tokt("\u00BF"); (?-i:À) tokt("\u00C0"); (?-i:Á) tokt("\u00C1"); (?-i:Â) tokt("\u00C2"); (?-i:Ã) tokt("\u00C3"); (?-i:Ä) tokt("\u00C4"); (?-i:Å) tokt("\u00C5"); (?-i:Æ) tokt("\u00C6"); (?-i:Ç) tokt("\u00C7"); (?-i:È) tokt("\u00C8"); (?-i:É) tokt("\u00C9"); (?-i:Ê) tokt("\u00CA"); (?-i:Ë) tokt("\u00CB"); (?-i:Ì) tokt("\u00CC"); (?-i:Í) tokt("\u00CD"); (?-i:Î) tokt("\u00CE"); (?-i:Ï) tokt("\u00CF"); (?-i:Ð) tokt("\u00D0"); (?-i:Ñ) tokt("\u00D1"); (?-i:Ò) tokt("\u00D2"); (?-i:Ó) tokt("\u00D3"); (?-i:Ô) tokt("\u00D4"); (?-i:Õ) tokt("\u00D5"); (?-i:Ö) tokt("\u00D6"); (?-i:×) tokt("\u00D7"); (?-i:Ø) tokt("\u00D8"); (?-i:Ù) tokt("\u00D9"); (?-i:Ú) tokt("\u00DA"); (?-i:Û) tokt("\u00DB"); (?-i:Ü) tokt("\u00DC"); (?-i:Ý) tokt("\u00DD"); (?-i:Þ) tokt("\u00DE"); (?-i:ß) tokt("\u00DF"); (?-i:à) tokt("\u00E0"); (?-i:á) tokt("\u00E1"); (?-i:â) tokt("\u00E2"); (?-i:ã) tokt("\u00E3"); (?-i:ä) tokt("\u00E4"); (?-i:å) tokt("\u00E5"); (?-i:æ) tokt("\u00E6"); (?-i:ç) tokt("\u00E7"); (?-i:è) tokt("\u00E8"); (?-i:é) tokt("\u00E9"); (?-i:ê) tokt("\u00EA"); (?-i:ë) tokt("\u00EB"); (?-i:ì) tokt("\u00EC"); (?-i:í) tokt("\u00ED"); (?-i:î) tokt("\u00EE"); (?-i:ï) tokt("\u00EF"); (?-i:ð) tokt("\u00F0"); (?-i:ñ) tokt("\u00F1"); (?-i:ò) tokt("\u00F2"); (?-i:ó) tokt("\u00F3"); (?-i:ô) tokt("\u00F4"); (?-i:õ) tokt("\u00F5"); (?-i:ö) tokt("\u00F6"); (?-i:÷) tokt("\u00F7"); (?-i:ø) tokt("\u00F8"); (?-i:ù) tokt("\u00F9"); (?-i:ú) tokt("\u00FA"); (?-i:û) tokt("\u00FB"); (?-i:ü) tokt("\u00FC"); (?-i:ý) tokt("\u00FD"); (?-i:þ) tokt("\u00FE"); (?-i:ÿ) tokt("\u00FF"); (?-i:Œ) tokt("\u0152"); (?-i:œ) tokt("\u0153"); (?-i:Š) tokt("\u0160"); (?-i:š) tokt("\u0161"); (?-i:Ÿ) tokt("\u0178"); (?-i:ƒ) tokt("\u0192"); (?-i:ˆ) tokt("\u02C6"); (?-i:˜) tokt("\u02DC"); (?-i:Α) tokt("\u0391"); (?-i:Β) tokt("\u0392"); (?-i:Γ) tokt("\u0393"); (?-i:Δ) tokt("\u0394"); (?-i:Ε) tokt("\u0395"); (?-i:Ζ) tokt("\u0396"); (?-i:Η) tokt("\u0397"); (?-i:Θ) tokt("\u0398"); (?-i:Ι) tokt("\u0399"); (?-i:Κ) tokt("\u039A"); (?-i:Λ) tokt("\u039B"); (?-i:Μ) tokt("\u039C"); (?-i:Ν) tokt("\u039D"); (?-i:Ξ) tokt("\u039E"); (?-i:Ο) tokt("\u039F"); (?-i:Π) tokt("\u03A0"); (?-i:Ρ) tokt("\u03A1"); (?-i:Σ) tokt("\u03A3"); (?-i:Τ) tokt("\u03A4"); (?-i:Υ) tokt("\u03A5"); (?-i:Φ) tokt("\u03A6"); (?-i:Χ) tokt("\u03A7"); (?-i:Ψ) tokt("\u03A8"); (?-i:Ω) tokt("\u03A9"); (?-i:α) tokt("\u03B1"); (?-i:β) tokt("\u03B2"); (?-i:γ) tokt("\u03B3"); (?-i:δ) tokt("\u03B4"); (?-i:ε) tokt("\u03B5"); (?-i:ζ) tokt("\u03B6"); (?-i:η) tokt("\u03B7"); (?-i:θ) tokt("\u03B8"); (?-i:ι) tokt("\u03B9"); (?-i:κ) tokt("\u03BA"); (?-i:λ) tokt("\u03BB"); (?-i:μ) tokt("\u03BC"); (?-i:ν) tokt("\u03BD"); (?-i:ξ) tokt("\u03BE"); (?-i:ο) tokt("\u03BF"); (?-i:π) tokt("\u03C0"); (?-i:ρ) tokt("\u03C1"); (?-i:ς) tokt("\u03C2"); (?-i:σ) tokt("\u03C3"); (?-i:τ) tokt("\u03C4"); (?-i:υ) tokt("\u03C5"); (?-i:φ) tokt("\u03C6"); (?-i:χ) tokt("\u03C7"); (?-i:ψ) tokt("\u03C8"); (?-i:ω) tokt("\u03C9"); (?-i:ϑ) tokt("\u03D1"); (?-i:ϒ) tokt("\u03D2"); (?-i:ϖ) tokt("\u03D6"); (?-i: ) tokt("\u2002"); (?-i: ) tokt("\u2003"); (?-i: ) tokt("\u2009"); (?-i:‌) tokt("\u200C"); (?-i:‍) tokt("\u200D"); (?-i:‎) tokt("\u200E"); (?-i:‏) tokt("\u200F"); (?-i:–) tokt("\u2013"); (?-i:—) tokt("\u2014"); (?-i:‘) tokt("\u2018"); (?-i:’) tokt("\u2019"); (?-i:‚) tokt("\u201A"); (?-i:“) tokt("\u201C"); (?-i:”) tokt("\u201D"); (?-i:„) tokt("\u201E"); (?-i:†) tokt("\u2020"); (?-i:‡) tokt("\u2021"); (?-i:•) tokt("\u2022"); (?-i:…) tokt("\u2026"); (?-i:‰) tokt("\u2030"); (?-i:′) tokt("\u2032"); (?-i:″) tokt("\u2033"); (?-i:‹) tokt("\u2039"); (?-i:›) tokt("\u203A"); (?-i:‾) tokt("\u203E"); (?-i:⁄) tokt("\u2044"); (?-i:€) tokt("\u20AC"); (?-i:ℑ) tokt("\u2111"); (?-i:℘) tokt("\u2118"); (?-i:ℜ) tokt("\u211C"); (?-i:™) tokt("\u2122"); (?-i:ℵ) tokt("\u2135"); (?-i:←) tokt("\u2190"); (?-i:↑) tokt("\u2191"); (?-i:→) tokt("\u2192"); (?-i:↓) tokt("\u2193"); (?-i:↔) tokt("\u2194"); (?-i:↵) tokt("\u21B5"); (?-i:⇐) tokt("\u21D0"); (?-i:⇑) tokt("\u21D1"); (?-i:⇒) tokt("\u21D2"); (?-i:⇓) tokt("\u21D3"); (?-i:⇔) tokt("\u21D4"); (?-i:∀) tokt("\u2200"); (?-i:∂) tokt("\u2202"); (?-i:∃) tokt("\u2203"); (?-i:∅) tokt("\u2205"); (?-i:∇) tokt("\u2207"); (?-i:∈) tokt("\u2208"); (?-i:∉) tokt("\u2209"); (?-i:∋) tokt("\u220B"); (?-i:∏) tokt("\u220F"); (?-i:∑) tokt("\u2211"); (?-i:−) tokt("\u2212"); (?-i:∗) tokt("\u2217"); (?-i:√) tokt("\u221A"); (?-i:∝) tokt("\u221D"); (?-i:∞) tokt("\u221E"); (?-i:∠) tokt("\u2220"); (?-i:∧) tokt("\u2227"); (?-i:∨) tokt("\u2228"); (?-i:∩) tokt("\u2229"); (?-i:∪) tokt("\u222A"); (?-i:∫) tokt("\u222B"); (?-i:∴) tokt("\u2234"); (?-i:∼) tokt("\u223C"); (?-i:≅) tokt("\u2245"); (?-i:≈) tokt("\u2248"); (?-i:≠) tokt("\u2260"); (?-i:≡) tokt("\u2261"); (?-i:≤) tokt("\u2264"); (?-i:≥) tokt("\u2265"); (?-i:⊂) tokt("\u2282"); (?-i:⊃) tokt("\u2283"); (?-i:⊄) tokt("\u2284"); (?-i:⊆) tokt("\u2286"); (?-i:⊇) tokt("\u2287"); (?-i:⊕) tokt("\u2295"); (?-i:⊗) tokt("\u2297"); (?-i:⊥) tokt("\u22A5"); (?-i:⋅) tokt("\u22C5"); (?-i:⌈) tokt("\u2308"); (?-i:⌉) tokt("\u2309"); (?-i:⌊) tokt("\u230A"); (?-i:⌋) tokt("\u230B"); (?-i:⟨) tokt("\u2329"); (?-i:⟩) tokt("\u232A"); (?-i:◊) tokt("\u25CA"); (?-i:♠) tokt("\u2660"); (?-i:♣) tokt("\u2663"); (?-i:♥) tokt("\u2665"); (?-i:♦) tokt("\u2666"); /* awesome entities */ (?-i:&cloud;) tokt("\u2601"); (?-i:&umbrella;) tokt("\u2602"); (?-i:&snowman;) tokt("\u2603"); (?-i:&snowflake;) tokt("\u2745"); (?-i:&comet;) tokt("\u2604"); (?-i:&thunderstorm;) tokt("\u2608"); /* pseudo entities */ ' tokt("\\'"); "\\" tokt("\\\\"); /* meta entities */ (?-i:&#[0-9]+;) { char buf[5]; utf8ize(atoi(yytext + 2), buf); tokt(buf); } (?-i:&#x)[A-F0-9]+; { char buf[5]; char *_; utf8ize(strtol(yytext + 3, &_, 16), buf); tokt(buf); } /* not entities */ & { yymore(); BEGIN(XHP_INVALID_ENTITY); } } { {BYTE}{1,10} { for (char* ii = yytext; *ii; ++ii) { if (*ii == ';') { ii[1] = 0; break; } } if (!yyextra->terminated) { yyextra->error = string("Invalid entity: (") + yytext + ")"; yyextra->terminated = true; } } } { [ \t\x0b\x0c\xa0\r\n]|\r\n { if (*yytext == '\r' || *yytext == '\n') { // Since we rewrite newlines into space we need to increment both line // counters. The first_lineno increment is quite a hack, and makes it so // that this ent is on the wrong line but it doesn't mess up the rest of // the file. ++yyextra->lineno; ++yyextra->first_lineno; } pop_state(); tok(T_XHP_TEXT); } . { pop_state(); yyless(0); } } { [^&'<>\\{ \t\x0b\x0c\xa0\r\n]+{WHITESPACE}? { yy_scan_newlines(yytext, yyg); tok(T_XHP_TEXT); } {WHITESPACE}* { yy_scan_newlines(yytext, yyg); tok(T_XHP_TEXT); } /* TODO: I removed {WHITESPACE}* from all of these but it needs to be restored if the tree is unflattened. */ "{" { yy_scan_newlines(yytext, yyg); tok('{'); } "<" { yy_scan_newlines(yytext, yyg); tok('<'); } "" { yy_scan_newlines(yytext, yyg); tok(T_XHP_LT_DIV_GT); } } { any tok(T_XHP_ANY); pcdata tok(T_XHP_PCDATA); empty tok(T_XHP_EMPTY); {LABEL} tok(T_STRING); ";" { pop_state(); tok(';'); } ":" { tok(T_XHP_COLON); } } /* Other */ <*>{BYTE} { tok(yytext[0]); // fix unused function warnings yy_top_state(NULL); yyunput(0, 0, NULL); } %% #ifdef DEBUG static const char* yy_state_name(int state) { switch (state) { case INITIAL: return "INITIAL"; case PHP: return "PHP"; case PHP_COMMENT: return "PHP_COMMENT"; case PHP_EOL_COMMENT: return "PHP_EOL_COMMENT"; case PHP_DOC_COMMENT: return "PHP_DOC_COMMENT"; case PHP_HEREDOC_START: return "PHP_HEREDOC_START"; case PHP_HEREDOC_NSTART: return "PHP_HEREDOC_NSTART"; case PHP_HEREDOC_NEWLINE: return "PHP_HEREDOC_NEWLINE"; case PHP_HEREDOC_DATA: return "PHP_HEREDOC_DATA"; case PHP_NO_RESERVED_WORDS: return "PHP_NO_RESERVED_WORDS"; case PHP_NO_RESERVED_WORDS_PERSIST: return "PHP_NO_RESERVED_WORDS_PERSIST"; case XHP_LABEL: return "XHP_LABEL"; case XHP_LABEL_WHITESPACE: return "XHP_LABEL_WHITESPACE"; case XHP_ATTRS: return "XHP_ATTRS"; case XHP_ATTR_VAL: return "XHP_ATTR_VAL"; case XHP_AFTER_ENT: return "XHP_AFTER_ENT"; case XHP_CHILD: return "XHP_CHILD"; case XHP_CHILD_START: return "XHP_CHILD_START"; case XHP_INVALID_ENTITY: return "XHP_INVALID_ENTITY"; case XHP_ATTR_TYPE_DECL: return "XHP_ATTR_TYPE_DECL"; case XHP_CHILDREN_DECL: return "XHP_CHILDREN_DECL"; default: return "???"; } } static void yy_log_token(int tok) { const char* tokname = yytokname(tok); if (tokname) { fprintf(stderr, "--> %s\n", tokname); } else { fprintf(stderr, "--> '%c'\n", tok); } } #endif static int yy_token(int tok, yyguts_t* yyg) { if (YY_START == PHP_NO_RESERVED_WORDS) { pop_state(); } switch (tok) { case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO: case T_OPEN_TAG_FAKE: push_state(PHP); break; case T_CLOSE_TAG: pop_state(); // We need to return a ';', not a T_CLOSE_TAG, because a construct like // "" is valid and there are about a billion parser rules // which terminate with ';' so making a new rule like // "semicolon_or_close_tag" would be hard. The token in yylval has the // correct type and value, we just don't generate a node. return ';'; // In PHP it's ok to use keywords such as 'if' as field names // or function names. case T_OBJECT_OPERATOR: case T_FUNCTION: push_state(PHP_NO_RESERVED_WORDS); break; case T_PAAMAYIM_NEKUDOTAYIM: if (yyextra->colon_hack) { yyextra->colon_hack = false; } else { push_state(PHP_NO_RESERVED_WORDS); } break; case '{': // not used anymore yyextra->curly_stack.push(tok); break; } #ifdef DEBUG yy_log_token(tok); #endif return yyextra->last_token = tok; } static inline void yy_scan_newlines(const char* text, struct yyguts_t* yyg) { for (; *text; ++text) { if (*text == '\r') { if (text[1] == '\n') { ++text; } ++yyextra->lineno; } else if (*text == '\n') { ++yyextra->lineno; } } } void xhp_new_push_state(int s, struct yyguts_t* yyg) { #ifdef DEBUG fprintf(stderr, "--> PUSH(%s -> %s)\n", yy_state_name(YY_START), yy_state_name(s)); #endif yy_push_state(s, yyg); } void xhp_new_pop_state(struct yyguts_t* yyg) { #ifdef DEBUG int s = YY_START; #endif yy_pop_state(yyg); #ifdef DEBUG fprintf(stderr, "--> POP(%s -> %s)\n", yy_state_name(s), yy_state_name(YY_START)); #endif } void xhp_set_state(int s, struct yyguts_t* yyg) { #ifdef DEBUG fprintf(stderr, "--> SET(%s)\n", yy_state_name(s)); #endif BEGIN(s); }