aleph1.co.uk Git - yaffs-website/blob - vendor/nikic/php-parser/lib/PhpParser/Lexer.php

   1 <?php
   2
   3 namespace PhpParser;
   4
   5 use PhpParser\Parser\Tokens;
   6
   7 class Lexer
   8 {
   9     protected $code;
  10     protected $tokens;
  11     protected $pos;
  12     protected $line;
  13     protected $filePos;
  14     protected $prevCloseTagHasNewline;
  15
  16     protected $tokenMap;
  17     protected $dropTokens;
  18
  19     protected $usedAttributes;
  20
  21     /**
  22      * Creates a Lexer.
  23      *
  24      * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
  25      *                       which is an array of attributes to add to the AST nodes. Possible
  26      *                       attributes are: 'comments', 'startLine', 'endLine', 'startTokenPos',
  27      *                       'endTokenPos', 'startFilePos', 'endFilePos'. The option defaults to the
  28      *                       first three. For more info see getNextToken() docs.
  29      */
  30     public function __construct(array $options = array()) {
  31         // map from internal tokens to PhpParser tokens
  32         $this->tokenMap = $this->createTokenMap();
  33
  34         // map of tokens to drop while lexing (the map is only used for isset lookup,
  35         // that's why the value is simply set to 1; the value is never actually used.)
  36         $this->dropTokens = array_fill_keys(
  37             array(T_WHITESPACE, T_OPEN_TAG, T_COMMENT, T_DOC_COMMENT), 1
  38         );
  39
  40         // the usedAttributes member is a map of the used attribute names to a dummy
  41         // value (here "true")
  42         $options += array(
  43             'usedAttributes' => array('comments', 'startLine', 'endLine'),
  44         );
  45         $this->usedAttributes = array_fill_keys($options['usedAttributes'], true);
  46     }
  47
  48     /**
  49      * Initializes the lexer for lexing the provided source code.
  50      *
  51      * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
  52      * the getErrors() method.
  53      *
  54      * @param string $code The source code to lex
  55      * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
  56      *                                        ErrorHandler\Throwing
  57      */
  58     public function startLexing($code, ErrorHandler $errorHandler = null) {
  59         if (null === $errorHandler) {
  60             $errorHandler = new ErrorHandler\Throwing();
  61         }
  62
  63         $this->code = $code; // keep the code around for __halt_compiler() handling
  64         $this->pos  = -1;
  65         $this->line =  1;
  66         $this->filePos = 0;
  67
  68         // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
  69         // This ensures proper composability, because having a newline is the "safe" assumption.
  70         $this->prevCloseTagHasNewline = true;
  71
  72         $scream = ini_set('xdebug.scream', '0');
  73
  74         $this->resetErrors();
  75         $this->tokens = @token_get_all($code);
  76         $this->handleErrors($errorHandler);
  77
  78         if (false !== $scream) {
  79             ini_set('xdebug.scream', $scream);
  80         }
  81     }
  82
  83     protected function resetErrors() {
  84         if (function_exists('error_clear_last')) {
  85             error_clear_last();
  86         } else {
  87             // set error_get_last() to defined state by forcing an undefined variable error
  88             set_error_handler(function() { return false; }, 0);
  89             @$undefinedVariable;
  90             restore_error_handler();
  91         }
  92     }
  93
  94     private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
  95         for ($i = $start; $i < $end; $i++) {
  96             $chr = $this->code[$i];
  97             if ($chr === 'b' || $chr === 'B') {
  98                 // HHVM does not treat b" tokens correctly, so ignore these
  99                 continue;
 100             }
 101
 102             if ($chr === "\0") {
 103                 // PHP cuts error message after null byte, so need special case
 104                 $errorMsg = 'Unexpected null byte';
 105             } else {
 106                 $errorMsg = sprintf(
 107                     'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
 108                 );
 109             }
 110
 111             $errorHandler->handleError(new Error($errorMsg, [
 112                 'startLine' => $line,
 113                 'endLine' => $line,
 114                 'startFilePos' => $i,
 115                 'endFilePos' => $i,
 116             ]));
 117         }
 118     }
 119
 120     private function isUnterminatedComment($token) {
 121         return ($token[0] === T_COMMENT || $token[0] === T_DOC_COMMENT)
 122             && substr($token[1], 0, 2) === '/*'
 123             && substr($token[1], -2) !== '*/';
 124     }
 125
 126     private function errorMayHaveOccurred() {
 127         if (defined('HHVM_VERSION')) {
 128             // In HHVM token_get_all() does not throw warnings, so we need to conservatively
 129             // assume that an error occurred
 130             return true;
 131         }
 132
 133         $error = error_get_last();
 134         return null !== $error
 135             && false === strpos($error['message'], 'Undefined variable');
 136     }
 137
 138     protected function handleErrors(ErrorHandler $errorHandler) {
 139         if (!$this->errorMayHaveOccurred()) {
 140             return;
 141         }
 142
 143         // PHP's error handling for token_get_all() is rather bad, so if we want detailed
 144         // error information we need to compute it ourselves. Invalid character errors are
 145         // detected by finding "gaps" in the token array. Unterminated comments are detected
 146         // by checking if a trailing comment has a "*/" at the end.
 147
 148         $filePos = 0;
 149         $line = 1;
 150         foreach ($this->tokens as $i => $token) {
 151             $tokenValue = \is_string($token) ? $token : $token[1];
 152             $tokenLen = \strlen($tokenValue);
 153
 154             if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
 155                 // Something is missing, must be an invalid character
 156                 $nextFilePos = strpos($this->code, $tokenValue, $filePos);
 157                 $this->handleInvalidCharacterRange(
 158                     $filePos, $nextFilePos, $line, $errorHandler);
 159                 $filePos = $nextFilePos;
 160             }
 161
 162             $filePos += $tokenLen;
 163             $line += substr_count($tokenValue, "\n");
 164         }
 165
 166         if ($filePos !== \strlen($this->code)) {
 167             if (substr($this->code, $filePos, 2) === '/*') {
 168                 // Unlike PHP, HHVM will drop unterminated comments entirely
 169                 $comment = substr($this->code, $filePos);
 170                 $errorHandler->handleError(new Error('Unterminated comment', [
 171                     'startLine' => $line,
 172                     'endLine' => $line + substr_count($comment, "\n"),
 173                     'startFilePos' => $filePos,
 174                     'endFilePos' => $filePos + \strlen($comment),
 175                 ]));
 176
 177                 // Emulate the PHP behavior
 178                 $isDocComment = isset($comment[3]) && $comment[3] === '*';
 179                 $this->tokens[] = [$isDocComment ? T_DOC_COMMENT : T_COMMENT, $comment, $line];
 180             } else {
 181                 // Invalid characters at the end of the input
 182                 $this->handleInvalidCharacterRange(
 183                     $filePos, \strlen($this->code), $line, $errorHandler);
 184             }
 185             return;
 186         }
 187
 188         // Check for unterminated comment
 189         $lastToken = $this->tokens[count($this->tokens) - 1];
 190         if ($this->isUnterminatedComment($lastToken)) {
 191             $errorHandler->handleError(new Error('Unterminated comment', [
 192                 'startLine' => $line - substr_count($lastToken[1], "\n"),
 193                 'endLine' => $line,
 194                 'startFilePos' => $filePos - \strlen($lastToken[1]),
 195                 'endFilePos' => $filePos,
 196             ]));
 197         }
 198     }
 199
 200     /**
 201      * Fetches the next token.
 202      *
 203      * The available attributes are determined by the 'usedAttributes' option, which can
 204      * be specified in the constructor. The following attributes are supported:
 205      *
 206      *  * 'comments'      => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
 207      *                       representing all comments that occurred between the previous
 208      *                       non-discarded token and the current one.
 209      *  * 'startLine'     => Line in which the node starts.
 210      *  * 'endLine'       => Line in which the node ends.
 211      *  * 'startTokenPos' => Offset into the token array of the first token in the node.
 212      *  * 'endTokenPos'   => Offset into the token array of the last token in the node.
 213      *  * 'startFilePos'  => Offset into the code string of the first character that is part of the node.
 214      *  * 'endFilePos'    => Offset into the code string of the last character that is part of the node.
 215      *
 216      * @param mixed $value           Variable to store token content in
 217      * @param mixed $startAttributes Variable to store start attributes in
 218      * @param mixed $endAttributes   Variable to store end attributes in
 219      *
 220      * @return int Token id
 221      */
 222     public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) {
 223         $startAttributes = array();
 224         $endAttributes   = array();
 225
 226         while (1) {
 227             if (isset($this->tokens[++$this->pos])) {
 228                 $token = $this->tokens[$this->pos];
 229             } else {
 230                 // EOF token with ID 0
 231                 $token = "\0";
 232             }
 233
 234             if (isset($this->usedAttributes['startLine'])) {
 235                 $startAttributes['startLine'] = $this->line;
 236             }
 237             if (isset($this->usedAttributes['startTokenPos'])) {
 238                 $startAttributes['startTokenPos'] = $this->pos;
 239             }
 240             if (isset($this->usedAttributes['startFilePos'])) {
 241                 $startAttributes['startFilePos'] = $this->filePos;
 242             }
 243
 244             if (\is_string($token)) {
 245                 $value = $token;
 246                 if (isset($token[1])) {
 247                     // bug in token_get_all
 248                     $this->filePos += 2;
 249                     $id = ord('"');
 250                 } else {
 251                     $this->filePos += 1;
 252                     $id = ord($token);
 253                 }
 254             } elseif (!isset($this->dropTokens[$token[0]])) {
 255                 $value = $token[1];
 256                 $id = $this->tokenMap[$token[0]];
 257                 if (T_CLOSE_TAG === $token[0]) {
 258                     $this->prevCloseTagHasNewline = false !== strpos($token[1], "\n");
 259                 } else if (T_INLINE_HTML === $token[0]) {
 260                     $startAttributes['hasLeadingNewline'] = $this->prevCloseTagHasNewline;
 261                 }
 262
 263                 $this->line += substr_count($value, "\n");
 264                 $this->filePos += \strlen($value);
 265             } else {
 266                 if (T_COMMENT === $token[0] || T_DOC_COMMENT === $token[0]) {
 267                     if (isset($this->usedAttributes['comments'])) {
 268                         $comment = T_DOC_COMMENT === $token[0]
 269                             ? new Comment\Doc($token[1], $this->line, $this->filePos)
 270                             : new Comment($token[1], $this->line, $this->filePos);
 271                         $startAttributes['comments'][] = $comment;
 272                     }
 273                 }
 274
 275                 $this->line += substr_count($token[1], "\n");
 276                 $this->filePos += \strlen($token[1]);
 277                 continue;
 278             }
 279
 280             if (isset($this->usedAttributes['endLine'])) {
 281                 $endAttributes['endLine'] = $this->line;
 282             }
 283             if (isset($this->usedAttributes['endTokenPos'])) {
 284                 $endAttributes['endTokenPos'] = $this->pos;
 285             }
 286             if (isset($this->usedAttributes['endFilePos'])) {
 287                 $endAttributes['endFilePos'] = $this->filePos - 1;
 288             }
 289
 290             return $id;
 291         }
 292
 293         throw new \RuntimeException('Reached end of lexer loop');
 294     }
 295
 296     /**
 297      * Returns the token array for current code.
 298      *
 299      * The token array is in the same format as provided by the
 300      * token_get_all() function and does not discard tokens (i.e.
 301      * whitespace and comments are included). The token position
 302      * attributes are against this token array.
 303      *
 304      * @return array Array of tokens in token_get_all() format
 305      */
 306     public function getTokens() {
 307         return $this->tokens;
 308     }
 309
 310     /**
 311      * Handles __halt_compiler() by returning the text after it.
 312      *
 313      * @return string Remaining text
 314      */
 315     public function handleHaltCompiler() {
 316         // text after T_HALT_COMPILER, still including ();
 317         $textAfter = substr($this->code, $this->filePos);
 318
 319         // ensure that it is followed by ();
 320         // this simplifies the situation, by not allowing any comments
 321         // in between of the tokens.
 322         if (!preg_match('~^\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) {
 323             throw new Error('__HALT_COMPILER must be followed by "();"');
 324         }
 325
 326         // prevent the lexer from returning any further tokens
 327         $this->pos = count($this->tokens);
 328
 329         // return with (); removed
 330         return (string) substr($textAfter, strlen($matches[0])); // (string) converts false to ''
 331     }
 332
 333     /**
 334      * Creates the token map.
 335      *
 336      * The token map maps the PHP internal token identifiers
 337      * to the identifiers used by the Parser. Additionally it
 338      * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
 339      *
 340      * @return array The token map
 341      */
 342     protected function createTokenMap() {
 343         $tokenMap = array();
 344
 345         // 256 is the minimum possible token number, as everything below
 346         // it is an ASCII value
 347         for ($i = 256; $i < 1000; ++$i) {
 348             if (T_DOUBLE_COLON === $i) {
 349                 // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
 350                 $tokenMap[$i] = Tokens::T_PAAMAYIM_NEKUDOTAYIM;
 351             } elseif(T_OPEN_TAG_WITH_ECHO === $i) {
 352                 // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
 353                 $tokenMap[$i] = Tokens::T_ECHO;
 354             } elseif(T_CLOSE_TAG === $i) {
 355                 // T_CLOSE_TAG is equivalent to ';'
 356                 $tokenMap[$i] = ord(';');
 357             } elseif ('UNKNOWN' !== $name = token_name($i)) {
 358                 if ('T_HASHBANG' === $name) {
 359                     // HHVM uses a special token for #! hashbang lines
 360                     $tokenMap[$i] = Tokens::T_INLINE_HTML;
 361                 } else if (defined($name = 'PhpParser\Parser\Tokens::' . $name)) {
 362                     // Other tokens can be mapped directly
 363                     $tokenMap[$i] = constant($name);
 364                 }
 365             }
 366         }
 367
 368         // HHVM uses a special token for numbers that overflow to double
 369         if (defined('T_ONUMBER')) {
 370             $tokenMap[T_ONUMBER] = Tokens::T_DNUMBER;
 371         }
 372         // HHVM also has a separate token for the __COMPILER_HALT_OFFSET__ constant
 373         if (defined('T_COMPILER_HALT_OFFSET')) {
 374             $tokenMap[T_COMPILER_HALT_OFFSET] = Tokens::T_STRING;
 375         }
 376
 377         return $tokenMap;
 378     }
 379 }