aleph1.co.uk Git - yaffs-website/blob - vendor/nikic/php-parser/lib/PhpParser/Lexer.php

   1 <?php declare(strict_types=1);
   2
   3 namespace PhpParser;
   4
   5 use PhpParser\Parser\Tokens;
   6
   7 class Lexer
   8 {
   9     protected $code;
  10     protected $tokens;
  11     protected $pos;
  12     protected $line;
  13     protected $filePos;
  14     protected $prevCloseTagHasNewline;
  15
  16     protected $tokenMap;
  17     protected $dropTokens;
  18
  19     protected $usedAttributes;
  20
  21     /**
  22      * Creates a Lexer.
  23      *
  24      * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
  25      *                       which is an array of attributes to add to the AST nodes. Possible
  26      *                       attributes are: 'comments', 'startLine', 'endLine', 'startTokenPos',
  27      *                       'endTokenPos', 'startFilePos', 'endFilePos'. The option defaults to the
  28      *                       first three. For more info see getNextToken() docs.
  29      */
  30     public function __construct(array $options = []) {
  31         // map from internal tokens to PhpParser tokens
  32         $this->tokenMap = $this->createTokenMap();
  33
  34         // map of tokens to drop while lexing (the map is only used for isset lookup,
  35         // that's why the value is simply set to 1; the value is never actually used.)
  36         $this->dropTokens = array_fill_keys(
  37             [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1
  38         );
  39
  40         // the usedAttributes member is a map of the used attribute names to a dummy
  41         // value (here "true")
  42         $options += [
  43             'usedAttributes' => ['comments', 'startLine', 'endLine'],
  44         ];
  45         $this->usedAttributes = array_fill_keys($options['usedAttributes'], true);
  46     }
  47
  48     /**
  49      * Initializes the lexer for lexing the provided source code.
  50      *
  51      * This function does not throw if lexing errors occur. Instead, errors may be retrieved using
  52      * the getErrors() method.
  53      *
  54      * @param string $code The source code to lex
  55      * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
  56      *                                        ErrorHandler\Throwing
  57      */
  58     public function startLexing(string $code, ErrorHandler $errorHandler = null) {
  59         if (null === $errorHandler) {
  60             $errorHandler = new ErrorHandler\Throwing();
  61         }
  62
  63         $this->code = $code; // keep the code around for __halt_compiler() handling
  64         $this->pos  = -1;
  65         $this->line =  1;
  66         $this->filePos = 0;
  67
  68         // If inline HTML occurs without preceding code, treat it as if it had a leading newline.
  69         // This ensures proper composability, because having a newline is the "safe" assumption.
  70         $this->prevCloseTagHasNewline = true;
  71
  72         $scream = ini_set('xdebug.scream', '0');
  73
  74         error_clear_last();
  75         $this->tokens = @token_get_all($code);
  76         $this->handleErrors($errorHandler);
  77
  78         if (false !== $scream) {
  79             ini_set('xdebug.scream', $scream);
  80         }
  81     }
  82
  83     private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
  84         for ($i = $start; $i < $end; $i++) {
  85             $chr = $this->code[$i];
  86             if ($chr === 'b' || $chr === 'B') {
  87                 // HHVM does not treat b" tokens correctly, so ignore these
  88                 continue;
  89             }
  90
  91             if ($chr === "\0") {
  92                 // PHP cuts error message after null byte, so need special case
  93                 $errorMsg = 'Unexpected null byte';
  94             } else {
  95                 $errorMsg = sprintf(
  96                     'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
  97                 );
  98             }
  99
 100             $errorHandler->handleError(new Error($errorMsg, [
 101                 'startLine' => $line,
 102                 'endLine' => $line,
 103                 'startFilePos' => $i,
 104                 'endFilePos' => $i,
 105             ]));
 106         }
 107     }
 108
 109     /**
 110      * Check whether comment token is unterminated.
 111      *
 112      * @return bool
 113      */
 114     private function isUnterminatedComment($token) : bool {
 115         return ($token[0] === \T_COMMENT || $token[0] === \T_DOC_COMMENT)
 116             && substr($token[1], 0, 2) === '/*'
 117             && substr($token[1], -2) !== '*/';
 118     }
 119
 120     /**
 121      * Check whether an error *may* have occurred during tokenization.
 122      *
 123      * @return bool
 124      */
 125     private function errorMayHaveOccurred() : bool {
 126         if (defined('HHVM_VERSION')) {
 127             // In HHVM token_get_all() does not throw warnings, so we need to conservatively
 128             // assume that an error occurred
 129             return true;
 130         }
 131
 132         return null !== error_get_last();
 133     }
 134
 135     protected function handleErrors(ErrorHandler $errorHandler) {
 136         if (!$this->errorMayHaveOccurred()) {
 137             return;
 138         }
 139
 140         // PHP's error handling for token_get_all() is rather bad, so if we want detailed
 141         // error information we need to compute it ourselves. Invalid character errors are
 142         // detected by finding "gaps" in the token array. Unterminated comments are detected
 143         // by checking if a trailing comment has a "*/" at the end.
 144
 145         $filePos = 0;
 146         $line = 1;
 147         foreach ($this->tokens as $token) {
 148             $tokenValue = \is_string($token) ? $token : $token[1];
 149             $tokenLen = \strlen($tokenValue);
 150
 151             if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
 152                 // Something is missing, must be an invalid character
 153                 $nextFilePos = strpos($this->code, $tokenValue, $filePos);
 154                 $this->handleInvalidCharacterRange(
 155                     $filePos, $nextFilePos, $line, $errorHandler);
 156                 $filePos = (int) $nextFilePos;
 157             }
 158
 159             $filePos += $tokenLen;
 160             $line += substr_count($tokenValue, "\n");
 161         }
 162
 163         if ($filePos !== \strlen($this->code)) {
 164             if (substr($this->code, $filePos, 2) === '/*') {
 165                 // Unlike PHP, HHVM will drop unterminated comments entirely
 166                 $comment = substr($this->code, $filePos);
 167                 $errorHandler->handleError(new Error('Unterminated comment', [
 168                     'startLine' => $line,
 169                     'endLine' => $line + substr_count($comment, "\n"),
 170                     'startFilePos' => $filePos,
 171                     'endFilePos' => $filePos + \strlen($comment),
 172                 ]));
 173
 174                 // Emulate the PHP behavior
 175                 $isDocComment = isset($comment[3]) && $comment[3] === '*';
 176                 $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
 177             } else {
 178                 // Invalid characters at the end of the input
 179                 $this->handleInvalidCharacterRange(
 180                     $filePos, \strlen($this->code), $line, $errorHandler);
 181             }
 182             return;
 183         }
 184
 185         if (count($this->tokens) > 0) {
 186             // Check for unterminated comment
 187             $lastToken = $this->tokens[count($this->tokens) - 1];
 188             if ($this->isUnterminatedComment($lastToken)) {
 189                 $errorHandler->handleError(new Error('Unterminated comment', [
 190                     'startLine' => $line - substr_count($lastToken[1], "\n"),
 191                     'endLine' => $line,
 192                     'startFilePos' => $filePos - \strlen($lastToken[1]),
 193                     'endFilePos' => $filePos,
 194                 ]));
 195             }
 196         }
 197     }
 198
 199     /**
 200      * Fetches the next token.
 201      *
 202      * The available attributes are determined by the 'usedAttributes' option, which can
 203      * be specified in the constructor. The following attributes are supported:
 204      *
 205      *  * 'comments'      => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
 206      *                       representing all comments that occurred between the previous
 207      *                       non-discarded token and the current one.
 208      *  * 'startLine'     => Line in which the node starts.
 209      *  * 'endLine'       => Line in which the node ends.
 210      *  * 'startTokenPos' => Offset into the token array of the first token in the node.
 211      *  * 'endTokenPos'   => Offset into the token array of the last token in the node.
 212      *  * 'startFilePos'  => Offset into the code string of the first character that is part of the node.
 213      *  * 'endFilePos'    => Offset into the code string of the last character that is part of the node.
 214      *
 215      * @param mixed $value           Variable to store token content in
 216      * @param mixed $startAttributes Variable to store start attributes in
 217      * @param mixed $endAttributes   Variable to store end attributes in
 218      *
 219      * @return int Token id
 220      */
 221     public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) : int {
 222         $startAttributes = [];
 223         $endAttributes   = [];
 224
 225         while (1) {
 226             if (isset($this->tokens[++$this->pos])) {
 227                 $token = $this->tokens[$this->pos];
 228             } else {
 229                 // EOF token with ID 0
 230                 $token = "\0";
 231             }
 232
 233             if (isset($this->usedAttributes['startLine'])) {
 234                 $startAttributes['startLine'] = $this->line;
 235             }
 236             if (isset($this->usedAttributes['startTokenPos'])) {
 237                 $startAttributes['startTokenPos'] = $this->pos;
 238             }
 239             if (isset($this->usedAttributes['startFilePos'])) {
 240                 $startAttributes['startFilePos'] = $this->filePos;
 241             }
 242
 243             if (\is_string($token)) {
 244                 $value = $token;
 245                 if (isset($token[1])) {
 246                     // bug in token_get_all
 247                     $this->filePos += 2;
 248                     $id = ord('"');
 249                 } else {
 250                     $this->filePos += 1;
 251                     $id = ord($token);
 252                 }
 253             } elseif (!isset($this->dropTokens[$token[0]])) {
 254                 $value = $token[1];
 255                 $id = $this->tokenMap[$token[0]];
 256                 if (\T_CLOSE_TAG === $token[0]) {
 257                     $this->prevCloseTagHasNewline = false !== strpos($token[1], "\n");
 258                 } elseif (\T_INLINE_HTML === $token[0]) {
 259                     $startAttributes['hasLeadingNewline'] = $this->prevCloseTagHasNewline;
 260                 }
 261
 262                 $this->line += substr_count($value, "\n");
 263                 $this->filePos += \strlen($value);
 264             } else {
 265                 if (\T_COMMENT === $token[0] || \T_DOC_COMMENT === $token[0]) {
 266                     if (isset($this->usedAttributes['comments'])) {
 267                         $comment = \T_DOC_COMMENT === $token[0]
 268                             ? new Comment\Doc($token[1], $this->line, $this->filePos, $this->pos)
 269                             : new Comment($token[1], $this->line, $this->filePos, $this->pos);
 270                         $startAttributes['comments'][] = $comment;
 271                     }
 272                 }
 273
 274                 $this->line += substr_count($token[1], "\n");
 275                 $this->filePos += \strlen($token[1]);
 276                 continue;
 277             }
 278
 279             if (isset($this->usedAttributes['endLine'])) {
 280                 $endAttributes['endLine'] = $this->line;
 281             }
 282             if (isset($this->usedAttributes['endTokenPos'])) {
 283                 $endAttributes['endTokenPos'] = $this->pos;
 284             }
 285             if (isset($this->usedAttributes['endFilePos'])) {
 286                 $endAttributes['endFilePos'] = $this->filePos - 1;
 287             }
 288
 289             return $id;
 290         }
 291
 292         throw new \RuntimeException('Reached end of lexer loop');
 293     }
 294
 295     /**
 296      * Returns the token array for current code.
 297      *
 298      * The token array is in the same format as provided by the
 299      * token_get_all() function and does not discard tokens (i.e.
 300      * whitespace and comments are included). The token position
 301      * attributes are against this token array.
 302      *
 303      * @return array Array of tokens in token_get_all() format
 304      */
 305     public function getTokens() : array {
 306         return $this->tokens;
 307     }
 308
 309     /**
 310      * Handles __halt_compiler() by returning the text after it.
 311      *
 312      * @return string Remaining text
 313      */
 314     public function handleHaltCompiler() : string {
 315         // text after T_HALT_COMPILER, still including ();
 316         $textAfter = substr($this->code, $this->filePos);
 317
 318         // ensure that it is followed by ();
 319         // this simplifies the situation, by not allowing any comments
 320         // in between of the tokens.
 321         if (!preg_match('~^\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) {
 322             throw new Error('__HALT_COMPILER must be followed by "();"');
 323         }
 324
 325         // prevent the lexer from returning any further tokens
 326         $this->pos = count($this->tokens);
 327
 328         // return with (); removed
 329         return substr($textAfter, strlen($matches[0]));
 330     }
 331
 332     /**
 333      * Creates the token map.
 334      *
 335      * The token map maps the PHP internal token identifiers
 336      * to the identifiers used by the Parser. Additionally it
 337      * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
 338      *
 339      * @return array The token map
 340      */
 341     protected function createTokenMap() : array {
 342         $tokenMap = [];
 343
 344         // 256 is the minimum possible token number, as everything below
 345         // it is an ASCII value
 346         for ($i = 256; $i < 1000; ++$i) {
 347             if (\T_DOUBLE_COLON === $i) {
 348                 // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
 349                 $tokenMap[$i] = Tokens::T_PAAMAYIM_NEKUDOTAYIM;
 350             } elseif(\T_OPEN_TAG_WITH_ECHO === $i) {
 351                 // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
 352                 $tokenMap[$i] = Tokens::T_ECHO;
 353             } elseif(\T_CLOSE_TAG === $i) {
 354                 // T_CLOSE_TAG is equivalent to ';'
 355                 $tokenMap[$i] = ord(';');
 356             } elseif ('UNKNOWN' !== $name = token_name($i)) {
 357                 if ('T_HASHBANG' === $name) {
 358                     // HHVM uses a special token for #! hashbang lines
 359                     $tokenMap[$i] = Tokens::T_INLINE_HTML;
 360                 } elseif (defined($name = Tokens::class . '::' . $name)) {
 361                     // Other tokens can be mapped directly
 362                     $tokenMap[$i] = constant($name);
 363                 }
 364             }
 365         }
 366
 367         // HHVM uses a special token for numbers that overflow to double
 368         if (defined('T_ONUMBER')) {
 369             $tokenMap[\T_ONUMBER] = Tokens::T_DNUMBER;
 370         }
 371         // HHVM also has a separate token for the __COMPILER_HALT_OFFSET__ constant
 372         if (defined('T_COMPILER_HALT_OFFSET')) {
 373             $tokenMap[\T_COMPILER_HALT_OFFSET] = Tokens::T_STRING;
 374         }
 375
 376         return $tokenMap;
 377     }
 378 }