Same name and namespace in other branches
  1. 5.0.x advagg_js_minify/jsminplus.inc \JSTokenizer 1 comment
  2. 7.x-1.x advagg_js_compress/jsminplus.inc \JSTokenizer 1 comment
  3. 7.x-2.x advagg_js_compress/jsminplus.inc \JSTokenizer 1 comment
  4. 8.x-2.x advagg_js_minify/jsminplus.inc \JSTokenizer 1 comment
  5. 8.x-3.x advagg_js_minify/jsminplus.inc \JSTokenizer 1 comment
  6. 8.x-4.x advagg_js_minify/jsminplus.inc \JSTokenizer 1 comment

Hierarchy

Expanded class hierarchy of JSTokenizer

File

advagg_js_minify/jsminplus.inc, line 1856

View source
class JSTokenizer {
    private $cursor = 0;
    private $source;
    public $tokens = array();
    public $tokenIndex = 0;
    public $lookahead = 0;
    public $scanNewlines = false;
    public $scanOperand = true;
    public $filename;
    public $lineno;
    private $keywords = array(
        'break',
        'case',
        'catch',
        'const',
        'continue',
        'debugger',
        'default',
        'delete',
        'do',
        'else',
        'enum',
        'false',
        'finally',
        'for',
        'function',
        'if',
        'in',
        'instanceof',
        'new',
        'null',
        'return',
        'switch',
        'this',
        'throw',
        'true',
        'try',
        'typeof',
        'var',
        'void',
        'while',
        'with',
    );
    private $opTypeNames = array(
        ';',
        ',',
        '?',
        ':',
        '||',
        '&&',
        '|',
        '^',
        '&',
        '===',
        '==',
        '=',
        '!==',
        '!=',
        '<<',
        '<=',
        '<',
        '>>>',
        '>>',
        '>=',
        '>',
        '++',
        '--',
        '+',
        '-',
        '*',
        '/',
        '%',
        '!',
        '~',
        '.',
        '[',
        ']',
        '{',
        '}',
        '(',
        ')',
        '@*/',
    );
    private $assignOps = array(
        '|',
        '^',
        '&',
        '<<',
        '>>',
        '>>>',
        '+',
        '-',
        '*',
        '/',
        '%',
    );
    private $opRegExp;
    public function __construct() {
        $this->opRegExp = '#^(' . implode('|', array_map('preg_quote', $this->opTypeNames)) . ')#';
    }
    public function init($source, $filename = '', $lineno = 1) {
        $this->source = $source;
        $this->filename = $filename ? $filename : '[inline]';
        $this->lineno = $lineno;
        $this->cursor = 0;
        $this->tokens = array();
        $this->tokenIndex = 0;
        $this->lookahead = 0;
        $this->scanNewlines = false;
        $this->scanOperand = true;
    }
    public function getInput($chunksize) {
        if ($chunksize) {
            return substr($this->source, $this->cursor, $chunksize);
        }
        return substr($this->source, $this->cursor);
    }
    public function isDone() {
        return $this->peek() == TOKEN_END;
    }
    public function match($tt, $op_dot = false) {
        return $this->get(1000, $op_dot) == $tt || $this->unget();
    }
    public function mustMatch($tt, $op_dot = false) {
        if (!$this->match($tt, $op_dot)) {
            throw $this->newSyntaxError('Unexpected token; token ' . $tt . ' expected');
        }
        return $this->currentToken();
    }
    public function peek() {
        if ($this->lookahead) {
            $next = $this->tokens[$this->tokenIndex + $this->lookahead & 3];
            if ($this->scanNewlines && $next->lineno != $this->lineno) {
                $tt = TOKEN_NEWLINE;
            }
            else {
                $tt = $next->type;
            }
        }
        else {
            $tt = $this->get();
            $this->unget();
        }
        return $tt;
    }
    public function peekOnSameLine() {
        $this->scanNewlines = true;
        $tt = $this->peek();
        $this->scanNewlines = false;
        return $tt;
    }
    public function currentToken() {
        if (!empty($this->tokens)) {
            return $this->tokens[$this->tokenIndex];
        }
    }
    public function get($chunksize = 1000, $op_dot = false) {
        while ($this->lookahead) {
            $this->lookahead--;
            $this->tokenIndex = $this->tokenIndex + 1 & 3;
            $token = $this->tokens[$this->tokenIndex];
            if ($token->type != TOKEN_NEWLINE || $this->scanNewlines) {
                return $token->type;
            }
        }
        $conditional_comment = false;
        // strip whitespace and comments
        while (true) {
            $input = $this->getInput($chunksize);
            // whitespace handling; gobble up \r as well (effectively we don't have support for MAC newlines!)
            $re = $this->scanNewlines ? '/^[ \\r\\t]+/' : '/^\\s+/';
            if (preg_match($re, $input, $match)) {
                $spaces = $match[0];
                $spacelen = strlen($spaces);
                $this->cursor += $spacelen;
                if (!$this->scanNewlines) {
                    $this->lineno += substr_count($spaces, "\n");
                }
                if ($spacelen == $chunksize) {
                    continue;
                    // complete chunk contained whitespace
                }
                $input = $this->getInput($chunksize);
                if ($input == '' || $input[0] != '/') {
                    break;
                }
            }
            // Comments
            if (!preg_match('/^\\/(?:\\*(@(?:cc_on|if|elif|else|end))?.*?\\*\\/|\\/[^\\n]*)/s', $input, $match)) {
                if (!$chunksize) {
                    break;
                }
                // retry with a full chunk fetch; this also prevents breakage of long regular expressions (which will never match a comment)
                $chunksize = null;
                continue;
            }
            // check if this is a conditional (JScript) comment
            if (!empty($match[1])) {
                $match[0] = '/*' . $match[1];
                $conditional_comment = true;
                break;
            }
            else {
                $this->cursor += strlen($match[0]);
                $this->lineno += substr_count($match[0], "\n");
            }
        }
        if ($input == '') {
            $tt = TOKEN_END;
            $match = array(
                '',
            );
        }
        elseif ($conditional_comment) {
            $tt = TOKEN_CONDCOMMENT_START;
        }
        else {
            switch ($input[0]) {
                case '0':
                    // hexadecimal
                    if (($input[1] == 'x' || $input[1] == 'X') && preg_match('/^0x[0-9a-f]+/i', $input, $match)) {
                        $tt = TOKEN_NUMBER;
                        break;
                    }
                // FALL THROUGH
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                case '8':
                case '9':
                    // should always match
                    preg_match('/^\\d+(?:\\.\\d*)?(?:[eE][-+]?\\d+)?/', $input, $match);
                    $tt = TOKEN_NUMBER;
                    break;
                case "'":
                    if (preg_match('/^\'(?:[^\\\\\'\\r\\n]++|\\\\(?:.|\\r?\\n))*\'/', $input, $match)) {
                        $tt = TOKEN_STRING;
                    }
                    else {
                        if ($chunksize) {
                            return $this->get(null);
                            // retry with a full chunk fetch
                        }
                        throw $this->newSyntaxError('Unterminated string literal');
                    }
                    break;
                case '"':
                    if (preg_match('/^"(?:[^\\\\"\\r\\n]++|\\\\(?:.|\\r?\\n))*"/', $input, $match)) {
                        $tt = TOKEN_STRING;
                    }
                    else {
                        if ($chunksize) {
                            return $this->get(null);
                            // retry with a full chunk fetch
                        }
                        throw $this->newSyntaxError('Unterminated string literal');
                    }
                    break;
                case '/':
                    if ($this->scanOperand && preg_match('/^\\/((?:\\\\.|\\[(?:\\\\.|[^\\]])*\\]|[^\\/])+)\\/([gimy]*)/', $input, $match)) {
                        $tt = TOKEN_REGEXP;
                        break;
                    }
                // FALL THROUGH
                case '|':
                case '^':
                case '&':
                case '<':
                case '>':
                case '+':
                case '-':
                case '*':
                case '%':
                case '=':
                case '!':
                    // should always match
                    preg_match($this->opRegExp, $input, $match);
                    $op = $match[0];
                    if (in_array($op, $this->assignOps) && $input[strlen($op)] == '=') {
                        $tt = OP_ASSIGN;
                        $match[0] .= '=';
                    }
                    else {
                        $tt = $op;
                        if ($this->scanOperand) {
                            if ($op == OP_PLUS) {
                                $tt = OP_UNARY_PLUS;
                            }
                            elseif ($op == OP_MINUS) {
                                $tt = OP_UNARY_MINUS;
                            }
                        }
                        $op = null;
                    }
                    break;
                case '.':
                    if (preg_match('/^\\.\\d+(?:[eE][-+]?\\d+)?/', $input, $match)) {
                        $tt = TOKEN_NUMBER;
                        break;
                    }
                // FALL THROUGH
                case ';':
                case ',':
                case '?':
                case ':':
                case '~':
                case '[':
                case ']':
                case '{':
                case '}':
                case '(':
                case ')':
                    // these are all single
                    $match = array(
                        $input[0],
                    );
                    $tt = $input[0];
                    break;
                case '@':
                    // check end of conditional comment
                    if (substr($input, 0, 3) == '@*/') {
                        $match = array(
                            '@*/',
                        );
                        $tt = TOKEN_CONDCOMMENT_END;
                    }
                    else {
                        throw $this->newSyntaxError('Illegal token');
                    }
                    break;
                case "\n":
                    if ($this->scanNewlines) {
                        $match = array(
                            "\n",
                        );
                        $tt = TOKEN_NEWLINE;
                    }
                    else {
                        throw $this->newSyntaxError('Illegal token');
                    }
                    break;
                default:
                    // Fast path for identifiers: word chars followed by whitespace or various other tokens.
                    // Note we don't need to exclude digits in the first char, as they've already been found
                    // above.
                    if (!preg_match('/^[$\\w]+(?=[\\s\\/\\|\\^\\&<>\\+\\-\\*%=!.;,\\?:~\\[\\]\\{\\}\\(\\)@])/', $input, $match)) {
                        // Character classes per ECMA-262 edition 5.1 section 7.6
                        // Per spec, must accept Unicode 3.0, *may* accept later versions.
                        // We'll take whatever PCRE understands, which should be more recent.
                        $identifierStartChars = "\\p{L}\\p{Nl}" . "\$" . "_";
                        $identifierPartChars = $identifierStartChars . "\\p{Mn}\\p{Mc}" . "\\p{Nd}" . "\\p{Pc}";
                        
                        # UnicodeConnectorPunctuation
                        $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
                        $identifierRegex = "/^" . "(?:[{$identifierStartChars}]|{$unicodeEscape})" . "(?:[{$identifierPartChars}]|{$unicodeEscape})*" . "/uS";
                        if (preg_match($identifierRegex, $input, $match)) {
                            if (strpos($match[0], '\\') !== false) {
                                // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
                                // the original chars, but only within the boundaries of the identifier.
                                $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/', array(
                                    __CLASS__,
                                    'unicodeEscapeCallback',
                                ), $match[0]);
                                // Since our original regex didn't de-escape the originals, we need to check for validity again.
                                // No need to worry about token boundaries, as anything outside the identifier is illegal!
                                if (!preg_match("/^[{$identifierStartChars}][{$identifierPartChars}]*\$/u", $decoded)) {
                                    throw $this->newSyntaxError('Illegal token');
                                }
                                // Per spec it _ought_ to work to use these escapes for keywords words as well...
                                // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
                                // that don't match the keyword.
                                if (in_array($decoded, $this->keywords)) {
                                    throw $this->newSyntaxError('Illegal token');
                                }
                                // TODO: save the decoded form for output?
                            }
                        }
                        else {
                            throw $this->newSyntaxError('Illegal token');
                        }
                    }
                    // Identifiers after an OP_DOT can include otherwise reserve keywords.
                    if ($op_dot) {
                        $tt = TOKEN_IDENTIFIER;
                    }
                    else {
                        $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
                    }
            }
        }
        $this->tokenIndex = $this->tokenIndex + 1 & 3;
        if (!isset($this->tokens[$this->tokenIndex])) {
            $this->tokens[$this->tokenIndex] = new JSToken();
        }
        $token = $this->tokens[$this->tokenIndex];
        $token->type = $tt;
        if ($tt == OP_ASSIGN) {
            $token->assignOp = $op;
        }
        $token->start = $this->cursor;
        $token->value = $match[0];
        $this->cursor += strlen($match[0]);
        $token->end = $this->cursor;
        $token->lineno = $this->lineno;
        return $tt;
    }
    public function unget() {
        if (++$this->lookahead == 4) {
            throw $this->newSyntaxError('PANIC: too much lookahead!');
        }
        $this->tokenIndex = $this->tokenIndex - 1 & 3;
    }
    public function newSyntaxError($m) {
        return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
    }
    public static function unicodeEscapeCallback($m) {
        return html_entity_decode('&#x' . $m[1] . ';', ENT_QUOTES, 'UTF-8');
    }

}

Members

Title Sort descending Modifiers Object type Summary
JSTokenizer::$assignOps private property
JSTokenizer::$cursor private property
JSTokenizer::$filename public property
JSTokenizer::$keywords private property
JSTokenizer::$lineno public property
JSTokenizer::$lookahead public property
JSTokenizer::$opRegExp private property
JSTokenizer::$opTypeNames private property
JSTokenizer::$scanNewlines public property
JSTokenizer::$scanOperand public property
JSTokenizer::$source private property
JSTokenizer::$tokenIndex public property
JSTokenizer::$tokens public property
JSTokenizer::currentToken public function
JSTokenizer::get public function
JSTokenizer::getInput public function
JSTokenizer::init public function
JSTokenizer::isDone public function
JSTokenizer::match public function
JSTokenizer::mustMatch public function
JSTokenizer::newSyntaxError public function
JSTokenizer::peek public function
JSTokenizer::peekOnSameLine public function
JSTokenizer::unget public function
JSTokenizer::unicodeEscapeCallback public static function
JSTokenizer::__construct public function