| Nick Dickinson-Wilde

Same name and namespace in other branches
6.0.x advagg_js_minify/jsminplus.inc \JSTokenizer::get()
7.x-1.x advagg_js_compress/jsminplus.inc \JSTokenizer::get()
7.x-2.x advagg_js_compress/jsminplus.inc \JSTokenizer::get()
8.x-2.x advagg_js_minify/jsminplus.inc \JSTokenizer::get()
8.x-3.x advagg_js_minify/jsminplus.inc \JSTokenizer::get()
8.x-4.x advagg_js_minify/jsminplus.inc \JSTokenizer::get()
2 calls to JSTokenizer::get()
JSTokenizer::match in advagg_js_minify/jsminplus.inc
JSTokenizer::peek in advagg_js_minify/jsminplus.inc
File

advagg_js_minify/jsminplus.inc, line 2020
Class

JSTokenizer
Code

public function get($chunksize = 1000, $op_dot = false) {
    while ($this->lookahead) {
        $this->lookahead--;
        $this->tokenIndex = $this->tokenIndex + 1 & 3;
        $token = $this->tokens[$this->tokenIndex];
        if ($token->type != TOKEN_NEWLINE || $this->scanNewlines) {
            return $token->type;
        }
    }
    $conditional_comment = false;
    // strip whitespace and comments
    while (true) {
        $input = $this->getInput($chunksize);
        // whitespace handling; gobble up \r as well (effectively we don't have support for MAC newlines!)
        $re = $this->scanNewlines ? '/^[ \\r\\t]+/' : '/^\\s+/';
        if (preg_match($re, $input, $match)) {
            $spaces = $match[0];
            $spacelen = strlen($spaces);
            $this->cursor += $spacelen;
            if (!$this->scanNewlines) {
                $this->lineno += substr_count($spaces, "\n");
            }
            if ($spacelen == $chunksize) {
                continue;
                // complete chunk contained whitespace
            }
            $input = $this->getInput($chunksize);
            if ($input == '' || $input[0] != '/') {
                break;
            }
        }
        // Comments
        if (!preg_match('/^\\/(?:\\*(@(?:cc_on|if|elif|else|end))?.*?\\*\\/|\\/[^\\n]*)/s', $input, $match)) {
            if (!$chunksize) {
                break;
            }
            // retry with a full chunk fetch; this also prevents breakage of long regular expressions (which will never match a comment)
            $chunksize = null;
            continue;
        }
        // check if this is a conditional (JScript) comment
        if (!empty($match[1])) {
            $match[0] = '/*' . $match[1];
            $conditional_comment = true;
            break;
        }
        else {
            $this->cursor += strlen($match[0]);
            $this->lineno += substr_count($match[0], "\n");
        }
    }
    if ($input == '') {
        $tt = TOKEN_END;
        $match = array(
            '',
        );
    }
    elseif ($conditional_comment) {
        $tt = TOKEN_CONDCOMMENT_START;
    }
    else {
        switch ($input[0]) {
            case '0':
                // hexadecimal
                if (($input[1] == 'x' || $input[1] == 'X') && preg_match('/^0x[0-9a-f]+/i', $input, $match)) {
                    $tt = TOKEN_NUMBER;
                    break;
                }
            // FALL THROUGH
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                // should always match
                preg_match('/^\\d+(?:\\.\\d*)?(?:[eE][-+]?\\d+)?/', $input, $match);
                $tt = TOKEN_NUMBER;
                break;
            case "'":
                if (preg_match('/^\'(?:[^\\\\\'\\r\\n]++|\\\\(?:.|\\r?\\n))*\'/', $input, $match)) {
                    $tt = TOKEN_STRING;
                }
                else {
                    if ($chunksize) {
                        return $this->get(null);
                        // retry with a full chunk fetch
                    }
                    throw $this->newSyntaxError('Unterminated string literal');
                }
                break;
            case '"':
                if (preg_match('/^"(?:[^\\\\"\\r\\n]++|\\\\(?:.|\\r?\\n))*"/', $input, $match)) {
                    $tt = TOKEN_STRING;
                }
                else {
                    if ($chunksize) {
                        return $this->get(null);
                        // retry with a full chunk fetch
                    }
                    throw $this->newSyntaxError('Unterminated string literal');
                }
                break;
            case '/':
                if ($this->scanOperand && preg_match('/^\\/((?:\\\\.|\\[(?:\\\\.|[^\\]])*\\]|[^\\/])+)\\/([gimy]*)/', $input, $match)) {
                    $tt = TOKEN_REGEXP;
                    break;
                }
            // FALL THROUGH
            case '|':
            case '^':
            case '&':
            case '<':
            case '>':
            case '+':
            case '-':
            case '*':
            case '%':
            case '=':
            case '!':
                // should always match
                preg_match($this->opRegExp, $input, $match);
                $op = $match[0];
                if (in_array($op, $this->assignOps) && $input[strlen($op)] == '=') {
                    $tt = OP_ASSIGN;
                    $match[0] .= '=';
                }
                else {
                    $tt = $op;
                    if ($this->scanOperand) {
                        if ($op == OP_PLUS) {
                            $tt = OP_UNARY_PLUS;
                        }
                        elseif ($op == OP_MINUS) {
                            $tt = OP_UNARY_MINUS;
                        }
                    }
                    $op = null;
                }
                break;
            case '.':
                if (preg_match('/^\\.\\d+(?:[eE][-+]?\\d+)?/', $input, $match)) {
                    $tt = TOKEN_NUMBER;
                    break;
                }
            // FALL THROUGH
            case ';':
            case ',':
            case '?':
            case ':':
            case '~':
            case '[':
            case ']':
            case '{':
            case '}':
            case '(':
            case ')':
                // these are all single
                $match = array(
                    $input[0],
                );
                $tt = $input[0];
                break;
            case '@':
                // check end of conditional comment
                if (substr($input, 0, 3) == '@*/') {
                    $match = array(
                        '@*/',
                    );
                    $tt = TOKEN_CONDCOMMENT_END;
                }
                else {
                    throw $this->newSyntaxError('Illegal token');
                }
                break;
            case "\n":
                if ($this->scanNewlines) {
                    $match = array(
                        "\n",
                    );
                    $tt = TOKEN_NEWLINE;
                }
                else {
                    throw $this->newSyntaxError('Illegal token');
                }
                break;
            default:
                // Fast path for identifiers: word chars followed by whitespace or various other tokens.
                // Note we don't need to exclude digits in the first char, as they've already been found
                // above.
                if (!preg_match('/^[$\\w]+(?=[\\s\\/\\|\\^\\&<>\\+\\-\\*%=!.;,\\?:~\\[\\]\\{\\}\\(\\)@])/', $input, $match)) {
                    // Character classes per ECMA-262 edition 5.1 section 7.6
                    // Per spec, must accept Unicode 3.0, *may* accept later versions.
                    // We'll take whatever PCRE understands, which should be more recent.
                    $identifierStartChars = "\\p{L}\\p{Nl}" . "\$" . "_";
                    $identifierPartChars = $identifierStartChars . "\\p{Mn}\\p{Mc}" . "\\p{Nd}" . "\\p{Pc}";
                    
                    # UnicodeConnectorPunctuation
                    $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
                    $identifierRegex = "/^" . "(?:[{$identifierStartChars}]|{$unicodeEscape})" . "(?:[{$identifierPartChars}]|{$unicodeEscape})*" . "/uS";
                    if (preg_match($identifierRegex, $input, $match)) {
                        if (strpos($match[0], '\\') !== false) {
                            // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
                            // the original chars, but only within the boundaries of the identifier.
                            $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/', array(
                                __CLASS__,
                                'unicodeEscapeCallback',
                            ), $match[0]);
                            // Since our original regex didn't de-escape the originals, we need to check for validity again.
                            // No need to worry about token boundaries, as anything outside the identifier is illegal!
                            if (!preg_match("/^[{$identifierStartChars}][{$identifierPartChars}]*\$/u", $decoded)) {
                                throw $this->newSyntaxError('Illegal token');
                            }
                            // Per spec it _ought_ to work to use these escapes for keywords words as well...
                            // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
                            // that don't match the keyword.
                            if (in_array($decoded, $this->keywords)) {
                                throw $this->newSyntaxError('Illegal token');
                            }
                            // TODO: save the decoded form for output?
                        }
                    }
                    else {
                        throw $this->newSyntaxError('Illegal token');
                    }
                }
                // Identifiers after an OP_DOT can include otherwise reserve keywords.
                if ($op_dot) {
                    $tt = TOKEN_IDENTIFIER;
                }
                else {
                    $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
                }
        }
    }
    $this->tokenIndex = $this->tokenIndex + 1 & 3;
    if (!isset($this->tokens[$this->tokenIndex])) {
        $this->tokens[$this->tokenIndex] = new JSToken();
    }
    $token = $this->tokens[$this->tokenIndex];
    $token->type = $tt;
    if ($tt == OP_ASSIGN) {
        $token->assignOp = $op;
    }
    $token->start = $this->cursor;
    $token->value = $match[0];
    $this->cursor += strlen($match[0]);
    $token->end = $this->cursor;
    $token->lineno = $this->lineno;
    return $tt;
}
function JSTokenizer::get

File

Class

Code