view src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java @ 178:1c29dc809de2

8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines Reviewed-by: jlaskey, lagergren
author hannesw
date Fri, 05 Apr 2013 19:50:10 +0200
parents 390d44ba90cf
children 635a93b61d34
line wrap: on
line source

/*
 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package jdk.nashorn.internal.runtime.regexp;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.PatternSyntaxException;

import jdk.nashorn.internal.parser.Lexer;
import jdk.nashorn.internal.parser.Scanner;
import jdk.nashorn.internal.runtime.BitVector;

/**
 * Scan a JavaScript regexp, converting to Java regex if necessary.
 *
 */
final class RegExpScanner extends Scanner {

    /**
     * String builder used to rewrite the pattern for the currently used regexp factory.
     */
    private final StringBuilder sb;

    /** Is this the special case of a regexp that never matches anything */
    private boolean neverMatches;

    /** Expected token table */
    private final Map<Character, Integer> expected = new HashMap<>();

    /** Capturing parenthesis that have been found so far. */
    private final List<Capture> caps = new LinkedList<>();

    /** Forward references to capturing parenthesis to be resolved later.*/
    private final LinkedList<Integer> forwardReferences = new LinkedList<>();

    /** Current level of zero-width negative lookahead assertions. */
    private int negativeLookaheadLevel;

    /** Are we currently inside a character class? */
    private boolean inCharClass = false;

    /** Are we currently inside a negated character class? */
    private boolean inNegativeClass = false;

    private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";

    private static class Capture {
        /**
         * Zero-width negative lookaheads enclosing the capture.
         */
        private final int negativeLookaheadLevel;

        Capture(final int negativeLookaheadLevel) {
            this.negativeLookaheadLevel = negativeLookaheadLevel;
        }

        public int getNegativeLookaheadLevel() {
            return negativeLookaheadLevel;
        }

    }

    /**
     * Constructor
     * @param string the JavaScript regexp to parse
     */
    private RegExpScanner(final String string) {
        super(string);
        sb = new StringBuilder(limit);
        reset(0);
        expected.put(']', 0);
        expected.put('}', 0);
    }

    private void processForwardReferences() {
        if (neverMatches()) {
            return;
        }

        Iterator<Integer> iterator = forwardReferences.descendingIterator();
        while (iterator.hasNext()) {
            final int pos = iterator.next();
            final int num = iterator.next();
            if (num > caps.size()) {
                // Non-existing reference should never match, if smaller than 8 convert to octal escape
                // to be compatible with other engines.
                if (num < 8) {
                    String escape = "\\x0" + num;
                    sb.insert(pos, escape);
                } else {
                    neverMatches = true;
                    break;
                }
            }
        }

        forwardReferences.clear();
    }

    /**
     * Scan a JavaScript regexp string returning a Java safe regex string.
     *
     * @param string
     *            JavaScript regexp string.
     * @return Java safe regex string.
     */
    public static RegExpScanner scan(final String string) {
        final RegExpScanner scanner = new RegExpScanner(string);

        try {
            scanner.disjunction();
        } catch (final Exception e) {
            throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
        }

        scanner.processForwardReferences();
        if (scanner.neverMatches()) {
            return null; // never matches
        }

        // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
        if (scanner.position != string.length()) {
            final String p = scanner.getStringBuilder().toString();
            throw new PatternSyntaxException(string, p, p.length() + 1);
        }

        return scanner;
     }

    /**
     * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
     * is an example where we never match
     *
     * @return boolean
     */
    private boolean neverMatches() {
        return neverMatches;
    }

    final StringBuilder getStringBuilder() {
        return sb;
    }

    String getJavaPattern() {
        return sb.toString();
    }

    BitVector getGroupsInNegativeLookahead() {
        BitVector vec = null;
        for (int i = 0; i < caps.size(); i++) {
            final Capture cap = caps.get(i);
            if (cap.getNegativeLookaheadLevel() > 0) {
                if (vec == null) {
                    vec = new BitVector(caps.size() + 1);
                }
                vec.set(i + 1);
            }
        }
        return vec;
    }

    /**
     * Commit n characters to the builder and to a given token
     * @param n     Number of characters.
     * @return Committed token
     */
    private boolean commit(final int n) {
        switch (n) {
        case 1:
            sb.append(ch0);
            skip(1);
            break;
        case 2:
            sb.append(ch0);
            sb.append(ch1);
            skip(2);
            break;
        case 3:
            sb.append(ch0);
            sb.append(ch1);
            sb.append(ch2);
            skip(3);
            break;
        default:
            assert false : "Should not reach here";
        }

        return true;
    }

    /**
     * Restart the buffers back at an earlier position.
     *
     * @param startIn
     *            Position in the input stream.
     * @param startOut
     *            Position in the output stream.
     */
    private void restart(final int startIn, final int startOut) {
        reset(startIn);
        sb.setLength(startOut);
    }

    private void push(final char ch) {
        expected.put(ch, expected.get(ch) + 1);
    }

    private void pop(final char ch) {
        expected.put(ch, Math.min(0, expected.get(ch) - 1));
    }

    /*
     * Recursive descent tokenizer starts below.
     */

    /*
     * Disjunction ::
     *      Alternative
     *      Alternative | Disjunction
     */
    private void disjunction() {
        while (true) {
            alternative();

            if (ch0 == '|') {
                commit(1);
            } else {
                break;
            }
        }
    }

    /*
     * Alternative ::
     *      [empty]
     *      Alternative Term
     */
    private void alternative() {
        while (term()) {
            // do nothing
        }
    }

    /*
     * Term ::
     *      Assertion
     *      Atom
     *      Atom Quantifier
     */
    private boolean term() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (assertion()) {
            return true;
        }

        if (atom()) {
            boolean emptyCharacterClass = false;
            if (sb.toString().endsWith("[]")) {
                emptyCharacterClass = true;
            } else if (sb.toString().endsWith("[^]")) {
                sb.setLength(sb.length() - 2);
                sb.append("\\s\\S]");
            }

            boolean quantifier = quantifier();

            if (emptyCharacterClass) {
                if (!quantifier) {
                    neverMatches = true; //never matches ever.
                }
                // Note: we could check if quantifier has min zero to mark empty character class as dead.
            }

            return true;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * Assertion ::
     *      ^
     *      $
     *      \b
     *      \B
     *      ( ? = Disjunction )
     *      ( ? ! Disjunction )
     */
    private boolean assertion() {
        final int startIn  = position;
        final int startOut = sb.length();

        switch (ch0) {
        case '^':
        case '$':
            return commit(1);

        case '\\':
            if (ch1 == 'b' || ch1 == 'B') {
                return commit(2);
            }
            break;

        case '(':
            if (ch1 != '?') {
                break;
            }
            if (ch2 != '=' && ch2 != '!') {
                break;
            }
            final boolean isNegativeLookahead = (ch2 == '!');
            commit(3);

            if (isNegativeLookahead) {
                negativeLookaheadLevel++;
            }
            disjunction();
            if (isNegativeLookahead) {
                negativeLookaheadLevel--;
            }

            if (ch0 == ')') {
                return commit(1);
            }
            break;

        default:
            break;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * Quantifier ::
     *      QuantifierPrefix
     *      QuantifierPrefix ?
     */
    private boolean quantifier() {
        if (quantifierPrefix()) {
            if (ch0 == '?') {
                commit(1);
            }
            return true;
        }
        return false;
    }

    /*
     * QuantifierPrefix ::
     *      *
     *      +
     *      ?
     *      { DecimalDigits }
     *      { DecimalDigits , }
     *      { DecimalDigits , DecimalDigits }
     */
    private boolean quantifierPrefix() {
        final int startIn  = position;
        final int startOut = sb.length();

        switch (ch0) {
        case '*':
        case '+':
        case '?':
            return commit(1);

        case '{':
            commit(1);

            if (!decimalDigits()) {
                break; // not a quantifier - back out
            }
            push('}');

            if (ch0 == ',') {
                commit(1);
                decimalDigits();
            }

            if (ch0 == '}') {
                pop('}');
                commit(1);
            } else {
                // Bad quantifier should be rejected but is accepted by all major engines
                restart(startIn, startOut);
                return false;
            }

            return true;

        default:
            break;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * Atom ::
     *      PatternCharacter
     *      .
     *      \ AtomEscape
     *      CharacterClass
     *      ( Disjunction )
     *      ( ? : Disjunction )
     *
     */
    private boolean atom() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (patternCharacter()) {
            return true;
        }

        if (ch0 == '.') {
            return commit(1);
        }

        if (ch0 == '\\') {
            commit(1);

            if (atomEscape()) {
                return true;
            }
        }

        if (characterClass()) {
            return true;
        }

        if (ch0 == '(') {
            boolean capturingParens = true;
            commit(1);
            if (ch0 == '?' && ch1 == ':') {
                capturingParens = false;
                commit(2);
            }

            disjunction();

            if (ch0 == ')') {
                commit(1);
                if (capturingParens) {
                    caps.add(new Capture(negativeLookaheadLevel));
                }
                return true;
            }
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * PatternCharacter ::
     *      SourceCharacter but not any of: ^$\.*+?()[]{}|
     */
    @SuppressWarnings("fallthrough")
    private boolean patternCharacter() {
        if (atEOF()) {
            return false;
        }

        switch (ch0) {
        case '^':
        case '$':
        case '\\':
        case '.':
        case '*':
        case '+':
        case '?':
        case '(':
        case ')':
        case '[':
        case '|':
            return false;

        case '}':
        case ']':
            final int n = expected.get(ch0);
            if (n != 0) {
                return false;
            }

       case '{':
           // if not a valid quantifier escape curly brace to match itself
           // this ensures compatibility with other JS implementations
           if (!quantifierPrefix()) {
               sb.append('\\');
               return commit(1);
           }
           return false;

        default:
            return commit(1); // SOURCECHARACTER
        }
    }

    /*
     * AtomEscape ::
     *      DecimalEscape
     *      CharacterEscape
     *      CharacterClassEscape
     */
    private boolean atomEscape() {
        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
        return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
    }

    /*
     * CharacterEscape ::
     *      ControlEscape
     *      c ControlLetter
     *      HexEscapeSequence
     *      UnicodeEscapeSequence
     *      IdentityEscape
     */
    private boolean characterEscape() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (controlEscape()) {
            return true;
        }

        if (ch0 == 'c') {
            commit(1);
            if (controlLetter()) {
                return true;
            }
            restart(startIn, startOut);
        }

        if (hexEscapeSequence() || unicodeEscapeSequence()) {
            return true;
        }

        restart(startIn, startOut);
        return false;
    }

    private boolean scanEscapeSequence(final char leader, final int length) {
        final int startIn  = position;
        final int startOut = sb.length();

        if (ch0 != leader) {
            return false;
        }

        commit(1);
        for (int i = 0; i < length; i++) {
            final char ch0l = Character.toLowerCase(ch0);
            if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
                commit(1);
            } else {
                restart(startIn, startOut);
                return false;
            }
        }

        return true;
    }

    private boolean hexEscapeSequence() {
        return scanEscapeSequence('x', 2);
    }

    private boolean unicodeEscapeSequence() {
        return scanEscapeSequence('u', 4);
    }

    /*
     * ControlEscape ::
     *      one of fnrtv
     */
    private boolean controlEscape() {
        switch (ch0) {
        case 'f':
        case 'n':
        case 'r':
        case 't':
        case 'v':
            return commit(1);

        default:
            return false;
        }
    }

    /*
     * ControlLetter ::
     *      one of abcdefghijklmnopqrstuvwxyz
     *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
     */
    private boolean controlLetter() {
        final char c = Character.toUpperCase(ch0);
        if (c >= 'A' && c <= 'Z') {
            // for some reason java regexps don't like control characters on the
            // form "\\ca".match([string with ascii 1 at char0]). Translating
            // them to unicode does it though.
            sb.setLength(sb.length() - 1);
            unicode(c - 'A' + 1);
            skip(1);
            return true;
        }
        return false;
    }

    /*
     * IdentityEscape ::
     *      SourceCharacter but not IdentifierPart
     *      <ZWJ>  (200c)
     *      <ZWNJ> (200d)
     */
    private boolean identityEscape() {
        if (atEOF()) {
            throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
        }
        // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
        if (ch0 == 'c') {
            // Ignore invalid control letter escape if within a character class
            if (inCharClass && ch1 != ']') {
                sb.setLength(sb.length() - 1);
                skip(2);
                return true;
            } else {
                sb.append('\\'); // Treat invalid \c control sequence as \\c
            }
        } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
            sb.setLength(sb.length() - 1);
        }
        return commit(1);
    }

    /*
     * DecimalEscape ::
     *      DecimalIntegerLiteral [lookahead DecimalDigit]
     */
    private boolean decimalEscape() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (ch0 == '0' && !isDecimalDigit(ch1)) {
            skip(1);
            //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
            sb.append("\u0000");
            return true;
        }

        if (isDecimalDigit(ch0)) {
            final int num = ch0 - '0';

            // Single digit escape, treat as backreference.
            if (!isDecimalDigit(ch1)) {
                if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
                    //  Captures that live inside a negative lookahead are dead after the
                    //  lookahead and will be undefined if referenced from outside.
                    if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
                        sb.setLength(sb.length() - 1);
                    } else {
                        sb.append(ch0);
                    }
                    skip(1);
                    return true;
                } else if (num > caps.size()) {
                    // Forward reference to a capture group. Forward references are always undefined so we
                    // can omit it from the output buffer. Additionally, if the capture group does not exist
                    // the whole regexp becomes invalid, so register the reference for later processing.
                    sb.setLength(sb.length() - 1);
                    forwardReferences.add(num);
                    forwardReferences.add(sb.length());
                    skip(1);
                    return true;
                }
            }

            if (inCharClass) {
                // Convert octal escape to unicode escape if inside character class.
                StringBuilder digit = new StringBuilder(4);
                while (isDecimalDigit(ch0)) {
                    digit.append(ch0);
                    skip(1);
                }

                int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
                if (value > 0xff) {
                    throw new NumberFormatException(digit.toString());
                }

                unicode(value);

            } else {
                // Copy decimal escape as-is
                decimalDigits();
            }
            return true;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * CharacterClassEscape ::
     *  one of dDsSwW
     */
    private boolean characterClassEscape() {
        switch (ch0) {
        // java.util.regex requires translation of \s and \S to explicit character list
        case 's':
            if (RegExpFactory.usesJavaUtilRegex()) {
                sb.setLength(sb.length() - 1);
                // No nested class required if we already are inside a character class
                if (inCharClass) {
                    sb.append(Lexer.getWhitespaceRegExp());
                } else {
                    sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
                }
                skip(1);
                return true;
            }
            return commit(1);
        case 'S':
            if (RegExpFactory.usesJavaUtilRegex()) {
                sb.setLength(sb.length() - 1);
                // In negative class we must use intersection to get double negation ("not anything else than space")
                sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
                skip(1);
                return true;
            }
            return commit(1);
        case 'd':
        case 'D':
        case 'w':
        case 'W':
            return commit(1);

        default:
            return false;
        }
    }

    /*
     * CharacterClass ::
     *      [ [lookahead {^}] ClassRanges ]
     *      [ ^ ClassRanges ]
     */
    private boolean characterClass() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (ch0 == '[') {
            try {
                inCharClass = true;
                push(']');
                commit(1);

                if (ch0 == '^') {
                    inNegativeClass = true;
                    commit(1);
                }

                if (classRanges() && ch0 == ']') {
                    pop(']');
                    return commit(1);
                }
            } finally {
                inCharClass = false;  // no nested character classes in JavaScript
                inNegativeClass = false;
            }
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * ClassRanges ::
     *      [empty]
     *      NonemptyClassRanges
     */
    private boolean classRanges() {
        nonemptyClassRanges();
        return true;
    }

    /*
     * NonemptyClassRanges ::
     *      ClassAtom
     *      ClassAtom NonemptyClassRangesNoDash
     *      ClassAtom - ClassAtom ClassRanges
     */
    private boolean nonemptyClassRanges() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (classAtom()) {

            if (ch0 == '-') {
                commit(1);

                if (classAtom() && classRanges()) {
                    return true;
                }
            }

            nonemptyClassRangesNoDash();

            return true;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * NonemptyClassRangesNoDash ::
     *      ClassAtom
     *      ClassAtomNoDash NonemptyClassRangesNoDash
     *      ClassAtomNoDash - ClassAtom ClassRanges
     */
    private boolean nonemptyClassRangesNoDash() {
        final int startIn  = position;
        final int startOut = sb.length();

        if (classAtomNoDash()) {

            // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
            if (ch0 == '-') {
               commit(1);

               if (classAtom() && classRanges()) {
                   return true;
               }
               //fallthru
           }

            nonemptyClassRangesNoDash();
            return true; // still a class atom
        }

        if (classAtom()) {
            return true;
        }

        restart(startIn, startOut);
        return false;
    }

    /*
     * ClassAtom : - ClassAtomNoDash
     */
    private boolean classAtom() {

        if (ch0 == '-') {
            return commit(1);
        }

        return classAtomNoDash();
    }

    /*
     * ClassAtomNoDash ::
     *      SourceCharacter but not one of \ or ] or -
     *      \ ClassEscape
     */
    private boolean classAtomNoDash() {
        final int startIn  = position;
        final int startOut = sb.length();

        switch (ch0) {
        case ']':
        case '-':
        case '\0':
            return false;

        case '[':
            // unescaped left square bracket - add escape
            sb.append('\\');
            return commit(1);

        case '\\':
            commit(1);
            if (classEscape()) {
                return true;
            }

            restart(startIn, startOut);
            return false;

        default:
            return commit(1);
        }
    }

    /*
     * ClassEscape ::
     *      DecimalEscape
     *      b
     *      CharacterEscape
     *      CharacterClassEscape
     */
    private boolean classEscape() {

        if (decimalEscape()) {
            return true;
        }

        if (ch0 == 'b') {
            sb.setLength(sb.length() - 1);
            sb.append('\b');
            skip(1);
            return true;
        }

        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
        return characterEscape() || characterClassEscape() || identityEscape();
    }

    /*
     * DecimalDigits
     */
    private boolean decimalDigits() {
        if (!isDecimalDigit(ch0)) {
            return false;
        }

        while (isDecimalDigit(ch0)) {
            commit(1);
        }

        return true;
    }

    private void unicode(final int value) {
        final String hex = Integer.toHexString(value);
        sb.append('u');
        for (int i = 0; i < 4 - hex.length(); i++) {
            sb.append('0');
        }
        sb.append(hex);
    }

    private static boolean isDecimalDigit(final char ch) {
        return ch >= '0' && ch <= '9';
    }
}