Mercurial > hg > openjdk > jigsaw > nashorn
changeset 113:508da3c7fc3a
8008093: Make RegExp engine pluggable
Reviewed-by: lagergren, attila
author | hannesw |
---|---|
date | Fri, 22 Feb 2013 16:31:10 +0100 |
parents | 3f0ff84aaf36 |
children | e42fd1640ff9 |
files | src/jdk/nashorn/internal/objects/Global.java src/jdk/nashorn/internal/objects/NativeRegExp.java src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java src/jdk/nashorn/internal/objects/NativeString.java src/jdk/nashorn/internal/parser/AbstractParser.java src/jdk/nashorn/internal/runtime/RegExp.java src/jdk/nashorn/internal/runtime/RegExpMatch.java src/jdk/nashorn/internal/runtime/RegExpScanner.java src/jdk/nashorn/internal/runtime/regexp/DefaultRegExp.java src/jdk/nashorn/internal/runtime/regexp/RegExp.java src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java src/jdk/nashorn/internal/runtime/regexp/RegExpMatcher.java src/jdk/nashorn/internal/runtime/regexp/RegExpResult.java src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java |
diffstat | 14 files changed, 2099 insertions(+), 1855 deletions(-) [+] |
line wrap: on
line diff
--- a/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 10:39:00 2013 -0400 +++ b/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 16:31:10 2013 +0100 @@ -50,7 +50,7 @@ import jdk.nashorn.internal.runtime.NativeJavaPackage; import jdk.nashorn.internal.runtime.OptionsObject; import jdk.nashorn.internal.runtime.PropertyDescriptor; -import jdk.nashorn.internal.runtime.RegExpMatch; +import jdk.nashorn.internal.runtime.regexp.RegExpResult; import jdk.nashorn.internal.runtime.Scope; import jdk.nashorn.internal.runtime.ScriptFunction; import jdk.nashorn.internal.runtime.ScriptObject; @@ -339,7 +339,7 @@ private ClassCache classCache; // Used to store the last RegExp result to support deprecated RegExp constructor properties - private RegExpMatch lastRegExpMatch; + private RegExpResult lastRegExpResult; private static final MethodHandle EVAL = findOwnMH("eval", Object.class, Object.class, Object.class); private static final MethodHandle PRINT = findOwnMH("print", Object.class, Object.class, Object[].class); @@ -1709,12 +1709,12 @@ return MH.findStatic(MethodHandles.publicLookup(), Global.class, name, MH.type(rtype, types)); } - RegExpMatch getLastRegExpMatch() { - return lastRegExpMatch; + RegExpResult getLastRegExpResult() { + return lastRegExpResult; } - void setLastRegExpMatch(RegExpMatch regExpMatch) { - this.lastRegExpMatch = regExpMatch; + void setLastRegExpResult(final RegExpResult regExpResult) { + this.lastRegExpResult = regExpResult; } }
--- a/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 10:39:00 2013 -0400 +++ b/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 16:31:10 2013 +0100 @@ -31,8 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; + import jdk.nashorn.internal.objects.annotations.Attribute; import jdk.nashorn.internal.objects.annotations.Constructor; import jdk.nashorn.internal.objects.annotations.Function; @@ -44,8 +43,10 @@ import jdk.nashorn.internal.runtime.BitVector; import jdk.nashorn.internal.runtime.JSType; import jdk.nashorn.internal.runtime.ParserException; -import jdk.nashorn.internal.runtime.RegExp; -import jdk.nashorn.internal.runtime.RegExpMatch; +import jdk.nashorn.internal.runtime.regexp.RegExp; +import jdk.nashorn.internal.runtime.regexp.RegExpFactory; +import jdk.nashorn.internal.runtime.regexp.RegExpResult; +import jdk.nashorn.internal.runtime.regexp.RegExpMatcher; import jdk.nashorn.internal.runtime.ScriptFunction; import jdk.nashorn.internal.runtime.ScriptObject; import jdk.nashorn.internal.runtime.ScriptRuntime; @@ -59,35 +60,15 @@ @Property(attributes = Attribute.NOT_ENUMERABLE | Attribute.NOT_CONFIGURABLE) public Object lastIndex; - /** Pattern string. */ - private String input; - - /** Global search flag for this regexp. */ - private boolean global; - - /** Case insensitive flag for this regexp */ - private boolean ignoreCase; - - /** Multi-line flag for this regexp */ - private boolean multiline; - - /** Java regex pattern to use for match. We compile to one of these */ - private Pattern pattern; - - private BitVector groupsInNegativeLookahead; + /** Compiled regexp */ + private RegExp regexp; // Reference to global object needed to support static RegExp properties private Global globalObject; - /* - public NativeRegExp() { - init(); - }*/ - NativeRegExp(final String input, final String flagString) { - RegExp regExp = null; try { - regExp = new RegExp(input, flagString); + this.regexp = RegExpFactory.create(input, flagString); } catch (final ParserException e) { // translate it as SyntaxError object and throw it e.throwAsEcmaException(); @@ -95,13 +76,6 @@ } this.setLastIndex(0); - this.input = regExp.getInput(); - this.global = regExp.isGlobal(); - this.ignoreCase = regExp.isIgnoreCase(); - this.multiline = regExp.isMultiline(); - this.pattern = regExp.getPattern(); - this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead(); - init(); } @@ -110,24 +84,8 @@ } NativeRegExp(final NativeRegExp regExp) { - this.input = regExp.getInput(); - this.global = regExp.getGlobal(); - this.multiline = regExp.getMultiline(); - this.ignoreCase = regExp.getIgnoreCase(); this.lastIndex = regExp.getLastIndexObject(); - this.pattern = regExp.getPattern(); - this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead(); - - init(); - } - - NativeRegExp(final Pattern pattern) { - this.input = pattern.pattern(); - this.multiline = (pattern.flags() & Pattern.MULTILINE) != 0; - this.ignoreCase = (pattern.flags() & Pattern.CASE_INSENSITIVE) != 0; - this.lastIndex = 0; - this.pattern = pattern; - + this.regexp = regExp.getRegExp(); init(); } @@ -232,16 +190,59 @@ return new NativeRegExp(patternString, flagString); } + /** + * Build a regexp that matches {@code string} as-is. All meta-characters will be escaped. + * + * @param string pattern string + * @return flat regexp + */ + static NativeRegExp flatRegExp(String string) { + // escape special characters + StringBuilder sb = null; + final int length = string.length(); + + for (int i = 0; i < length; i++) { + final char c = string.charAt(i); + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case '{': + case '|': + if (sb == null) { + sb = new StringBuilder(length * 2); + sb.append(string, 0, i); + } + sb.append('\\'); + sb.append(c); + break; + default: + if (sb != null) { + sb.append(c); + } + break; + } + } + return new NativeRegExp(sb == null ? string : sb.toString(), ""); + } + private String getFlagString() { - final StringBuilder sb = new StringBuilder(); + final StringBuilder sb = new StringBuilder(3); - if (global) { + if (regexp.isGlobal()) { sb.append('g'); } - if (ignoreCase) { + if (regexp.isIgnoreCase()) { sb.append('i'); } - if (multiline) { + if (regexp.isMultiline()) { sb.append('m'); } @@ -255,7 +256,7 @@ @Override public String toString() { - return "/" + input + "/" + getFlagString(); + return "/" + regexp.getSource() + "/" + getFlagString(); } /** @@ -270,13 +271,8 @@ public static Object compile(final Object self, final Object pattern, final Object flags) { final NativeRegExp regExp = checkRegExp(self); final NativeRegExp compiled = newRegExp(pattern, flags); - // copy over fields to 'self' - regExp.setInput(compiled.getInput()); - regExp.setGlobal(compiled.getGlobal()); - regExp.setIgnoreCase(compiled.getIgnoreCase()); - regExp.setMultiline(compiled.getMultiline()); - regExp.setPattern(compiled.getPattern()); - regExp.setGroupsInNegativeLookahead(compiled.getGroupsInNegativeLookahead()); + // copy over regexp to 'self' + regExp.setRegExp(compiled.getRegExp()); // Some implementations return undefined. Some return 'self'. Since return // value is most likely be ignored, we can play safe and return 'self'. @@ -326,7 +322,7 @@ */ @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) public static Object source(final Object self) { - return checkRegExp(self).input; + return checkRegExp(self).getRegExp().getSource(); } /** @@ -337,7 +333,7 @@ */ @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) public static Object global(final Object self) { - return checkRegExp(self).global; + return checkRegExp(self).getRegExp().isGlobal(); } /** @@ -348,7 +344,7 @@ */ @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) public static Object ignoreCase(final Object self) { - return checkRegExp(self).ignoreCase; + return checkRegExp(self).getRegExp().isIgnoreCase(); } /** @@ -359,7 +355,7 @@ */ @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) public static Object multiline(final Object self) { - return checkRegExp(self).multiline; + return checkRegExp(self).getRegExp().isMultiline(); } /** @@ -369,7 +365,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "input") public static Object getLastInput(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getInput(); } @@ -390,7 +386,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastMatch") public static Object getLastMatch(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(0); } @@ -401,7 +397,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastParen") public static Object getLastParen(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getLastParen(); } @@ -412,7 +408,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "leftContext") public static Object getLeftContext(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getInput().substring(0, match.getIndex()); } @@ -423,7 +419,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "rightContext") public static Object getRightContext(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getInput().substring(match.getIndex() + match.length()); } @@ -434,7 +430,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$1") public static Object getGroup1(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(1); } @@ -445,7 +441,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$2") public static Object getGroup2(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(2); } @@ -456,7 +452,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$3") public static Object getGroup3(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(3); } @@ -467,7 +463,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$4") public static Object getGroup4(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(4); } @@ -478,7 +474,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$5") public static Object getGroup5(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(5); } @@ -489,7 +485,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$6") public static Object getGroup6(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(6); } @@ -500,7 +496,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$7") public static Object getGroup7(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(7); } @@ -511,7 +507,7 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$8") public static Object getGroup8(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(8); } @@ -522,34 +518,30 @@ */ @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$9") public static Object getGroup9(Object self) { - final RegExpMatch match = Global.instance().getLastRegExpMatch(); + final RegExpResult match = Global.instance().getLastRegExpResult(); return match == null ? "" : match.getGroup(9); } - private RegExpMatch execInner(final String string) { - if (this.pattern == null) { - return null; // never matches or similar, e.g. a[] - } + private RegExpResult execInner(final String string) { - final Matcher matcher = pattern.matcher(string); - final int start = this.global ? getLastIndex() : 0; - + final int start = regexp.isGlobal() ? getLastIndex() : 0; if (start < 0 || start > string.length()) { setLastIndex(0); return null; } - if (!matcher.find(start)) { + final RegExpMatcher matcher = regexp.match(string); + if (matcher == null || !matcher.search(start)) { setLastIndex(0); return null; } - if (global) { + if (regexp.isGlobal()) { setLastIndex(matcher.end()); } - final RegExpMatch match = new RegExpMatch(string, matcher.start(), groups(matcher)); - globalObject.setLastRegExpMatch(match); + final RegExpResult match = new RegExpResult(string, matcher.start(), groups(matcher)); + globalObject.setLastRegExpResult(match); return match; } @@ -557,9 +549,11 @@ * Convert java.util.regex.Matcher groups to JavaScript groups. * That is, replace null and groups that didn't match with undefined. */ - private Object[] groups(final Matcher matcher) { + private Object[] groups(final RegExpMatcher matcher) { final int groupCount = matcher.groupCount(); final Object[] groups = new Object[groupCount + 1]; + final BitVector groupsInNegativeLookahead = regexp.getGroupsInNegativeLookahead(); + for (int i = 0, lastGroupStart = matcher.start(); i <= groupCount; i++) { final int groupStart = matcher.start(i); if (lastGroupStart > groupStart @@ -586,7 +580,7 @@ * @return NativeArray of matches, string or null. */ public Object exec(final String string) { - final RegExpMatch match = execInner(string); + final RegExpResult match = execInner(string); if (match == null) { return null; @@ -617,7 +611,12 @@ * @return String with substitutions. */ Object replace(final String string, final String replacement, final ScriptFunction function) { - final Matcher matcher = pattern.matcher(string); + final RegExpMatcher matcher = regexp.match(string); + + if (matcher == null) { + return string; + } + /* * $$ -> $ * $& -> the matched substring @@ -628,8 +627,8 @@ */ String replace = replacement; - if (!global) { - if (!matcher.find()) { + if (!regexp.isGlobal()) { + if (!matcher.search(0)) { return string; } @@ -642,45 +641,39 @@ return sb.toString(); } - int end = 0; // a.k.a. lastAppendPosition setLastIndex(0); - boolean found; - try { - found = matcher.find(end); - } catch (final IndexOutOfBoundsException e) { - found = false; - } - - if (!found) { + if (!matcher.search(0)) { return string; } + int thisIndex = 0; int previousLastIndex = 0; final StringBuilder sb = new StringBuilder(); + do { if (function != null) { replace = callReplaceValue(function, matcher, string); } - appendReplacement(matcher, string, replace, sb, end); - end = matcher.end(); + + appendReplacement(matcher, string, replace, sb, thisIndex); // ECMA 15.5.4.10 String.prototype.match(regexp) - final int thisIndex = end; + thisIndex = matcher.end(); if (thisIndex == previousLastIndex) { setLastIndex(thisIndex + 1); previousLastIndex = thisIndex + 1; } else { previousLastIndex = thisIndex; } - } while (matcher.find()); + } while (previousLastIndex <= string.length() && matcher.search(previousLastIndex)); - sb.append(string, end, string.length()); + sb.append(string, thisIndex, string.length()); return sb.toString(); } - private void appendReplacement(final Matcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) { + private void appendReplacement(final RegExpMatcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) { // Process substitution string to replace group references with groups int cursor = 0; final StringBuilder result = new StringBuilder(); @@ -748,7 +741,7 @@ sb.append(result); } - private String callReplaceValue(final ScriptFunction function, final Matcher matcher, final String string) { + private String callReplaceValue(final ScriptFunction function, final RegExpMatcher matcher, final String string) { final Object[] groups = groups(matcher); final Object[] args = Arrays.copyOf(groups, groups.length + 2); @@ -782,7 +775,7 @@ return new NativeArray(); } - RegExpMatch match; + RegExpResult match; final int inputLength = input.length(); int lastLength = -1; int lastLastIndex = 0; @@ -834,7 +827,7 @@ * @return Index of match. */ Object search(final String string) { - final RegExpMatch match = execInner(string); + final RegExpResult match = execInner(string); if (match == null) { return -1; @@ -884,52 +877,20 @@ } } - private String getInput() { - return input; - } - - private void setInput(final String input) { - this.input = input; + private void setGlobal(final boolean global) { + regexp.setGlobal(global); } boolean getGlobal() { - return global; - } - - private void setGlobal(final boolean global) { - this.global = global; - } - - private boolean getIgnoreCase() { - return ignoreCase; - } - - private void setIgnoreCase(final boolean ignoreCase) { - this.ignoreCase = ignoreCase; - } - - private boolean getMultiline() { - return multiline; + return regexp.isGlobal(); } - private void setMultiline(final boolean multiline) { - this.multiline = multiline; - } - - private Pattern getPattern() { - return pattern; + private RegExp getRegExp() { + return regexp; } - private void setPattern(final Pattern pattern) { - this.pattern = pattern; - } - - private BitVector getGroupsInNegativeLookahead() { - return groupsInNegativeLookahead; - } - - private void setGroupsInNegativeLookahead(final BitVector groupsInNegativeLookahead) { - this.groupsInNegativeLookahead = groupsInNegativeLookahead; + private void setRegExp(final RegExp regexp) { + this.regexp = regexp; } }
--- a/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 10:39:00 2013 -0400 +++ b/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 16:31:10 2013 +0100 @@ -31,7 +31,7 @@ import jdk.nashorn.internal.objects.annotations.ScriptClass; import jdk.nashorn.internal.objects.annotations.Setter; import jdk.nashorn.internal.runtime.JSType; -import jdk.nashorn.internal.runtime.RegExpMatch; +import jdk.nashorn.internal.runtime.regexp.RegExpResult; import jdk.nashorn.internal.runtime.ScriptObject; import jdk.nashorn.internal.runtime.arrays.ArrayData; @@ -49,11 +49,11 @@ @Property public Object input; - NativeRegExpExecResult(final RegExpMatch match) { + NativeRegExpExecResult(final RegExpResult result) { setProto(Global.instance().getArrayPrototype()); - this.setArray(ArrayData.allocate(match.getGroups().clone())); - this.index = match.getIndex(); - this.input = match.getInput(); + this.setArray(ArrayData.allocate(result.getGroups().clone())); + this.index = result.getIndex(); + this.input = result.getInput(); } /**
--- a/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 10:39:00 2013 -0400 +++ b/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 16:31:10 2013 +0100 @@ -38,7 +38,6 @@ import java.util.Arrays; import java.util.LinkedList; import java.util.List; -import java.util.regex.Pattern; import jdk.internal.dynalink.CallSiteDescriptor; import jdk.internal.dynalink.linker.GuardedInvocation; import jdk.internal.dynalink.linker.LinkRequest; @@ -712,7 +711,7 @@ if (string instanceof NativeRegExp) { nativeRegExp = (NativeRegExp) string; } else { - nativeRegExp = new NativeRegExp(Pattern.compile(JSType.toString(string), Pattern.LITERAL)); + nativeRegExp = NativeRegExp.flatRegExp(JSType.toString(string)); } if (replacement instanceof ScriptFunction) {
--- a/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 10:39:00 2013 -0400 +++ b/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 16:31:10 2013 +0100 @@ -37,7 +37,7 @@ import jdk.nashorn.internal.runtime.ErrorManager; import jdk.nashorn.internal.runtime.JSErrorType; import jdk.nashorn.internal.runtime.ParserException; -import jdk.nashorn.internal.runtime.RegExp; +import jdk.nashorn.internal.runtime.regexp.RegExpFactory; import jdk.nashorn.internal.runtime.Source; /** @@ -427,7 +427,7 @@ if (value instanceof RegexToken) { final RegexToken regex = (RegexToken)value; try { - RegExp.validate(regex.getExpression(), regex.getOptions()); + RegExpFactory.validate(regex.getExpression(), regex.getOptions()); } catch (final ParserException e) { error(e.getMessage()); }
--- a/src/jdk/nashorn/internal/runtime/RegExp.java Fri Feb 22 10:39:00 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package jdk.nashorn.internal.runtime; - -import static java.util.regex.Pattern.CASE_INSENSITIVE; -import static java.util.regex.Pattern.MULTILINE; -import static java.util.regex.Pattern.UNICODE_CASE; - -import java.util.HashSet; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -/** - * This class is used to represent a parsed regular expression. Accepts input - * pattern string and flagString. This is used by AbstractParser to validate - * RegExp literals as well as by NativeRegExp to parse RegExp constructor arguments. - */ -public final class RegExp { - /** Pattern string. */ - private final String input; - - /** Global search flag for this regexp.*/ - private boolean global; - - /** Case insensitive flag for this regexp */ - private boolean ignoreCase; - - /** Multi-line flag for this regexp */ - private boolean multiline; - - /** Java regexp pattern to use for match. We compile to one of these */ - private Pattern pattern; - - /** BitVector that keeps track of groups in negative lookahead */ - private BitVector groupsInNegativeLookahead; - - /** - * Creates RegExpLiteral object from given input and flagString. - * - * @param input RegExp pattern string - * @param flagString RegExp flags - * @throws ParserException if flagString is invalid or input string has syntax error. - */ - public RegExp(final String input, final String flagString) throws ParserException { - this.input = input; - final HashSet<Character> usedFlags = new HashSet<>(); - int flags = 0; - - for (final char ch : flagString.toCharArray()) { - if (usedFlags.contains(ch)) { - throwParserException("repeated.flag", Character.toString(ch)); - } - - switch (ch) { - case 'g': - this.global = true; - usedFlags.add(ch); - break; - case 'i': - this.ignoreCase = true; - flags |= CASE_INSENSITIVE | UNICODE_CASE; - usedFlags.add(ch); - break; - case 'm': - this.multiline = true; - flags |= MULTILINE; - usedFlags.add(ch); - break; - default: - throwParserException("unsupported.flag", Character.toString(ch)); - } - } - - try { - RegExpScanner parsed; - - try { - parsed = RegExpScanner.scan(input); - } catch (final PatternSyntaxException e) { - // refine the exception with a better syntax error, if this - // passes, just rethrow what we have - Pattern.compile(input, flags); - throw e; - } - - if (parsed != null) { - this.pattern = Pattern.compile(parsed.getJavaPattern(), flags); - this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead(); - } - } catch (final PatternSyntaxException e2) { - throwParserException("syntax", e2.getMessage()); - } - - } - - /** - * @return the input - */ - public String getInput() { - return input; - } - - /** - * @return the global - */ - public boolean isGlobal() { - return global; - } - - /** - * @return the ignoreCase - */ - public boolean isIgnoreCase() { - return ignoreCase; - } - - /** - * @return the multiline - */ - public boolean isMultiline() { - return multiline; - } - - /** - * @return the pattern - */ - public Pattern getPattern() { - return pattern; - } - - /** - * @return the groupsInNegativeLookahead - */ - public BitVector getGroupsInNegativeLookahead() { - return groupsInNegativeLookahead; - } - - /** - * Validation method for RegExp input and flagString - we don't care about the RegExp object - * - * @param input regexp input - * @param flagString flag string - * - * @throws ParserException if invalid regexp and flags - */ - @SuppressWarnings({"unused", "ResultOfObjectAllocationIgnored"}) - public static void validate(final String input, final String flagString) throws ParserException { - new RegExp(input, flagString); - } - - private static void throwParserException(final String key, final String str) throws ParserException { - throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str)); - } -}
--- a/src/jdk/nashorn/internal/runtime/RegExpMatch.java Fri Feb 22 10:39:00 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package jdk.nashorn.internal.runtime; - -/** - * Match tuple to keep track of ongoing regexp match. - */ -public final class RegExpMatch { - final Object[] groups; - final int index; - final String input; - - /** - * Constructor - * - * @param input regexp input - * @param index index of match - * @param groups groups vector - */ - public RegExpMatch(final String input, final int index, final Object[] groups) { - this.input = input; - this.index = index; - this.groups = groups; - } - - /** - * Get the groups for the match - * @return group vector - */ - public Object[] getGroups() { - return groups; - } - - /** - * Get the input for the map - * @return input - */ - public String getInput() { - return input; - } - - /** - * Get the index for the match - * @return index - */ - public int getIndex() { - return index; - } - - /** - * Get the length of the match - * @return length - */ - public int length() { - return ((String)groups[0]).length(); - } - - /** - * Get the group with the given index or the empty string if group index is not valid. - * @param index the group index - * @return the group or "" - */ - public Object getGroup(int index) { - return index >= 0 && index < groups.length ? groups[index] : ""; - } - - /** - * Get the last parenthesis group, or the empty string if none exists. - * @return the last group or "" - */ - public Object getLastParen() { - return groups.length > 1 ? groups[groups.length - 1] : ""; - } - -}
--- a/src/jdk/nashorn/internal/runtime/RegExpScanner.java Fri Feb 22 10:39:00 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1411 +0,0 @@ -/* - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package jdk.nashorn.internal.runtime; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.regex.PatternSyntaxException; -import jdk.nashorn.internal.parser.Lexer; -import jdk.nashorn.internal.parser.Scanner; - -/** - * Scan a JavaScript regexp, converting to Java regex if necessary. - * - */ -final class RegExpScanner extends Scanner { - - /** - * String builder to accumulate the result - this contains verbatim parsed JavaScript. - * to get the java equivalent we need to create a Pattern token and return its toString() - */ - private final StringBuilder sb; - - /** An optional error message if one occurred during parse. */ - private String errorMessage; - - /** Is this the special case of a regexp that never matches anything */ - private boolean neverMatches; - - /** The resulting java.util.regex pattern string. */ - private String javaPattern; - - /** Expected token table */ - private final Map<Character, Integer> expected = new HashMap<>(); - - /** Capturing parenthesis that have been found so far. */ - private final List<Capture> caps = new LinkedList<>(); - - /** Forward references to capturing parenthesis to be resolved later.*/ - private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); - - /** Current level of zero-width negative lookahead assertions. */ - private int negativeLookaheadLevel; - - private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; - - private static class Capture { - /** - * Zero-width negative lookaheads enclosing the capture. - */ - private final int negativeLookaheadLevel; - /** - * Captures that live inside a negative lookahead are dead after the - * lookahead and will be undefined if referenced from outside. - */ - private boolean isDead; - - Capture(final int negativeLookaheadLevel) { - this.negativeLookaheadLevel = negativeLookaheadLevel; - } - - public int getNegativeLookaheadLevel() { - return negativeLookaheadLevel; - } - - public boolean isDead() { - return isDead; - } - - public void setDead() { - this.isDead = true; - } - } - - /** - * This is a token - the JavaScript regexp is scanned into a token tree - * A token has other tokens as children as well as "atoms", i.e. Strings. - * - */ - private static class Token { - - private enum Type { - PATTERN, - DISJUNCTION, - ALTERNATIVE, - TERM, - ASSERTION, - QUANTIFIER, - QUANTIFIER_PREFIX, - ATOM, - PATTERN_CHARACTER, - ATOM_ESCAPE, - CHARACTER_ESCAPE, - CONTROL_ESCAPE, - CONTROL_LETTER, - IDENTITY_ESCAPE, - DECIMAL_ESCAPE, - CHARACTERCLASS_ESCAPE, - CHARACTERCLASS, - CLASSRANGES, - NON_EMPTY_CLASSRANGES, - NON_EMPTY_CLASSRANGES_NODASH, - CLASSATOM, - CLASSATOM_NODASH, - CLASS_ESCAPE, - DECIMALDIGITS, - HEX_ESCAPESEQUENCE, - UNICODE_ESCAPESEQUENCE, - } - - /** - * Token tyoe - */ - private final Token.Type type; - - /** - * Child nodes - */ - private final List<Object> children; - - /** - * Parent node - */ - private Token parent; - - /** - * Dead code flag - */ - private boolean isDead; - - private static final Map<Type, ToString> toStringMap = new HashMap<>(); - private static final ToString DEFAULT_TOSTRING = new ToString(); - - private static String unicode(final int value) { - final StringBuilder sb = new StringBuilder(); - final String hex = Integer.toHexString(value); - sb.append('u'); - for (int i = 0; i < 4 - hex.length(); i++) { - sb.append('0'); - } - sb.append(hex); - - return sb.toString(); - } - - static { - toStringMap.put(Type.CHARACTERCLASS, new ToString() { - @Override - public String toString(final Token token) { - return super.toString(token).replace("\\b", "\b"); - } - }); - - // for some reason java regexps don't like control characters on the - // form "\\ca".match([string with ascii 1 at char0]). Translating - // them to unicode does it though. - toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { - @Override - public String toString(final Token token) { - final String str = super.toString(token); - if (str.length() == 2) { - return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); - } - return str; - } - }); - - toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { - @Override - public String toString(final Token token) { - final String str = super.toString(token); - - if ("\0".equals(str)) { - return str; - } - - int value; - - if (!token.hasParentOfType(Type.CLASSRANGES)) { - return str; - } - - value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal - if (value > 0xff) { - throw new NumberFormatException(str); - } - - return Token.unicode(value); - } - }); - - } - - /** - * JavaScript Token to Java regex substring framework. - * - */ - private static class ToString { - String toString(final Token token) { - final StringBuilder sb = new StringBuilder(); - for (final Object child : token.getChildren()) { - sb.append(child); - } - - //perform global substitutions that hold true for any evaluated form - String str = sb.toString(); - switch (str) { - case "\\s": - str = "[" + Lexer.getWhitespaceRegExp() + "]"; - break; - case "\\S": - str = "[^" + Lexer.getWhitespaceRegExp() + "]"; - break; - case "[^]": - str = "[\\s\\S]"; - break; - default: - break; - } - return str; - } - } - - /** - * Token iterator. Doesn't return "atom" children. i.e. string representations, - * just tokens - * - */ - private static class TokenIterator implements Iterator<Token> { - private final List<Token> preorder; - - private void init(final Token root) { - preorder.add(root); - for (final Object child : root.getChildren()) { - if (child instanceof Token) { - init((Token)child); - } - } - } - - TokenIterator(final Token root) { - preorder = new ArrayList<>(); - init(root); - } - - @Override - public boolean hasNext() { - return !preorder.isEmpty(); - } - - @Override - public Token next() { - return preorder.remove(0); - } - - @Override - public void remove() { - next(); - } - } - - /** - * Constructor - * @param type the token type - */ - Token(final Token.Type type) { - this.type = type; - children = new ArrayList<>(); - } - - /** - * Add a an "atom" child to a token - * @param child the child to add - * @return the token (for chaining) - */ - public Token add(final String child) { - children.add(child); - return this; - } - - /** - * Add a child to a token - * @param child the child - * @return the token (for chaining) - */ - public Token add(final Token child) { - if (child != null) { - children.add(child); - child.setParent(this); - } - return this; - } - - /** - * Remove a child from a token - * @param child the child to remove - * @return true if successful - */ - public boolean remove(final Token child) { - return children.remove(child); - } - - /** - * Remove the last child from a token - * @return the removed child - */ - public Object removeLast() { - return children.remove(children.size() - 1); - } - - /** - * Flag this token as dead code - * @param isDead is it dead or not - */ - private void setIsDead(final boolean isDead) { - this.isDead = isDead; - } - - /** - * Is this token dead code - * @return boolean - */ - private boolean getIsDead() { - return isDead; - } - - /** - * Get the parent of this token - * @return parent token - */ - public Token getParent() { - return parent; - } - - public boolean hasParentOfType(final Token.Type parentType) { - for (Token p = getParent(); p != null; p = p.getParent()) { - if (p.getType() == parentType) { - return true; - } - } - return false; - } - - public boolean hasChildOfType(final Token.Type childType) { - for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { - if (iter.next().getType() == childType) { - return true; - } - } - return false; - } - - /** - * Set the parent of this token - * @param parent - */ - private void setParent(final Token parent) { - this.parent = parent; - } - - /** - * Get the children of this token - * @return an array of children, never null - */ - public Object[] getChildren() { - return children.toArray(); - } - - /** - * Reset this token, remove all children - */ - public void reset() { - children.clear(); - } - - /** - * Get a preorder token iterator with this token as root - * @return an iterator - */ - public Iterator<Token> iterator() { - return new TokenIterator(this); - } - - /** - * Get the type of this token - * @return type - */ - public Type getType() { - return type; - } - - /** - * Turn this token into Java regexp compatible text - * @return part of a java regexp - */ - @Override - public String toString() { - ToString t = toStringMap.get(getType()); - if (t == null) { - t = DEFAULT_TOSTRING; - } - return t.toString(this); - } - } - - /** - * Constructor - * @param string the JavaScript regexp to parse - */ - private RegExpScanner(final String string) { - super(string); - sb = new StringBuilder(limit); - reset(0); - expected.put(']', 0); - expected.put('}', 0); - } - - private void processForwardReferences() { - if (neverMatches()) { - return; - } - - for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { - if (fwdRef.getKey().intValue() > caps.size()) { - neverMatches = true; - break; - } - - fwdRef.getValue().setIsDead(true); - } - - forwardReferences.clear(); - } - - /** - * Scan a JavaScript regexp string returning a Java safe regex string. - * - * @param string - * JavaScript regexp string. - * @return Java safe regex string. - */ - public static RegExpScanner scan(final String string) { - final RegExpScanner scanner = new RegExpScanner(string); - - Token pattern; - - try { - pattern = scanner.pattern(); - } catch (final Exception e) { - throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); - } - - scanner.processForwardReferences(); - if (scanner.neverMatches()) { - return null; // never matches - } - - // go over the code and remove dead code - final Iterator<Token> iter = pattern.iterator(); - while (iter.hasNext()) { - final Token next = iter.next(); - if (next.getIsDead()) { - next.getParent().remove(next); - } - } - - // turn the pattern into a string, p, the java equivalent string for our js regexp - final String p = pattern.toString(); - // if builder contains all tokens that were sent in, we know - // we correctly parsed the entire JavaScript regexp without syntax errors - if (!string.equals(scanner.getStringBuilder().toString())) { - throw new PatternSyntaxException(string, p, p.length() + 1); - } - - scanner.javaPattern = p; - return scanner; - } - - /** - * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, - * is an example where we never match - * - * @return boolean - */ - private boolean neverMatches() { - return neverMatches; - } - - /** - * This is used to set better error messages that can be reused - * in NativeRegExp for augmenting e.g. SyntaxErrors. - * - * @return an error message or null if no extra info - */ - public String getErrorMessage() { - return errorMessage; - } - - final StringBuilder getStringBuilder() { - return sb; - } - - String getJavaPattern() { - return javaPattern; - } - - BitVector getGroupsInNegativeLookahead() { - BitVector vec = null; - for (int i = 0; i < caps.size(); i++) { - final Capture cap = caps.get(i); - if (cap.getNegativeLookaheadLevel() > 0) { - if (vec == null) { - vec = new BitVector(caps.size() + 1); - } - vec.set(i + 1); - } - } - return vec; - } - - /** - * Commit n characters to the builder and to a given token - * @param token Uncommitted token. - * @param n Number of characters. - * @return Committed token - */ - private Token commit(final Token token, final int n) { - final int startIn = position; - - switch (n) { - case 1: - sb.append(ch0); - skip(1); - break; - case 2: - sb.append(ch0); - sb.append(ch1); - skip(2); - break; - case 3: - sb.append(ch0); - sb.append(ch1); - sb.append(ch2); - skip(3); - break; - default: - assert false : "Should not reach here"; - } - - if (token == null) { - return null; - } - - return token.add(sb.substring(startIn, sb.length())); - } - - /** - * Restart the buffers back at an earlier position. - * - * @param startIn - * Position in the input stream. - * @param startOut - * Position in the output stream. - */ - private void restart(final int startIn, final int startOut) { - reset(startIn); - sb.setLength(startOut); - } - - private void push(final char ch) { - expected.put(ch, expected.get(ch) + 1); - } - - private void pop(final char ch) { - expected.put(ch, Math.min(0, expected.get(ch) - 1)); - } - - /* - * Recursive descent tokenizer starts below. - */ - - /* - * Pattern :: - * Disjunction - */ - private Token pattern() { - final Token token = new Token(Token.Type.PATTERN); - - final Token child = disjunction(); - return token.add(child); - } - - /* - * Disjunction :: - * Alternative - * Alternative | Disjunction - */ - private Token disjunction() { - final Token token = new Token(Token.Type.DISJUNCTION); - - while (true) { - token.add(alternative()); - - if (ch0 == '|') { - commit(token, 1); - } else { - break; - } - } - - return token; - } - - /* - * Alternative :: - * [empty] - * Alternative Term - */ - private Token alternative() { - final Token token = new Token(Token.Type.ALTERNATIVE); - - Token child; - while ((child = term()) != null) { - token.add(child); - } - - return token; - } - - /* - * Term :: - * Assertion - * Atom - * Atom Quantifier - */ - private Token term() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.TERM); - Token child; - - child = assertion(); - if (child != null) { - return token.add(child); - } - - child = atom(); - if (child != null) { - boolean emptyCharacterClass = false; - if ("[]".equals(child.toString())) { - emptyCharacterClass = true; - } - - token.add(child); - - final Token quantifier = quantifier(); - if (quantifier != null) { - token.add(quantifier); - } - - if (emptyCharacterClass) { - if (quantifier == null) { - neverMatches = true; //never matches ever. - } else { - //if we can get away with max zero, remove this entire token - final String qs = quantifier.toString(); - if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { - token.setIsDead(true); - } - } - } - - return token; - } - - restart(startIn, startOut); - return null; - } - - /* - * Assertion :: - * ^ - * $ - * \b - * \B - * ( ? = Disjunction ) - * ( ? ! Disjunction ) - */ - private Token assertion() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.ASSERTION); - - switch (ch0) { - case '^': - case '$': - return commit(token, 1); - - case '\\': - if (ch1 == 'b' || ch1 == 'B') { - return commit(token, 2); - } - break; - - case '(': - if (ch1 != '?') { - break; - } - if (ch2 != '=' && ch2 != '!') { - break; - } - final boolean isNegativeLookahead = (ch2 == '!'); - commit(token, 3); - - if (isNegativeLookahead) { - negativeLookaheadLevel++; - } - final Token disjunction = disjunction(); - if (isNegativeLookahead) { - for (final Capture cap : caps) { - if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { - cap.setDead(); - } - } - negativeLookaheadLevel--; - } - - if (disjunction != null && ch0 == ')') { - token.add(disjunction); - return commit(token, 1); - } - break; - - default: - break; - } - - restart(startIn, startOut); - - return null; - } - - /* - * Quantifier :: - * QuantifierPrefix - * QuantifierPrefix ? - */ - private Token quantifier() { - final Token token = new Token(Token.Type.QUANTIFIER); - final Token child = quantifierPrefix(); - if (child != null) { - token.add(child); - if (ch0 == '?') { - commit(token, 1); - } - return token; - } - return null; - } - - /* - * QuantifierPrefix :: - * * - * + - * ? - * { DecimalDigits } - * { DecimalDigits , } - * { DecimalDigits , DecimalDigits } - */ - private Token quantifierPrefix() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); - - switch (ch0) { - case '*': - case '+': - case '?': - return commit(token, 1); - - case '{': - commit(token, 1); - - final Token child = decimalDigits(); - if (child == null) { - break; // not a quantifier - back out - } - push('}'); - token.add(child); - - if (ch0 == ',') { - commit(token, 1); - token.add(decimalDigits()); - } - - if (ch0 == '}') { - pop('}'); - commit(token, 1); - } - - return token; - - default: - break; - } - - restart(startIn, startOut); - return null; - } - - /* - * Atom :: - * PatternCharacter - * . - * \ AtomEscape - * CharacterClass - * ( Disjunction ) - * ( ? : Disjunction ) - * - */ - private Token atom() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.ATOM); - Token child; - - child = patternCharacter(); - if (child != null) { - return token.add(child); - } - - if (ch0 == '.') { - return commit(token, 1); - } - - if (ch0 == '\\') { - commit(token, 1); - child = atomEscape(); - - if (child != null) { - if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { - final char idEscape = child.toString().charAt(0); - if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { - token.reset(); - } - } - - token.add(child); - - // forward backreferences always match empty. JavaScript != Java - if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { - final int refNum = Integer.parseInt(child.toString()); - - if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { - // reference to dead in-negative-lookahead capture - token.setIsDead(true); - } else if (caps.size() < refNum) { - // forward reference: always matches against empty string (dead token). - // invalid reference (non-existant capture): pattern never matches. - forwardReferences.put(refNum, token); - } - } - - return token; - } - } - - child = characterClass(); - if (child != null) { - return token.add(child); - } - - if (ch0 == '(') { - boolean capturingParens = true; - commit(token, 1); - if (ch0 == '?' && ch1 == ':') { - capturingParens = false; - commit(token, 2); - } - - child = disjunction(); - if (child != null) { - token.add(child); - if (ch0 == ')') { - final Token atom = commit(token, 1); - if (capturingParens) { - caps.add(new Capture(negativeLookaheadLevel)); - } - return atom; - } - } - } - - restart(startIn, startOut); - return null; - } - - /* - * PatternCharacter :: - * SourceCharacter but not any of: ^$\.*+?()[]{}| - */ - @SuppressWarnings("fallthrough") - private Token patternCharacter() { - if (atEOF()) { - return null; - } - - switch (ch0) { - case '^': - case '$': - case '\\': - case '.': - case '*': - case '+': - case '?': - case '(': - case ')': - case '[': - case '|': - return null; - - case '}': - case ']': - final int n = expected.get(ch0); - if (n != 0) { - return null; - } - - case '{': - // if not a valid quantifier escape curly brace to match itself - // this ensures compatibility with other JS implementations - final Token quant = quantifierPrefix(); - return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; - - default: - return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER - } - } - - /* - * AtomEscape :: - * DecimalEscape - * CharacterEscape - * CharacterClassEscape - */ - private Token atomEscape() { - final Token token = new Token(Token.Type.ATOM_ESCAPE); - Token child; - - child = decimalEscape(); - if (child != null) { - return token.add(child); - } - - child = characterClassEscape(); - if (child != null) { - return token.add(child); - } - - child = characterEscape(); - if (child != null) { - return token.add(child); - } - - - return null; - } - - /* - * CharacterEscape :: - * ControlEscape - * c ControlLetter - * HexEscapeSequence - * UnicodeEscapeSequence - * IdentityEscape - */ - private Token characterEscape() { - final int startIn = position; - final int startOut = sb.length(); - - final Token token = new Token(Token.Type.CHARACTER_ESCAPE); - Token child; - - child = controlEscape(); - if (child != null) { - return token.add(child); - } - - if (ch0 == 'c') { - commit(token, 1); - child = controlLetter(); - if (child != null) { - return token.add(child); - } - restart(startIn, startOut); - } - - child = hexEscapeSequence(); - if (child != null) { - return token.add(child); - } - - child = unicodeEscapeSequence(); - if (child != null) { - return token.add(child); - } - - child = identityEscape(); - if (child != null) { - return token.add(child); - } - - restart(startIn, startOut); - - return null; - } - - private boolean scanEscapeSequence(final char leader, final int length, final Token token) { - final int startIn = position; - final int startOut = sb.length(); - - if (ch0 != leader) { - return false; - } - - commit(token, 1); - for (int i = 0; i < length; i++) { - final char ch0l = Character.toLowerCase(ch0); - if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { - commit(token, 1); - } else { - restart(startIn, startOut); - return false; - } - } - - return true; - } - - private Token hexEscapeSequence() { - final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); - if (scanEscapeSequence('x', 2, token)) { - return token; - } - return null; - } - - private Token unicodeEscapeSequence() { - final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); - if (scanEscapeSequence('u', 4, token)) { - return token; - } - return null; - } - - /* - * ControlEscape :: - * one of fnrtv - */ - private Token controlEscape() { - switch (ch0) { - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); - - default: - return null; - } - } - - /* - * ControlLetter :: - * one of abcdefghijklmnopqrstuvwxyz - * ABCDEFGHIJKLMNOPQRSTUVWXYZ - */ - private Token controlLetter() { - final char c = Character.toUpperCase(ch0); - if (c >= 'A' && c <= 'Z') { - final Token token = new Token(Token.Type.CONTROL_LETTER); - commit(token, 1); - return token; - } - return null; - /* - Token token = new Token(Token.Type.CONTROL_LETTER); - commit(null, 1);//add original char to builder not to token - this.neverMatches = c < 'A' || c > 'Z'; - return token.add(""+c);*/ - } - - /* - * IdentityEscape :: - * SourceCharacter but not IdentifierPart - * <ZWJ> (200c) - * <ZWNJ> (200d) - */ - private Token identityEscape() { - final Token token = new Token(Token.Type.IDENTITY_ESCAPE); - commit(token, 1); - return token; - } - - /* - * DecimalEscape :: - * DecimalIntegerLiteral [lookahead DecimalDigit] - */ - private Token decimalEscape() { - final Token token = new Token(Token.Type.DECIMAL_ESCAPE); - final int startIn = position; - final int startOut = sb.length(); - - if (ch0 == '0' && !isDecimalDigit(ch1)) { - commit(token, 1); - token.removeLast(); - // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); - return token.add("\u0000"); - } - - if (isDecimalDigit(ch0)) { - while (isDecimalDigit(ch0)) { - commit(token, 1); - } - return token; - } - - restart(startIn, startOut); - - return null; - } - - /* - * CharacterClassEscape :: - * one of dDsSwW - */ - private Token characterClassEscape() { - switch (ch0) { - case 's': - case 'S': - case 'd': - case 'D': - case 'w': - case 'W': - return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); - - default: - return null; - } - } - - /* - * CharacterClass :: - * [ [lookahead {^}] ClassRanges ] - * [ ^ ClassRanges ] - */ - private Token characterClass() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.CHARACTERCLASS); - - if (ch0 == '[') { - push(']'); - commit(token, 1); - - if (ch0 == '^') { - commit(token, 1); - } - - final Token child = classRanges(); - if (child != null && ch0 == ']') { - pop(']'); - token.add(child); - return commit(token, 1); - } - } - - restart(startIn, startOut); - return null; - } - - /* - * ClassRanges :: - * [empty] - * NonemptyClassRanges - */ - private Token classRanges() { - return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); - } - - /* - * NonemptyClassRanges :: - * ClassAtom - * ClassAtom NonemptyClassRangesNoDash - * ClassAtom - ClassAtom ClassRanges - */ - private Token nonemptyClassRanges() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); - Token child; - - child = classAtom(); - if (child != null) { - token.add(child); - - if (ch0 == '-') { - commit(token, 1); - - final Token child1 = classAtom(); - final Token child2 = classRanges(); - if (child1 != null && child2 != null) { - token.add(child1); - token.add(child2); - - return token; - } - } - - child = nonemptyClassRangesNoDash(); - if (child != null) { - token.add(child); - return token; - } - - return token; - } - - restart(startIn, startOut); - return null; - } - - /* - * NonemptyClassRangesNoDash :: - * ClassAtom - * ClassAtomNoDash NonemptyClassRangesNoDash - * ClassAtomNoDash - ClassAtom ClassRanges - */ - private Token nonemptyClassRangesNoDash() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); - Token child; - - child = classAtomNoDash(); - if (child != null) { - token.add(child); - - // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom - if (ch0 == '-') { - commit(token, 1); - - final Token child1 = classAtom(); - final Token child2 = classRanges(); - if (child1 != null && child2 != null) { - token.add(child1); - return token.add(child2); - } - //fallthru - } - - child = nonemptyClassRangesNoDash(); - if (child != null) { - token.add(child); - } - return token; // still a class atom - } - - child = classAtom(); - if (child != null) { - return token.add(child); - } - - restart(startIn, startOut); - return null; - } - - /* - * ClassAtom : - ClassAtomNoDash - */ - private Token classAtom() { - final Token token = new Token(Token.Type.CLASSATOM); - - if (ch0 == '-') { - return commit(token, 1); - } - - final Token child = classAtomNoDash(); - if (child != null) { - return token.add(child); - } - - return null; - } - - /* - * ClassAtomNoDash :: - * SourceCharacter but not one of \ or ] or - - * \ ClassEscape - */ - private Token classAtomNoDash() { - final int startIn = position; - final int startOut = sb.length(); - final Token token = new Token(Token.Type.CLASSATOM_NODASH); - - switch (ch0) { - case ']': - case '-': - case '\0': - return null; - - case '[': - // unescaped left square bracket - add escape - return commit(token.add("\\"), 1); - - case '\\': - commit(token, 1); - final Token child = classEscape(); - if (child != null) { - return token.add(child); - } - - restart(startIn, startOut); - return null; - - default: - return commit(token, 1); - } - } - - /* - * ClassEscape :: - * DecimalEscape - * b - * CharacterEscape - * CharacterClassEscape - */ - private Token classEscape() { - final Token token = new Token(Token.Type.CLASS_ESCAPE); - Token child; - - child = decimalEscape(); - if (child != null) { - return token.add(child); - } - - if (ch0 == 'b') { - return commit(token, 1); - } - - child = characterEscape(); - if (child != null) { - return token.add(child); - } - - child = characterClassEscape(); - if (child != null) { - return token.add(child); - } - - return null; - } - - /* - * DecimalDigits - */ - private Token decimalDigits() { - if (!isDecimalDigit(ch0)) { - return null; - } - - final Token token = new Token(Token.Type.DECIMALDIGITS); - while (isDecimalDigit(ch0)) { - commit(token, 1); - } - - return token; - } - - private static boolean isDecimalDigit(final char ch) { - return ch >= '0' && ch <= '9'; - } -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/DefaultRegExp.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +import jdk.nashorn.internal.runtime.ParserException; + +import static java.util.regex.Pattern.CASE_INSENSITIVE; +import static java.util.regex.Pattern.MULTILINE; +import static java.util.regex.Pattern.UNICODE_CASE; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Default regular expression implementation based on java.util.regex package. + * + * Note that this class is not thread-safe as it stores the current match result + * and the string being matched in instance fields. + */ +public class DefaultRegExp extends RegExp { + + /** Java regexp pattern to use for match. We compile to one of these */ + private Pattern pattern; + + /** The matcher */ + private RegExpMatcher matcher; + + /** + * Construct a Regular expression from the given {@code source} and {@code flags} strings. + * + * @param source RegExp source string + * @param flags RegExp flag string + * @throws ParserException if flags is invalid or source string has syntax error. + */ + public DefaultRegExp(final String source, final String flags) throws ParserException { + super(source, flags); + + int intFlags = 0; + + if (isIgnoreCase()) { + intFlags |= CASE_INSENSITIVE | UNICODE_CASE; + } + if (isMultiline()) { + intFlags |= MULTILINE; + } + + try { + RegExpScanner parsed; + + try { + parsed = RegExpScanner.scan(source); + } catch (final PatternSyntaxException e) { + // refine the exception with a better syntax error, if this + // passes, just rethrow what we have + Pattern.compile(source, intFlags); + throw e; + } + + if (parsed != null) { + this.pattern = Pattern.compile(parsed.getJavaPattern(), intFlags); + this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead(); + } + } catch (final PatternSyntaxException e2) { + throwParserException("syntax", e2.getMessage()); + } + } + + @Override + public RegExpMatcher match(final String str) { + if (pattern == null) { + return null; // never matches or similar, e.g. a[] + } + + RegExpMatcher matcher = this.matcher; + + if (matcher == null || matcher.getInput() != str) { + matcher = new DefaultMatcher(str); + this.matcher = matcher; + } + + return matcher; + } + + class DefaultMatcher implements RegExpMatcher { + final String input; + final Matcher matcher; + + DefaultMatcher(final String input) { + this.input = input; + this.matcher = pattern.matcher(input); + } + + @Override + public boolean search(final int start) { + return matcher.find(start); + } + + @Override + public String getInput() { + return input; + } + + @Override + public int start() { + return matcher.start(); + } + + @Override + public int start(final int group) { + return matcher.start(group); + } + + @Override + public int end() { + return matcher.end(); + } + + @Override + public int end(final int group) { + return matcher.end(group); + } + + @Override + public String group() { + return matcher.group(); + } + + @Override + public String group(final int group) { + return matcher.group(group); + } + + @Override + public int groupCount() { + return matcher.groupCount(); + } + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExp.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +import jdk.nashorn.internal.runtime.BitVector; +import jdk.nashorn.internal.runtime.ECMAErrors; +import jdk.nashorn.internal.runtime.ParserException; + +import java.util.regex.MatchResult; + +/** + * This is the base class for representing a parsed regular expression. + * + * Instances of this class are created by a {@link RegExpFactory}. + */ +public abstract class RegExp { + + /** Pattern string. */ + private final String source; + + /** Global search flag for this regexp.*/ + private boolean global; + + /** Case insensitive flag for this regexp */ + private boolean ignoreCase; + + /** Multi-line flag for this regexp */ + private boolean multiline; + + /** BitVector that keeps track of groups in negative lookahead */ + protected BitVector groupsInNegativeLookahead; + + /** + * Constructor. + * + * @param source the source string + * @param flags the flags string + */ + protected RegExp(final String source, final String flags) { + this.source = source; + for (int i = 0; i < flags.length(); i++) { + final char ch = flags.charAt(i); + switch (ch) { + case 'g': + if (this.global) { + throwParserException("repeated.flag", "g"); + } + this.global = true; + break; + case 'i': + if (this.ignoreCase) { + throwParserException("repeated.flag", "i"); + } + this.ignoreCase = true; + break; + case 'm': + if (this.multiline) { + throwParserException("repeated.flag", "m"); + } + this.multiline = true; + break; + default: + throwParserException("unsupported.flag", Character.toString(ch)); + } + } + } + + /** + * Get the source pattern of this regular expression. + * + * @return the source string + */ + public String getSource() { + return source; + } + + /** + * Set the global flag of this regular expression to {@code global}. + * + * @param global the new global flag + */ + public void setGlobal(final boolean global) { + this.global = global; + } + + /** + * Get the global flag of this regular expression. + * + * @return the global flag + */ + public boolean isGlobal() { + return global; + } + + /** + * Get the ignore-case flag of this regular expression. + * + * @return the ignore-case flag + */ + public boolean isIgnoreCase() { + return ignoreCase; + } + + /** + * Get the multiline flag of this regular expression. + * + * @return the multiline flag + */ + public boolean isMultiline() { + return multiline; + } + + /** + * Get a bitset indicating which of the groups in this regular expression are inside a negative lookahead. + * + * @return the groups-in-negative-lookahead bitset + */ + public BitVector getGroupsInNegativeLookahead() { + return groupsInNegativeLookahead; + } + + /** + * Match this regular expression against {@code str}, starting at index {@code start} + * and return a {@link MatchResult} with the result. + * + * @param str the string + * @return the matcher + */ + public abstract RegExpMatcher match(String str); + + /** + * Throw a regexp parser exception. + * + * @param key the message key + * @param str string argument + * @throws jdk.nashorn.internal.runtime.ParserException + */ + protected static void throwParserException(final String key, final String str) throws ParserException { + throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str)); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +import jdk.nashorn.internal.parser.Lexer; +import jdk.nashorn.internal.runtime.ParserException; + +/** + * Factory class for regular expressions. This class creates instances of {@link DefaultRegExp}. + */ +public class RegExpFactory { + + + private final static RegExpFactory instance = new RegExpFactory(); + + /** + * Creates a Regular expression from the given {@code pattern} and {@code flags} strings. + * + * @param pattern RegExp pattern string + * @param flags RegExp flags string + * @throws ParserException if flags is invalid or pattern string has syntax error. + */ + protected RegExp compile(final String pattern, final String flags) throws ParserException { + return new DefaultRegExp(pattern, flags); + } + + /** + * Replace a regexp token as suitable for regexp instances created by this factory. + * + * @param str a regular expression token + * @return the replacement token + */ + protected String replaceToken(final String str) { + switch (str) { + case "\\s": + return "[" + Lexer.getWhitespaceRegExp() + "]"; + case "\\S": + return "[^" + Lexer.getWhitespaceRegExp() + "]"; + case "[^]": + return "[\\s\\S]"; + default: + return str; + } + } + + /** + * Compile a regexp with the given {@code source} and {@code flags}. + * + * @param pattern RegExp pattern string + * @param flags flag string + * + * @throws ParserException if invalid source or flags + */ + public static RegExp create(final String pattern, final String flags) { + return instance.compile(pattern, flags); + } + + /** + * Replace a regexp token as needed by the currently installed factory instance. + * + * @param token a regexp token + * @return the replacement token + */ + public static String replace(final String token) { + return instance.replaceToken(token); + } + + /** + * Validate a regexp with the given {@code source} and {@code flags}. + * + * @param pattern RegExp pattern string + * @param flags flag string + * + * @throws ParserException if invalid source or flags + */ + // @SuppressWarnings({"unused"}) + public static void validate(final String pattern, final String flags) throws ParserException { + instance.compile(pattern, flags); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpMatcher.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +import java.util.regex.MatchResult; + +/** + * Interface for matching a regular expression against a string and retrieving the + * match result. Extends {@link MatchResult}. + */ +public interface RegExpMatcher extends MatchResult { + + /** + * Searches for pattern starting at {@code start}. Returns {@code true} if a match was found. + * + * @param start the start index in the input string + * @return {@code true} if a match was found + */ + boolean search(int start); + + /** + * Get the input string. + * + * @return the input string + */ + String getInput(); + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpResult.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +/** + * Match tuple to keep track of ongoing regexp match. + */ +public final class RegExpResult { + final Object[] groups; + final int index; + final String input; + + /** + * Constructor + * + * @param input regexp input + * @param index index of match + * @param groups groups vector + */ + public RegExpResult(final String input, final int index, final Object[] groups) { + this.input = input; + this.index = index; + this.groups = groups; + } + + /** + * Get the groups for the match + * @return group vector + */ + public Object[] getGroups() { + return groups; + } + + /** + * Get the input for the map + * @return input + */ + public String getInput() { + return input; + } + + /** + * Get the index for the match + * @return index + */ + public int getIndex() { + return index; + } + + /** + * Get the length of the match + * @return length + */ + public int length() { + return ((String)groups[0]).length(); + } + + /** + * Get the group with the given index or the empty string if group index is not valid. + * @param index the group index + * @return the group or "" + */ + public Object getGroup(int index) { + return index >= 0 && index < groups.length ? groups[index] : ""; + } + + /** + * Get the last parenthesis group, or the empty string if none exists. + * @return the last group or "" + */ + public Object getLastParen() { + return groups.length > 1 ? groups[groups.length - 1] : ""; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Fri Feb 22 16:31:10 2013 +0100 @@ -0,0 +1,1391 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.nashorn.internal.runtime.regexp; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.regex.PatternSyntaxException; + +import jdk.nashorn.internal.parser.Scanner; +import jdk.nashorn.internal.runtime.BitVector; + +/** + * Scan a JavaScript regexp, converting to Java regex if necessary. + * + */ +final class RegExpScanner extends Scanner { + + /** + * String builder to accumulate the result - this contains verbatim parsed JavaScript. + * to get the java equivalent we need to create a Pattern token and return its toString() + */ + private final StringBuilder sb; + + /** Is this the special case of a regexp that never matches anything */ + private boolean neverMatches; + + /** The resulting java.util.regex pattern string. */ + private String javaPattern; + + /** Expected token table */ + private final Map<Character, Integer> expected = new HashMap<>(); + + /** Capturing parenthesis that have been found so far. */ + private final List<Capture> caps = new LinkedList<>(); + + /** Forward references to capturing parenthesis to be resolved later.*/ + private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); + + /** Current level of zero-width negative lookahead assertions. */ + private int negativeLookaheadLevel; + + private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; + + private static class Capture { + /** + * Zero-width negative lookaheads enclosing the capture. + */ + private final int negativeLookaheadLevel; + /** + * Captures that live inside a negative lookahead are dead after the + * lookahead and will be undefined if referenced from outside. + */ + private boolean isDead; + + Capture(final int negativeLookaheadLevel) { + this.negativeLookaheadLevel = negativeLookaheadLevel; + } + + public int getNegativeLookaheadLevel() { + return negativeLookaheadLevel; + } + + public boolean isDead() { + return isDead; + } + + public void setDead() { + this.isDead = true; + } + } + + /** + * This is a token - the JavaScript regexp is scanned into a token tree + * A token has other tokens as children as well as "atoms", i.e. Strings. + */ + private static class Token { + + private enum Type { + PATTERN, + DISJUNCTION, + ALTERNATIVE, + TERM, + ASSERTION, + QUANTIFIER, + QUANTIFIER_PREFIX, + ATOM, + PATTERN_CHARACTER, + ATOM_ESCAPE, + CHARACTER_ESCAPE, + CONTROL_ESCAPE, + CONTROL_LETTER, + IDENTITY_ESCAPE, + DECIMAL_ESCAPE, + CHARACTERCLASS_ESCAPE, + CHARACTERCLASS, + CLASSRANGES, + NON_EMPTY_CLASSRANGES, + NON_EMPTY_CLASSRANGES_NODASH, + CLASSATOM, + CLASSATOM_NODASH, + CLASS_ESCAPE, + DECIMALDIGITS, + HEX_ESCAPESEQUENCE, + UNICODE_ESCAPESEQUENCE, + } + + /** + * Token tyoe + */ + private final Token.Type type; + + /** + * Child nodes + */ + private final List<Object> children; + + /** + * Parent node + */ + private Token parent; + + /** + * Dead code flag + */ + private boolean isDead; + + private static final Map<Type, ToString> toStringMap = new HashMap<>(); + private static final ToString DEFAULT_TOSTRING = new ToString(); + + private static String unicode(final int value) { + final StringBuilder sb = new StringBuilder(); + final String hex = Integer.toHexString(value); + sb.append('u'); + for (int i = 0; i < 4 - hex.length(); i++) { + sb.append('0'); + } + sb.append(hex); + + return sb.toString(); + } + + static { + toStringMap.put(Type.CHARACTERCLASS, new ToString() { + @Override + public String toString(final Token token) { + return super.toString(token).replace("\\b", "\b"); + } + }); + + // for some reason java regexps don't like control characters on the + // form "\\ca".match([string with ascii 1 at char0]). Translating + // them to unicode does it though. + toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { + @Override + public String toString(final Token token) { + final String str = super.toString(token); + if (str.length() == 2) { + return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); + } + return str; + } + }); + + toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { + @Override + public String toString(final Token token) { + final String str = super.toString(token); + + if ("\0".equals(str)) { + return str; + } + + int value; + + if (!token.hasParentOfType(Type.CLASSRANGES)) { + return str; + } + + value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal + if (value > 0xff) { + throw new NumberFormatException(str); + } + + return Token.unicode(value); + } + }); + + } + + /** + * JavaScript Token to Java regex substring framework. + */ + private static class ToString { + String toString(final Token token) { + final Object[] children = token.getChildren(); + + // Allow the installed regexp factory to perform global substitutions. + switch (children.length) { + case 0: + return ""; + case 1: + return RegExpFactory.replace(children[0].toString()); + default: + final StringBuilder sb = new StringBuilder(); + for (final Object child : children) { + sb.append(child); + } + return RegExpFactory.replace(sb.toString()); + } + } + } + + /** + * Token iterator. Doesn't return "atom" children. i.e. string representations, + * just tokens + * + */ + private static class TokenIterator implements Iterator<Token> { + private final List<Token> preorder; + + private void init(final Token root) { + preorder.add(root); + for (final Object child : root.getChildren()) { + if (child instanceof Token) { + init((Token)child); + } + } + } + + TokenIterator(final Token root) { + preorder = new ArrayList<>(); + init(root); + } + + @Override + public boolean hasNext() { + return !preorder.isEmpty(); + } + + @Override + public Token next() { + return preorder.remove(0); + } + + @Override + public void remove() { + next(); + } + } + + /** + * Constructor + * @param type the token type + */ + Token(final Token.Type type) { + this.type = type; + children = new ArrayList<>(); + } + + /** + * Add a an "atom" child to a token + * @param child the child to add + * @return the token (for chaining) + */ + public Token add(final String child) { + children.add(child); + return this; + } + + /** + * Add a child to a token + * @param child the child + * @return the token (for chaining) + */ + public Token add(final Token child) { + if (child != null) { + children.add(child); + child.setParent(this); + } + return this; + } + + /** + * Remove a child from a token + * @param child the child to remove + * @return true if successful + */ + public boolean remove(final Token child) { + return children.remove(child); + } + + /** + * Remove the last child from a token + * @return the removed child + */ + public Object removeLast() { + return children.remove(children.size() - 1); + } + + /** + * Flag this token as dead code + * @param isDead is it dead or not + */ + private void setIsDead(final boolean isDead) { + this.isDead = isDead; + } + + /** + * Is this token dead code + * @return boolean + */ + private boolean getIsDead() { + return isDead; + } + + /** + * Get the parent of this token + * @return parent token + */ + public Token getParent() { + return parent; + } + + public boolean hasParentOfType(final Token.Type parentType) { + for (Token p = getParent(); p != null; p = p.getParent()) { + if (p.getType() == parentType) { + return true; + } + } + return false; + } + + public boolean hasChildOfType(final Token.Type childType) { + for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { + if (iter.next().getType() == childType) { + return true; + } + } + return false; + } + + /** + * Set the parent of this token + * @param parent + */ + private void setParent(final Token parent) { + this.parent = parent; + } + + /** + * Get the children of this token + * @return an array of children, never null + */ + public Object[] getChildren() { + return children.toArray(); + } + + /** + * Reset this token, remove all children + */ + public void reset() { + children.clear(); + } + + /** + * Get a preorder token iterator with this token as root + * @return an iterator + */ + public Iterator<Token> iterator() { + return new TokenIterator(this); + } + + /** + * Get the type of this token + * @return type + */ + public Type getType() { + return type; + } + + /** + * Turn this token into Java regexp compatible text + * @return part of a java regexp + */ + @Override + public String toString() { + ToString t = toStringMap.get(getType()); + if (t == null) { + t = DEFAULT_TOSTRING; + } + return t.toString(this); + } + } + + /** + * Constructor + * @param string the JavaScript regexp to parse + */ + private RegExpScanner(final String string) { + super(string); + sb = new StringBuilder(limit); + reset(0); + expected.put(']', 0); + expected.put('}', 0); + } + + private void processForwardReferences() { + if (neverMatches()) { + return; + } + + for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { + if (fwdRef.getKey().intValue() > caps.size()) { + neverMatches = true; + break; + } + + fwdRef.getValue().setIsDead(true); + } + + forwardReferences.clear(); + } + + /** + * Scan a JavaScript regexp string returning a Java safe regex string. + * + * @param string + * JavaScript regexp string. + * @return Java safe regex string. + */ + public static RegExpScanner scan(final String string) { + final RegExpScanner scanner = new RegExpScanner(string); + + Token pattern; + + try { + pattern = scanner.pattern(); + } catch (final Exception e) { + throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); + } + + scanner.processForwardReferences(); + if (scanner.neverMatches()) { + return null; // never matches + } + + // go over the code and remove dead code + final Iterator<Token> iter = pattern.iterator(); + while (iter.hasNext()) { + final Token next = iter.next(); + if (next.getIsDead()) { + next.getParent().remove(next); + } + } + + // turn the pattern into a string, p, the java equivalent string for our js regexp + final String p = pattern.toString(); + // if builder contains all tokens that were sent in, we know + // we correctly parsed the entire JavaScript regexp without syntax errors + if (!string.equals(scanner.getStringBuilder().toString())) { + throw new PatternSyntaxException(string, p, p.length() + 1); + } + + scanner.javaPattern = p; + return scanner; + } + + /** + * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, + * is an example where we never match + * + * @return boolean + */ + private boolean neverMatches() { + return neverMatches; + } + + final StringBuilder getStringBuilder() { + return sb; + } + + String getJavaPattern() { + return javaPattern; + } + + BitVector getGroupsInNegativeLookahead() { + BitVector vec = null; + for (int i = 0; i < caps.size(); i++) { + final Capture cap = caps.get(i); + if (cap.getNegativeLookaheadLevel() > 0) { + if (vec == null) { + vec = new BitVector(caps.size() + 1); + } + vec.set(i + 1); + } + } + return vec; + } + + /** + * Commit n characters to the builder and to a given token + * @param token Uncommitted token. + * @param n Number of characters. + * @return Committed token + */ + private Token commit(final Token token, final int n) { + final int startIn = position; + + switch (n) { + case 1: + sb.append(ch0); + skip(1); + break; + case 2: + sb.append(ch0); + sb.append(ch1); + skip(2); + break; + case 3: + sb.append(ch0); + sb.append(ch1); + sb.append(ch2); + skip(3); + break; + default: + assert false : "Should not reach here"; + } + + if (token == null) { + return null; + } + + return token.add(sb.substring(startIn, sb.length())); + } + + /** + * Restart the buffers back at an earlier position. + * + * @param startIn + * Position in the input stream. + * @param startOut + * Position in the output stream. + */ + private void restart(final int startIn, final int startOut) { + reset(startIn); + sb.setLength(startOut); + } + + private void push(final char ch) { + expected.put(ch, expected.get(ch) + 1); + } + + private void pop(final char ch) { + expected.put(ch, Math.min(0, expected.get(ch) - 1)); + } + + /* + * Recursive descent tokenizer starts below. + */ + + /* + * Pattern :: + * Disjunction + */ + private Token pattern() { + final Token token = new Token(Token.Type.PATTERN); + + final Token child = disjunction(); + return token.add(child); + } + + /* + * Disjunction :: + * Alternative + * Alternative | Disjunction + */ + private Token disjunction() { + final Token token = new Token(Token.Type.DISJUNCTION); + + while (true) { + token.add(alternative()); + + if (ch0 == '|') { + commit(token, 1); + } else { + break; + } + } + + return token; + } + + /* + * Alternative :: + * [empty] + * Alternative Term + */ + private Token alternative() { + final Token token = new Token(Token.Type.ALTERNATIVE); + + Token child; + while ((child = term()) != null) { + token.add(child); + } + + return token; + } + + /* + * Term :: + * Assertion + * Atom + * Atom Quantifier + */ + private Token term() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.TERM); + Token child; + + child = assertion(); + if (child != null) { + return token.add(child); + } + + child = atom(); + if (child != null) { + boolean emptyCharacterClass = false; + if ("[]".equals(child.toString())) { + emptyCharacterClass = true; + } + + token.add(child); + + final Token quantifier = quantifier(); + if (quantifier != null) { + token.add(quantifier); + } + + if (emptyCharacterClass) { + if (quantifier == null) { + neverMatches = true; //never matches ever. + } else { + //if we can get away with max zero, remove this entire token + final String qs = quantifier.toString(); + if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { + token.setIsDead(true); + } + } + } + + return token; + } + + restart(startIn, startOut); + return null; + } + + /* + * Assertion :: + * ^ + * $ + * \b + * \B + * ( ? = Disjunction ) + * ( ? ! Disjunction ) + */ + private Token assertion() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.ASSERTION); + + switch (ch0) { + case '^': + case '$': + return commit(token, 1); + + case '\\': + if (ch1 == 'b' || ch1 == 'B') { + return commit(token, 2); + } + break; + + case '(': + if (ch1 != '?') { + break; + } + if (ch2 != '=' && ch2 != '!') { + break; + } + final boolean isNegativeLookahead = (ch2 == '!'); + commit(token, 3); + + if (isNegativeLookahead) { + negativeLookaheadLevel++; + } + final Token disjunction = disjunction(); + if (isNegativeLookahead) { + for (final Capture cap : caps) { + if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { + cap.setDead(); + } + } + negativeLookaheadLevel--; + } + + if (disjunction != null && ch0 == ')') { + token.add(disjunction); + return commit(token, 1); + } + break; + + default: + break; + } + + restart(startIn, startOut); + + return null; + } + + /* + * Quantifier :: + * QuantifierPrefix + * QuantifierPrefix ? + */ + private Token quantifier() { + final Token token = new Token(Token.Type.QUANTIFIER); + final Token child = quantifierPrefix(); + if (child != null) { + token.add(child); + if (ch0 == '?') { + commit(token, 1); + } + return token; + } + return null; + } + + /* + * QuantifierPrefix :: + * * + * + + * ? + * { DecimalDigits } + * { DecimalDigits , } + * { DecimalDigits , DecimalDigits } + */ + private Token quantifierPrefix() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); + + switch (ch0) { + case '*': + case '+': + case '?': + return commit(token, 1); + + case '{': + commit(token, 1); + + final Token child = decimalDigits(); + if (child == null) { + break; // not a quantifier - back out + } + push('}'); + token.add(child); + + if (ch0 == ',') { + commit(token, 1); + token.add(decimalDigits()); + } + + if (ch0 == '}') { + pop('}'); + commit(token, 1); + } + + return token; + + default: + break; + } + + restart(startIn, startOut); + return null; + } + + /* + * Atom :: + * PatternCharacter + * . + * \ AtomEscape + * CharacterClass + * ( Disjunction ) + * ( ? : Disjunction ) + * + */ + private Token atom() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.ATOM); + Token child; + + child = patternCharacter(); + if (child != null) { + return token.add(child); + } + + if (ch0 == '.') { + return commit(token, 1); + } + + if (ch0 == '\\') { + commit(token, 1); + child = atomEscape(); + + if (child != null) { + if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { + final char idEscape = child.toString().charAt(0); + if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { + token.reset(); + } + } + + token.add(child); + + // forward backreferences always match empty. JavaScript != Java + if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { + final int refNum = Integer.parseInt(child.toString()); + + if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { + // reference to dead in-negative-lookahead capture + token.setIsDead(true); + } else if (caps.size() < refNum) { + // forward reference: always matches against empty string (dead token). + // invalid reference (non-existant capture): pattern never matches. + forwardReferences.put(refNum, token); + } + } + + return token; + } + } + + child = characterClass(); + if (child != null) { + return token.add(child); + } + + if (ch0 == '(') { + boolean capturingParens = true; + commit(token, 1); + if (ch0 == '?' && ch1 == ':') { + capturingParens = false; + commit(token, 2); + } + + child = disjunction(); + if (child != null) { + token.add(child); + if (ch0 == ')') { + final Token atom = commit(token, 1); + if (capturingParens) { + caps.add(new Capture(negativeLookaheadLevel)); + } + return atom; + } + } + } + + restart(startIn, startOut); + return null; + } + + /* + * PatternCharacter :: + * SourceCharacter but not any of: ^$\.*+?()[]{}| + */ + @SuppressWarnings("fallthrough") + private Token patternCharacter() { + if (atEOF()) { + return null; + } + + switch (ch0) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case '|': + return null; + + case '}': + case ']': + final int n = expected.get(ch0); + if (n != 0) { + return null; + } + + case '{': + // if not a valid quantifier escape curly brace to match itself + // this ensures compatibility with other JS implementations + final Token quant = quantifierPrefix(); + return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; + + default: + return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER + } + } + + /* + * AtomEscape :: + * DecimalEscape + * CharacterEscape + * CharacterClassEscape + */ + private Token atomEscape() { + final Token token = new Token(Token.Type.ATOM_ESCAPE); + Token child; + + child = decimalEscape(); + if (child != null) { + return token.add(child); + } + + child = characterClassEscape(); + if (child != null) { + return token.add(child); + } + + child = characterEscape(); + if (child != null) { + return token.add(child); + } + + + return null; + } + + /* + * CharacterEscape :: + * ControlEscape + * c ControlLetter + * HexEscapeSequence + * UnicodeEscapeSequence + * IdentityEscape + */ + private Token characterEscape() { + final int startIn = position; + final int startOut = sb.length(); + + final Token token = new Token(Token.Type.CHARACTER_ESCAPE); + Token child; + + child = controlEscape(); + if (child != null) { + return token.add(child); + } + + if (ch0 == 'c') { + commit(token, 1); + child = controlLetter(); + if (child != null) { + return token.add(child); + } + restart(startIn, startOut); + } + + child = hexEscapeSequence(); + if (child != null) { + return token.add(child); + } + + child = unicodeEscapeSequence(); + if (child != null) { + return token.add(child); + } + + child = identityEscape(); + if (child != null) { + return token.add(child); + } + + restart(startIn, startOut); + + return null; + } + + private boolean scanEscapeSequence(final char leader, final int length, final Token token) { + final int startIn = position; + final int startOut = sb.length(); + + if (ch0 != leader) { + return false; + } + + commit(token, 1); + for (int i = 0; i < length; i++) { + final char ch0l = Character.toLowerCase(ch0); + if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { + commit(token, 1); + } else { + restart(startIn, startOut); + return false; + } + } + + return true; + } + + private Token hexEscapeSequence() { + final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); + if (scanEscapeSequence('x', 2, token)) { + return token; + } + return null; + } + + private Token unicodeEscapeSequence() { + final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); + if (scanEscapeSequence('u', 4, token)) { + return token; + } + return null; + } + + /* + * ControlEscape :: + * one of fnrtv + */ + private Token controlEscape() { + switch (ch0) { + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); + + default: + return null; + } + } + + /* + * ControlLetter :: + * one of abcdefghijklmnopqrstuvwxyz + * ABCDEFGHIJKLMNOPQRSTUVWXYZ + */ + private Token controlLetter() { + final char c = Character.toUpperCase(ch0); + if (c >= 'A' && c <= 'Z') { + final Token token = new Token(Token.Type.CONTROL_LETTER); + commit(token, 1); + return token; + } + return null; + /* + Token token = new Token(Token.Type.CONTROL_LETTER); + commit(null, 1);//add original char to builder not to token + this.neverMatches = c < 'A' || c > 'Z'; + return token.add(""+c);*/ + } + + /* + * IdentityEscape :: + * SourceCharacter but not IdentifierPart + * <ZWJ> (200c) + * <ZWNJ> (200d) + */ + private Token identityEscape() { + final Token token = new Token(Token.Type.IDENTITY_ESCAPE); + commit(token, 1); + return token; + } + + /* + * DecimalEscape :: + * DecimalIntegerLiteral [lookahead DecimalDigit] + */ + private Token decimalEscape() { + final Token token = new Token(Token.Type.DECIMAL_ESCAPE); + final int startIn = position; + final int startOut = sb.length(); + + if (ch0 == '0' && !isDecimalDigit(ch1)) { + commit(token, 1); + token.removeLast(); + // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); + return token.add("\u0000"); + } + + if (isDecimalDigit(ch0)) { + while (isDecimalDigit(ch0)) { + commit(token, 1); + } + return token; + } + + restart(startIn, startOut); + + return null; + } + + /* + * CharacterClassEscape :: + * one of dDsSwW + */ + private Token characterClassEscape() { + switch (ch0) { + case 's': + case 'S': + case 'd': + case 'D': + case 'w': + case 'W': + return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); + + default: + return null; + } + } + + /* + * CharacterClass :: + * [ [lookahead {^}] ClassRanges ] + * [ ^ ClassRanges ] + */ + private Token characterClass() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.CHARACTERCLASS); + + if (ch0 == '[') { + push(']'); + commit(token, 1); + + if (ch0 == '^') { + commit(token, 1); + } + + final Token child = classRanges(); + if (child != null && ch0 == ']') { + pop(']'); + token.add(child); + return commit(token, 1); + } + } + + restart(startIn, startOut); + return null; + } + + /* + * ClassRanges :: + * [empty] + * NonemptyClassRanges + */ + private Token classRanges() { + return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); + } + + /* + * NonemptyClassRanges :: + * ClassAtom + * ClassAtom NonemptyClassRangesNoDash + * ClassAtom - ClassAtom ClassRanges + */ + private Token nonemptyClassRanges() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); + Token child; + + child = classAtom(); + if (child != null) { + token.add(child); + + if (ch0 == '-') { + commit(token, 1); + + final Token child1 = classAtom(); + final Token child2 = classRanges(); + if (child1 != null && child2 != null) { + token.add(child1); + token.add(child2); + + return token; + } + } + + child = nonemptyClassRangesNoDash(); + if (child != null) { + token.add(child); + return token; + } + + return token; + } + + restart(startIn, startOut); + return null; + } + + /* + * NonemptyClassRangesNoDash :: + * ClassAtom + * ClassAtomNoDash NonemptyClassRangesNoDash + * ClassAtomNoDash - ClassAtom ClassRanges + */ + private Token nonemptyClassRangesNoDash() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); + Token child; + + child = classAtomNoDash(); + if (child != null) { + token.add(child); + + // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom + if (ch0 == '-') { + commit(token, 1); + + final Token child1 = classAtom(); + final Token child2 = classRanges(); + if (child1 != null && child2 != null) { + token.add(child1); + return token.add(child2); + } + //fallthru + } + + child = nonemptyClassRangesNoDash(); + if (child != null) { + token.add(child); + } + return token; // still a class atom + } + + child = classAtom(); + if (child != null) { + return token.add(child); + } + + restart(startIn, startOut); + return null; + } + + /* + * ClassAtom : - ClassAtomNoDash + */ + private Token classAtom() { + final Token token = new Token(Token.Type.CLASSATOM); + + if (ch0 == '-') { + return commit(token, 1); + } + + final Token child = classAtomNoDash(); + if (child != null) { + return token.add(child); + } + + return null; + } + + /* + * ClassAtomNoDash :: + * SourceCharacter but not one of \ or ] or - + * \ ClassEscape + */ + private Token classAtomNoDash() { + final int startIn = position; + final int startOut = sb.length(); + final Token token = new Token(Token.Type.CLASSATOM_NODASH); + + switch (ch0) { + case ']': + case '-': + case '\0': + return null; + + case '[': + // unescaped left square bracket - add escape + return commit(token.add("\\"), 1); + + case '\\': + commit(token, 1); + final Token child = classEscape(); + if (child != null) { + return token.add(child); + } + + restart(startIn, startOut); + return null; + + default: + return commit(token, 1); + } + } + + /* + * ClassEscape :: + * DecimalEscape + * b + * CharacterEscape + * CharacterClassEscape + */ + private Token classEscape() { + final Token token = new Token(Token.Type.CLASS_ESCAPE); + Token child; + + child = decimalEscape(); + if (child != null) { + return token.add(child); + } + + if (ch0 == 'b') { + return commit(token, 1); + } + + child = characterEscape(); + if (child != null) { + return token.add(child); + } + + child = characterClassEscape(); + if (child != null) { + return token.add(child); + } + + return null; + } + + /* + * DecimalDigits + */ + private Token decimalDigits() { + if (!isDecimalDigit(ch0)) { + return null; + } + + final Token token = new Token(Token.Type.DECIMALDIGITS); + while (isDecimalDigit(ch0)) { + commit(token, 1); + } + + return token; + } + + private static boolean isDecimalDigit(final char ch) { + return ch >= '0' && ch <= '9'; + } +}