Mercurial > hg > icedtea9-forest > jdk
changeset 5564:af209a223b6b
7014640: To add a metachar \R for line ending and character classes for vertical/horizontal ws \v \V \h \H
Summary: added propsoed constructs
Reviewed-by: alanb
author | sherman |
---|---|
date | Tue, 08 May 2012 10:57:13 -0700 |
parents | 48513d156965 |
children | 1ece20885be4 |
files | src/share/classes/java/util/regex/Pattern.java test/java/util/regex/RegExTest.java |
diffstat | 2 files changed, 189 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/src/share/classes/java/util/regex/Pattern.java Tue May 08 02:59:10 2012 -0400 +++ b/src/share/classes/java/util/regex/Pattern.java Tue May 08 10:57:13 2012 -0700 @@ -152,15 +152,24 @@ * <td headers="matches">A digit: <tt>[0-9]</tt></td></tr> * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td> * <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr> + * <tr><td valign="top" headers="construct predef"><tt>\h</tt></td> + * <td headers="matches">A horizontal whitespace character: + * <tt>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</tt></td></tr> + * <tr><td valign="top" headers="construct predef"><tt>\H</tt></td> + * <td headers="matches">A non-horizontal whitespace character: <tt>[^\h]</tt></td></tr> * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td> * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr> * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td> * <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr> + * <tr><td valign="top" headers="construct predef"><tt>\v</tt></td> + * <td headers="matches">A vertical whitespace character: <tt>[\n\x0B\f\r\x85\u2028\u2029]</tt> + * </td></tr> + * <tr><td valign="top" headers="construct predef"><tt>\V</tt></td> + * <td headers="matches">A non-vertical whitespace character: <tt>[^\v]</tt></td></tr> * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td> * <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr> * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td> * <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr> - * * <tr><th> </th></tr> * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr> * @@ -244,6 +253,13 @@ * <td headers="matches">The end of the input</td></tr> * * <tr><th> </th></tr> + * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr> + * <tr><td valign="top" headers="construct lineending"><tt>\R</tt></td> + * <td headers="matches">Any Unicode linebreak sequence, is equivalent to + * <tt>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] + * </tt></td></tr> + * + * <tr><th> </th></tr> * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr> * * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td> @@ -599,11 +615,9 @@ * <li> Noncharacter_Code_Point * <li> Assigned * </ul> - - * <p> - * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in - * conformance with the recommendation of <i>Annex C: Compatibility Properties</i> + * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b> + * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i> * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified. * <p> @@ -668,12 +682,6 @@ * * <ul> * <li><p> Predefined character classes (Unicode character) - * <p><tt>\h </tt>A horizontal whitespace - * <p><tt>\H </tt>A non horizontal whitespace - * <p><tt>\v </tt>A vertical whitespace - * <p><tt>\V </tt>A non vertical whitespace - * <p><tt>\R </tt>Any Unicode linebreak sequence - * <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt> * <p><tt>\X </tt>Match Unicode * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> * <i>extended grapheme cluster</i></a> @@ -2178,7 +2186,7 @@ } unread(); prev = cursor; - ch = escape(false, first == 0); + ch = escape(false, first == 0, false); if (ch >= 0) { append(ch, first); first++; @@ -2276,7 +2284,7 @@ * If the returned value is greater than zero, it is the value that * matches the escape sequence. */ - private int escape(boolean inclass, boolean create) { + private int escape(boolean inclass, boolean create, boolean isrange) { int ch = skip(); switch (ch) { case '0': @@ -2318,6 +2326,8 @@ if (create) root = new LastMatch(); return -1; case 'H': + if (create) root = new HorizWS().complement(); + return -1; case 'I': case 'J': case 'K': @@ -2327,8 +2337,11 @@ case 'O': case 'P': case 'Q': + break; case 'R': - break; + if (inclass) break; + if (create) root = new LineEnding(); + return -1; case 'S': if (create) root = has(UNICODE_CHARACTER_CLASS) ? new Utype(UnicodeProp.WHITE_SPACE).complement() @@ -2336,8 +2349,10 @@ return -1; case 'T': case 'U': + break; case 'V': - break; + if (create) root = new VertWS().complement(); + return -1; case 'W': if (create) root = has(UNICODE_CHARACTER_CLASS) ? new Utype(UnicodeProp.WORD).complement() @@ -2373,7 +2388,10 @@ case 'f': return '\f'; case 'g': + break; case 'h': + if (create) root = new HorizWS(); + return -1; case 'i': case 'j': break; @@ -2413,7 +2431,18 @@ case 'u': return u(); case 'v': - return '\013'; + // '\v' was implemented as VT/0x0B in releases < 1.8 (though + // undocumented). In JDK8 '\v' is specified as a predefined + // character class for all vertical whitespace characters. + // So [-1, root=VertWS node] pair is returned (instead of a + // single 0x0B). This breaks the range if '\v' is used as + // the start or end value, such as [\v-...] or [...-\v], in + // which a single definite value (0x0B) is expected. For + // compatiblity concern '\013'/0x0B is returned if isrange. + if (isrange) + return '\013'; + if (create) root = new VertWS(); + return -1; case 'w': if (create) root = has(UNICODE_CHARACTER_CLASS) ? new Utype(UnicodeProp.WORD) @@ -2590,13 +2619,14 @@ oneLetter = false; return family(oneLetter, comp); } else { // ordinary escape + boolean isrange = temp[cursor+1] == '-'; unread(); - ch = escape(true, true); + ch = escape(true, true, isrange); if (ch == -1) return (CharProperty) root; } } else { - ch = single(); + next(); } if (ch >= 0) { if (peek() == '-') { @@ -2606,9 +2636,15 @@ } if (endRange != ']') { next(); - int m = single(); - if (m < ch) + int m = peek(); + if (m == '\\') { + m = escape(true, false, true); + } else { + next(); + } + if (m < ch) { throw error("Illegal character range"); + } if (has(CASE_INSENSITIVE)) return caseInsensitiveRangeFor(ch, m); else @@ -2620,17 +2656,6 @@ throw error("Unexpected character '"+((char)ch)+"'"); } - private int single() { - int ch = peek(); - switch (ch) { - case '\\': - return escape(true, false); - default: - next(); - return ch; - } - } - /** * Parses a Unicode character family and returns its representative node. */ @@ -3695,6 +3720,35 @@ } /** + * Node class that matches a Unicode line ending '\R' + */ + static final class LineEnding extends Node { + boolean match(Matcher matcher, int i, CharSequence seq) { + // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029]) + if (i < matcher.to) { + int ch = seq.charAt(i); + if (ch == 0x0A || ch == 0x0B || ch == 0x0C || + ch == 0x85 || ch == 0x2028 || ch == 0x2029) + return next.match(matcher, i + 1, seq); + if (ch == 0x0D) { + i++; + if (i < matcher.to && seq.charAt(i) == 0x0A) + i++; + return next.match(matcher, i, seq); + } + } else { + matcher.hitEnd = true; + } + return false; + } + boolean study(TreeInfo info) { + info.minLength++; + info.maxLength += 2; + return next.study(info); + } + } + + /** * Abstract node class to match one character satisfying some * boolean property. */ @@ -3789,7 +3843,6 @@ } } - /** * Node class that matches a Unicode block. */ @@ -3838,7 +3891,6 @@ } } - /** * Node class that matches a POSIX type. */ @@ -3851,6 +3903,28 @@ } /** + * Node class that matches a Perl vertical whitespace + */ + static final class VertWS extends BmpCharProperty { + boolean isSatisfiedBy(int cp) { + return (cp >= 0x0A && cp <= 0x0D) || + cp == 0x85 || cp == 0x2028 || cp == 0x2029; + } + } + + /** + * Node class that matches a Perl horizontal whitespace + */ + static final class HorizWS extends BmpCharProperty { + boolean isSatisfiedBy(int cp) { + return cp == 0x09 || cp == 0x20 || cp == 0xa0 || + cp == 0x1680 || cp == 0x180e || + cp >= 0x2000 && cp <= 0x200a || + cp == 0x202f || cp == 0x205f || cp == 0x3000; + } + } + + /** * Base class for all Slice nodes */ static class SliceNode extends Node {
--- a/test/java/util/regex/RegExTest.java Tue May 08 02:59:10 2012 -0400 +++ b/test/java/util/regex/RegExTest.java Tue May 08 10:57:13 2012 -0700 @@ -33,7 +33,7 @@ * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066 - * 7067045 + * 7067045 7014640 */ import java.util.regex.*; @@ -141,6 +141,8 @@ unicodePropertiesTest(); unicodeHexNotationTest(); unicodeClassesTest(); + horizontalAndVerticalWSTest(); + linebreakTest(); if (failure) { throw new RuntimeException("RegExTest failed, 1st failure: " + @@ -857,13 +859,18 @@ // in replacement string try { "\uac00".replaceAll("\uac00", "$"); + failCount++; + } catch (IllegalArgumentException iie) { + } catch (Exception e) { + failCount++; + } + try { "\uac00".replaceAll("\uac00", "\\"); failCount++; } catch (IllegalArgumentException iie) { } catch (Exception e) { failCount++; } - report("Literal replacement"); } @@ -3838,4 +3845,77 @@ failCount++; report("unicodePredefinedClasses"); } + + private static void horizontalAndVerticalWSTest() throws Exception { + String hws = new String (new char[] { + 0x09, 0x20, 0xa0, 0x1680, 0x180e, + 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, + 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, + 0x202f, 0x205f, 0x3000 }); + String vws = new String (new char[] { + 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 }); + if (!Pattern.compile("\\h+").matcher(hws).matches() || + !Pattern.compile("[\\h]+").matcher(hws).matches()) + failCount++; + if (Pattern.compile("\\H").matcher(hws).find() || + Pattern.compile("[\\H]").matcher(hws).find()) + failCount++; + if (!Pattern.compile("\\v+").matcher(vws).matches() || + !Pattern.compile("[\\v]+").matcher(vws).matches()) + failCount++; + if (Pattern.compile("\\V").matcher(vws).find() || + Pattern.compile("[\\V]").matcher(vws).find()) + failCount++; + String prefix = "abcd"; + String suffix = "efgh"; + String ng = "A"; + for (int i = 0; i < hws.length(); i++) { + String c = String.valueOf(hws.charAt(i)); + Matcher m = Pattern.compile("\\h").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + m = Pattern.compile("[\\h]").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + + m = Pattern.compile("\\H").matcher(hws.substring(0, i) + ng + hws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + m = Pattern.compile("[\\H]").matcher(hws.substring(0, i) + ng + hws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + } + for (int i = 0; i < vws.length(); i++) { + String c = String.valueOf(vws.charAt(i)); + Matcher m = Pattern.compile("\\v").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + m = Pattern.compile("[\\v]").matcher(prefix + c + suffix); + if (!m.find() || !c.equals(m.group())) + failCount++; + + m = Pattern.compile("\\V").matcher(vws.substring(0, i) + ng + vws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + m = Pattern.compile("[\\V]").matcher(vws.substring(0, i) + ng + vws.substring(i)); + if (!m.find() || !ng.equals(m.group())) + failCount++; + } + // \v in range is interpreted as 0x0B. This is the undocumented behavior + if (!Pattern.compile("[\\v-\\v]").matcher(String.valueOf((char)0x0B)).matches()) + failCount++; + report("horizontalAndVerticalWSTest"); + } + + private static void linebreakTest() throws Exception { + String linebreaks = new String (new char[] { + 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 }); + String crnl = "\r\n"; + if (!Pattern.compile("\\R+").matcher(linebreaks).matches() || + !Pattern.compile("\\R").matcher(crnl).matches() || + Pattern.compile("\\R\\R").matcher(crnl).matches()) + failCount++; + report("linebreakTest"); + } + }