changeset 5564:af209a223b6b

7014640: To add a metachar \R for line ending and character classes for vertical/horizontal ws \v \V \h \H Summary: added propsoed constructs Reviewed-by: alanb
author sherman
date Tue, 08 May 2012 10:57:13 -0700
parents 48513d156965
children 1ece20885be4
files src/share/classes/java/util/regex/Pattern.java test/java/util/regex/RegExTest.java
diffstat 2 files changed, 189 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/classes/java/util/regex/Pattern.java	Tue May 08 02:59:10 2012 -0400
+++ b/src/share/classes/java/util/regex/Pattern.java	Tue May 08 10:57:13 2012 -0700
@@ -152,15 +152,24 @@
  *     <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
  * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
  *     <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\h</tt></td>
+ *     <td headers="matches">A horizontal whitespace character:
+ *     <tt>[ \t\xA0&#92;u1680&#92;u180e&#92;u2000-&#92;u200a&#92;u202f&#92;u205f&#92;u3000]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\H</tt></td>
+ *     <td headers="matches">A non-horizontal whitespace character: <tt>[^\h]</tt></td></tr>
  * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
  *     <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
  * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
  *     <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\v</tt></td>
+ *     <td headers="matches">A vertical whitespace character: <tt>[\n\x0B\f\r\x85&#92;u2028&#92;u2029]</tt>
+ *     </td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\V</tt></td>
+ *     <td headers="matches">A non-vertical whitespace character: <tt>[^\v]</tt></td></tr>
  * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
  *     <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
  * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
  *     <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
- *
  * <tr><th>&nbsp;</th></tr>
  * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
  *
@@ -244,6 +253,13 @@
  *     <td headers="matches">The end of the input</td></tr>
  *
  * <tr><th>&nbsp;</th></tr>
+ * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
+ * <tr><td valign="top" headers="construct lineending"><tt>\R</tt></td>
+ *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
+ *     <tt>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
+ *     </tt></td></tr>
+ *
+ * <tr><th>&nbsp;</th></tr>
  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
  *
  * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
@@ -599,11 +615,9 @@
  *   <li> Noncharacter_Code_Point
  *   <li> Assigned
  * </ul>
-
-
  * <p>
- * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
- * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
+ * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b>
+ * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
  * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
  * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
  * <p>
@@ -668,12 +682,6 @@
  *
  * <ul>
  *    <li><p> Predefined character classes (Unicode character)
- *    <p><tt>\h&nbsp;&nbsp;&nbsp;&nbsp;</tt>A horizontal whitespace
- *    <p><tt>\H&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non horizontal whitespace
- *    <p><tt>\v&nbsp;&nbsp;&nbsp;&nbsp;</tt>A vertical whitespace
- *    <p><tt>\V&nbsp;&nbsp;&nbsp;&nbsp;</tt>A non vertical whitespace
- *    <p><tt>\R&nbsp;&nbsp;&nbsp;&nbsp;</tt>Any Unicode linebreak sequence
- *    <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt>
  *    <p><tt>\X&nbsp;&nbsp;&nbsp;&nbsp;</tt>Match Unicode
  *    <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
  *    <i>extended grapheme cluster</i></a>
@@ -2178,7 +2186,7 @@
                 }
                 unread();
                 prev = cursor;
-                ch = escape(false, first == 0);
+                ch = escape(false, first == 0, false);
                 if (ch >= 0) {
                     append(ch, first);
                     first++;
@@ -2276,7 +2284,7 @@
      * If the returned value is greater than zero, it is the value that
      * matches the escape sequence.
      */
-    private int escape(boolean inclass, boolean create) {
+    private int escape(boolean inclass, boolean create, boolean isrange) {
         int ch = skip();
         switch (ch) {
         case '0':
@@ -2318,6 +2326,8 @@
             if (create) root = new LastMatch();
             return -1;
         case 'H':
+            if (create) root = new HorizWS().complement();
+            return -1;
         case 'I':
         case 'J':
         case 'K':
@@ -2327,8 +2337,11 @@
         case 'O':
         case 'P':
         case 'Q':
+            break;
         case 'R':
-            break;
+            if (inclass) break;
+            if (create) root = new LineEnding();
+            return -1;
         case 'S':
             if (create) root = has(UNICODE_CHARACTER_CLASS)
                                ? new Utype(UnicodeProp.WHITE_SPACE).complement()
@@ -2336,8 +2349,10 @@
             return -1;
         case 'T':
         case 'U':
+            break;
         case 'V':
-            break;
+            if (create) root = new VertWS().complement();
+            return -1;
         case 'W':
             if (create) root = has(UNICODE_CHARACTER_CLASS)
                                ? new Utype(UnicodeProp.WORD).complement()
@@ -2373,7 +2388,10 @@
         case 'f':
             return '\f';
         case 'g':
+            break;
         case 'h':
+            if (create) root = new HorizWS();
+            return -1;
         case 'i':
         case 'j':
             break;
@@ -2413,7 +2431,18 @@
         case 'u':
             return u();
         case 'v':
-            return '\013';
+            // '\v' was implemented as VT/0x0B in releases < 1.8 (though
+            // undocumented). In JDK8 '\v' is specified as a predefined
+            // character class for all vertical whitespace characters.
+            // So [-1, root=VertWS node] pair is returned (instead of a
+            // single 0x0B). This breaks the range if '\v' is used as
+            // the start or end value, such as [\v-...] or [...-\v], in
+            // which a single definite value (0x0B) is expected. For
+            // compatiblity concern '\013'/0x0B is returned if isrange.
+            if (isrange)
+                return '\013';
+            if (create) root = new VertWS();
+            return -1;
         case 'w':
             if (create) root = has(UNICODE_CHARACTER_CLASS)
                                ? new Utype(UnicodeProp.WORD)
@@ -2590,13 +2619,14 @@
                     oneLetter = false;
                 return family(oneLetter, comp);
             } else { // ordinary escape
+                boolean isrange = temp[cursor+1] == '-';
                 unread();
-                ch = escape(true, true);
+                ch = escape(true, true, isrange);
                 if (ch == -1)
                     return (CharProperty) root;
             }
         } else {
-            ch = single();
+            next();
         }
         if (ch >= 0) {
             if (peek() == '-') {
@@ -2606,9 +2636,15 @@
                 }
                 if (endRange != ']') {
                     next();
-                    int m = single();
-                    if (m < ch)
+                    int m = peek();
+                    if (m == '\\') {
+                        m = escape(true, false, true);
+                    } else {
+                        next();
+                    }
+                    if (m < ch) {
                         throw error("Illegal character range");
+                    }
                     if (has(CASE_INSENSITIVE))
                         return caseInsensitiveRangeFor(ch, m);
                     else
@@ -2620,17 +2656,6 @@
         throw error("Unexpected character '"+((char)ch)+"'");
     }
 
-    private int single() {
-        int ch = peek();
-        switch (ch) {
-        case '\\':
-            return escape(true, false);
-        default:
-            next();
-            return ch;
-        }
-    }
-
     /**
      * Parses a Unicode character family and returns its representative node.
      */
@@ -3695,6 +3720,35 @@
     }
 
     /**
+     * Node class that matches a Unicode line ending '\R'
+     */
+    static final class LineEnding extends Node {
+        boolean match(Matcher matcher, int i, CharSequence seq) {
+            // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])
+            if (i < matcher.to) {
+                int ch = seq.charAt(i);
+                if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||
+                    ch == 0x85 || ch == 0x2028 || ch == 0x2029)
+                    return next.match(matcher, i + 1, seq);
+                if (ch == 0x0D) {
+                    i++;
+                    if (i < matcher.to && seq.charAt(i) == 0x0A)
+                        i++;
+                    return next.match(matcher, i, seq);
+                }
+            } else {
+                matcher.hitEnd = true;
+            }
+            return false;
+        }
+        boolean study(TreeInfo info) {
+            info.minLength++;
+            info.maxLength += 2;
+            return next.study(info);
+        }
+    }
+
+    /**
      * Abstract node class to match one character satisfying some
      * boolean property.
      */
@@ -3789,7 +3843,6 @@
         }
     }
 
-
     /**
      * Node class that matches a Unicode block.
      */
@@ -3838,7 +3891,6 @@
         }
     }
 
-
     /**
      * Node class that matches a POSIX type.
      */
@@ -3851,6 +3903,28 @@
     }
 
     /**
+     * Node class that matches a Perl vertical whitespace
+     */
+    static final class VertWS extends BmpCharProperty {
+        boolean isSatisfiedBy(int cp) {
+            return (cp >= 0x0A && cp <= 0x0D) ||
+                   cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+        }
+    }
+
+    /**
+     * Node class that matches a Perl horizontal whitespace
+     */
+    static final class HorizWS extends BmpCharProperty {
+        boolean isSatisfiedBy(int cp) {
+            return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
+                   cp == 0x1680 || cp == 0x180e ||
+                   cp >= 0x2000 && cp <= 0x200a ||
+                   cp == 0x202f || cp == 0x205f || cp == 0x3000;
+        }
+    }
+
+    /**
      * Base class for all Slice nodes
      */
     static class SliceNode extends Node {
--- a/test/java/util/regex/RegExTest.java	Tue May 08 02:59:10 2012 -0400
+++ b/test/java/util/regex/RegExTest.java	Tue May 08 10:57:13 2012 -0700
@@ -33,7 +33,7 @@
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
- * 7067045
+ * 7067045 7014640
  */
 
 import java.util.regex.*;
@@ -141,6 +141,8 @@
         unicodePropertiesTest();
         unicodeHexNotationTest();
         unicodeClassesTest();
+        horizontalAndVerticalWSTest();
+        linebreakTest();
         if (failure) {
             throw new
                 RuntimeException("RegExTest failed, 1st failure: " +
@@ -857,13 +859,18 @@
         // in replacement string
         try {
             "\uac00".replaceAll("\uac00", "$");
+            failCount++;
+        } catch (IllegalArgumentException iie) {
+        } catch (Exception e) {
+            failCount++;
+        }
+        try {
             "\uac00".replaceAll("\uac00", "\\");
             failCount++;
         } catch (IllegalArgumentException iie) {
         } catch (Exception e) {
             failCount++;
         }
-
         report("Literal replacement");
     }
 
@@ -3838,4 +3845,77 @@
             failCount++;
         report("unicodePredefinedClasses");
     }
+
+    private static void horizontalAndVerticalWSTest() throws Exception {
+        String hws = new String (new char[] {
+                                     0x09, 0x20, 0xa0, 0x1680, 0x180e,
+                                     0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
+                                     0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+                                     0x202f, 0x205f, 0x3000 });
+        String vws = new String (new char[] {
+                                     0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 });
+        if (!Pattern.compile("\\h+").matcher(hws).matches() ||
+            !Pattern.compile("[\\h]+").matcher(hws).matches())
+            failCount++;
+        if (Pattern.compile("\\H").matcher(hws).find() ||
+            Pattern.compile("[\\H]").matcher(hws).find())
+            failCount++;
+        if (!Pattern.compile("\\v+").matcher(vws).matches() ||
+            !Pattern.compile("[\\v]+").matcher(vws).matches())
+            failCount++;
+        if (Pattern.compile("\\V").matcher(vws).find() ||
+            Pattern.compile("[\\V]").matcher(vws).find())
+            failCount++;
+        String prefix = "abcd";
+        String suffix = "efgh";
+        String ng = "A";
+        for (int i = 0; i < hws.length(); i++) {
+            String c = String.valueOf(hws.charAt(i));
+            Matcher m = Pattern.compile("\\h").matcher(prefix + c + suffix);
+            if (!m.find() || !c.equals(m.group()))
+                failCount++;
+            m = Pattern.compile("[\\h]").matcher(prefix + c + suffix);
+            if (!m.find() || !c.equals(m.group()))
+                failCount++;
+
+            m = Pattern.compile("\\H").matcher(hws.substring(0, i) + ng + hws.substring(i));
+            if (!m.find() || !ng.equals(m.group()))
+                failCount++;
+            m = Pattern.compile("[\\H]").matcher(hws.substring(0, i) + ng + hws.substring(i));
+            if (!m.find() || !ng.equals(m.group()))
+                failCount++;
+        }
+        for (int i = 0; i < vws.length(); i++) {
+            String c = String.valueOf(vws.charAt(i));
+            Matcher m = Pattern.compile("\\v").matcher(prefix + c + suffix);
+            if (!m.find() || !c.equals(m.group()))
+                failCount++;
+            m = Pattern.compile("[\\v]").matcher(prefix + c + suffix);
+            if (!m.find() || !c.equals(m.group()))
+                failCount++;
+
+            m = Pattern.compile("\\V").matcher(vws.substring(0, i) + ng + vws.substring(i));
+            if (!m.find() || !ng.equals(m.group()))
+                failCount++;
+            m = Pattern.compile("[\\V]").matcher(vws.substring(0, i) + ng + vws.substring(i));
+            if (!m.find() || !ng.equals(m.group()))
+                failCount++;
+        }
+        // \v in range is interpreted as 0x0B. This is the undocumented behavior
+        if (!Pattern.compile("[\\v-\\v]").matcher(String.valueOf((char)0x0B)).matches())
+            failCount++;
+        report("horizontalAndVerticalWSTest");
+    }
+
+    private static void linebreakTest() throws Exception {
+        String linebreaks = new String (new char[] {
+            0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 });
+        String crnl = "\r\n";
+        if (!Pattern.compile("\\R+").matcher(linebreaks).matches() ||
+            !Pattern.compile("\\R").matcher(crnl).matches() ||
+            Pattern.compile("\\R\\R").matcher(crnl).matches())
+            failCount++;
+        report("linebreakTest");
+    }
+
 }