changeset 4310:94d02b3c5ac4

7039066: j.u.rgex does not match TR18 RL1.4 Simple Word Boundaries and RL1.2 Properties Summary: updated the regex Unicode property support Reviewed-by: alanb
author sherman
date Thu, 28 Apr 2011 20:48:36 -0700
parents 775b77e74bec
children 0b1354ecf5a3
files src/share/classes/java/util/regex/Pattern.java src/share/classes/java/util/regex/UnicodeProp.java test/java/util/regex/POSIX_ASCII.java test/java/util/regex/POSIX_Unicode.java test/java/util/regex/RegExTest.java
diffstat 5 files changed, 981 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/classes/java/util/regex/Pattern.java	Thu Apr 28 20:18:57 2011 -0700
+++ b/src/share/classes/java/util/regex/Pattern.java	Thu Apr 28 20:48:36 2011 -0700
@@ -206,13 +206,15 @@
  *     <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
  *
  * <tr><th>&nbsp;</th></tr>
- * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
+ * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
  * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
- *     <td headers="matches">A Latin&nbsp;script character (simple <a href="#ubc">script</a>)</td></tr>
+ *     <td headers="matches">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>
  * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
- *     <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr>
+ *     <td headers="matches">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>
  * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
- *     <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
+ *     <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
+ * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
+ *     <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
  * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
  *     <td headers="matches">A currency symbol</td></tr>
  * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
@@ -328,10 +330,11 @@
  *     <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
  * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
  *     <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
- * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux)&nbsp;</tt></td>
+ * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU)&nbsp;</tt></td>
  *     <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
- * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
+ * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
+ * on - off</td></tr>
  * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>&nbsp;&nbsp;</td>
  *     <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
@@ -518,61 +521,140 @@
  *
  * <p> This class is in conformance with Level 1 of <a
  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
- * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
+ * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
  * Canonical Equivalents.
- *
- * <p> Unicode escape sequences such as <tt>&#92;u2014</tt> in Java source code
+ * <p>
+ * <b>Unicode escape sequences</b> such as <tt>&#92;u2014</tt> in Java source code
  * are processed as described in section 3.3 of
  * <cite>The Java&trade; Language Specification</cite>.
- * Such escape sequences are also
- * implemented directly by the regular-expression parser so that Unicode
- * escapes can be used in expressions that are read from files or from the
- * keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and <tt>"\\u2014"</tt>,
- * while not equal, compile into the same pattern, which matches the character
- * with hexadecimal value <tt>0x2014</tt>.
- *
- * <p> A Unicode character can also be represented in a regular-expression by
- * using its hexadecimal code point value directly as described in construct
+ * Such escape sequences are also implemented directly by the regular-expression
+ * parser so that Unicode escapes can be used in expressions that are read from
+ * files or from the keyboard.  Thus the strings <tt>"&#92;u2014"</tt> and
+ * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
+ * matches the character with hexadecimal value <tt>0x2014</tt>.
+ * <p>
+ * A Unicode character can also be represented in a regular-expression by
+ * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
  * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
  * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
  * Unicode escape sequences of the surrogate pair
  * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
- *
- * <a name="ubc">
- * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
- * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
+ * <p>
+ * Unicode scripts, blocks, categories and binary properties are written with
+ * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
+ * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
  * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
  * does not match if the input has that property.
  * <p>
- * Scripts are specified either with the prefix {@code Is}, as in
+ * Scripts, blocks, categories and binary properties can be used both inside
+ * and outside of a character class.
+ * <a name="usc">
+ * <p>
+ * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
  * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
  * <p>
- * Blocks are specified with the prefix {@code In}, as in
+ * The script names supported by <code>Pattern</code> are the valid script names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
+ * <a name="ubc">
+ * <p>
+ * <b>Blocks</b> are specified with the prefix {@code In}, as in
  * {@code InMongolian}, or by using the keyword {@code block} (or its short
  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
  * <p>
- * Categories may be specified with the optional prefix {@code Is}:
+ * The block names supported by <code>Pattern</code> are the valid block names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
+ * <p>
+ * <a name="ucc">
+ * <b>Categories</b> may be specified with the optional prefix {@code Is}:
  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
  * letters. Same as scripts and blocks, categories can also be specified
  * by using the keyword {@code general_category} (or its short form
  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
  * <p>
- * Scripts, blocks and categories can be used both inside and outside of a
- * character class.
- * <p> The supported categories are those of
+ * The supported categories are those of
  * <a href="http://www.unicode.org/unicode/standard/standard.html">
  * <i>The Unicode Standard</i></a> in the version specified by the
  * {@link java.lang.Character Character} class. The category names are those
  * defined in the Standard, both normative and informative.
- * The script names supported by <code>Pattern</code> are the valid script names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
- * The block names supported by <code>Pattern</code> are the valid block names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
+ * <p>
+ * <a name="ubpc">
+ * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
+ * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
+ * are
+ * <ul>
+ *   <li> Alphabetic
+ *   <li> Ideographic
+ *   <li> Letter
+ *   <li> Lowercase
+ *   <li> Uppercase
+ *   <li> Titlecase
+ *   <li> Punctuation
+ *   <Li> Control
+ *   <li> White_Space
+ *   <li> Digit
+ *   <li> Hex_Digit
+ *   <li> Noncharacter_Code_Point
+ *   <li> Assigned
+ * </ul>
+
+
+ * <p>
+ * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
+ * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
+ * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
+ * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
  * <p>
- * <a name="jcc"> <p>Categories that behave like the java.lang.Character
+ * <table border="0" cellpadding="1" cellspacing="0"
+ *  summary="predefined and posix character classes in Unicode mode">
+ * <tr align="left">
+ * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
+ * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
+ *</tr>
+ * <tr><td><tt>\p{Lower}</tt></td>
+ *     <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
+ * <tr><td><tt>\p{Upper}</tt></td>
+ *     <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
+ * <tr><td><tt>\p{ASCII}</tt></td>
+ *     <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
+ * <tr><td><tt>\p{Alpha}</tt></td>
+ *     <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
+ * <tr><td><tt>\p{Digit}</tt></td>
+ *     <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
+ * <tr><td><tt>\p{Alnum}</tt></td>
+ *     <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
+ * <tr><td><tt>\p{Punct}</tt></td>
+ *     <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
+ * <tr><td><tt>\p{Graph}</tt></td>
+ *     <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
+ * <tr><td><tt>\p{Print}</tt></td>
+ *     <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
+ * <tr><td><tt>\p{Blank}</tt></td>
+ *     <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
+ * <tr><td><tt>\p{Cntrl}</tt></td>
+ *     <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
+ * <tr><td><tt>\p{XDigit}</tt></td>
+ *     <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
+ * <tr><td><tt>\p{Space}</tt></td>
+ *     <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
+ * <tr><td><tt>\d</tt></td>
+ *     <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
+ * <tr><td><tt>\D</tt></td>
+ *     <td>A non-digit: <tt>[^\d]</tt></td></tr>
+ * <tr><td><tt>\s</tt></td>
+ *     <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
+ * <tr><td><tt>\S</tt></td>
+ *     <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
+ * <tr><td><tt>\w</tt></td>
+ *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
+ * <tr><td><tt>\W</tt></td>
+ *     <td>A non-word character: <tt>[^\w]</tt></td></tr>
+ * </table>
+ * <p>
+ * <a name="jcc">
+ * Categories that behave like the java.lang.Character
  * boolean is<i>methodname</i> methods (except for the deprecated ones) are
  * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
  * the specified property has the name <tt>java<i>methodname</i></tt>.
@@ -796,6 +878,28 @@
      */
     public static final int CANON_EQ = 0x80;
 
+    /**
+     * Enables the Unicode version of <i>Predefined character classes</i> and
+     * <i>POSIX character classes</i>.
+     *
+     * <p> When this flag is specified then the (US-ASCII only)
+     * <i>Predefined character classes</i> and <i>POSIX character classes</i>
+     * are in conformance with
+     * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
+     * Standard #18: Unicode Regular Expression</i></a>
+     * <i>Annex C: Compatibility Properties</i>.
+     * <p>
+     * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+     * flag expression&nbsp;<tt>(?U)</tt>.
+     * <p>
+     * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+     * folding.
+     * <p>
+     * Specifying this flag may impose a performance penalty.  </p>
+     * @since 1.7
+     */
+    public static final int UNICODE_CHARACTER_CLASS = 0x100;
+
     /* Pattern has only two serialized components: The pattern string
      * and the flags, which are all that is needed to recompile the pattern
      * when it is deserialized.
@@ -918,7 +1022,8 @@
      *         Match flags, a bit mask that may include
      *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
      *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
-     *         {@link #LITERAL} and {@link #COMMENTS}
+     *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
+     *         and {@link #COMMENTS}
      *
      * @throws  IllegalArgumentException
      *          If bit values other than those corresponding to the defined
@@ -1209,6 +1314,10 @@
         pattern = p;
         flags = f;
 
+        // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+        if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+            flags |= UNICODE_CASE;
+
         // Reset group index count
         capturingGroupCount = 1;
         localCount = 0;
@@ -2164,12 +2273,14 @@
             return -1;
         case 'B':
             if (inclass) break;
-            if (create) root = new Bound(Bound.NONE);
+            if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
             return -1;
         case 'C':
             break;
         case 'D':
-            if (create) root = new Ctype(ASCII.DIGIT).complement();
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.DIGIT).complement()
+                               : new Ctype(ASCII.DIGIT).complement();
             return -1;
         case 'E':
         case 'F':
@@ -2191,14 +2302,18 @@
         case 'R':
             break;
         case 'S':
-            if (create) root = new Ctype(ASCII.SPACE).complement();
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.WHITE_SPACE).complement()
+                               : new Ctype(ASCII.SPACE).complement();
             return -1;
         case 'T':
         case 'U':
         case 'V':
             break;
         case 'W':
-            if (create) root = new Ctype(ASCII.WORD).complement();
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.WORD).complement()
+                               : new Ctype(ASCII.WORD).complement();
             return -1;
         case 'X':
         case 'Y':
@@ -2216,12 +2331,14 @@
             return '\007';
         case 'b':
             if (inclass) break;
-            if (create) root = new Bound(Bound.BOTH);
+            if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
             return -1;
         case 'c':
             return c();
         case 'd':
-            if (create) root = new Ctype(ASCII.DIGIT);
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.DIGIT)
+                               : new Ctype(ASCII.DIGIT);
             return -1;
         case 'e':
             return '\033';
@@ -2259,7 +2376,9 @@
         case 'r':
             return '\r';
         case 's':
-            if (create) root = new Ctype(ASCII.SPACE);
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.WHITE_SPACE)
+                               : new Ctype(ASCII.SPACE);
             return -1;
         case 't':
             return '\t';
@@ -2268,7 +2387,9 @@
         case 'v':
             return '\013';
         case 'w':
-            if (create) root = new Ctype(ASCII.WORD);
+            if (create) root = has(UNICODE_CHARACTER_CLASS)
+                               ? new Utype(UnicodeProp.WORD)
+                               : new Ctype(ASCII.WORD);
             return -1;
         case 'x':
             return x();
@@ -2490,7 +2611,7 @@
     {
         next();
         String name;
-        CharProperty node;
+        CharProperty node = null;
 
         if (singleLetter) {
             int c = temp[cursor];
@@ -2536,11 +2657,21 @@
             } else if (name.startsWith("Is")) {
                 // \p{isGeneralCategory} and \p{isScriptName}
                 name = name.substring(2);
-                node = CharPropertyNames.charPropertyFor(name);
+                UnicodeProp uprop = UnicodeProp.forName(name);
+                if (uprop != null)
+                    node = new Utype(uprop);
+                if (node == null)
+                    node = CharPropertyNames.charPropertyFor(name);
                 if (node == null)
                     node = unicodeScriptPropertyFor(name);
             } else {
-                node = charPropertyNodeFor(name);
+                if (has(UNICODE_CHARACTER_CLASS)) {
+                    UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+                    if (uprop != null)
+                        node = new Utype(uprop);
+                }
+                if (node == null)
+                    node = charPropertyNodeFor(name);
             }
         }
         if (maybeComplement) {
@@ -2822,6 +2953,9 @@
             case 'x':
                 flags |= COMMENTS;
                 break;
+            case 'U':
+                flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+                break;
             case '-': // subFlag then fall through
                 ch = next();
                 subFlag();
@@ -2861,6 +2995,8 @@
             case 'x':
                 flags &= ~COMMENTS;
                 break;
+            case 'U':
+                flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
             default:
                 return;
             }
@@ -3664,6 +3800,18 @@
     }
 
     /**
+     * Node class that matches a Unicode "type"
+     */
+    static final class Utype extends CharProperty {
+        final UnicodeProp uprop;
+        Utype(UnicodeProp uprop) { this.uprop = uprop; }
+        boolean isSatisfiedBy(int ch) {
+            return uprop.is(ch);
+        }
+    }
+
+
+    /**
      * Node class that matches a POSIX type.
      */
     static final class Ctype extends BmpCharProperty {
@@ -5025,9 +5173,17 @@
         static int BOTH = 0x3;
         static int NONE = 0x4;
         int type;
-        Bound(int n) {
+        boolean useUWORD;
+        Bound(int n, boolean useUWORD) {
             type = n;
-        }
+            this.useUWORD = useUWORD;
+        }
+
+        boolean isWord(int ch) {
+            return useUWORD ? UnicodeProp.WORD.is(ch)
+                            : (ch == '_' || Character.isLetterOrDigit(ch));
+        }
+
         int check(Matcher matcher, int i, CharSequence seq) {
             int ch;
             boolean left = false;
@@ -5039,14 +5195,14 @@
             }
             if (i > startIndex) {
                 ch = Character.codePointBefore(seq, i);
-                left = (ch == '_' || Character.isLetterOrDigit(ch) ||
+                left = (isWord(ch) ||
                     ((Character.getType(ch) == Character.NON_SPACING_MARK)
                      && hasBaseCharacter(matcher, i-1, seq)));
             }
             boolean right = false;
             if (i < endIndex) {
                 ch = Character.codePointAt(seq, i);
-                right = (ch == '_' || Character.isLetterOrDigit(ch) ||
+                right = (isWord(ch) ||
                     ((Character.getType(ch) == Character.NON_SPACING_MARK)
                      && hasBaseCharacter(matcher, i, seq)));
             } else {
@@ -5428,6 +5584,12 @@
             defClone("javaUpperCase", new CloneableProperty() {
                 boolean isSatisfiedBy(int ch) {
                     return Character.isUpperCase(ch);}});
+            defClone("javaAlphabetic", new CloneableProperty() {
+                boolean isSatisfiedBy(int ch) {
+                    return Character.isAlphabetic(ch);}});
+            defClone("javaIdeographic", new CloneableProperty() {
+                boolean isSatisfiedBy(int ch) {
+                    return Character.isIdeographic(ch);}});
             defClone("javaTitleCase", new CloneableProperty() {
                 boolean isSatisfiedBy(int ch) {
                     return Character.isTitleCase(ch);}});
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/classes/java/util/regex/UnicodeProp.java	Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package java.util.regex;
+
+import java.util.HashMap;
+import java.util.Locale;
+
+enum UnicodeProp {
+
+    ALPHABETIC {
+        public boolean is(int ch) {
+            return Character.isAlphabetic(ch);
+        }
+    },
+
+    LETTER {
+        public boolean is(int ch) {
+            return Character.isLetter(ch);
+        }
+    },
+
+    IDEOGRAPHIC {
+        public boolean is(int ch) {
+            return Character.isIdeographic(ch);
+        }
+    },
+
+    LOWERCASE {
+        public boolean is(int ch) {
+            return Character.isLowerCase(ch);
+        }
+    },
+
+    UPPERCASE {
+        public boolean is(int ch) {
+            return Character.isUpperCase(ch);
+        }
+    },
+
+    TITLECASE {
+        public boolean is(int ch) {
+            return Character.isTitleCase(ch);
+        }
+    },
+
+    WHITE_SPACE {
+        // \p{Whitespace}
+        public boolean is(int ch) {
+            return ((((1 << Character.SPACE_SEPARATOR) |
+                      (1 << Character.LINE_SEPARATOR) |
+                      (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+                   != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
+        }
+    },
+
+    CONTROL {
+        // \p{gc=Control}
+        public boolean is(int ch) {
+            return Character.getType(ch) == Character.CONTROL;
+        }
+    },
+
+    PUNCTUATION {
+        // \p{gc=Punctuation}
+        public boolean is(int ch) {
+            return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+                      (1 << Character.DASH_PUNCTUATION) |
+                      (1 << Character.START_PUNCTUATION) |
+                      (1 << Character.END_PUNCTUATION) |
+                      (1 << Character.OTHER_PUNCTUATION) |
+                      (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+                      (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+                   != 0;
+        }
+    },
+
+    HEX_DIGIT {
+        // \p{gc=Decimal_Number}
+        // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
+        public boolean is(int ch) {
+            return DIGIT.is(ch) ||
+                   (ch >= 0x0030 && ch <= 0x0039) ||
+                   (ch >= 0x0041 && ch <= 0x0046) ||
+                   (ch >= 0x0061 && ch <= 0x0066) ||
+                   (ch >= 0xFF10 && ch <= 0xFF19) ||
+                   (ch >= 0xFF21 && ch <= 0xFF26) ||
+                   (ch >= 0xFF41 && ch <= 0xFF46);
+        }
+    },
+
+    ASSIGNED {
+        public boolean is(int ch) {
+            return Character.getType(ch) != Character.UNASSIGNED;
+        }
+    },
+
+    NONCHARACTER_CODE_POINT {
+        // PropList.txt:Noncharacter_Code_Point
+        public boolean is(int ch) {
+            return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+        }
+    },
+
+    DIGIT {
+        // \p{gc=Decimal_Number}
+        public boolean is(int ch) {
+            return Character.isDigit(ch);
+        }
+    },
+
+    ALNUM {
+        // \p{alpha}
+        // \p{digit}
+        public boolean is(int ch) {
+            return ALPHABETIC.is(ch) || DIGIT.is(ch);
+        }
+    },
+
+    BLANK {
+        // \p{Whitespace} --
+        // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
+        //  \p{gc=Line_Separator}
+        //  \p{gc=Paragraph_Separator}]
+        public boolean is(int ch) {
+            return Character.getType(ch) == Character.SPACE_SEPARATOR ||
+                   ch == 0x9; // \N{HT}
+        }
+    },
+
+    GRAPH {
+        // [^
+        //  \p{space}
+        //  \p{gc=Control}
+        //  \p{gc=Surrogate}
+        //  \p{gc=Unassigned}]
+        public boolean is(int ch) {
+            return ((((1 << Character.SPACE_SEPARATOR) |
+                      (1 << Character.LINE_SEPARATOR) |
+                      (1 << Character.PARAGRAPH_SEPARATOR) |
+                      (1 << Character.CONTROL) |
+                      (1 << Character.SURROGATE) |
+                      (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
+                   == 0;
+        }
+    },
+
+    PRINT {
+        // \p{graph}
+        // \p{blank}
+        // -- \p{cntrl}
+        public boolean is(int ch) {
+            return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
+        }
+    },
+
+    WORD {
+        //  \p{alpha}
+        //  \p{gc=Mark}
+        //  \p{digit}
+        //  \p{gc=Connector_Punctuation}
+
+        public boolean is(int ch) {
+            return ALPHABETIC.is(ch) ||
+                   ((((1 << Character.NON_SPACING_MARK) |
+                      (1 << Character.ENCLOSING_MARK) |
+                      (1 << Character.COMBINING_SPACING_MARK) |
+                      (1 << Character.DECIMAL_DIGIT_NUMBER) |
+                      (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+                   != 0;
+        }
+    };
+
+    private final static HashMap<String, String> posix = new HashMap<>();
+    private final static HashMap<String, String> aliases = new HashMap<>();
+    static {
+        posix.put("ALPHA", "ALPHABETIC");
+        posix.put("LOWER", "LOWERCASE");
+        posix.put("UPPER", "UPPERCASE");
+        posix.put("SPACE", "WHITE_SPACE");
+        posix.put("PUNCT", "PUNCTUATION");
+        posix.put("XDIGIT","HEX_DIGIT");
+        posix.put("ALNUM", "ALNUM");
+        posix.put("CNTRL", "CONTROL");
+        posix.put("DIGIT", "DIGIT");
+        posix.put("BLANK", "BLANK");
+        posix.put("GRAPH", "GRAPH");
+        posix.put("PRINT", "PRINT");
+
+        aliases.put("WHITESPACE", "WHITE_SPACE");
+        aliases.put("HEXDIGIT","HEX_DIGIT");
+        aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
+    }
+
+    public static UnicodeProp forName(String propName) {
+        propName = propName.toUpperCase(Locale.ENGLISH);
+        String alias = aliases.get(propName);
+        if (alias != null)
+            propName = alias;
+        try {
+            return valueOf (propName);
+        } catch (IllegalArgumentException x) {}
+        return null;
+    }
+
+    public static UnicodeProp forPOSIXName(String propName) {
+        propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
+        if (propName == null)
+            return null;
+        return valueOf (propName);
+    }
+
+    public abstract boolean is(int ch);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/java/util/regex/POSIX_ASCII.java	Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+
+final class POSIX_ASCII {
+
+    static final int UPPER   = 0x00000100;
+
+    static final int LOWER   = 0x00000200;
+
+    static final int DIGIT   = 0x00000400;
+
+    static final int SPACE   = 0x00000800;
+
+    static final int PUNCT   = 0x00001000;
+
+    static final int CNTRL   = 0x00002000;
+
+    static final int BLANK   = 0x00004000;
+
+    static final int HEX     = 0x00008000;
+
+    static final int UNDER   = 0x00010000;
+
+    static final int ASCII   = 0x0000FF00;
+
+    static final int ALPHA   = (UPPER|LOWER);
+
+    static final int ALNUM   = (UPPER|LOWER|DIGIT);
+
+    static final int GRAPH   = (PUNCT|UPPER|LOWER|DIGIT);
+
+    static final int WORD    = (UPPER|LOWER|UNDER|DIGIT);
+
+    static final int XDIGIT  = (HEX);
+
+    private static final int[] ctype = new int[] {
+        CNTRL,                  /* 00 (NUL) */
+        CNTRL,                  /* 01 (SOH) */
+        CNTRL,                  /* 02 (STX) */
+        CNTRL,                  /* 03 (ETX) */
+        CNTRL,                  /* 04 (EOT) */
+        CNTRL,                  /* 05 (ENQ) */
+        CNTRL,                  /* 06 (ACK) */
+        CNTRL,                  /* 07 (BEL) */
+        CNTRL,                  /* 08 (BS)  */
+        SPACE+CNTRL+BLANK,      /* 09 (HT)  */
+        SPACE+CNTRL,            /* 0A (LF)  */
+        SPACE+CNTRL,            /* 0B (VT)  */
+        SPACE+CNTRL,            /* 0C (FF)  */
+        SPACE+CNTRL,            /* 0D (CR)  */
+        CNTRL,                  /* 0E (SI)  */
+        CNTRL,                  /* 0F (SO)  */
+        CNTRL,                  /* 10 (DLE) */
+        CNTRL,                  /* 11 (DC1) */
+        CNTRL,                  /* 12 (DC2) */
+        CNTRL,                  /* 13 (DC3) */
+        CNTRL,                  /* 14 (DC4) */
+        CNTRL,                  /* 15 (NAK) */
+        CNTRL,                  /* 16 (SYN) */
+        CNTRL,                  /* 17 (ETB) */
+        CNTRL,                  /* 18 (CAN) */
+        CNTRL,                  /* 19 (EM)  */
+        CNTRL,                  /* 1A (SUB) */
+        CNTRL,                  /* 1B (ESC) */
+        CNTRL,                  /* 1C (FS)  */
+        CNTRL,                  /* 1D (GS)  */
+        CNTRL,                  /* 1E (RS)  */
+        CNTRL,                  /* 1F (US)  */
+        SPACE+BLANK,            /* 20 SPACE */
+        PUNCT,                  /* 21 !     */
+        PUNCT,                  /* 22 "     */
+        PUNCT,                  /* 23 #     */
+        PUNCT,                  /* 24 $     */
+        PUNCT,                  /* 25 %     */
+        PUNCT,                  /* 26 &     */
+        PUNCT,                  /* 27 '     */
+        PUNCT,                  /* 28 (     */
+        PUNCT,                  /* 29 )     */
+        PUNCT,                  /* 2A *     */
+        PUNCT,                  /* 2B +     */
+        PUNCT,                  /* 2C ,     */
+        PUNCT,                  /* 2D -     */
+        PUNCT,                  /* 2E .     */
+        PUNCT,                  /* 2F /     */
+        DIGIT+HEX+0,            /* 30 0     */
+        DIGIT+HEX+1,            /* 31 1     */
+        DIGIT+HEX+2,            /* 32 2     */
+        DIGIT+HEX+3,            /* 33 3     */
+        DIGIT+HEX+4,            /* 34 4     */
+        DIGIT+HEX+5,            /* 35 5     */
+        DIGIT+HEX+6,            /* 36 6     */
+        DIGIT+HEX+7,            /* 37 7     */
+        DIGIT+HEX+8,            /* 38 8     */
+        DIGIT+HEX+9,            /* 39 9     */
+        PUNCT,                  /* 3A :     */
+        PUNCT,                  /* 3B ;     */
+        PUNCT,                  /* 3C <     */
+        PUNCT,                  /* 3D =     */
+        PUNCT,                  /* 3E >     */
+        PUNCT,                  /* 3F ?     */
+        PUNCT,                  /* 40 @     */
+        UPPER+HEX+10,           /* 41 A     */
+        UPPER+HEX+11,           /* 42 B     */
+        UPPER+HEX+12,           /* 43 C     */
+        UPPER+HEX+13,           /* 44 D     */
+        UPPER+HEX+14,           /* 45 E     */
+        UPPER+HEX+15,           /* 46 F     */
+        UPPER+16,               /* 47 G     */
+        UPPER+17,               /* 48 H     */
+        UPPER+18,               /* 49 I     */
+        UPPER+19,               /* 4A J     */
+        UPPER+20,               /* 4B K     */
+        UPPER+21,               /* 4C L     */
+        UPPER+22,               /* 4D M     */
+        UPPER+23,               /* 4E N     */
+        UPPER+24,               /* 4F O     */
+        UPPER+25,               /* 50 P     */
+        UPPER+26,               /* 51 Q     */
+        UPPER+27,               /* 52 R     */
+        UPPER+28,               /* 53 S     */
+        UPPER+29,               /* 54 T     */
+        UPPER+30,               /* 55 U     */
+        UPPER+31,               /* 56 V     */
+        UPPER+32,               /* 57 W     */
+        UPPER+33,               /* 58 X     */
+        UPPER+34,               /* 59 Y     */
+        UPPER+35,               /* 5A Z     */
+        PUNCT,                  /* 5B [     */
+        PUNCT,                  /* 5C \     */
+        PUNCT,                  /* 5D ]     */
+        PUNCT,                  /* 5E ^     */
+        PUNCT|UNDER,            /* 5F _     */
+        PUNCT,                  /* 60 `     */
+        LOWER+HEX+10,           /* 61 a     */
+        LOWER+HEX+11,           /* 62 b     */
+        LOWER+HEX+12,           /* 63 c     */
+        LOWER+HEX+13,           /* 64 d     */
+        LOWER+HEX+14,           /* 65 e     */
+        LOWER+HEX+15,           /* 66 f     */
+        LOWER+16,               /* 67 g     */
+        LOWER+17,               /* 68 h     */
+        LOWER+18,               /* 69 i     */
+        LOWER+19,               /* 6A j     */
+        LOWER+20,               /* 6B k     */
+        LOWER+21,               /* 6C l     */
+        LOWER+22,               /* 6D m     */
+        LOWER+23,               /* 6E n     */
+        LOWER+24,               /* 6F o     */
+        LOWER+25,               /* 70 p     */
+        LOWER+26,               /* 71 q     */
+        LOWER+27,               /* 72 r     */
+        LOWER+28,               /* 73 s     */
+        LOWER+29,               /* 74 t     */
+        LOWER+30,               /* 75 u     */
+        LOWER+31,               /* 76 v     */
+        LOWER+32,               /* 77 w     */
+        LOWER+33,               /* 78 x     */
+        LOWER+34,               /* 79 y     */
+        LOWER+35,               /* 7A z     */
+        PUNCT,                  /* 7B {     */
+        PUNCT,                  /* 7C |     */
+        PUNCT,                  /* 7D }     */
+        PUNCT,                  /* 7E ~     */
+        CNTRL,                  /* 7F (DEL) */
+    };
+
+    static int getType(int ch) {
+        return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
+    }
+
+    static boolean isType(int ch, int type) {
+        return (getType(ch) & type) != 0;
+    }
+
+    static boolean isAscii(int ch) {
+        return ((ch & 0xFFFFFF80) == 0);
+    }
+
+    static boolean isAlpha(int ch) {
+        return isType(ch, ALPHA);
+    }
+
+    static boolean isDigit(int ch) {
+        return ((ch-'0')|('9'-ch)) >= 0;
+    }
+
+    static boolean isAlnum(int ch) {
+        return isType(ch, ALNUM);
+    }
+
+    static boolean isGraph(int ch) {
+        return isType(ch, GRAPH);
+    }
+
+    static boolean isPrint(int ch) {
+        return ((ch-0x20)|(0x7E-ch)) >= 0;
+    }
+
+    static boolean isPunct(int ch) {
+        return isType(ch, PUNCT);
+    }
+
+    static boolean isSpace(int ch) {
+        return isType(ch, SPACE);
+    }
+
+    static boolean isHexDigit(int ch) {
+        return isType(ch, HEX);
+    }
+
+    static boolean isCntrl(int ch) {
+        return isType(ch, CNTRL);
+    }
+
+    static boolean isLower(int ch) {
+        return ((ch-'a')|('z'-ch)) >= 0;
+    }
+
+    static boolean isUpper(int ch) {
+        return ((ch-'A')|('Z'-ch)) >= 0;
+    }
+
+    static boolean isWord(int ch) {
+        return isType(ch, WORD);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/java/util/regex/POSIX_Unicode.java	Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.HashMap;
+import java.util.Locale;
+
+final public class POSIX_Unicode {
+
+    public static boolean isAlpha(int ch) {
+        return Character.isAlphabetic(ch);
+    }
+
+    public static boolean isLower(int ch) {
+        return Character.isLowerCase(ch);
+    }
+
+    public static boolean isUpper(int ch) {
+        return Character.isUpperCase(ch);
+    }
+
+    // \p{Whitespace}
+    public static boolean isSpace(int ch) {
+        return ((((1 << Character.SPACE_SEPARATOR) |
+                  (1 << Character.LINE_SEPARATOR) |
+                  (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+                   != 0 ||
+               (ch >= 0x9 && ch <= 0xd) ||
+               (ch == 0x85);
+    }
+
+    // \p{gc=Control}
+    public static boolean isCntrl(int ch) {
+        return Character.getType(ch) == Character.CONTROL;
+    }
+
+    // \p{gc=Punctuation}
+    public static boolean isPunct(int ch) {
+        return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+                  (1 << Character.DASH_PUNCTUATION) |
+                  (1 << Character.START_PUNCTUATION) |
+                  (1 << Character.END_PUNCTUATION) |
+                  (1 << Character.OTHER_PUNCTUATION) |
+                  (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+                  (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+              != 0;
+    }
+
+    // \p{gc=Decimal_Number}
+    // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
+    public static boolean isHexDigit(int ch) {
+        return Character.isDigit(ch) ||
+               (ch >= 0x0030 && ch <= 0x0039) ||
+               (ch >= 0x0041 && ch <= 0x0046) ||
+               (ch >= 0x0061 && ch <= 0x0066) ||
+               (ch >= 0xFF10 && ch <= 0xFF19) ||
+               (ch >= 0xFF21 && ch <= 0xFF26) ||
+               (ch >= 0xFF41 && ch <= 0xFF46);
+    }
+
+    // \p{gc=Decimal_Number}
+    public static boolean isDigit(int ch) {
+        return Character.isDigit(ch);
+    };
+
+    // \p{alpha}
+    // \p{digit}
+    public static boolean isAlnum(int ch) {
+        return Character.isAlphabetic(ch) || Character.isDigit(ch);
+    }
+
+    // \p{Whitespace} --
+    // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
+    //  \p{gc=Line_Separator}
+    //  \p{gc=Paragraph_Separator}]
+    public static boolean isBlank(int ch) {
+        int type = Character.getType(ch);
+        return isSpace(ch) &&
+               ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 &&
+               type != Character.LINE_SEPARATOR &&
+               type != Character.PARAGRAPH_SEPARATOR;
+    }
+
+    // [^
+    //  \p{space}
+    //  \p{gc=Control}
+    //  \p{gc=Surrogate}
+    //  \p{gc=Unassigned}]
+    public static boolean isGraph(int ch) {
+        int type = Character.getType(ch);
+        return !(isSpace(ch) ||
+                 Character.CONTROL == type ||
+                 Character.SURROGATE == type ||
+                 Character.UNASSIGNED == type);
+    }
+
+    // \p{graph}
+    // \p{blank}
+    // -- \p{cntrl}
+    public static boolean isPrint(int ch) {
+        return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch);
+    }
+
+    // PropList.txt:Noncharacter_Code_Point
+    public static boolean isNoncharacterCodePoint(int ch) {
+        return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+    }
+
+    //  \p{alpha}
+    //  \p{gc=Mark}
+    //  \p{digit}
+    //  \p{gc=Connector_Punctuation}
+    public static boolean isWord(int ch) {
+        return isAlpha(ch) ||
+               ((((1 << Character.NON_SPACING_MARK) |
+                  (1 << Character.ENCLOSING_MARK) |
+                  (1 << Character.COMBINING_SPACING_MARK) |
+                  (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+               != 0 ||
+               isDigit(ch);
+    }
+}
--- a/test/java/util/regex/RegExTest.java	Thu Apr 28 20:18:57 2011 -0700
+++ b/test/java/util/regex/RegExTest.java	Thu Apr 28 20:48:36 2011 -0700
@@ -32,7 +32,7 @@
  * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
+ * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066
  */
 
 import java.util.regex.*;
@@ -137,6 +137,7 @@
         nonBmpClassComplementTest();
         unicodePropertiesTest();
         unicodeHexNotationTest();
+        unicodeClassesTest();
         if (failure)
             throw new RuntimeException("Failure in the RE handling.");
         else
@@ -3656,5 +3657,146 @@
                  failCount++;
          }
          report("unicodeHexNotation");
-     }
+    }
+
+    private static void unicodeClassesTest() throws Exception {
+
+        Matcher lower  = Pattern.compile("\\p{Lower}").matcher("");
+        Matcher upper  = Pattern.compile("\\p{Upper}").matcher("");
+        Matcher ASCII  = Pattern.compile("\\p{ASCII}").matcher("");
+        Matcher alpha  = Pattern.compile("\\p{Alpha}").matcher("");
+        Matcher digit  = Pattern.compile("\\p{Digit}").matcher("");
+        Matcher alnum  = Pattern.compile("\\p{Alnum}").matcher("");
+        Matcher punct  = Pattern.compile("\\p{Punct}").matcher("");
+        Matcher graph  = Pattern.compile("\\p{Graph}").matcher("");
+        Matcher print  = Pattern.compile("\\p{Print}").matcher("");
+        Matcher blank  = Pattern.compile("\\p{Blank}").matcher("");
+        Matcher cntrl  = Pattern.compile("\\p{Cntrl}").matcher("");
+        Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher("");
+        Matcher space  = Pattern.compile("\\p{Space}").matcher("");
+        Matcher bound  = Pattern.compile("\\b").matcher("");
+        Matcher word   = Pattern.compile("\\w++").matcher("");
+        // UNICODE_CHARACTER_CLASS
+        Matcher lowerU  = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher upperU  = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher ASCIIU  = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher alphaU  = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher digitU  = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher alnumU  = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher punctU  = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher graphU  = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher printU  = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher blankU  = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher cntrlU  = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher spaceU  = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher boundU  = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher wordU   = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        // embedded flag (?U)
+        Matcher lowerEU  = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher graphEU  = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher wordEU   = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+
+        Matcher bwb    = Pattern.compile("\\b\\w\\b").matcher("");
+        Matcher bwbU   = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        Matcher bwbEU  = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+        // properties
+        Matcher lowerP  = Pattern.compile("\\p{IsLowerCase}").matcher("");
+        Matcher upperP  = Pattern.compile("\\p{IsUpperCase}").matcher("");
+        Matcher titleP  = Pattern.compile("\\p{IsTitleCase}").matcher("");
+        Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher("");
+        Matcher alphaP  = Pattern.compile("\\p{IsAlphabetic}").matcher("");
+        Matcher ideogP  = Pattern.compile("\\p{IsIdeographic}").matcher("");
+        Matcher cntrlP  = Pattern.compile("\\p{IsControl}").matcher("");
+        Matcher spaceP  = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
+        Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
+        Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
+
+        // javaMethod
+        Matcher lowerJ  = Pattern.compile("\\p{javaLowerCase}").matcher("");
+        Matcher upperJ  = Pattern.compile("\\p{javaUpperCase}").matcher("");
+        Matcher alphaJ  = Pattern.compile("\\p{javaAlphabetic}").matcher("");
+        Matcher ideogJ  = Pattern.compile("\\p{javaIdeographic}").matcher("");
+
+        for (int cp = 1; cp < 0x30000; cp++) {
+            String str = new String(Character.toChars(cp));
+            int type = Character.getType(cp);
+            if (// lower
+                POSIX_ASCII.isLower(cp)   != lower.reset(str).matches()  ||
+                Character.isLowerCase(cp) != lowerU.reset(str).matches() ||
+                Character.isLowerCase(cp) != lowerP.reset(str).matches() ||
+                Character.isLowerCase(cp) != lowerEU.reset(str).matches()||
+                Character.isLowerCase(cp) != lowerJ.reset(str).matches()||
+                // upper
+                POSIX_ASCII.isUpper(cp)   != upper.reset(str).matches()  ||
+                POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() ||
+                Character.isUpperCase(cp) != upperP.reset(str).matches() ||
+                Character.isUpperCase(cp) != upperJ.reset(str).matches() ||
+                // alpha
+                POSIX_ASCII.isAlpha(cp)   != alpha.reset(str).matches()  ||
+                POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() ||
+                Character.isAlphabetic(cp)!= alphaP.reset(str).matches() ||
+                Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() ||
+                // digit
+                POSIX_ASCII.isDigit(cp)   != digit.reset(str).matches()  ||
+                Character.isDigit(cp)     != digitU.reset(str).matches() ||
+                // alnum
+                POSIX_ASCII.isAlnum(cp)   != alnum.reset(str).matches()  ||
+                POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() ||
+                // punct
+                POSIX_ASCII.isPunct(cp)   != punct.reset(str).matches()  ||
+                POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() ||
+                // graph
+                POSIX_ASCII.isGraph(cp)   != graph.reset(str).matches()  ||
+                POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() ||
+                POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()||
+                // blank
+                POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK)
+                                          != blank.reset(str).matches()  ||
+                POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() ||
+                // print
+                POSIX_ASCII.isPrint(cp)   != print.reset(str).matches()  ||
+                POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() ||
+                // cntrl
+                POSIX_ASCII.isCntrl(cp)   != cntrl.reset(str).matches()  ||
+                POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() ||
+                (Character.CONTROL == type) != cntrlP.reset(str).matches() ||
+                // hexdigit
+                POSIX_ASCII.isHexDigit(cp)   != xdigit.reset(str).matches()  ||
+                POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() ||
+                // space
+                POSIX_ASCII.isSpace(cp)   != space.reset(str).matches()  ||
+                POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() ||
+                POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() ||
+                // word
+                POSIX_ASCII.isWord(cp)   != word.reset(str).matches()  ||
+                POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() ||
+                POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()||
+                // bwordb
+                POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() ||
+                POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() ||
+                // properties
+                Character.isTitleCase(cp) != titleP.reset(str).matches() ||
+                Character.isLetter(cp)    != letterP.reset(str).matches()||
+                Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
+                Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
+                (Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
+                POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
+                failCount++;
+        }
+
+        // bounds/word align
+        twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
+        if (!bwbU.reset("\u0180sherman\u0400").matches())
+            failCount++;
+        twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
+        if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches())
+            failCount++;
+        twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
+        if (!bwbU.reset("\u0724\u0739\u0724").matches())
+            failCount++;
+        if (!bwbEU.reset("\u0724\u0739\u0724").matches())
+            failCount++;
+        report("unicodePredefinedClasses");
+    }
 }