Mercurial > hg > icedtea9-forest > jdk
changeset 5817:c76ad79a5a2f
7183053: Optimize DoubleByte charset for String.getBytes()/new String(byte[])
Summary: DoubleByte implements sun/nio.cs/ArrayDe/Encoder interface
Reviewed-by: alanb
author | sherman |
---|---|
date | Tue, 17 Jul 2012 19:57:31 -0700 |
parents | b6f78869c66d |
children | 89129c0737f1 |
files | src/share/classes/sun/nio/cs/ext/DoubleByte.java src/share/classes/sun/nio/cs/ext/HKSCS.java test/sun/nio/cs/StrCodingBenchmark.java test/sun/nio/cs/StrCodingBenchmarkDB.java test/sun/nio/cs/TestStringCoding.java |
diffstat | 5 files changed, 431 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/src/share/classes/sun/nio/cs/ext/DoubleByte.java Tue Jul 17 11:01:44 2012 -0700 +++ b/src/share/classes/sun/nio/cs/ext/DoubleByte.java Tue Jul 17 19:57:31 2012 -0700 @@ -33,6 +33,8 @@ import java.nio.charset.CoderResult; import java.util.Arrays; import sun.nio.cs.Surrogate; +import sun.nio.cs.ArrayDecoder; +import sun.nio.cs.ArrayEncoder; import static sun.nio.cs.CharsetMapping.*; /* @@ -107,7 +109,7 @@ } public static class Decoder extends CharsetDecoder - implements DelegatableDecoder + implements DelegatableDecoder, ArrayDecoder { final char[][] b2c; @@ -209,6 +211,29 @@ return decodeBufferLoop(src, dst); } + public int decode(byte[] src, int sp, int len, char[] dst) { + int dp = 0; + int sl = sp + len; + char repl = replacement().charAt(0); + while (sp < sl) { + int b1 = src[sp++] & 0xff; + char c = b2cSB[b1]; + if (c == UNMAPPABLE_DECODING) { + if (sp < sl) { + int b2 = src[sp++] & 0xff; + if (b2 >= b2Min && b2 <= b2Max) { + c = b2c[b1][b2 - b2Min]; + } + } + if (c == UNMAPPABLE_DECODING) { + c = repl; + } + } + dst[dp++] = c; + } + return dp; + } + public void implReset() { super.implReset(); } @@ -228,6 +253,7 @@ return UNMAPPABLE_DECODING; return b2c[b1][b2 - b2Min]; } + } // IBM_EBCDIC_DBCS @@ -367,6 +393,46 @@ src.position(mark); } } + + public int decode(byte[] src, int sp, int len, char[] dst) { + int dp = 0; + int sl = sp + len; + currentState = SBCS; + char repl = replacement().charAt(0); + while (sp < sl) { + int b1 = src[sp++] & 0xff; + if (b1 == SO) { // Shift out + if (currentState != SBCS) + dst[dp++] = repl; + else + currentState = DBCS; + } else if (b1 == SI) { + if (currentState != DBCS) + dst[dp++] = repl; + else + currentState = SBCS; + } else { + char c = UNMAPPABLE_DECODING; + if (currentState == SBCS) { + c = b2cSB[b1]; + if (c == UNMAPPABLE_DECODING) + c = repl; + } else { + if (sl == sp) { + c = repl; + } else { + int b2 = src[sp++] & 0xff; + if (b2 < b2Min || b2 > b2Max || + (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) { + c = repl; + } + } + } + dst[dp++] = c; + } + } + return dp; + } } // EBCDIC_DBCS_ONLY @@ -405,9 +471,37 @@ return CoderResult.malformedForLength(1); return CoderResult.unmappableForLength(2); } + + public int decode(byte[] src, int sp, int len, char[] dst) { + int dp = 0; + int sl = sp + len; + char repl = replacement().charAt(0); + while (sp < sl) { + int b1 = src[sp++] & 0xff; + char c = b2cSB[b1]; + if (c == UNMAPPABLE_DECODING) { + if (sp < sl) { + int b2 = src[sp++] & 0xff; + if (b2 < b2Min || b2 > b2Max || + (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) { + if (b1 == SS2 || b1 == SS3) { + sp--; + } + c = repl; + } + } else { + c = repl; + } + } + dst[dp++] = c; + } + return dp; + } } - public static class Encoder extends CharsetEncoder { + public static class Encoder extends CharsetEncoder + implements ArrayEncoder + { final int MAX_SINGLEBYTE = 0xff; private final char[] c2b; private final char[] c2bIndex; @@ -516,6 +610,35 @@ return encodeBufferLoop(src, dst); } + public int encode(char[] src, int sp, int len, byte[] dst) { + int dp = 0; + int sl = sp + len; + int dl = dst.length; + while (sp < sl) { + char c = src[sp++]; + int bb = encodeChar(c); + if (bb == UNMAPPABLE_ENCODING) { + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(src[sp])) { + sp++; + } + byte[] repl = replacement(); + dst[dp++] = repl[0]; + if (repl.length > 1) + dst[dp++] = repl[1]; + continue; + } //else + if (bb > MAX_SINGLEBYTE) { // DoubleByte + dst[dp++] = (byte)(bb >> 8); + dst[dp++] = (byte)bb; + } else { // SingleByte + dst[dp++] = (byte)bb; + } + + } + return dp; + } + public int encodeChar(char ch) { return c2b[c2bIndex[ch >> 8] + (ch & 0xff)]; } @@ -604,7 +727,6 @@ } } - // EBCDIC_DBCS_ONLY public static class Encoder_EBCDIC_DBCSONLY extends Encoder { Encoder_EBCDIC_DBCSONLY(Charset cs, byte[] repl, char[] c2b, char[] c2bIndex) { @@ -619,7 +741,6 @@ } } - // for IBM_EBCDIC_DBCS public static class Encoder_EBCDIC extends Encoder { static final int SBCS = 0; static final int DBCS = 1; @@ -741,6 +862,47 @@ src.position(mark); } } + + public int encode(char[] src, int sp, int len, byte[] dst) { + int dp = 0; + int sl = sp + len; + while (sp < sl) { + char c = src[sp++]; + int bb = encodeChar(c); + + if (bb == UNMAPPABLE_ENCODING) { + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(src[sp])) { + sp++; + } + byte[] repl = replacement(); + dst[dp++] = repl[0]; + if (repl.length > 1) + dst[dp++] = repl[1]; + continue; + } //else + if (bb > MAX_SINGLEBYTE) { // DoubleByte + if (currentState == SBCS) { + currentState = DBCS; + dst[dp++] = SO; + } + dst[dp++] = (byte)(bb >> 8); + dst[dp++] = (byte)bb; + } else { // SingleByte + if (currentState == DBCS) { + currentState = SBCS; + dst[dp++] = SI; + } + dst[dp++] = (byte)bb; + } + } + + if (currentState == DBCS) { + currentState = SBCS; + dst[dp++] = SI; + } + return dp; + } } // EUC_SIMPLE
--- a/src/share/classes/sun/nio/cs/ext/HKSCS.java Tue Jul 17 11:01:44 2012 -0700 +++ b/src/share/classes/sun/nio/cs/ext/HKSCS.java Tue Jul 17 19:57:31 2012 -0700 @@ -175,6 +175,40 @@ } } + public int decode(byte[] src, int sp, int len, char[] dst) { + int dp = 0; + int sl = sp + len; + char repl = replacement().charAt(0); + while (sp < sl) { + int b1 = src[sp++] & 0xff; + char c = decodeSingle(b1); + if (c == UNMAPPABLE_DECODING) { + if (sl == sp) { + c = repl; + } else { + int b2 = src[sp++] & 0xff; + if (b2 < b2Min || b2 > b2Max) { + c = repl; + } else if ((c = decodeDouble(b1, b2)) == UNMAPPABLE_DECODING) { + c = decodeDoubleEx(b1, b2); //supp + if (c == UNMAPPABLE_DECODING) { + c = decodeBig5(b1, b2); //big5 + if (c == UNMAPPABLE_DECODING) + c = repl; + } else { + // supplementary character in u+2xxxx area + dst[dp++] = Surrogate.high(0x20000 + c); + dst[dp++] = Surrogate.low(0x20000 + c); + continue; + } + } + } + } + dst[dp++] = c; + } + return dp; + } + public CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) { if (src.hasArray() && dst.hasArray()) return decodeArrayLoop(src, dst); @@ -322,6 +356,36 @@ return encodeBufferLoop(src, dst); } + public int encode(char[] src, int sp, int len, byte[] dst) { + int dp = 0; + int sl = sp + len; + while (sp < sl) { + char c = src[sp++]; + int bb = encodeChar(c); + if (bb == UNMAPPABLE_ENCODING) { + if (!Character.isHighSurrogate(c) || sp == sl || + !Character.isLowSurrogate(src[sp]) || + (bb = encodeSupp(Character.toCodePoint(c, src[sp++]))) + == UNMAPPABLE_ENCODING) { + byte[] repl = replacement(); + dst[dp++] = repl[0]; + if (repl.length > 1) + dst[dp++] = repl[1]; + continue; + } + sp++; + } + if (bb > MAX_SINGLEBYTE) { // DoubleByte + dst[dp++] = (byte)(bb >> 8); + dst[dp++] = (byte)bb; + } else { // SingleByte + dst[dp++] = (byte)bb; + } + } + return dp; + } + + static char[] C2B_UNMAPPABLE = new char[0x100]; static { Arrays.fill(C2B_UNMAPPABLE, (char)UNMAPPABLE_ENCODING);
--- a/test/sun/nio/cs/StrCodingBenchmark.java Tue Jul 17 11:01:44 2012 -0700 +++ b/test/sun/nio/cs/StrCodingBenchmark.java Tue Jul 17 19:57:31 2012 -0700 @@ -75,7 +75,7 @@ return nanoss; } - public static void time(Job ... jobs) throws Throwable { + public static long[] time(Job ... jobs) throws Throwable { long[] warmup = time0(jobs); // Warm up run long[] nanoss = time0(jobs); // Real timing run @@ -110,6 +110,7 @@ // Print out absolute and relative times, calibrated against first job for (int i = 0; i < jobs.length; i++) System.out.printf(format, jobs[i].name(), milliss[i], ratios[i]); + return milliss; } public static Job[] filter(Pattern filter, Job[] jobs) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/sun/nio/cs/StrCodingBenchmarkDB.java Tue Jul 17 19:57:31 2012 -0700 @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import java.util.*; +import java.nio.*; +import java.nio.charset.*; +import java.util.concurrent.*; +import java.util.regex.Pattern; + +public class StrCodingBenchmarkDB extends StrCodingBenchmark { + + + public static void main(String[] args) throws Throwable { + final int itrs = Integer.getInteger("iterations", 100000); + //final int itrs = Integer.getInteger("iterations", 12); + final int size = Integer.getInteger("size", 2048); + final int subsize = Integer.getInteger("subsize", 128); + final int maxchar = Integer.getInteger("maxchar", 128); + final String regex = System.getProperty("filter"); + final Pattern filter = (regex == null) ? null : Pattern.compile(regex); + final boolean useSecurityManager = Boolean.getBoolean("SecurityManager"); + if (useSecurityManager) + System.setSecurityManager(new PermissiveSecurityManger()); + final Random rnd = new Random(); + + String[] csns = new String[] { + "Big5", + "Johab", + "EUC_CN", + "EUC_KR", + "MS932", + "MS936", + "MS949", + "MS950", + "GBK", + + "Big5_HKSCS", + "Big5_HKSCS_2001", + "Big5_Solaris", + "MS950_HKSCS", + "MS950_HKSCS_XP", + "IBM1364", + "IBM1381", + "IBM1383", + "IBM930", + "IBM933", + "IBM935", + "IBM937", + "IBM939", + "IBM942", + "IBM943", + "IBM948", + "IBM949", + "IBM950", + "IBM970", + }; + + ArrayList<long[]> sum = new ArrayList<>(); + + for (final String csn : csns) { + final Charset cs = Charset.forName(csn); + List<Integer> cps = new ArrayList<>(0x4000); + int off = 0; + int cp = 0; + int n = 0; + CharsetEncoder enc = cs.newEncoder(); + while (cp < 0x10000 && n < cps.size()) { + if (enc.canEncode((char)cp)) { + cps.add(cp); + n++; + } + cp++; + } + Collections.shuffle(cps); + char[] ca = new char[cps.size()]; + for (int i = 0; i < cps.size(); i++) + ca[i] = (char)(int)cps.get(i); + + + System.out.printf("%n--------%s---------%n", csn); + for (int sz = 8; sz <= 2048; sz *= 2) { + System.out.printf(" [len=%d]%n", sz); + + final char[] chars = Arrays.copyOf(ca, sz); + final String str = new String(chars); + final byte[] bs = str.getBytes(cs); + + Job[] jobs = { + + new Job("String decode: csn") { + public void work() throws Throwable { + for (int i = 0; i < itrs; i++) + new String(bs, csn); + }}, + + new Job("String decode: cs") { + public void work() throws Throwable { + for (int i = 0; i < itrs; i++) + new String(bs, cs); + }}, + + new Job("String encode: csn") { + public void work() throws Throwable { + for (int i = 0; i < itrs; i++) + str.getBytes(csn); + }}, + + new Job("String encode: cs") { + public void work() throws Throwable { + for (int i = 0; i < itrs; i++) + str.getBytes(cs); + }}, + }; + sum.add(time(jobs)); + + } + } + } +}
--- a/test/sun/nio/cs/TestStringCoding.java Tue Jul 17 11:01:44 2012 -0700 +++ b/test/sun/nio/cs/TestStringCoding.java Tue Jul 17 19:57:31 2012 -0700 @@ -24,7 +24,7 @@ */ /* @test - @bug 6636323 6636319 7040220 7096080 + @bug 6636323 6636319 7040220 7096080 7183053 @summary Test if StringCoding and NIO result have the same de/encoding result * @run main/othervm/timeout=2000 TestStringCoding */ @@ -70,11 +70,62 @@ } test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen)); } + + testMixed(cs); System.out.println("done!"); } } } + static void testMixed(Charset cs) throws Throwable { + CharsetDecoder dec = cs.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + CharsetEncoder enc = cs.newEncoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + List<Integer> cps = new ArrayList<>(0x10000); + int off = 0; + int cp = 0; + while (cp < 0x10000) { + if (enc.canEncode((char)cp)) { + cps.add(cp); + } + cp++; + } + Collections.shuffle(cps); + char[] bmpCA = new char[cps.size()]; + for (int i = 0; i < cps.size(); i++) + bmpCA[i] = (char)(int)cps.get(i); + String bmpStr = new String(bmpCA); + //getBytes(csn); + byte[] bmpBA = bmpStr.getBytes(cs.name()); + ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); + byte[] baNIO = new byte[bf.limit()]; + bf.get(baNIO, 0, baNIO.length); + if (!Arrays.equals(bmpBA, baNIO)) { + throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); + } + + //getBytes(cs); + bmpBA = bmpStr.getBytes(cs); + if (!Arrays.equals(bmpBA, baNIO)) + throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); + + //new String(csn); + String strSC = new String(bmpBA, cs.name()); + String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); + if(!strNIO.equals(strSC)) { + throw new RuntimeException("new String(csn) failed -> " + cs.name()); + } + + //new String(cs); + strSC = new String(bmpBA, cs); + if (!strNIO.equals(strSC)) + throw new RuntimeException("new String(cs) failed -> " + cs.name()); + + } + static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() @@ -100,6 +151,7 @@ //new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); + if(!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); @@ -112,7 +164,7 @@ if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates - cs.name().equals("CESU-8")) // utf8 handles surrogates + cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] { (byte)'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc; @@ -137,12 +189,16 @@ cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); + /* sun.nio.cs.ArrayDeEncoder works on the assumption that the + invoker (StringCoder) allocates enough output buf, utf8 + and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); - if (n != 7 || !"abABABc".equals(new String(ba, 0, n, - cs.name()))) + if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); + } + */ } }