# HG changeset patch # User Greg Lewis # Date 1517695315 28800 # Node ID 6e4b0b4481b997f687c47dbd510e080f895c0124 # Parent 9eace51e0a47ae216c858a22d236e272b653810e# Parent b84d5a3bd874f9747aa56aea22450553c32666b2 Merge from main OpenJDK repository diff -r 9eace51e0a47 -r 6e4b0b4481b9 .hgtags --- a/.hgtags Fri Sep 08 09:32:12 2017 -0700 +++ b/.hgtags Sat Feb 03 14:01:55 2018 -0800 @@ -848,3 +848,5 @@ 56ad25be7d88c2c2da562fe1e8879c8723d01da1 jdk7u141-b02 75662a7ec1719b3133636d09bd078968579a55ab jdk7u151-b00 d0c7cea0660f7a8188a7b8c1f6d1a6c8d6388fb0 jdk7u151-b01 +fc789043683d3cf424f97176bd77cf7abe5bd01a jdk7u161-b00 +2965926dc5176c075e7a68c5d82bfd0ffa91cd5e jdk7u161-b01 diff -r 9eace51e0a47 -r 6e4b0b4481b9 THIRD_PARTY_README --- a/THIRD_PARTY_README Fri Sep 08 09:32:12 2017 -0700 +++ b/THIRD_PARTY_README Sat Feb 03 14:01:55 2018 -0800 @@ -3134,14 +3134,14 @@ ------------------------------------------------------------------------------- -%% This notice is provided with respect to zlib v1.2.3, which is included +%% This notice is provided with respect to zlib v1.2.11, which may be included with JRE 7, JDK 7, and OpenJDK 7 --- begin of LICENSE --- - version 1.2.3, July 18th, 2005 - - Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + version 1.2.11, January 15th, 2017 + + Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/ppc/vm/ppc.ad --- a/src/cpu/ppc/vm/ppc.ad Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/ppc/vm/ppc.ad Sat Feb 03 14:01:55 2018 -0800 @@ -2445,6 +2445,11 @@ return false; } +// PPC AES support not yet implemented +const bool Matcher::pass_original_key_for_aes() { + return false; +} + // RETURNS: whether this branch offset is short enough that a short // branch can be used. // diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -2093,6 +2093,10 @@ // arraycopy stubs used by compilers generate_arraycopy_stubs(); + if (UseAESIntrinsics) { + guarantee(!UseAESIntrinsics, "not yet implemented."); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/sparc/vm/assembler_sparc.hpp --- a/src/cpu/sparc/vm/assembler_sparc.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -630,6 +630,7 @@ orncc_op3 = 0x16, xnorcc_op3 = 0x17, addccc_op3 = 0x18, + aes4_op3 = 0x19, umulcc_op3 = 0x1a, smulcc_op3 = 0x1b, subccc_op3 = 0x1c, @@ -663,6 +664,8 @@ fpop1_op3 = 0x34, fpop2_op3 = 0x35, impdep1_op3 = 0x36, + aes3_op3 = 0x36, + flog3_op3 = 0x36, impdep2_op3 = 0x37, jmpl_op3 = 0x38, rett_op3 = 0x39, @@ -724,41 +727,56 @@ enum opfs { // selected opfs - fmovs_opf = 0x01, - fmovd_opf = 0x02, - - fnegs_opf = 0x05, - fnegd_opf = 0x06, - - fadds_opf = 0x41, - faddd_opf = 0x42, - fsubs_opf = 0x45, - fsubd_opf = 0x46, - - fmuls_opf = 0x49, - fmuld_opf = 0x4a, - fdivs_opf = 0x4d, - fdivd_opf = 0x4e, - - fcmps_opf = 0x51, - fcmpd_opf = 0x52, - - fstox_opf = 0x81, - fdtox_opf = 0x82, - fxtos_opf = 0x84, - fxtod_opf = 0x88, - fitos_opf = 0xc4, - fdtos_opf = 0xc6, - fitod_opf = 0xc8, - fstod_opf = 0xc9, - fstoi_opf = 0xd1, - fdtoi_opf = 0xd2, - - mdtox_opf = 0x110, - mstouw_opf = 0x111, - mstosw_opf = 0x113, - mxtod_opf = 0x118, - mwtos_opf = 0x119 + fmovs_opf = 0x01, + fmovd_opf = 0x02, + + fnegs_opf = 0x05, + fnegd_opf = 0x06, + + fadds_opf = 0x41, + faddd_opf = 0x42, + fsubs_opf = 0x45, + fsubd_opf = 0x46, + + fmuls_opf = 0x49, + fmuld_opf = 0x4a, + fdivs_opf = 0x4d, + fdivd_opf = 0x4e, + + fcmps_opf = 0x51, + fcmpd_opf = 0x52, + + fstox_opf = 0x81, + fdtox_opf = 0x82, + fxtos_opf = 0x84, + fxtod_opf = 0x88, + fitos_opf = 0xc4, + fdtos_opf = 0xc6, + fitod_opf = 0xc8, + fstod_opf = 0xc9, + fstoi_opf = 0xd1, + fdtoi_opf = 0xd2, + + mdtox_opf = 0x110, + mstouw_opf = 0x111, + mstosw_opf = 0x113, + mxtod_opf = 0x118, + mwtos_opf = 0x119, + + aes_kexpand0_opf = 0x130, + aes_kexpand2_opf = 0x131 + }; + + enum op5s { + aes_eround01_op5 = 0x00, + aes_eround23_op5 = 0x01, + aes_dround01_op5 = 0x02, + aes_dround23_op5 = 0x03, + aes_eround01_l_op5 = 0x04, + aes_eround23_l_op5 = 0x05, + aes_dround01_l_op5 = 0x06, + aes_dround23_l_op5 = 0x07, + aes_kexpand1_op5 = 0x08 }; enum RCondition { rc_z = 1, rc_lez = 2, rc_lz = 3, rc_nz = 5, rc_gz = 6, rc_gez = 7, rc_last = rc_gez }; @@ -979,6 +997,7 @@ static int immed( bool i) { return u_field(i ? 1 : 0, 13, 13); } static int opf_low6( int w) { return u_field(w, 10, 5); } static int opf_low5( int w) { return u_field(w, 9, 5); } + static int op5( int x) { return u_field(x, 8, 5); } static int trapcc( CC cc) { return u_field(cc, 12, 11); } static int sx( int i) { return u_field(i, 12, 12); } // shift x=1 means 64-bit static int opf( int x) { return u_field(x, 13, 5); } @@ -1003,6 +1022,7 @@ static int fd( FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 29, 25); }; static int fs1(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 18, 14); }; static int fs2(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 4, 0); }; + static int fs3(FloatRegister r, FloatRegisterImpl::Width fwa) { return u_field(r->encoding(fwa), 13, 9); }; // some float instructions use this encoding on the op3 field static int alt_op3(int op, FloatRegisterImpl::Width w) { @@ -1111,6 +1131,12 @@ return x & ((1 << 10) - 1); } + // AES crypto instructions supported only on certain processors + static void aes_only() { assert( VM_Version::has_aes(), "This instruction only works on SPARC with AES instructions support"); } + + // instruction only in VIS1 + static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); } + // instruction only in VIS3 static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); } @@ -1243,6 +1269,24 @@ void addccc( Register s1, int simm13a, Register d ) { emit_long( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); } + // 4-operand AES instructions + + void aes_eround01( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround23( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround01( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround23( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround01_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_eround23_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround01_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_dround23_l( FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_kexpand1( FloatRegister s1, FloatRegister s2, int imm5a, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | u_field(imm5a, 13, 9) | op5(aes_kexpand1_op5) | fs2(s2, FloatRegisterImpl::D) ); } + + + // 3-operand AES instructions + + void aes_kexpand0( FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand0_opf) | fs2(s2, FloatRegisterImpl::D) ); } + void aes_kexpand2( FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand2_opf) | fs2(s2, FloatRegisterImpl::D) ); } + // pp 136 inline void bpr(RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none); @@ -1360,6 +1404,10 @@ void fmul( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_long( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | fs1(s1, sw) | opf(0x60 + sw + dw*4) | fs2(s2, sw)); } void fdiv( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_long( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x4c + w) | fs2(s2, w)); } + // FXORs/FXORd instructions + + void fxor( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(flog3_op3) | fs1(s1, w) | opf(0x6E - w) | fs2(s2, w)); } + // pp 164 void fsqrt( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_long( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w)); } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/sparc/vm/sparc.ad --- a/src/cpu/sparc/vm/sparc.ad Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/sparc/vm/sparc.ad Sat Feb 03 14:01:55 2018 -0800 @@ -1911,6 +1911,12 @@ return false; } +// Current (2013) SPARC platforms need to read original key +// to construct decryption expanded key +const bool Matcher::pass_original_key_for_aes() { + return true; +} + // USII supports fxtof through the whole range of number, USIII doesn't const bool Matcher::convL2FSupported(void) { return VM_Version::has_fast_fxtof(); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -3375,6 +3375,775 @@ } } + address generate_aescrypt_encryptBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); + Label L_doLast128bit, L_storeOutput; + address start = __ pc(); + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + const Register keylen = O4; //reg for storing expanded key array length + + // read expanded key length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load input into F54-F56; F30-F31 used as temp + __ ldf(FloatRegisterImpl::S, from, 0, F30); + __ ldf(FloatRegisterImpl::S, from, 4, F31); + __ fmov(FloatRegisterImpl::D, F30, F54); + __ ldf(FloatRegisterImpl::S, from, 8, F30); + __ ldf(FloatRegisterImpl::S, from, 12, F31); + __ fmov(FloatRegisterImpl::D, F30, F56); + + // load expanded key + for ( int i = 0; i <= 38; i += 2 ) { + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); + } + + // perform cipher transformation + __ fxor(FloatRegisterImpl::D, F0, F54, F54); + __ fxor(FloatRegisterImpl::D, F2, F56, F56); + // rounds 1 through 8 + for ( int i = 4; i <= 28; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F54, F56, F58); + __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60); + __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54); + __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56); + } + __ aes_eround01(F36, F54, F56, F58); //round 9 + __ aes_eround23(F38, F54, F56, F60); + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit); + + for ( int i = 40; i <= 50; i += 2 ) { + __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) ); + } + __ aes_eround01(F40, F58, F60, F54); //round 10 + __ aes_eround23(F42, F58, F60, F56); + __ aes_eround01(F44, F54, F56, F58); //round 11 + __ aes_eround23(F46, F54, F56, F60); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput); + + __ ldf(FloatRegisterImpl::D, key, 208, F52); + __ aes_eround01(F48, F58, F60, F54); //round 12 + __ aes_eround23(F50, F58, F60, F56); + __ ldf(FloatRegisterImpl::D, key, 216, F46); + __ ldf(FloatRegisterImpl::D, key, 224, F48); + __ ldf(FloatRegisterImpl::D, key, 232, F50); + __ aes_eround01(F52, F54, F56, F58); //round 13 + __ aes_eround23(F46, F54, F56, F60); + __ br(Assembler::always, false, Assembler::pt, L_storeOutput); + __ delayed()->nop(); + + __ BIND(L_doLast128bit); + __ ldf(FloatRegisterImpl::D, key, 160, F48); + __ ldf(FloatRegisterImpl::D, key, 168, F50); + + __ BIND(L_storeOutput); + // perform last round of encryption common for all key sizes + __ aes_eround01_l(F48, F58, F60, F54); //last round + __ aes_eround23_l(F50, F58, F60, F56); + + // store output into the destination array, F0-F1 used as temp + __ fmov(FloatRegisterImpl::D, F54, F0); + __ stf(FloatRegisterImpl::S, F0, to, 0); + __ stf(FloatRegisterImpl::S, F1, to, 4); + __ fmov(FloatRegisterImpl::D, F56, F0); + __ stf(FloatRegisterImpl::S, F0, to, 8); + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + + return start; + } + + address generate_aescrypt_decryptBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); + address start = __ pc(); + Label L_expand192bit, L_expand256bit, L_common_transform; + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + Register original_key = O3; // original key array only required during decryption + const Register keylen = O4; // reg for storing expanded key array length + + // read expanded key array length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load input into F52-F54; F30,F31 used as temp + __ ldf(FloatRegisterImpl::S, from, 0, F30); + __ ldf(FloatRegisterImpl::S, from, 4, F31); + __ fmov(FloatRegisterImpl::D, F30, F52); + __ ldf(FloatRegisterImpl::S, from, 8, F30); + __ ldf(FloatRegisterImpl::S, from, 12, F31); + __ fmov(FloatRegisterImpl::D, F30, F54); + + // load original key from SunJCE expanded decryption key + for ( int i = 0; i <= 3; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); + + // 128-bit original key size + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 4 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); + } + + // perform 128-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F42, F54, F54); + __ fxor(FloatRegisterImpl::D, F40, F52, F52); + __ br(Assembler::always, false, Assembler::pt, L_common_transform); + __ delayed()->nop(); + + __ BIND(L_expand192bit); + + // start loading rest of the 192-bit key + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 6 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); + } + __ aes_kexpand1(F42, F46, 7, F48); + __ aes_kexpand2(F44, F48, F50); + + // perform 192-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F50, F54, F54); + __ fxor(FloatRegisterImpl::D, F48, F52, F52); + __ aes_dround23(F46, F52, F54, F58); + __ aes_dround01(F44, F52, F54, F56); + __ aes_dround23(F42, F56, F58, F54); + __ aes_dround01(F40, F56, F58, F52); + __ br(Assembler::always, false, Assembler::pt, L_common_transform); + __ delayed()->nop(); + + __ BIND(L_expand256bit); + + // load rest of the 256-bit key + for ( int i = 4; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); + } + __ aes_kexpand1(F48, F54, 6, F56); + __ aes_kexpand2(F50, F56, F58); + + for ( int i = 0; i <= 6; i += 2 ) { + __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); + } + + // load input into F52-F54 + __ ldf(FloatRegisterImpl::D, from, 0, F52); + __ ldf(FloatRegisterImpl::D, from, 8, F54); + + // perform 256-bit key specific inverse cipher transformation + __ fxor(FloatRegisterImpl::D, F0, F54, F54); + __ fxor(FloatRegisterImpl::D, F2, F52, F52); + __ aes_dround23(F4, F52, F54, F58); + __ aes_dround01(F6, F52, F54, F56); + __ aes_dround23(F50, F56, F58, F54); + __ aes_dround01(F48, F56, F58, F52); + __ aes_dround23(F46, F52, F54, F58); + __ aes_dround01(F44, F52, F54, F56); + __ aes_dround23(F42, F56, F58, F54); + __ aes_dround01(F40, F56, F58, F52); + + for ( int i = 0; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform inverse cipher transformations common for all key sizes + __ BIND(L_common_transform); + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F52, F54, F58); + __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56); + if ( i != 6) { + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52); + } + } + + // store output to destination array, F0-F1 used as temp + __ fmov(FloatRegisterImpl::D, F52, F0); + __ stf(FloatRegisterImpl::S, F0, to, 0); + __ stf(FloatRegisterImpl::S, F1, to, 4); + __ fmov(FloatRegisterImpl::D, F54, F0); + __ stf(FloatRegisterImpl::S, F0, to, 8); + __ retl(); + __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); + + return start; + } + + address generate_cipherBlockChaining_encryptAESCrypt() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + Label L_cbcenc128, L_cbcenc192, L_cbcenc256; + address start = __ pc(); + Register from = O0; // source byte array + Register to = O1; // destination byte array + Register key = O2; // expanded key array + Register rvec = O3; // init vector + const Register len_reg = O4; // cipher length + const Register keylen = O5; // reg for storing expanded key array length + + // save cipher len to return in the end + __ mov(len_reg, L1); + + // read expanded key length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // load init vector + __ ldf(FloatRegisterImpl::D, rvec, 0, F60); + __ ldf(FloatRegisterImpl::D, rvec, 8, F62); + __ ldx(key,0,G1); + __ ldx(key,8,G2); + + // start loading expanded key + for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128); + + for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192); + + for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) { + __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); + } + + // 256-bit original key size + __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc128); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // TEN_EROUNDS + for ( int i = 0; i <= 32; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 32 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ add(to, 16, to); + __ subcc(len_reg, 16, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc192); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // TWELEVE_EROUNDS + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 40 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ subcc(len_reg, 16, len_reg); + __ add(to, 16, to); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + __ align(OptoLoopAlignment); + __ BIND(L_cbcenc256); + __ ldx(from,0,G3); + __ ldx(from,8,G4); + __ xor3(G1,G3,G3); + __ xor3(G2,G4,G4); + __ movxtod(G3,F56); + __ movxtod(G4,F58); + __ fxor(FloatRegisterImpl::D, F60, F56, F60); + __ fxor(FloatRegisterImpl::D, F62, F58, F62); + + // FOURTEEN_EROUNDS + for ( int i = 0; i <= 48; i += 8 ) { + __ aes_eround01(as_FloatRegister(i), F60, F62, F56); + __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58); + if (i != 48 ) { + __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62); + } else { + __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60); + __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62); + } + } + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + __ add(from, 16, from); + __ subcc(len_reg, 16, len_reg); + __ add(to, 16, to); + __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); + __ delayed()->nop(); + __ stf(FloatRegisterImpl::D, F60, rvec, 0); + __ stf(FloatRegisterImpl::D, F62, rvec, 8); + __ retl(); + __ delayed()->mov(L1, O0); + + return start; + } + + address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; + Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; + address start = __ pc(); + Register from = I0; // source byte array + Register to = I1; // destination byte array + Register key = I2; // expanded key array + Register rvec = I3; // init vector + const Register len_reg = I4; // cipher length + const Register original_key = I5; // original key array only required during decryption + const Register keylen = L6; // reg for storing expanded key array length + + // save cipher len before save_frame, to return in the end + __ mov(O4, L0); + __ save_frame(0); //args are read from I* registers since we save the frame in the beginning + + // load original key from SunJCE expanded decryption key + for ( int i = 0; i <= 3; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // load initial vector + __ ldx(rvec,0,L0); + __ ldx(rvec,8,L1); + + // read expanded key array length + __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit); + + // 128-bit original key size + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 4 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6)); + } + + // load expanded key[last-1] and key[last] elements + __ movdtox(F40,L2); + __ movdtox(F42,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); + __ delayed()->nop(); + + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); + __ delayed()->nop(); + + __ BIND(L_expand192bit); + // load rest of the 192-bit key + __ ldf(FloatRegisterImpl::S, original_key, 16, F4); + __ ldf(FloatRegisterImpl::S, original_key, 20, F5); + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 36; i += 6 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10)); + } + __ aes_kexpand1(F42, F46, 7, F48); + __ aes_kexpand2(F44, F48, F50); + + // load expanded key[last-1] and key[last] elements + __ movdtox(F48,L2); + __ movdtox(F50,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); + __ delayed()->nop(); + + __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); + __ delayed()->nop(); + + __ BIND(L_expand256bit); + // load rest of the 256-bit key + for ( int i = 4; i <= 7; i++ ) { + __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); + } + + // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions + for ( int i = 0; i <= 40; i += 8 ) { + __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8)); + __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10)); + __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12)); + __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14)); + } + __ aes_kexpand1(F48, F54, 6, F56); + __ aes_kexpand2(F50, F56, F58); + + // load expanded key[last-1] and key[last] elements + __ movdtox(F56,L2); + __ movdtox(F58,L3); + + __ and3(len_reg, 16, L4); + __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); + __ delayed()->nop(); + + __ BIND(L_dec_first_block_start); + __ ldx(from,0,L4); + __ ldx(from,8,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + // 128-bit original key size + __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192); + + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23(F50, F56, F58, F62); + __ aes_dround01(F48, F56, F58, F60); + + __ BIND(L_dec_first_block192); + __ aes_dround23(F46, F60, F62, F58); + __ aes_dround01(F44, F60, F62, F56); + __ aes_dround23(F42, F56, F58, F62); + __ aes_dround01(F40, F56, F58, F60); + + __ BIND(L_dec_first_block128); + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if ( i != 6) { + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F56); + __ movxtod(L1,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 0); + __ stf(FloatRegisterImpl::D, F62, to, 8); + + __ add(from, 16, from); + __ add(to, 16, to); + __ subcc(len_reg, 16, len_reg); + __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + // 256-bit original key size + __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256); + + // 192-bit original key size + __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks128); + __ nop(); + + // F40:F42 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F40); + __ xor3(L3,G5,G1); + __ movxtod(G1,F42); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + for ( int i = 38; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F40, F42, F44); + __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if (i != 6 ) { + __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42); + __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42); + __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40); + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F46); + __ movxtod(L1,F44); + __ fxor(FloatRegisterImpl::D, F46, F40, F40); + __ fxor(FloatRegisterImpl::D, F44, F42, F42); + + __ stf(FloatRegisterImpl::D, F40, to, 0); + __ stf(FloatRegisterImpl::D, F42, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); + __ delayed()->nop(); + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks192); + __ nop(); + + // F48:F50 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F48); + __ xor3(L3,G5,G1); + __ movxtod(G1,F50); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + for ( int i = 46; i >= 6; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F48, F50, F52); + __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + if (i != 6 ) { + __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50); + __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } else { + __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50); + __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48); + __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60); + } + } + + __ movxtod(L0,F54); + __ movxtod(L1,F52); + __ fxor(FloatRegisterImpl::D, F54, F48, F48); + __ fxor(FloatRegisterImpl::D, F52, F50, F50); + + __ stf(FloatRegisterImpl::D, F48, to, 0); + __ stf(FloatRegisterImpl::D, F50, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); + __ delayed()->nop(); + __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); + __ delayed()->nop(); + + __ align(OptoLoopAlignment); + __ BIND(L_dec_next2_blocks256); + __ nop(); + + // F0:F2 used for first 16-bytes + __ ldx(from,0,G4); + __ ldx(from,8,G5); + __ xor3(L2,G4,G1); + __ movxtod(G1,F0); + __ xor3(L3,G5,G1); + __ movxtod(G1,F2); + + // F60:F62 used for next 16-bytes + __ ldx(from,16,L4); + __ ldx(from,24,L5); + __ xor3(L2,L4,G1); + __ movxtod(G1,F60); + __ xor3(L3,L5,G1); + __ movxtod(G1,F62); + + __ aes_dround23(F54, F0, F2, F4); + __ aes_dround01(F52, F0, F2, F6); + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23(F50, F6, F4, F2); + __ aes_dround01(F48, F6, F4, F0); + __ aes_dround23(F50, F56, F58, F62); + __ aes_dround01(F48, F56, F58, F60); + // save F48:F54 in temp registers + __ movdtox(F54,G2); + __ movdtox(F52,G3); + __ movdtox(F50,G6); + __ movdtox(F48,G1); + for ( int i = 46; i >= 14; i -= 8 ) { + __ aes_dround23(as_FloatRegister(i), F0, F2, F4); + __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6); + __ aes_dround23(as_FloatRegister(i), F60, F62, F58); + __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56); + __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2); + __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0); + __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62); + __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60); + } + // init F48:F54 with F0:F6 values (original key) + __ ldf(FloatRegisterImpl::D, original_key, 0, F48); + __ ldf(FloatRegisterImpl::D, original_key, 8, F50); + __ ldf(FloatRegisterImpl::D, original_key, 16, F52); + __ ldf(FloatRegisterImpl::D, original_key, 24, F54); + __ aes_dround23(F54, F0, F2, F4); + __ aes_dround01(F52, F0, F2, F6); + __ aes_dround23(F54, F60, F62, F58); + __ aes_dround01(F52, F60, F62, F56); + __ aes_dround23_l(F50, F6, F4, F2); + __ aes_dround01_l(F48, F6, F4, F0); + __ aes_dround23_l(F50, F56, F58, F62); + __ aes_dround01_l(F48, F56, F58, F60); + // re-init F48:F54 with their original values + __ movxtod(G2,F54); + __ movxtod(G3,F52); + __ movxtod(G6,F50); + __ movxtod(G1,F48); + + __ movxtod(L0,F6); + __ movxtod(L1,F4); + __ fxor(FloatRegisterImpl::D, F6, F0, F0); + __ fxor(FloatRegisterImpl::D, F4, F2, F2); + + __ stf(FloatRegisterImpl::D, F0, to, 0); + __ stf(FloatRegisterImpl::D, F2, to, 8); + + __ movxtod(G4,F56); + __ movxtod(G5,F58); + __ mov(L4,L0); + __ mov(L5,L1); + __ fxor(FloatRegisterImpl::D, F56, F60, F60); + __ fxor(FloatRegisterImpl::D, F58, F62, F62); + + __ stf(FloatRegisterImpl::D, F60, to, 16); + __ stf(FloatRegisterImpl::D, F62, to, 24); + + __ add(from, 32, from); + __ add(to, 32, to); + __ subcc(len_reg, 32, len_reg); + __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256); + __ delayed()->nop(); + + __ BIND(L_cbcdec_end); + __ stx(L0, rvec, 0); + __ stx(L1, rvec, 8); + __ restore(); + __ mov(L0, O0); + __ retl(); + __ delayed()->nop(); + + return start; + } + void generate_initial() { // Generates all stubs and initializes the entry points @@ -3431,6 +4200,14 @@ // Don't initialize the platform math functions since sparc // doesn't have intrinsics for these operations. + + // generate AES intrinsics code + if (UseAESIntrinsics) { + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/sparc/vm/vm_version_sparc.cpp --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -242,7 +242,7 @@ assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size"); char buf[512]; - jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")), (has_hardware_popc() ? ", popc" : ""), (has_vis1() ? ", vis1" : ""), @@ -250,6 +250,7 @@ (has_vis3() ? ", vis3" : ""), (has_blk_init() ? ", blk_init" : ""), (has_cbcond() ? ", cbcond" : ""), + (has_aes() ? ", aes" : ""), (is_ultra3() ? ", ultra3" : ""), (is_sun4v() ? ", sun4v" : ""), (is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")), @@ -273,6 +274,41 @@ if (!has_vis1()) // Drop to 0 if no VIS1 support UseVIS = 0; + // T2 and above should have support for AES instructions + if (has_aes()) { + if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1 + if (FLAG_IS_DEFAULT(UseAES)) { + FLAG_SET_DEFAULT(UseAES, true); + } + if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { + FLAG_SET_DEFAULT(UseAESIntrinsics, true); + } + // we disable both the AES flags if either of them is disabled on the command line + if (!UseAES || !UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAES, false); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } else { + if (UseAES || UseAESIntrinsics) { + warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled."); + if (UseAES) { + FLAG_SET_DEFAULT(UseAES, false); + } + if (UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } + } + } else if (UseAES || UseAESIntrinsics) { + warning("AES instructions are not available on this CPU"); + if (UseAES) { + FLAG_SET_DEFAULT(UseAES, false); + } + if (UseAESIntrinsics) { + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + } + #ifndef PRODUCT if (PrintMiscellaneous && Verbose) { tty->print("Allocation"); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/sparc/vm/vm_version_sparc.hpp --- a/src/cpu/sparc/vm/vm_version_sparc.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/sparc/vm/vm_version_sparc.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -49,7 +49,8 @@ M_family = 15, T_family = 16, T1_model = 17, - sparc5_instructions = 18 + sparc5_instructions = 18, + aes_instructions = 19 }; enum Feature_Flag_Set { @@ -75,6 +76,7 @@ T_family_m = 1 << T_family, T1_model_m = 1 << T1_model, sparc5_instructions_m = 1 << sparc5_instructions, + aes_instructions_m = 1 << aes_instructions, generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m, generic_v9_m = generic_v8_m | v9_instructions_m, @@ -126,6 +128,7 @@ static bool has_blk_init() { return (_features & blk_init_instructions_m) != 0; } static bool has_cbcond() { return (_features & cbcond_instructions_m) != 0; } static bool has_sparc5_instr() { return (_features & sparc5_instructions_m) != 0; } + static bool has_aes() { return (_features & aes_instructions_m) != 0; } static bool supports_compare_and_exchange() { return has_v9(); } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -2433,6 +2433,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_encryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); @@ -2513,7 +2516,7 @@ __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object handleSOERegisters(false /*restoring*/); - __ movl(rax, 0); // return 0 (why?) + __ movptr(rax, len_param); // return length __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2587,6 +2590,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_decryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); @@ -2680,7 +2686,7 @@ __ movptr(rvec , rvec_param); // restore this since used in loop __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object handleSOERegisters(false /*restoring*/); - __ movl(rax, 0); // return 0 (why?) + __ movptr(rax, len_param); // return length __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -3246,6 +3246,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_encryptAESCrypt() { assert(UseAES, "need AES instructions and misaligned SSE support"); __ align(CodeEntryAlignment); @@ -3261,7 +3264,7 @@ #ifndef _WIN64 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else - const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 const Register len_reg = r10; // pick the first volatile windows register #endif const Register pos = rax; @@ -3288,6 +3291,8 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); } +#else + __ push(len_reg); // Save #endif const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front @@ -3330,8 +3335,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(as_XMMRegister(i), xmm_save(i)); } + __ movl(rax, len_mem); +#else + __ pop(rax); // return length #endif - __ movl(rax, 0); // return 0 (why?) __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -3400,6 +3407,9 @@ // c_rarg3 - r vector byte array address // c_rarg4 - input length // + // Output: + // rax - input length + // address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { assert(UseAES, "need AES instructions and misaligned SSE support"); @@ -3418,7 +3428,7 @@ #ifndef _WIN64 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) #else - const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 const Register len_reg = r10; // pick the first volatile windows register #endif const Register pos = rax; @@ -3439,7 +3449,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(xmm_save(i), as_XMMRegister(i)); } +#else + __ push(len_reg); // Save #endif + // the java expanded key ordering is rotated one position from what we want // so we start from 0x10 here and hit 0x00 last const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front @@ -3545,8 +3558,10 @@ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { __ movdqu(as_XMMRegister(i), xmm_save(i)); } + __ movl(rax, len_mem); +#else + __ pop(rax); // return length #endif - __ movl(rax, 0); // return 0 (why?) __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/cpu/x86/vm/x86.ad --- a/src/cpu/x86/vm/x86.ad Fri Sep 08 09:32:12 2017 -0700 +++ b/src/cpu/x86/vm/x86.ad Sat Feb 03 14:01:55 2018 -0800 @@ -592,6 +592,12 @@ return !AlignVector; // can be changed by flag } +// x86 AES instructions are compatible with SunJCE expanded +// keys, hence we do not need to pass the original key to stubs +const bool Matcher::pass_original_key_for_aes() { + return false; +} + // Helper methods for MachSpillCopyNode::implementation(). static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo, int src_hi, int dst_hi, uint ireg, outputStream* st) { diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp --- a/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -132,6 +132,11 @@ #endif if (av & AV_SPARC_CBCOND) features |= cbcond_instructions_m; +#ifndef AV_SPARC_AES +#define AV_SPARC_AES 0x00020000 /* aes instrs supported */ +#endif + if (av & AV_SPARC_AES) features |= aes_instructions_m; + } else { // getisax(2) failed, use the old legacy code. #ifndef PRODUCT diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/classfile/vmSymbols.hpp --- a/src/share/vm/classfile/vmSymbols.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/classfile/vmSymbols.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -739,7 +739,7 @@ do_intrinsic(_cipherBlockChaining_decryptAESCrypt, com_sun_crypto_provider_cipherBlockChaining, decrypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ do_name( encrypt_name, "encrypt") \ do_name( decrypt_name, "decrypt") \ - do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)V") \ + do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)I") \ \ /* support for sun.misc.Unsafe */ \ do_class(sun_misc_Unsafe, "sun/misc/Unsafe") \ diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/interpreter/linkResolver.cpp --- a/src/share/vm/interpreter/linkResolver.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/interpreter/linkResolver.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -415,6 +415,40 @@ } } +void LinkResolver::check_method_loader_constraints(methodHandle& resolved_method, + KlassHandle resolved_klass, + Symbol* method_name, + Symbol* method_signature, + KlassHandle current_klass, + const char* method_type, TRAPS) { + Handle loader (THREAD, instanceKlass::cast(current_klass())->class_loader()); + Handle class_loader (THREAD, instanceKlass::cast(resolved_method->method_holder())->class_loader()); + { + ResourceMark rm(THREAD); + char* failed_type_name = + SystemDictionary::check_signature_loaders(method_signature, loader, + class_loader, true, CHECK); + if (failed_type_name != NULL) { + const char* msg = "loader constraint violation: when resolving %s" + " \"%s\" the class loader (instance of %s) of the current class, %s," + " and the class loader (instance of %s) for the method's defining class, %s, have" + " different Class objects for the type %s used in the signature"; + char* sig = methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()), method_name, method_signature); + const char* loader1 = SystemDictionary::loader_name(loader()); + char* current = instanceKlass::cast(current_klass())->name()->as_C_string(); + const char* loader2 = SystemDictionary::loader_name(class_loader()); + char* resolved = instanceKlass::cast(resolved_klass())->name()->as_C_string(); + size_t buflen = strlen(msg) + strlen(sig) + strlen(loader1) + + strlen(current) + strlen(loader2) + strlen(resolved) + + strlen(failed_type_name) + strlen(method_type) + 1; + char* buf = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, buflen); + jio_snprintf(buf, buflen, msg, method_type, sig, loader1, current, loader2, + resolved, failed_type_name); + THROW_MSG(vmSymbols::java_lang_LinkageError(), buf); + } + } +} + void LinkResolver::resolve_method(methodHandle& resolved_method, KlassHandle resolved_klass, Symbol* method_name, Symbol* method_signature, KlassHandle current_klass, bool check_access, TRAPS) { @@ -478,32 +512,8 @@ CHECK); // check loader constraints - Handle loader (THREAD, instanceKlass::cast(current_klass())->class_loader()); - Handle class_loader (THREAD, instanceKlass::cast(resolved_method->method_holder())->class_loader()); - { - ResourceMark rm(THREAD); - char* failed_type_name = - SystemDictionary::check_signature_loaders(method_signature, loader, - class_loader, true, CHECK); - if (failed_type_name != NULL) { - const char* msg = "loader constraint violation: when resolving method" - " \"%s\" the class loader (instance of %s) of the current class, %s," - " and the class loader (instance of %s) for resolved class, %s, have" - " different Class objects for the type %s used in the signature"; - char* sig = methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()),method_name,method_signature); - const char* loader1 = SystemDictionary::loader_name(loader()); - char* current = instanceKlass::cast(current_klass())->name()->as_C_string(); - const char* loader2 = SystemDictionary::loader_name(class_loader()); - char* resolved = instanceKlass::cast(resolved_klass())->name()->as_C_string(); - size_t buflen = strlen(msg) + strlen(sig) + strlen(loader1) + - strlen(current) + strlen(loader2) + strlen(resolved) + - strlen(failed_type_name); - char* buf = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, buflen); - jio_snprintf(buf, buflen, msg, sig, loader1, current, loader2, - resolved, failed_type_name); - THROW_MSG(vmSymbols::java_lang_LinkageError(), buf); - } - } + check_method_loader_constraints(resolved_method, resolved_klass, method_name, + method_signature, current_klass, "method", CHECK); } } @@ -540,34 +550,8 @@ } if (check_access) { - HandleMark hm(THREAD); - Handle loader (THREAD, instanceKlass::cast(current_klass())->class_loader()); - Handle class_loader (THREAD, instanceKlass::cast(resolved_method->method_holder())->class_loader()); - { - ResourceMark rm(THREAD); - char* failed_type_name = - SystemDictionary::check_signature_loaders(method_signature, loader, - class_loader, true, CHECK); - if (failed_type_name != NULL) { - const char* msg = "loader constraint violation: when resolving " - "interface method \"%s\" the class loader (instance of %s) of the " - "current class, %s, and the class loader (instance of %s) for " - "resolved class, %s, have different Class objects for the type %s " - "used in the signature"; - char* sig = methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()),method_name,method_signature); - const char* loader1 = SystemDictionary::loader_name(loader()); - char* current = instanceKlass::cast(current_klass())->name()->as_C_string(); - const char* loader2 = SystemDictionary::loader_name(class_loader()); - char* resolved = instanceKlass::cast(resolved_klass())->name()->as_C_string(); - size_t buflen = strlen(msg) + strlen(sig) + strlen(loader1) + - strlen(current) + strlen(loader2) + strlen(resolved) + - strlen(failed_type_name); - char* buf = NEW_RESOURCE_ARRAY_IN_THREAD(THREAD, char, buflen); - jio_snprintf(buf, buflen, msg, sig, loader1, current, loader2, - resolved, failed_type_name); - THROW_MSG(vmSymbols::java_lang_LinkageError(), buf); - } - } + check_method_loader_constraints(resolved_method, resolved_klass, method_name, + method_signature, current_klass, "interface method", CHECK); } } @@ -819,6 +803,10 @@ methodOopDesc::name_and_sig_as_C_string(Klass::cast(resolved_klass()), resolved_method->name(), resolved_method->signature())); + } else if (sel_method() != resolved_method()) { + check_method_loader_constraints(sel_method, resolved_klass, + sel_method->name(), sel_method->signature(), + current_klass, "method", CHECK); } } } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/interpreter/linkResolver.hpp --- a/src/share/vm/interpreter/linkResolver.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/interpreter/linkResolver.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -125,6 +125,9 @@ static void resolve_pool (KlassHandle& resolved_klass, Symbol*& method_name, Symbol*& method_signature, KlassHandle& current_klass, constantPoolHandle pool, int index, TRAPS); static void resolve_interface_method(methodHandle& resolved_method, KlassHandle resolved_klass, Symbol* method_name, Symbol* method_signature, KlassHandle current_klass, bool check_access, TRAPS); + static void check_method_loader_constraints(methodHandle& resolved_method, KlassHandle resolved_klass, + Symbol* method_name, Symbol* method_signature, + KlassHandle current_klass, const char* method_type, TRAPS); static void resolve_method (methodHandle& resolved_method, KlassHandle resolved_klass, Symbol* method_name, Symbol* method_signature, KlassHandle current_klass, bool check_access, TRAPS); static void linktime_resolve_static_method (methodHandle& resolved_method, KlassHandle resolved_klass, Symbol* method_name, Symbol* method_signature, KlassHandle current_klass, bool check_access, TRAPS); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/memory/filemap.cpp --- a/src/share/vm/memory/filemap.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/memory/filemap.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -185,7 +185,12 @@ tty->print_cr(" %s", _full_path); } - // Remove the existing file in case another process has it open. +#ifdef _WINDOWS // On Windows, need WRITE permission to remove the file. + chmod(_full_path, _S_IREAD | _S_IWRITE); +#endif + + // Use remove() to delete the existing file because, on Unix, this will + // allow processes that have it open continued access to the file. remove(_full_path); int fd = open(_full_path, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 0444); if (fd < 0) { diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/chaitin.cpp --- a/src/share/vm/opto/chaitin.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/chaitin.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -445,6 +445,9 @@ // Peephole remove copies post_allocate_copy_removal(); + // Merge multidefs if multiple defs representing the same value are used in a single block. + merge_multidefs(); + #ifdef ASSERT // Veify the graph after RA. verify(&live_arena); diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/chaitin.hpp --- a/src/share/vm/opto/chaitin.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/chaitin.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -511,6 +511,32 @@ // Extend the node to LRG mapping void add_reference( const Node *node, const Node *old_node); + // Record the first use of a def in the block for a register. + class RegDefUse { + Node* _def; + Node* _first_use; + public: + RegDefUse() : _def(NULL), _first_use(NULL) { } + Node* def() const { return _def; } + Node* first_use() const { return _first_use; } + + void update(Node* def, Node* use) { + if (_def != def) { + _def = def; + _first_use = use; + } + } + void clear() { + _def = NULL; + _first_use = NULL; + } + }; + typedef GrowableArray RegToDefUseMap; + int possibly_merge_multidef(Node *n, uint k, Block *block, RegToDefUseMap& reg2defuse); + + // Merge nodes that are a part of a multidef lrg and produce the same value within a block. + void merge_multidefs(); + private: static int _final_loads, _final_stores, _final_copies, _final_memoves; diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/library_call.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -290,6 +290,7 @@ bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); + Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); }; @@ -5503,10 +5504,22 @@ Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); if (k_start == NULL) return false; - // Call the stub. - make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), - stubAddr, stubName, TypePtr::BOTTOM, - src_start, dest_start, k_start); + if (Matcher::pass_original_key_for_aes()) { + // on SPARC we need to pass the original key since key expansion needs to happen in intrinsics due to + // compatibility issues between Java key expansion and SPARC crypto instructions + Node* original_k_start = get_original_key_start_from_aescrypt_object(aescrypt_object); + if (original_k_start == NULL) return false; + + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, original_k_start); + } else { + // Call the stub. + make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::aescrypt_block_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start); + } return true; } @@ -5584,14 +5597,29 @@ if (objRvec == NULL) return false; Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE); - // Call the stub, passing src_start, dest_start, k_start, r_start and src_len - make_runtime_call(RC_LEAF|RC_NO_FP, - OptoRuntime::cipherBlockChaining_aescrypt_Type(), - stubAddr, stubName, TypePtr::BOTTOM, - src_start, dest_start, k_start, r_start, len); - - // return is void so no result needs to be pushed - + Node* cbcCrypt; + if (Matcher::pass_original_key_for_aes()) { + // on SPARC we need to pass the original key since key expansion needs to happen in intrinsics due to + // compatibility issues between Java key expansion and SPARC crypto instructions + Node* original_k_start = get_original_key_start_from_aescrypt_object(aescrypt_object); + if (original_k_start == NULL) return false; + + // Call the stub, passing src_start, dest_start, k_start, r_start, src_len and original_k_start + cbcCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len, original_k_start); + } else { + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + cbcCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::cipherBlockChaining_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, r_start, len); + } + + // return cipher length (int) + Node* retvalue = _gvn.transform(new (C) ProjNode(cbcCrypt, TypeFunc::Parms)); + set_result(retvalue); return true; } @@ -5606,6 +5634,17 @@ return k_start; } +//------------------------------get_original_key_start_from_aescrypt_object----------------------- +Node * LibraryCallKit::get_original_key_start_from_aescrypt_object(Node *aescrypt_object) { + Node* objAESCryptKey = load_field_from_object(aescrypt_object, "lastKey", "[B", /*is_exact*/ false); + assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objAESCryptKey == NULL) return (Node *) NULL; + + // now have the array, need to get the start address of the lastKey array + Node* original_k_start = array_element_address(objAESCryptKey, intcon(0), T_BYTE); + return original_k_start; +} + //----------------------------inline_cipherBlockChaining_AESCrypt_predicate---------------------------- // Return node representing slow path of predicate check. // the pseudo code we want to emulate with this predicate is: diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/machnode.hpp --- a/src/share/vm/opto/machnode.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/machnode.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -539,6 +539,29 @@ #endif }; +// MachMergeNode is similar to a PhiNode in a sense it merges multiple values, +// however it doesn't have a control input and is more like a MergeMem. +// It is inserted after the register allocation is done to ensure that nodes use single +// definition of a multidef lrg in a block. +class MachMergeNode : public MachIdealNode { +public: + MachMergeNode(Node *n1) { + init_class_id(Class_MachMerge); + add_req(NULL); + add_req(n1); + } + virtual const RegMask &out_RegMask() const { return in(1)->out_RegMask(); } + virtual const RegMask &in_RegMask(uint idx) const { return in(1)->in_RegMask(idx); } + virtual const class Type *bottom_type() const { return in(1)->bottom_type(); } + virtual uint ideal_reg() const { return Matcher::base2reg[bottom_type()->base()]; } + virtual uint oper_input_base() const { return 1; } + virtual void emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { } + virtual uint size(PhaseRegAlloc *ra_) const { return 0; } +#ifndef PRODUCT + virtual const char *Name() const { return "MachMerge"; } +#endif +}; + //------------------------------MachBranchNode-------------------------------- // Abstract machine branch Node class MachBranchNode : public MachIdealNode { diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/matcher.hpp --- a/src/share/vm/opto/matcher.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/matcher.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -267,6 +267,9 @@ // CPU supports misaligned vectors store/load. static const bool misaligned_vectors_ok(); + // Should original key array reference be passed to AES stubs + static const bool pass_original_key_for_aes(); + // Used to determine a "low complexity" 64-bit constant. (Zero is simple.) // The standard of comparison is one (StoreL ConL) vs. two (StoreI ConI). // Depends on the details of 64-bit constant generation on the CPU. diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/node.hpp --- a/src/share/vm/opto/node.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/node.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -95,6 +95,7 @@ class MachSafePointNode; class MachSpillCopyNode; class MachTempNode; +class MachMergeNode; class Matcher; class MemBarNode; class MemBarStoreStoreNode; @@ -586,6 +587,7 @@ DEFINE_CLASS_ID(MachTemp, Mach, 3) DEFINE_CLASS_ID(MachConstantBase, Mach, 4) DEFINE_CLASS_ID(MachConstant, Mach, 5) + DEFINE_CLASS_ID(MachMerge, Mach, 6) DEFINE_CLASS_ID(Type, Node, 2) DEFINE_CLASS_ID(Phi, Type, 0) @@ -747,6 +749,7 @@ DEFINE_CLASS_QUERY(MachSafePoint) DEFINE_CLASS_QUERY(MachSpillCopy) DEFINE_CLASS_QUERY(MachTemp) + DEFINE_CLASS_QUERY(MachMerge) DEFINE_CLASS_QUERY(Mem) DEFINE_CLASS_QUERY(MemBar) DEFINE_CLASS_QUERY(MemBarStoreStore) diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/phase.cpp --- a/src/share/vm/opto/phase.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/phase.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -71,6 +71,7 @@ elapsedTimer Phase::_t_computeLive; elapsedTimer Phase::_t_regAllocSplit; elapsedTimer Phase::_t_postAllocCopyRemoval; +elapsedTimer Phase::_t_mergeMultidefs; elapsedTimer Phase::_t_fixupSpills; // Subtimers for _t_output @@ -132,11 +133,12 @@ tty->print_cr (" computeLive : %3.3f sec", Phase::_t_computeLive.seconds()); tty->print_cr (" regAllocSplit : %3.3f sec", Phase::_t_regAllocSplit.seconds()); tty->print_cr (" postAllocCopyRemoval: %3.3f sec", Phase::_t_postAllocCopyRemoval.seconds()); + tty->print_cr (" mergeMultidefs: %3.3f sec", Phase::_t_mergeMultidefs.seconds()); tty->print_cr (" fixupSpills : %3.3f sec", Phase::_t_fixupSpills.seconds()); double regalloc_subtotal = Phase::_t_ctorChaitin.seconds() + Phase::_t_buildIFGphysical.seconds() + Phase::_t_computeLive.seconds() + Phase::_t_regAllocSplit.seconds() + Phase::_t_fixupSpills.seconds() + - Phase::_t_postAllocCopyRemoval.seconds(); + Phase::_t_postAllocCopyRemoval.seconds() + Phase::_t_mergeMultidefs.seconds(); double percent_of_regalloc = ((regalloc_subtotal == 0.0) ? 0.0 : (regalloc_subtotal / Phase::_t_registerAllocation.seconds() * 100.0)); tty->print_cr (" subtotal : %3.3f sec, %3.2f %%", regalloc_subtotal, percent_of_regalloc); } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/phase.hpp --- a/src/share/vm/opto/phase.hpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/phase.hpp Sat Feb 03 14:01:55 2018 -0800 @@ -107,6 +107,7 @@ static elapsedTimer _t_computeLive; static elapsedTimer _t_regAllocSplit; static elapsedTimer _t_postAllocCopyRemoval; + static elapsedTimer _t_mergeMultidefs; static elapsedTimer _t_fixupSpills; // Subtimers for _t_output diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/postaloc.cpp --- a/src/share/vm/opto/postaloc.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/postaloc.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -256,20 +256,6 @@ // intermediate copies might be illegal, i.e., value is stored down to stack // then reloaded BUT survives in a register the whole way. Node *val = skip_copies(n->in(k)); - - if (val == x && nk_idx != 0 && - regnd[nk_reg] != NULL && regnd[nk_reg] != x && - n2lidx(x) == n2lidx(regnd[nk_reg])) { - // When rematerialzing nodes and stretching lifetimes, the - // allocator will reuse the original def for multidef LRG instead - // of the current reaching def because it can't know it's safe to - // do so. After allocation completes if they are in the same LRG - // then it should use the current reaching def instead. - n->set_req(k, regnd[nk_reg]); - blk_adjust += yank_if_dead(val, current_block, &value, ®nd); - val = skip_copies(n->in(k)); - } - if (val == x) return blk_adjust; // No progress? int n_regs = RegMask::num_registers(val->ideal_reg()); @@ -375,6 +361,94 @@ return false; } +// The algorithms works as follows: +// We traverse the block top to bottom. possibly_merge_multidef() is invoked for every input edge k +// of the instruction n. We check to see if the input is a multidef lrg. If it is, we record the fact that we've +// seen a definition (coming as an input) and add that fact to the reg2defuse array. The array maps registers to their +// current reaching definitions (we track only multidefs though). With each definition we also associate the first +// instruction we saw use it. If we encounter the situation when we observe an def (an input) that is a part of the +// same lrg but is different from the previous seen def we merge the two with a MachMerge node and substitute +// all the uses that we've seen so far to use the merge. After that we keep replacing the new defs in the same lrg +// as they get encountered with the merge node and keep adding these defs to the merge inputs. +void PhaseChaitin::merge_multidefs() { + NOT_PRODUCT( Compile::TracePhase t3("mergeMultidefs", &_t_mergeMultidefs, TimeCompiler); ) + ResourceMark rm; + // Keep track of the defs seen in registers and collect their uses in the block. + RegToDefUseMap reg2defuse(_max_reg, _max_reg, RegDefUse()); + for (uint i = 0; i < _cfg._num_blocks; i++) { + Block* block = _cfg._blocks[i]; + for (uint j = 1; j < block->_nodes.size(); j++) { + Node* n = block->_nodes[j]; + if (n->is_Phi()) continue; + for (uint k = 1; k < n->req(); k++) { + j += possibly_merge_multidef(n, k, block, reg2defuse); + } + // Null out the value produced by the instruction itself, since we're only interested in defs + // implicitly defined by the uses. We are actually interested in tracking only redefinitions + // of the multidef lrgs in the same register. For that matter it's enough to track changes in + // the base register only and ignore other effects of multi-register lrgs and fat projections. + // It is also ok to ignore defs coming from singledefs. After an implicit overwrite by one of + // those our register is guaranteed to be used by another lrg and we won't attempt to merge it. + uint lrg = n2lidx(n); + if (lrg > 0 && lrgs(lrg).is_multidef()) { + OptoReg::Name reg = lrgs(lrg).reg(); + reg2defuse.at(reg).clear(); + } + } + // Clear reg->def->use tracking for the next block + for (int j = 0; j < reg2defuse.length(); j++) { + reg2defuse.at(j).clear(); + } + } +} + +int PhaseChaitin::possibly_merge_multidef(Node *n, uint k, Block *block, RegToDefUseMap& reg2defuse) { + int blk_adjust = 0; + + uint lrg = n2lidx(n->in(k)); + if (lrg > 0 && lrgs(lrg).is_multidef()) { + OptoReg::Name reg = lrgs(lrg).reg(); + + Node* def = reg2defuse.at(reg).def(); + if (def != NULL && lrg == n2lidx(def) && def != n->in(k)) { + // Same lrg but different node, we have to merge. + MachMergeNode* merge; + if (def->is_MachMerge()) { // is it already a merge? + merge = def->as_MachMerge(); + } else { + merge = new (C) MachMergeNode(def); + + // Insert the merge node into the block before the first use. + uint use_index = block->find_node(reg2defuse.at(reg).first_use()); + block->_nodes.insert(use_index++, merge); + + // Let the allocator know about the new node, use the same lrg + _names.extend(merge->_idx, lrg); + blk_adjust++; + + // Fixup all the uses (there is at least one) that happened between the first + // use and before the current one. + for (; use_index < block->_nodes.size(); use_index++) { + Node* use = block->_nodes[use_index]; + if (use == n) { + break; + } + use->replace_edge(def, merge); + } + } + if (merge->find_edge(n->in(k)) == -1) { + merge->add_req(n->in(k)); + } + n->set_req(k, merge); + } + + // update the uses + reg2defuse.at(reg).update(n->in(k), n); + } + + return blk_adjust; +} + //------------------------------post_allocate_copy_removal--------------------- // Post-Allocation peephole copy removal. We do this in 1 pass over the diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/opto/runtime.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -827,12 +827,18 @@ const TypeFunc* OptoRuntime::aescrypt_block_Type() { // create input type (domain) int num_args = 3; + if (Matcher::pass_original_key_for_aes()) { + num_args = 4; + } int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; fields[argp++] = TypePtr::NOTNULL; // src fields[argp++] = TypePtr::NOTNULL; // dest fields[argp++] = TypePtr::NOTNULL; // k array + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); @@ -847,6 +853,9 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { // create input type (domain) int num_args = 5; + if (Matcher::pass_original_key_for_aes()) { + num_args = 6; + } int argcnt = num_args; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; @@ -855,13 +864,16 @@ fields[argp++] = TypePtr::NOTNULL; // k array fields[argp++] = TypePtr::NOTNULL; // r array fields[argp++] = TypeInt::INT; // src len + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); - // no result type needed + // returning cipher len (int) fields = TypeTuple::fields(1); - fields[TypeFunc::Parms+0] = NULL; // void - const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + fields[TypeFunc::Parms+0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields); return TypeFunc::make(domain, range); } diff -r 9eace51e0a47 -r 6e4b0b4481b9 src/share/vm/runtime/arguments.cpp --- a/src/share/vm/runtime/arguments.cpp Fri Sep 08 09:32:12 2017 -0700 +++ b/src/share/vm/runtime/arguments.cpp Sat Feb 03 14:01:55 2018 -0800 @@ -2772,8 +2772,6 @@ // Enable parallel GC and adaptive generation sizing FLAG_SET_CMDLINE(bool, UseParallelGC, true); - FLAG_SET_DEFAULT(ParallelGCThreads, - Abstract_VM_Version::parallel_worker_threads()); // Encourage steady state memory management FLAG_SET_CMDLINE(uintx, ThresholdTolerance, 100); diff -r 9eace51e0a47 -r 6e4b0b4481b9 test/compiler/7184394/TestAESMain.java --- a/test/compiler/7184394/TestAESMain.java Fri Sep 08 09:32:12 2017 -0700 +++ b/test/compiler/7184394/TestAESMain.java Sat Feb 03 14:01:55 2018 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -39,20 +39,32 @@ System.out.println(iters + " iterations"); TestAESEncode etest = new TestAESEncode(); etest.prepare(); + // warm-up for 20K iterations + System.out.println("Starting encryption warm-up"); + for (int i=0; i<20000; i++) { + etest.run(); + } + System.out.println("Finished encryption warm-up"); long start = System.nanoTime(); for (int i=0; i= minMemory; + } + + private static boolean canUseAggressiveHeapOption() throws Exception { + if (!haveRequiredMemory()) { + System.out.println( + "Skipping test of " + option + " : insufficient memory"); + return false; + } + return true; + } +} +