changeset 5686:8e3cc52cbcef

Add support for AES Intrinsics backport from jdk8 also incorporates 3 subsequent updates
author adinn
date Mon, 24 Nov 2014 13:37:15 +0000
parents 4868ef1912f1
children f7326d2a6cda
files src/cpu/aarch64/vm/assembler_aarch64.cpp src/cpu/aarch64/vm/assembler_aarch64.hpp src/cpu/aarch64/vm/icache_aarch64.cpp src/cpu/aarch64/vm/icache_aarch64.hpp src/cpu/aarch64/vm/stubGenerator_aarch64.cpp src/cpu/aarch64/vm/vm_version_aarch64.cpp
diffstat 6 files changed, 682 insertions(+), 245 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Mon Nov 24 13:37:15 2014 +0000
@@ -1194,18 +1194,11 @@
   }
 
 #ifndef PRODUCT
-  {
-    address PC = __ pc();
-    __ bl(__ pc()+(1<<27)-4);
-    NativeCall* call = nativeCall_at(PC);
-    ptrdiff_t offset = call->destination()-PC;
-    assert(offset == (1<<27)-4, "broken branch coding");
-    PC = __ pc();
-    __ bl(__ pc()-(1<<27));
-    call = nativeCall_at(PC);
-    offset = call->destination()-PC;
-    assert(offset == -(1<<27), "broken branch coding");
-  }
+
+  address PC = __ pc();
+  __ ld1(v0, __ T16B, Address(r16)); // No offset
+  __ ld1(v0, __ T16B, __ post(r16, 0)); // Post-index
+  __ ld1(v0, __ T16B, Address(r16, r17)); // 
 
 
 #endif // PRODUCT
@@ -3805,131 +3798,131 @@
   if (UseNeon) {
       cmp(len, 64);
       br(Assembler::LT, L_by16);
-      v_eor(v16, T16B, v16, v16);
+      eor(v16, T16B, v16, v16);
 
     Label L_fold;
 
       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
 
-      v_ld1(v0, v1, T2D, buf, 32);
-      v_ld1r(v4, T2D, tmp, 8);
-      v_ld1r(v5, T2D, tmp, 8);
-      v_ld1r(v6, T2D, tmp, 8);
-      v_ld1r(v7, T2D, tmp, 8);
-      v_mov(v16, T4S, 0, crc);
-
-      v_eor(v0, T16B, v0, v16);
+      ld1(v0, v1, T2D, post(buf, 32));
+      ld1r(v4, T2D, post(tmp, 8));
+      ld1r(v5, T2D, post(tmp, 8));
+      ld1r(v6, T2D, post(tmp, 8));
+      ld1r(v7, T2D, post(tmp, 8));
+      mov(v16, T4S, 0, crc);
+
+      eor(v0, T16B, v0, v16);
       sub(len, len, 64);
 
     BIND(L_fold);
-      v_pmull(v22, T8H, v0, v5, T8B);
-      v_pmull(v20, T8H, v0, v7, T8B);
-      v_pmull(v23, T8H, v0, v4, T8B);
-      v_pmull(v21, T8H, v0, v6, T8B);
+      pmull(v22, T8H, v0, v5, T8B);
+      pmull(v20, T8H, v0, v7, T8B);
+      pmull(v23, T8H, v0, v4, T8B);
+      pmull(v21, T8H, v0, v6, T8B);
     
-      v_pmull2(v18, T8H, v0, v5, T16B);
-      v_pmull2(v16, T8H, v0, v7, T16B);
-      v_pmull2(v19, T8H, v0, v4, T16B);
-      v_pmull2(v17, T8H, v0, v6, T16B);
+      pmull2(v18, T8H, v0, v5, T16B);
+      pmull2(v16, T8H, v0, v7, T16B);
+      pmull2(v19, T8H, v0, v4, T16B);
+      pmull2(v17, T8H, v0, v6, T16B);
     
-      v_uzp1(v24, v20, v22, T8H);
-      v_uzp2(v25, v20, v22, T8H);
-      v_eor(v20, T16B, v24, v25);
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);
     
-      v_uzp1(v26, v16, v18, T8H);
-      v_uzp2(v27, v16, v18, T8H);
-      v_eor(v16, T16B, v26, v27);
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);
     
-      v_ushll2(v22, T4S, v20, T8H, 8);
-      v_ushll(v20, T4S, v20, T4H, 8);
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);
     
-      v_ushll2(v18, T4S, v16, T8H, 8);
-      v_ushll(v16, T4S, v16, T4H, 8);
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);
     
-      v_eor(v22, T16B, v23, v22);
-      v_eor(v18, T16B, v19, v18);
-      v_eor(v20, T16B, v21, v20);
-      v_eor(v16, T16B, v17, v16);
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);
     
-      v_uzp1(v17, v16, v20, T2D);
-      v_uzp2(v21, v16, v20, T2D);
-      v_eor(v17, T16B, v17, v21);
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v17, T16B, v17, v21);
     
-      v_ushll2(v20, T2D, v17, T4S, 16);
-      v_ushll(v16, T2D, v17, T2S, 16);
+      ushll2(v20, T2D, v17, T4S, 16);
+      ushll(v16, T2D, v17, T2S, 16);
     
-      v_eor(v20, T16B, v20, v22);
-      v_eor(v16, T16B, v16, v18);
+      eor(v20, T16B, v20, v22);
+      eor(v16, T16B, v16, v18);
     
-      v_uzp1(v17, v20, v16, T2D);
-      v_uzp2(v21, v20, v16, T2D);
-      v_eor(v28, T16B, v17, v21);
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v28, T16B, v17, v21);
     
-      v_pmull(v22, T8H, v1, v5, T8B);
-      v_pmull(v20, T8H, v1, v7, T8B);
-      v_pmull(v23, T8H, v1, v4, T8B);
-      v_pmull(v21, T8H, v1, v6, T8B);
+      pmull(v22, T8H, v1, v5, T8B);
+      pmull(v20, T8H, v1, v7, T8B);
+      pmull(v23, T8H, v1, v4, T8B);
+      pmull(v21, T8H, v1, v6, T8B);
     
-      v_pmull2(v18, T8H, v1, v5, T16B);
-      v_pmull2(v16, T8H, v1, v7, T16B);
-      v_pmull2(v19, T8H, v1, v4, T16B);
-      v_pmull2(v17, T8H, v1, v6, T16B);
+      pmull2(v18, T8H, v1, v5, T16B);
+      pmull2(v16, T8H, v1, v7, T16B);
+      pmull2(v19, T8H, v1, v4, T16B);
+      pmull2(v17, T8H, v1, v6, T16B);
     
-      v_ld1(v0, v1, T2D, buf, 32);
+      ld1(v0, v1, T2D, post(buf, 32));
     
-      v_uzp1(v24, v20, v22, T8H);
-      v_uzp2(v25, v20, v22, T8H);
-      v_eor(v20, T16B, v24, v25);
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);
     
-      v_uzp1(v26, v16, v18, T8H);
-      v_uzp2(v27, v16, v18, T8H);
-      v_eor(v16, T16B, v26, v27);
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);
     
-      v_ushll2(v22, T4S, v20, T8H, 8);
-      v_ushll(v20, T4S, v20, T4H, 8);
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);
     
-      v_ushll2(v18, T4S, v16, T8H, 8);
-      v_ushll(v16, T4S, v16, T4H, 8);
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);
     
-      v_eor(v22, T16B, v23, v22);
-      v_eor(v18, T16B, v19, v18);
-      v_eor(v20, T16B, v21, v20);
-      v_eor(v16, T16B, v17, v16);
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);
     
-      v_uzp1(v17, v16, v20, T2D);
-      v_uzp2(v21, v16, v20, T2D);
-      v_eor(v16, T16B, v17, v21);
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v16, T16B, v17, v21);
     
-      v_ushll2(v20, T2D, v16, T4S, 16);
-      v_ushll(v16, T2D, v16, T2S, 16);
+      ushll2(v20, T2D, v16, T4S, 16);
+      ushll(v16, T2D, v16, T2S, 16);
     
-      v_eor(v20, T16B, v22, v20);
-      v_eor(v16, T16B, v16, v18);
+      eor(v20, T16B, v22, v20);
+      eor(v16, T16B, v16, v18);
     
-      v_uzp1(v17, v20, v16, T2D);
-      v_uzp2(v21, v20, v16, T2D);
-      v_eor(v20, T16B, v17, v21);
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v20, T16B, v17, v21);
     
-      v_shl(v16, v28, T2D, 1);
-      v_shl(v17, v20, T2D, 1);
+      shl(v16, v28, T2D, 1);
+      shl(v17, v20, T2D, 1);
     
-      v_eor(v0, T16B, v0, v16);
-      v_eor(v1, T16B, v1, v17);
+      eor(v0, T16B, v0, v16);
+      eor(v1, T16B, v1, v17);
 
       subs(len, len, 32);
       br(Assembler::GE, L_fold);
 
       mov(crc, 0);
-      v_mov(tmp, v0, T1D, 0);
+      mov(tmp, v0, T1D, 0);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v0, T1D, 1);
+      mov(tmp, v0, T1D, 1);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v1, T1D, 0);
+      mov(tmp, v1, T1D, 0);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v1, T1D, 1);
+      mov(tmp, v1, T1D, 1);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
 
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Nov 24 13:37:15 2014 +0000
@@ -442,15 +442,16 @@
     }
   }
 
-  Register base() {
-    guarantee((_mode == base_plus_offset | _mode == base_plus_offset_reg),
+  Register base() const {
+    guarantee((_mode == base_plus_offset | _mode == base_plus_offset_reg
+               | _mode == post),
 	      "wrong mode");
     return _base;
   }
-  long offset() {
+  long offset() const {
     return _offset;
   }
-  Register index() {
+  Register index() const {
     return _index;
   }
   mode getMode() const {
@@ -1872,7 +1873,7 @@
  * We just use FloatRegister in the following. They are exactly the same
  * as SIMD registers.
  */
-public:
+ public:
 
   enum SIMD_Arrangement {
        T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
@@ -1882,7 +1883,136 @@
        S32, D64, Q128
   };
 
-  void v_shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+ private:
+
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             int imm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             Register Xm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+ void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) {
+   switch (a.getMode()) {
+   case Address::base_plus_offset:
+     guarantee(a.offset() == 0, "no offset allowed here");
+     ld_st(Vt, T, a.base(), op1, op2);
+     break;
+   case Address::post:
+     ld_st(Vt, T, a.base(), a.offset(), op1, op2);
+     break;
+   case Address::base_plus_offset_reg:
+     ld_st(Vt, T, a.base(), a.index(), op1, op2);
+     break;
+   default:
+     ShouldNotReachHere();
+   }
+ }
+
+ public:
+
+#define INSN1(NAME, op1, op2)					\
+  void NAME(FloatRegister Vt, SIMD_Arrangement T, const Address &a) {	\
+   ld_st(Vt, T, a, op1, op2);						\
+ }
+
+#define INSN2(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, const Address &a) { \
+    assert(Vt->successor() == Vt2, "Registers must be ordered");	\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+#define INSN3(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,	\
+            SIMD_Arrangement T, const Address &a) {			\
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3,		\
+           "Registers must be ordered");				\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+#define INSN4(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,	\
+            FloatRegister Vt4, SIMD_Arrangement T, const Address &a) {	\
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3 &&		\
+           Vt3->successor() == Vt4, "Registers must be ordered");	\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+  INSN1(ld1,  0b001100010, 0b0111);
+  INSN2(ld1,  0b001100010, 0b1010);
+  INSN3(ld1,  0b001100010, 0b0110);
+  INSN4(ld1,  0b001100010, 0b0010);
+
+  INSN2(ld2,  0b001100010, 0b1000);
+  INSN3(ld3,  0b001100010, 0b0100);
+  INSN4(ld4,  0b001100010, 0b0000);
+
+  INSN1(st1,  0b001100000, 0b0111);
+  INSN2(st1,  0b001100000, 0b1010);
+  INSN3(st1,  0b001100000, 0b0110);
+  INSN4(st1,  0b001100000, 0b0010);
+
+  INSN2(st2,  0b001100000, 0b1000);
+  INSN3(st3,  0b001100000, 0b0100);
+  INSN4(st4,  0b001100000, 0b0000);
+
+  INSN1(ld1r, 0b001101010, 0b1100);
+  INSN2(ld2r, 0b001101011, 0b1100);
+  INSN3(ld3r, 0b001101010, 0b1110);
+  INSN4(ld4r, 0b001101011, 0b1110);
+
+#undef INSN1
+#undef INSN2
+#undef INSN3
+#undef INSN4
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T8B || T == T16B, "must be T8B or T16B");                               \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29, 21);                                        \
+    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);                              \
+  }
+
+  INSN(eor, 0b101110001);
+  INSN(orr, 0b001110101);
+  INSN(andr, 0b001110001);
+  INSN(bic, 0b001110011);
+  INSN(bif, 0b101110111);
+  INSN(bit, 0b101110101);
+  INSN(bsl, 0b101110011);
+  INSN(orn, 0b001110111);
+
+#undef INSN
+
+#define INSN(NAME, opc)                           \
+  void NAME(FloatRegister Vd, FloatRegister Vn) { \
+    starti;                                       \
+    f(opc, 31, 10), rf(Vn, 5), rf(Vd, 0);         \
+  }
+
+  INSN(aese, 0b0100111000101000010010);
+  INSN(aesd, 0b0100111000101000010110);
+  INSN(aesmc, 0b0100111000101000011010);
+  INSN(aesimc, 0b0100111000101000011110);
+
+#undef INSN
+
+  void shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
      *   0001 xxx	8B/16B, shift = xxx
@@ -1895,7 +2025,7 @@
     f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
   }
 
-  void v_ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+  void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
      *   0001 xxx	8H, 8B/16b shift = xxx
@@ -1908,22 +2038,22 @@
     f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
     f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
   }
-  void v_ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
-    v_ushll(Vd, Ta, Vn, Tb, shift);
+  void ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    ushll(Vd, Ta, Vn, Tb, shift);
   }
 
-  void v_uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
+  void uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
     starti;
     f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
     rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
   }
-  void v_uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
-    v_uzp1(Vd, Vn, Vm, T, 1);
+  void uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
+    uzp1(Vd, Vn, Vm, T, 1);
   }
  
   // Move from general purpose register
   //   mov  Vd.T[index], Rn
-  void v_mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+  void mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
     starti;
     f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16); 
     f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
@@ -1931,7 +2061,7 @@
 
   // Move to general purpose register
   //   mov  Rd, Vn.T[index]
-  void v_mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+  void mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
     starti;
     f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
     f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
@@ -1939,149 +2069,23 @@
   }
 
   // We do not handle the 1Q arrangement.
-  void v_pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+  void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
     starti;
     assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
     f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
     rf(Vn, 5), rf(Vd, 0);
   }
-  void v_pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
-    v_pmull(Vd, Ta, Vn, Vm, Tb);
-  }
-
-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    pmull(Vd, Ta, Vn, Vm, Tb);
   }
 
-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((8 << ((int)T & 1)) == imm, "size/imm mismatch");      
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((16 << ((int)T & 1)) == imm, "size/imm mismatch");     
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((24 << ((int)T & 1)) == imm, "size/imm mismatch");
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, Register Xm) {
+  void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
+  {
     starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((32 << ((int)T & 1)) == imm, "size/imm mismatch");
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_st1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }  
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001101010000001100, 29, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001101110, 29, 21), rf(Xm, 16);
-    f(0b1100, 15, 12), f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((1 << ((int)T & 3)) == imm, "size/imm mismatch");
-    f(0, 31), f((int)T & 1, 30), f(0b001101110111111100, 29, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_eor(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) {
-    starti;
-    assert(T == T8B || T == T16B, "must be T8B or T16B");
-    f(0, 31), f((int)T & 1, 30), f(0b101110001, 29, 21);
-    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
+    assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H");
+    f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24);
+    f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10);
+    rf(Vn, 5), rf(Vd, 0);
   }
 
   // CRC32 instructions
@@ -2276,6 +2280,8 @@
 class MacroAssembler: public Assembler {
   friend class LIR_Assembler;
 
+  using Assembler::mov;
+
  protected:
 
   // Support for VM calls
--- a/src/cpu/aarch64/vm/icache_aarch64.cpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/icache_aarch64.cpp	Mon Nov 24 13:37:15 2014 +0000
@@ -32,7 +32,10 @@
 
 void ICacheStubGenerator::generate_icache_flush(
   		ICache::flush_icache_stub_t* flush_icache_stub) {
-  aarch64TestHook();
   // Give anyone who calls this a surprise
   *flush_icache_stub = (ICache::flush_icache_stub_t)NULL;
 }
+
+void ICache::initialize() {
+  aarch64TestHook();
+}
--- a/src/cpu/aarch64/vm/icache_aarch64.hpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/icache_aarch64.hpp	Mon Nov 24 13:37:15 2014 +0000
@@ -33,7 +33,7 @@
 
 class ICache : public AbstractICache {
  public:
-  static void initialize() {}
+  static void initialize();
   static void invalidate_word(address addr) {
     __clear_cache((char *)addr, (char *)(addr + 3));
   }
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Mon Nov 24 13:37:15 2014 +0000
@@ -1676,6 +1676,414 @@
                                                                         /*dest_uninitialized*/true);
   }
 
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter();
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+
+    __ ld1(v1, __ T16B, key);
+    __ rev32(v1, __ T16B, v1);
+    __ eor(v0, __ T16B, v0, v1);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v5, __ T16B, __ post(key, 16));
+    __ rev32(v5, __ T16B, v5);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+
+    __ eor(v0, __ T16B, v0, v5);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch1, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v0, __ T16B, rvec);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, v31, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+      __ rev32(v31, __ T16B, v31);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v1, __ T16B, __ post(from, 16));
+      __ eor(v0, __ T16B, v0, v1);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aese(v0, v17); __ aesmc(v0, v0);
+      __ aese(v0, v18); __ aesmc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aese(v0, v19); __ aesmc(v0, v0);
+      __ aese(v0, v20); __ aesmc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aese(v0, v21); __ aesmc(v0, v0);
+      __ aese(v0, v22); __ aesmc(v0, v0);
+      __ aese(v0, v23); __ aesmc(v0, v0);
+      __ aese(v0, v24); __ aesmc(v0, v0);
+      __ aese(v0, v25); __ aesmc(v0, v0);
+      __ aese(v0, v26); __ aesmc(v0, v0);
+      __ aese(v0, v27); __ aesmc(v0, v0);
+      __ aese(v0, v28); __ aesmc(v0, v0);
+      __ aese(v0, v29); __ aesmc(v0, v0);
+      __ aese(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v0, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch2, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v2, __ T16B, rvec);
+
+      __ ld1(v31, __ T16B, __ post(key, 16));
+      __ rev32(v31, __ T16B, v31);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v0, __ T16B, __ post(from, 16));
+      __ orr(v1, __ T16B, v0, v0);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aesd(v0, v19); __ aesimc(v0, v0);
+      __ aesd(v0, v20); __ aesimc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aesd(v0, v21); __ aesimc(v0, v0);
+      __ aesd(v0, v22); __ aesimc(v0, v0);
+      __ aesd(v0, v23); __ aesimc(v0, v0);
+      __ aesd(v0, v24); __ aesimc(v0, v0);
+      __ aesd(v0, v25); __ aesimc(v0, v0);
+      __ aesd(v0, v26); __ aesimc(v0, v0);
+      __ aesd(v0, v27); __ aesimc(v0, v0);
+      __ aesd(v0, v28); __ aesimc(v0, v0);
+      __ aesd(v0, v29); __ aesimc(v0, v0);
+      __ aesd(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+      __ eor(v0, __ T16B, v0, v2);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ orr(v2, __ T16B, v1, v1);
+
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v2, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+
+    return start;
+  }
+
   // AARCH64 use safefetch stubs unless we are building for the simulator
   // in which case the x86 asm code in linux_aarch64.S is used
 
@@ -1930,6 +2338,13 @@
     generate_arraycopy_stubs();
 
 #ifndef BUILTIN_SIM
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Nov 21 20:35:24 2014 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Nov 24 13:37:15 2014 +0000
@@ -47,6 +47,10 @@
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
 
+#ifndef HWCAP_AES
+#define HWCAP_AES   (1<<3)
+#endif
+
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1<<7)
 #endif
@@ -121,6 +125,22 @@
   if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
     warning("UseCRC32 specified, but not supported on this CPU");
   }
+  if (auxv & HWCAP_AES) {
+    UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
+    UseAESIntrinsics =
+        UseAESIntrinsics || (UseAES && FLAG_IS_DEFAULT(UseAESIntrinsics));
+    if (UseAESIntrinsics && !UseAES) {
+      warning("UseAESIntrinsics enabled, but UseAES not, enabling");
+      UseAES = true;
+    }
+  } else {
+    if (UseAES) {
+      warning("UseAES specified, but not supported on this CPU");
+    }
+    if (UseAESIntrinsics) {
+      warning("UseAESIntrinsics specified, but not supported on this CPU");
+    }
+  }
 #endif
 
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {