Mercurial > hg > icedtea7-forest-aarch64 > hotspot

--- a/make/linux/makefiles/vm.make	Fri Nov 28 03:10:21 2014 +0000
+++ b/make/linux/makefiles/vm.make	Thu Dec 04 14:30:02 2014 +0000
@@ -92,6 +92,10 @@
 BUILD_USER    = -DHOTSPOT_BUILD_USER="\"$(HOTSPOT_BUILD_USER)\""
 VM_DISTRO     = -DHOTSPOT_VM_DISTRO="\"$(HOTSPOT_VM_DISTRO)\""

+ifeq ($(BUILTIN_SIM), true)
+  HS_LIB_ARCH=-DHOTSPOT_LIB_ARCH="\"aarch64\""
+endif
+
 CXXFLAGS =           \
   ${SYSDEFS}         \
   ${INCLUDES}        \
--- a/src/cpu/aarch64/vm/aarch64.ad	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Dec 04 14:30:02 2014 +0000
@@ -804,11 +804,6 @@

 //=============================================================================

-// Emit an interrupt that is caught by the debugger (for debugging compiler).
-void emit_break(CodeBuffer &cbuf) {
-  Unimplemented();
-}
-
 #ifndef PRODUCT
 void MachBreakpointNode::format(PhaseRegAlloc *ra_, outputStream *st) const {
   st->print("BREAKPOINT");
@@ -1363,12 +1358,10 @@
   return 4;
 }

-// !!! FIXME AARCH64 -- this needs to be reworked for jdk7
-
 uint size_java_to_interp()
 {
-  // count a mov mem --> to 3 movz/k and a branch
-  return 4 * NativeInstruction::instruction_size;
+  // ob jdk7 we only need a mov oop and a branch
+  return 2 * NativeInstruction::instruction_size;
 }

 // Offset from start of compiled java to interpreter stub to the load
@@ -1395,11 +1388,11 @@
   // static stub relocation stores the instruction address of the call
   const RelocationHolder &rspec = static_stub_Relocation::spec(mark);
   __ relocate(rspec);
-  // !!! FIXME AARCH64
   // static stub relocation also tags the methodOop in the code-stream.
-  // for jdk7 we have to use movoop and locate the oop in the cpool
-  // if we use an immediate then patching fails to update the pool
-  // oop and GC overwrites the patch with movk/z 0x0000 again
+  //
+  // n.b. for jdk7 we have to use movoop and locate the oop in the
+  // cpool if we use an immediate then patching fails to update the
+  // pool oop and GC overwrites the patch with movk/z 0x0000 again
   __ movoop(rmethod, (jobject) NULL);
   // This is recognized as unresolved by relocs/nativeinst/ic code
   __ b(__ pc());
@@ -1412,9 +1405,8 @@
 // relocation entries for call stub, compiled java to interpretor
 uint reloc_java_to_interp()
 {
-  // TODO fixme
-  // return a large number
-  return 5;
+  // n.b. on jdk7 we use a movoop and a branch
+  return 2;
 }

 //=============================================================================
@@ -2414,16 +2406,13 @@
     int disp = $mem$$disp;
     if (index == -1) {
       __ prfm(Address(base, disp), PLDL1KEEP);
-      __ nop();
     } else {
       Register index_reg = as_Register(index);
       if (disp == 0) {
-        // __ prfm(Address(base, index_reg, Address::lsl(scale)), PLDL1KEEP);
-        __ nop();
+        __ prfm(Address(base, index_reg, Address::lsl(scale)), PLDL1KEEP);
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PLDL1KEEP);
-        __ nop();
       }
     }
   %}
@@ -2441,11 +2430,9 @@
       Register index_reg = as_Register(index);
       if (disp == 0) {
         __ prfm(Address(base, index_reg, Address::lsl(scale)), PSTL1KEEP);
-        __ nop();
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PSTL1KEEP);
-        __ nop();
       }
     }
   %}
@@ -2458,16 +2445,13 @@
     int disp = $mem$$disp;
     if (index == -1) {
       __ prfm(Address(base, disp), PSTL1STRM);
-      __ nop();
     } else {
       Register index_reg = as_Register(index);
       if (disp == 0) {
         __ prfm(Address(base, index_reg, Address::lsl(scale)), PSTL1STRM);
-        __ nop();
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PSTL1STRM);
-        __ nop();
       }
     }
   %}
@@ -2589,7 +2573,12 @@
     Register dst_reg = as_Register($dst$$reg);
     unsigned long off;
     __ adrp(dst_reg, ExternalAddress(page), off);
-    assert(off == 0, "assumed offset == 0");
+    assert((off & 0x3ffL) == 0, "assumed offset aligned to 0x400");
+    // n.b. intra-page offset will never change even if this gets
+    // relocated so it is safe to omit the lea when off == 0
+    if (off != 0) {
+      __ lea(dst_reg, Address(dst_reg, off));
+    }
   %}

   enc_class aarch64_enc_mov_n(iRegN dst, immN src) %{
@@ -3374,6 +3363,16 @@
   interface(CONST_INTER);
 %}

+operand immI_le_4()
+%{
+  predicate(n->get_int() <= 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 operand immI_31()
 %{
   predicate(n->get_int() == 31);
@@ -4698,17 +4697,14 @@
 attributes %{
   // ARM instructions are of fixed length
   fixed_size_instructions;        // Fixed size instructions TODO does
-  // TODO does this relate to how many instructions can be scheduled
-  // at once? just guess 8 for now
-  max_instructions_per_bundle = 8;   // Up to 8 instructions per bundle
+  max_instructions_per_bundle = 2;   // A53 = 2, A57 = 4
   // ARM instructions come in 32-bit word units
   instruction_unit_size = 4;         // An instruction is 4 bytes long
-  // TODO identify correct cache line size  just guess 64 for now
   instruction_fetch_unit_size = 64;  // The processor fetches one line
   instruction_fetch_units = 1;       // of 64 bytes

   // List of nop instructions
-  //nops( MachNop );
+  nops( MachNop );
 %}

 // We don't use an actual pipeline model so don't care about resources
@@ -4718,21 +4714,387 @@
 //----------RESOURCES----------------------------------------------------------
 // Resources are the functional units available to the machine

-resources( D0, D1, D2, DECODE = D0 | D1 | D2,
-           MS0, MS1, MS2, MEM = MS0 | MS1 | MS2,
-           BR, FPU,
-           ALU0, ALU1, ALU2, ALU = ALU0 | ALU1 | ALU2);
+resources( INS0, INS1, INS01 = INS0 | INS1,
+           ALU0, ALU1, ALU = ALU0 | ALU1,
+           MAC,
+           DIV,
+           BRANCH,
+           LDST,
+           NEON_FP);

 //----------PIPELINE DESCRIPTION-----------------------------------------------
 // Pipeline Description specifies the stages in the machine's pipeline

 // Generic P2/P3 pipeline
-pipe_desc(S0, S1, S2, S3, S4, S5);
+pipe_desc(ISS, EX1, EX2, WR);

 //----------PIPELINE CLASSES---------------------------------------------------
 // Pipeline Classes describe the stages in which input and output are
 // referenced by the hardware pipeline.

+//------- Integer ALU operations --------------------------
+
+// Integer ALU reg-reg operation
+// Operands needed in EX1, result generated in EX2
+// Eg.	ADD	x0, x1, x2
+pipe_class ialu_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  INS01  : ISS; // Dual issue as instruction 0 or 1
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with constant shift
+// Shifted register must be available in LATE_ISS instead of EX1
+// Eg.	ADD	x0, x1, x2, LSL #2
+pipe_class ialu_reg_reg_shift(iRegI dst, iRegI src1, iRegI src2, immI shift)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg operation with constant shift
+// Eg.	LSL	x0, x1, #shift
+pipe_class ialu_reg_shift(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with variable shift
+// Both operands must be available in LATE_ISS instead of EX1
+// Result is available in EX1 instead of EX2
+// Eg.	LSLV	x0, x1, x2
+pipe_class ialu_reg_reg_vshift(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+// Integer ALU reg-reg operation with extract
+// As for _vshift above, but result generated in EX2
+// Eg.	EXTR	x0, x1, x2, #N
+pipe_class ialu_reg_reg_extr(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS1   : ISS; // Can only dual issue as Instruction 1
+  ALU    : EX1;
+%}
+
+// Integer ALU reg operation
+// Eg.	NEG	x0, x1
+pipe_class ialu_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg mmediate operation
+// Eg.	ADD	x0, x1, #N
+pipe_class ialu_reg_imm(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU immediate operation (no source operands)
+// Eg.	MOV	x0, #N
+pipe_class ialu_imm(iRegI dst)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+//------- Compare operation -------------------------------
+
+// Compare reg-reg
+// Eg.	CMP	x0, x1
+pipe_class icmp_reg_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  op2    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Compare reg-reg
+// Eg.	CMP	x0, #N
+pipe_class icmp_reg_imm(rFlagsReg cr, iRegI op1)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Conditional instructions ------------------------
+
+// Conditional no operands
+// Eg.	CSINC	x0, zr, zr, <cond>
+pipe_class icond_none(iRegI dst, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg_reg(iRegI dst, iRegI src1, iRegI src2, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg(iRegI dst, iRegI src, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src    : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Multiply pipeline operations --------------------
+
+// Multiply reg-reg
+// Eg.	MUL	w0, w1, w2
+pipe_class imul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class imac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Eg.	MUL	w0, w1, w2
+pipe_class lmul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class lmac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+//------- Divide pipeline operations --------------------
+
+// Eg.	SDIV	w0, w1, w2
+pipe_class idiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(8); // Maximum latency for 32 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+// Eg.	SDIV	x0, x1, x2
+pipe_class ldiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(16); // Maximum latency for 64 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+//------- Load pipeline operations ------------------------
+
+// Load - prefetch
+// Eg.	PFRM	<mem>
+pipe_class iload_prefetch(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, mem
+// Eg.	LDR	x0, <mem>
+pipe_class iload_reg_mem(iRegI dst, memory mem)
+%{
+  single_instruction;
+  dst    : WR(write);
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, reg
+// Eg.	LDR	x0, [sp, x1]
+pipe_class iload_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Store - zr, mem
+// Eg.	STR	zr, <mem>
+pipe_class istore_mem(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, mem
+// Eg.	STR	x0, <mem>
+pipe_class istore_reg_mem(iRegI src, memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, reg
+// Eg. STR	x0, [sp, x1]
+pipe_class istore_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Branch
+pipe_class pipe_branch()
+%{
+  single_instruction;
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Conditional branch
+pipe_class pipe_branch_cond(rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Compare & Branch
+// EG.	CBZ/CBNZ
+pipe_class pipe_cmp_branch(iRegI op1)
+%{
+  single_instruction;
+  op1    : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+//------- Synchronisation operations ----------------------
+
+// Any operation requiring serialization.
+// EG.	DMB/Atomic Ops/Load Acquire/Str Release
+pipe_class pipe_serial()
+%{
+  single_instruction;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
+// Generic big/slow expanded idiom - also serialized
+pipe_class pipe_slow()
+%{
+  instruction_count(10);
+  multiple_bundles;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
 // Empty pipeline class
 pipe_class pipe_class_empty()
 %{
@@ -4754,13 +5116,6 @@
   fixed_latency(16);
 %}

-// Pipeline class for traps.
-pipe_class pipe_class_trap()
-%{
-  single_instruction;
-  fixed_latency(100);
-%}
-
 // Pipeline class for memory operations.
 pipe_class pipe_class_memory()
 %{
@@ -4777,7 +5132,7 @@

 // Define the class for the Nop node.
 define %{
-   MachNop = pipe_class_default;
+  MachNop = pipe_class_empty;
 %}

 %}
@@ -4817,7 +5172,7 @@

   ins_encode(aarch64_enc_ldrsbw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Byte (8 bit signed) into long
@@ -4830,7 +5185,7 @@

   ins_encode(aarch64_enc_ldrsb(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Byte (8 bit unsigned)
@@ -4843,7 +5198,7 @@

   ins_encode(aarch64_enc_ldrb(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Byte (8 bit unsigned) into long
@@ -4856,7 +5211,7 @@

   ins_encode(aarch64_enc_ldrb(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Short (16 bit signed)
@@ -4869,7 +5224,7 @@

   ins_encode(aarch64_enc_ldrshw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Short (16 bit signed) into long
@@ -4882,7 +5237,7 @@

   ins_encode(aarch64_enc_ldrsh(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Char (16 bit unsigned)
@@ -4895,7 +5250,7 @@

   ins_encode(aarch64_enc_ldrh(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Short/Char (16 bit unsigned) into long
@@ -4908,7 +5263,7 @@

   ins_encode(aarch64_enc_ldrh(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Integer (32 bit signed)
@@ -4921,7 +5276,7 @@

   ins_encode(aarch64_enc_ldrw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Integer (32 bit signed) into long
@@ -4934,7 +5289,7 @@

   ins_encode(aarch64_enc_ldrsw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Integer (32 bit unsigned) into long
@@ -4947,7 +5302,7 @@

   ins_encode(aarch64_enc_ldrw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Long (64 bit signed)
@@ -4960,7 +5315,7 @@

   ins_encode(aarch64_enc_ldr(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Range
@@ -4973,7 +5328,7 @@

   ins_encode(aarch64_enc_ldrw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Pointer
@@ -4986,7 +5341,7 @@

   ins_encode(aarch64_enc_ldr(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Compressed Pointer
@@ -4999,7 +5354,7 @@

   ins_encode(aarch64_enc_ldrw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Klass Pointer
@@ -5012,7 +5367,7 @@

   ins_encode(aarch64_enc_ldr(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Narrow Klass Pointer
@@ -5025,7 +5380,7 @@

   ins_encode(aarch64_enc_ldrw(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}

 // Load Float
@@ -5065,7 +5420,7 @@

   ins_encode( aarch64_enc_movw_imm(dst, src) );

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Long Constant
@@ -5078,7 +5433,7 @@

   ins_encode( aarch64_enc_mov_imm(dst, src) );

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Pointer Constant
@@ -5094,7 +5449,7 @@

   ins_encode(aarch64_enc_mov_p(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Null Pointer Constant
@@ -5108,7 +5463,7 @@

   ins_encode(aarch64_enc_mov_p0(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Pointer Constant One
@@ -5122,7 +5477,7 @@

   ins_encode(aarch64_enc_mov_p1(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Poll Page Constant
@@ -5136,7 +5491,7 @@

   ins_encode(aarch64_enc_mov_poll_page(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Byte Map Base Constant
@@ -5150,7 +5505,7 @@

   ins_encode(aarch64_enc_mov_byte_map_base(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Narrow Pointer Constant
@@ -5164,7 +5519,7 @@

   ins_encode(aarch64_enc_mov_n(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Narrow Null Pointer Constant
@@ -5178,7 +5533,7 @@

   ins_encode(aarch64_enc_mov_n0(dst, con));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}

 // Load Packed Float Constant
@@ -5254,7 +5609,7 @@

   ins_encode(aarch64_enc_strb0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Store Byte
@@ -5267,7 +5622,7 @@

   ins_encode(aarch64_enc_strb(src, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}


@@ -5280,7 +5635,7 @@

   ins_encode(aarch64_enc_strb0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Store Char/Short
@@ -5293,7 +5648,7 @@

   ins_encode(aarch64_enc_strh(src, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}

 instruct storeimmC0(immI0 zero, memory mem)
@@ -5305,7 +5660,7 @@

   ins_encode(aarch64_enc_strh0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Store Integer
@@ -5319,7 +5674,7 @@

   ins_encode(aarch64_enc_strw(src, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}

 instruct storeimmI0(immI0 zero, memory mem)
@@ -5331,7 +5686,7 @@

   ins_encode(aarch64_enc_strw0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Store Long (64 bit signed)
@@ -5344,7 +5699,7 @@

   ins_encode(aarch64_enc_str(src, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}

 // Store Long (64 bit signed)
@@ -5357,7 +5712,7 @@

   ins_encode(aarch64_enc_str0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Store Pointer
@@ -5370,7 +5725,7 @@

   ins_encode(aarch64_enc_str(src, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}

 // Store Pointer
@@ -5383,7 +5738,7 @@

   ins_encode(aarch64_enc_str0(mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}

 // Save last Java PC to thread anchor
@@ -5500,7 +5855,7 @@

   ins_encode( aarch64_enc_prefetchr(mem) );

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_prefetch);
 %}

 instruct prefetchw( memory mem ) %{
@@ -5511,7 +5866,7 @@

   ins_encode( aarch64_enc_prefetchw(mem) );

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_prefetch);
 %}

 instruct prefetchnta( memory mem ) %{
@@ -5522,64 +5877,64 @@

   ins_encode( aarch64_enc_prefetchnta(mem) );

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_prefetch);
 %}

 // ============================================================================
 // BSWAP Instructions

-instruct bytes_reverse_int(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesI dst));
-
-  ins_cost(INSN_COST);
-  format %{ "revw  $dst, $dst" %}
-
-  ins_encode %{
-    __ revw(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_long(iRegLNoSp dst) %{
-  match(Set dst (ReverseBytesL dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev  $dst, $dst" %}
-
-  ins_encode %{
-    __ rev(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_unsigned_short(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesUS dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev16w  $dst, $dst" %}
-
-  ins_encode %{
-    __ rev16w(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_short(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesS dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev16w  $dst, $dst\n\t"
+instruct bytes_reverse_int(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesI src));
+
+  ins_cost(INSN_COST);
+  format %{ "revw  $dst, $src" %}
+
+  ins_encode %{
+    __ revw(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_long(iRegLNoSp dst, iRegL src) %{
+  match(Set dst (ReverseBytesL src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev  $dst, $src" %}
+
+  ins_encode %{
+    __ rev(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_unsigned_short(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesUS src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev16w  $dst, $src" %}
+
+  ins_encode %{
+    __ rev16w(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_short(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesS src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev16w  $dst, $src\n\t"
             "sbfmw $dst, $dst, #0, #15" %}

   ins_encode %{
-    __ rev16w(as_Register($dst$$reg), as_Register($dst$$reg));
+    __ rev16w(as_Register($dst$$reg), as_Register($src$$reg));
     __ sbfmw(as_Register($dst$$reg), as_Register($dst$$reg), 0U, 15U);
   %}

-  ins_pipe( pipe_class_default );
+  ins_pipe(ialu_reg);
 %}

 // ============================================================================
@@ -5594,7 +5949,7 @@
     __ clzw(as_Register($dst$$reg), as_Register($src$$reg));
   %}

-  ins_pipe( pipe_class_default );
+  ins_pipe( ialu_reg );
 %}

 instruct countLeadingZerosL(iRegI dst, iRegL src) %{
@@ -5606,7 +5961,7 @@
     __ clz(as_Register($dst$$reg), as_Register($src$$reg));
   %}

-  ins_pipe( pipe_class_default );
+  ins_pipe( ialu_reg );
 %}

 instruct countTrailingZerosI(iRegI dst, iRegI src) %{
@@ -5620,7 +5975,7 @@
     __ clzw(as_Register($dst$$reg), as_Register($dst$$reg));
   %}

-  ins_pipe( pipe_class_default );
+  ins_pipe(ialu_reg );
 %}

 instruct countTrailingZerosL(iRegI dst, iRegL src) %{
@@ -5634,7 +5989,7 @@
     __ clz(as_Register($dst$$reg), as_Register($dst$$reg));
   %}

-  ins_pipe( pipe_class_default );
+  ins_pipe( pipe_serial );
 %}

 // ============================================================================
@@ -5651,7 +6006,7 @@
     __ membar(Assembler::Membar_mask_bits(Assembler::LoadLoad|Assembler::LoadStore));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct membar_release()
@@ -5663,7 +6018,7 @@
   ins_encode %{
   __ membar(Assembler::AnyAny);
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct membar_volatile() %{
@@ -5676,7 +6031,7 @@
     __ membar(Assembler::AnyAny);
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct unnecessary_membar_volatile() %{
@@ -5698,7 +6053,7 @@
     __ membar(Assembler::StoreStore);
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct membar_acquire_lock() %{
@@ -5711,7 +6066,7 @@
     __ membar(Assembler::Membar_mask_bits(Assembler::LoadLoad|Assembler::LoadStore));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct membar_release_lock() %{
@@ -5724,7 +6079,7 @@
     __ membar(Assembler::AnyAny);
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 // ============================================================================
@@ -5742,7 +6097,7 @@
     }
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct castP2X(iRegLNoSp dst, iRegP src) %{
@@ -5757,7 +6112,7 @@
     }
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Convert oop into int for vectors alignment masking
@@ -5770,7 +6125,7 @@
     __ movw($dst$$Register, $src$$Register);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Convert compressed oop into int for vectors alignment masking
@@ -5786,7 +6141,7 @@
     __ movw($dst$$Register, $src$$Register);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}


@@ -5802,7 +6157,7 @@
     Register d = $dst$$Register;
     __ encode_heap_oop(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct encodeHeapOop_not_null(iRegNNoSp dst, iRegP src, rFlagsReg cr) %{
@@ -5813,7 +6168,7 @@
   ins_encode %{
     __ encode_heap_oop_not_null($dst$$Register, $src$$Register);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct decodeHeapOop(iRegPNoSp dst, iRegN src, rFlagsReg cr) %{
@@ -5827,7 +6182,7 @@
     Register d = $dst$$Register;
     __ decode_heap_oop(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct decodeHeapOop_not_null(iRegPNoSp dst, iRegN src, rFlagsReg cr) %{
@@ -5841,7 +6196,7 @@
     Register d = $dst$$Register;
     __ decode_heap_oop_not_null(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct checkCastPP(iRegPNoSp dst)
@@ -5913,7 +6268,7 @@

   ins_encode(aarch64_enc_ldaxr(dst, mem));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 // Conditional-store of the updated heap-top.
@@ -5938,7 +6293,7 @@

   ins_encode(aarch64_enc_stlxr(newval, heap_top_ptr));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 // this has to be implemented as a CAS
@@ -5955,7 +6310,7 @@

   ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}

 // this has to be implemented as a CAS
@@ -5972,7 +6327,7 @@

   ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}

 // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
@@ -5992,7 +6347,7 @@
  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval),
             aarch64_enc_cset_eq(res));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}

 instruct compareAndSwapL(iRegINoSp res, memory mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
@@ -6009,7 +6364,7 @@
  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval),
             aarch64_enc_cset_eq(res));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}

 instruct compareAndSwapP(iRegINoSp res, memory mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
@@ -6026,7 +6381,7 @@
  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval),
             aarch64_enc_cset_eq(res));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}

 instruct compareAndSwapN(iRegINoSp res, memory mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
@@ -6043,7 +6398,7 @@
  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval),
             aarch64_enc_cset_eq(res));

-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}


@@ -6053,7 +6408,7 @@
   ins_encode %{
     __ atomic_xchgw($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_setL(indirect mem, iRegLNoSp newv, iRegL prev) %{
@@ -6062,7 +6417,7 @@
   ins_encode %{
     __ atomic_xchg($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_setN(indirect mem, iRegNNoSp newv, iRegI prev) %{
@@ -6071,7 +6426,7 @@
   ins_encode %{
     __ atomic_xchgw($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_setP(indirect mem, iRegPNoSp newv, iRegP prev) %{
@@ -6080,7 +6435,7 @@
   ins_encode %{
     __ atomic_xchg($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}


@@ -6091,7 +6446,7 @@
   ins_encode %{
     __ atomic_add($newval$$Register, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addL_no_res(indirect mem, Universe dummy, iRegL incr) %{
@@ -6102,7 +6457,7 @@
   ins_encode %{
     __ atomic_add(noreg, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addLi(indirect mem, iRegLNoSp newval, immLAddSub incr) %{
@@ -6112,7 +6467,7 @@
   ins_encode %{
     __ atomic_add($newval$$Register, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addLi_no_res(indirect mem, Universe dummy, immLAddSub incr) %{
@@ -6123,7 +6478,7 @@
   ins_encode %{
     __ atomic_add(noreg, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addI(indirect mem, iRegINoSp newval, iRegIorL2I incr) %{
@@ -6133,7 +6488,7 @@
   ins_encode %{
     __ atomic_addw($newval$$Register, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addI_no_res(indirect mem, Universe dummy, iRegIorL2I incr) %{
@@ -6144,7 +6499,7 @@
   ins_encode %{
     __ atomic_addw(noreg, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addIi(indirect mem, iRegINoSp newval, immIAddSub incr) %{
@@ -6154,7 +6509,7 @@
   ins_encode %{
     __ atomic_addw($newval$$Register, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 instruct get_and_addIi_no_res(indirect mem, Universe dummy, immIAddSub incr) %{
@@ -6165,7 +6520,7 @@
   ins_encode %{
     __ atomic_addw(noreg, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}

 // ============================================================================
@@ -6194,7 +6549,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 instruct cmovUI_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegI src1, iRegI src2) %{
@@ -6210,7 +6565,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 // special cases where one arg is zero
@@ -6235,7 +6590,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUI_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, immI0 zero, iRegI src2) %{
@@ -6251,7 +6606,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovI_reg_zero(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, iRegI src1, immI0 zero) %{
@@ -6267,7 +6622,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUI_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegI src1, immI0 zero) %{
@@ -6283,7 +6638,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 // special case for creating a boolean 0 or 1
@@ -6307,7 +6662,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_none);
 %}

 instruct cmovUI_reg_zero_one(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, immI0 zero, immI_1 one) %{
@@ -6326,7 +6681,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_none);
 %}

 instruct cmovL_reg_reg(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, iRegL src1, iRegL src2) %{
@@ -6342,7 +6697,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 instruct cmovUL_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, iRegL src1, iRegL src2) %{
@@ -6358,7 +6713,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 // special cases where one arg is zero
@@ -6376,7 +6731,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUL_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, iRegL src1, immL0 zero) %{
@@ -6392,7 +6747,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovL_zero_reg(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, immL0 zero, iRegL src2) %{
@@ -6408,7 +6763,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUL_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, immL0 zero, iRegL src2) %{
@@ -6440,7 +6795,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 instruct cmovUP_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, iRegP src1, iRegP src2) %{
@@ -6456,7 +6811,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 // special cases where one arg is zero
@@ -6474,7 +6829,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUP_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, iRegP src1, immP0 zero) %{
@@ -6490,7 +6845,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovP_zero_reg(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, immP0 zero, iRegP src2) %{
@@ -6506,7 +6861,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUP_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, immP0 zero, iRegP src2) %{
@@ -6522,7 +6877,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovN_reg_reg(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, iRegN src1, iRegN src2) %{
@@ -6538,7 +6893,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 instruct cmovUN_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, iRegN src1, iRegN src2) %{
@@ -6554,7 +6909,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}

 // special cases where one arg is zero
@@ -6572,7 +6927,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUN_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, iRegN src1, immN0 zero) %{
@@ -6588,7 +6943,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovN_zero_reg(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, immN0 zero, iRegN src2) %{
@@ -6604,7 +6959,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovUN_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, immN0 zero, iRegN src2) %{
@@ -6620,7 +6975,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}

 instruct cmovF_reg(cmpOp cmp, rFlagsReg cr, vRegF dst, vRegF src1,  vRegF src2)
@@ -6719,7 +7074,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct addI_reg_imm(iRegINoSp dst, iRegI src1, immIAddSub src2) %{
@@ -6733,7 +7088,7 @@

   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 instruct addI_reg_imm_i2l(iRegINoSp dst, iRegL src1, immIAddSub src2) %{
@@ -6747,7 +7102,7 @@

   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Pointer Addition
@@ -6763,7 +7118,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct addP_reg_reg_ext(iRegPNoSp dst, iRegP src1, iRegIorL2I src2) %{
@@ -6778,7 +7133,7 @@
            as_Register($src2$$reg), ext::sxtw);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct addP_reg_reg_lsl(iRegPNoSp dst, iRegP src1, iRegL src2, immIScale scale) %{
@@ -6793,7 +7148,7 @@
 		   Address::lsl($scale$$constant)));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct addP_reg_reg_ext_shift(iRegPNoSp dst, iRegP src1, iRegIorL2I src2, immIScale scale) %{
@@ -6808,7 +7163,7 @@
 		   Address::sxtw($scale$$constant)));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct lshift_ext(iRegLNoSp dst, iRegIorL2I src, immI scale, rFlagsReg cr) %{
@@ -6823,7 +7178,7 @@
           $scale$$constant & 63, MIN(32, (-$scale$$constant) & 63));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Pointer Immediate Addition
@@ -6840,7 +7195,7 @@

   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Long Addition
@@ -6857,7 +7212,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 // No constant pool entries requiredLong Immediate Addition.
@@ -6872,7 +7227,7 @@

   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Integer Subtraction
@@ -6888,7 +7243,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 // Immediate Subtraction
@@ -6903,7 +7258,7 @@

   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Long Subtraction
@@ -6920,7 +7275,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 // No constant pool entries requiredLong Immediate Subtraction.
@@ -6935,7 +7290,7 @@

   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Integer Negation (special case for sub)
@@ -6947,11 +7302,11 @@
   format %{ "negw $dst, $src\t# int" %}

   ins_encode %{
-    __ negsw(as_Register($dst$$reg),
+    __ negw(as_Register($dst$$reg),
              as_Register($src$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Long Negation
@@ -6967,7 +7322,7 @@
 	   as_Register($src$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Integer Multiply
@@ -6984,7 +7339,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(imul_reg_reg);
 %}

 instruct smulI(iRegLNoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
@@ -6999,7 +7354,7 @@
 	     as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(imul_reg_reg);
 %}

 // Long Multiply
@@ -7016,7 +7371,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(lmul_reg_reg);
 %}

 instruct mulHiL_rReg(iRegLNoSp dst, iRegL src1, iRegL src2, rFlagsReg cr)
@@ -7032,7 +7387,7 @@
 	     as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(lmul_reg_reg);
 %}

 // Combined Integer Multiply & Add/Sub
@@ -7050,7 +7405,7 @@
              as_Register($src3$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(imac_reg_reg);
 %}

 instruct msubI(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, iRegIorL2I src3) %{
@@ -7066,7 +7421,7 @@
              as_Register($src3$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(imac_reg_reg);
 %}

 // Combined Long Multiply & Add/Sub
@@ -7084,7 +7439,7 @@
             as_Register($src3$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(lmac_reg_reg);
 %}

 instruct msubL(iRegLNoSp dst, iRegL src1, iRegL src2, iRegL src3) %{
@@ -7100,7 +7455,7 @@
             as_Register($src3$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(lmac_reg_reg);
 %}

 // Integer Divide
@@ -7112,7 +7467,7 @@
   format %{ "sdivw  $dst, $src1, $src2" %}

   ins_encode(aarch64_enc_divw(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(idiv_reg_reg);
 %}

 instruct signExtract(iRegINoSp dst, iRegI src, immI_31 div1, immI_31 div2) %{
@@ -7122,7 +7477,7 @@
   ins_encode %{
     __ lsrw(as_Register($dst$$reg), as_Register($src$$reg), 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 instruct div2Round(iRegINoSp dst, iRegI src, immI_31 div1, immI_31 div2) %{
@@ -7136,7 +7491,7 @@
 	      as_Register($src$$reg),
 	      Assembler::LSR, 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Long Divide
@@ -7148,7 +7503,7 @@
   format %{ "sdiv   $dst, $src1, $src2" %}

   ins_encode(aarch64_enc_div(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(ldiv_reg_reg);
 %}

 instruct signExtractL(iRegLNoSp dst, iRegL src, immL_63 div1, immL_63 div2) %{
@@ -7158,7 +7513,7 @@
   ins_encode %{
     __ lsr(as_Register($dst$$reg), as_Register($src$$reg), 63);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 instruct div2RoundL(iRegLNoSp dst, iRegL src, immL_63 div1, immL_63 div2) %{
@@ -7172,7 +7527,7 @@
 	      as_Register($src$$reg),
 	      Assembler::LSR, 63);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 // Integer Remainder
@@ -7185,7 +7540,7 @@
             "msubw($dst, rscratch1, $src2, $src1" %}

   ins_encode(aarch64_enc_modw(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(idiv_reg_reg);
 %}

 // Long Remainder
@@ -7198,7 +7553,7 @@
             "msub($dst, rscratch1, $src2, $src1" %}

   ins_encode(aarch64_enc_mod(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(ldiv_reg_reg);
 %}

 // Integer Shifts
@@ -7216,7 +7571,7 @@
              as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // Shift Left Immediate
@@ -7232,7 +7587,7 @@
             $src2$$constant & 0x1f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Right Logical Register
@@ -7264,7 +7619,7 @@
             $src2$$constant & 0x1f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Right Arithmetic Register
@@ -7280,7 +7635,7 @@
              as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // Shift Right Arithmetic Immediate
@@ -7296,7 +7651,7 @@
             $src2$$constant & 0x1f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Combined Int Mask and Right Shift (using UBFM)
@@ -7317,7 +7672,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // Shift Left Immediate
@@ -7333,7 +7688,7 @@
             $src2$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Right Logical Register
@@ -7349,7 +7704,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // Shift Right Logical Immediate
@@ -7365,7 +7720,23 @@
            $src2$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
+%}
+
+// A special-case pattern for card table stores.
+instruct urShiftP_reg_imm(iRegLNoSp dst, iRegP src1, immI src2) %{
+  match(Set dst (URShiftL (CastP2X src1) src2));
+
+  ins_cost(INSN_COST);
+  format %{ "lsr $dst, p2x($src1), ($src2 & 0x3f)" %}
+
+  ins_encode %{
+    __ lsr(as_Register($dst$$reg),
+           as_Register($src1$$reg),
+           $src2$$constant & 0x3f);
+  %}
+
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Right Arithmetic Register
@@ -7381,7 +7752,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // Shift Right Arithmetic Immediate
@@ -7397,7 +7768,7 @@
            $src2$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // BEGIN This section of the file is automatically generated. Do not edit --------------
@@ -7416,7 +7787,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 instruct regI_not_reg(iRegINoSp dst,
                          iRegI src1, immI_M1 m1,
@@ -7432,7 +7803,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct AndI_reg_not_reg(iRegINoSp dst,
@@ -7449,7 +7820,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AndL_reg_not_reg(iRegLNoSp dst,
@@ -7466,7 +7837,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct OrI_reg_not_reg(iRegINoSp dst,
@@ -7483,7 +7854,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct OrL_reg_not_reg(iRegLNoSp dst,
@@ -7500,7 +7871,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct XorI_reg_not_reg(iRegINoSp dst,
@@ -7517,7 +7888,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct XorL_reg_not_reg(iRegLNoSp dst,
@@ -7534,7 +7905,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AndI_reg_URShift_not_reg(iRegINoSp dst,
@@ -7552,7 +7923,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -7570,7 +7941,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndI_reg_RShift_not_reg(iRegINoSp dst,
@@ -7588,7 +7959,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -7606,7 +7977,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndI_reg_LShift_not_reg(iRegINoSp dst,
@@ -7624,7 +7995,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -7642,7 +8013,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_URShift_not_reg(iRegINoSp dst,
@@ -7660,7 +8031,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -7678,7 +8049,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_RShift_not_reg(iRegINoSp dst,
@@ -7696,7 +8067,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -7714,7 +8085,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_LShift_not_reg(iRegINoSp dst,
@@ -7732,7 +8103,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -7750,7 +8121,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_URShift_not_reg(iRegINoSp dst,
@@ -7768,7 +8139,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -7786,7 +8157,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_RShift_not_reg(iRegINoSp dst,
@@ -7804,7 +8175,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -7822,7 +8193,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_LShift_not_reg(iRegINoSp dst,
@@ -7840,7 +8211,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -7858,7 +8229,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndI_reg_URShift_reg(iRegINoSp dst,
@@ -7877,7 +8248,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_URShift_reg(iRegLNoSp dst,
@@ -7896,7 +8267,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndI_reg_RShift_reg(iRegINoSp dst,
@@ -7915,7 +8286,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_RShift_reg(iRegLNoSp dst,
@@ -7934,7 +8305,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndI_reg_LShift_reg(iRegINoSp dst,
@@ -7953,7 +8324,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AndL_reg_LShift_reg(iRegLNoSp dst,
@@ -7972,7 +8343,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_URShift_reg(iRegINoSp dst,
@@ -7991,7 +8362,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_URShift_reg(iRegLNoSp dst,
@@ -8010,7 +8381,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_RShift_reg(iRegINoSp dst,
@@ -8029,7 +8400,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_RShift_reg(iRegLNoSp dst,
@@ -8048,7 +8419,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorI_reg_LShift_reg(iRegINoSp dst,
@@ -8067,7 +8438,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct XorL_reg_LShift_reg(iRegLNoSp dst,
@@ -8086,7 +8457,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_URShift_reg(iRegINoSp dst,
@@ -8105,7 +8476,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_URShift_reg(iRegLNoSp dst,
@@ -8124,7 +8495,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_RShift_reg(iRegINoSp dst,
@@ -8143,7 +8514,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_RShift_reg(iRegLNoSp dst,
@@ -8162,7 +8533,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrI_reg_LShift_reg(iRegINoSp dst,
@@ -8181,7 +8552,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct OrL_reg_LShift_reg(iRegLNoSp dst,
@@ -8200,7 +8571,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddI_reg_URShift_reg(iRegINoSp dst,
@@ -8219,7 +8590,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddL_reg_URShift_reg(iRegLNoSp dst,
@@ -8238,7 +8609,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddI_reg_RShift_reg(iRegINoSp dst,
@@ -8257,7 +8628,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddL_reg_RShift_reg(iRegLNoSp dst,
@@ -8276,7 +8647,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddI_reg_LShift_reg(iRegINoSp dst,
@@ -8295,7 +8666,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct AddL_reg_LShift_reg(iRegLNoSp dst,
@@ -8314,7 +8685,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubI_reg_URShift_reg(iRegINoSp dst,
@@ -8333,7 +8704,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubL_reg_URShift_reg(iRegLNoSp dst,
@@ -8352,7 +8723,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubI_reg_RShift_reg(iRegINoSp dst,
@@ -8371,7 +8742,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubL_reg_RShift_reg(iRegLNoSp dst,
@@ -8390,7 +8761,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubI_reg_LShift_reg(iRegINoSp dst,
@@ -8409,7 +8780,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}

 instruct SubL_reg_LShift_reg(iRegLNoSp dst,
@@ -8428,7 +8799,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}


@@ -8453,7 +8824,7 @@
 	    r, s);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Left followed by Shift Right.
@@ -8476,7 +8847,7 @@
 	    r, s);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Left followed by Shift Right.
@@ -8499,7 +8870,7 @@
 	    r, s);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Shift Left followed by Shift Right.
@@ -8522,7 +8893,7 @@
 	    r, s);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 // Bitfield extract with shift & mask

@@ -8539,7 +8910,7 @@
     __ ubfxw(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 instruct ubfxL(iRegLNoSp dst, iRegL src, immI rshift, immL_bitmask mask)
 %{
@@ -8554,7 +8925,7 @@
     __ ubfx(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // We can use ubfx when extending an And with a mask when we know mask
@@ -8572,7 +8943,7 @@
     __ ubfx(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Rotations
@@ -8589,7 +8960,7 @@
     __ extr(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 63);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}

 instruct extrOrI(iRegINoSp dst, iRegI src1, iRegI src2, immI lshift, immI rshift, rFlagsReg cr)
@@ -8604,7 +8975,7 @@
     __ extrw(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}

 instruct extrAddL(iRegLNoSp dst, iRegL src1, iRegL src2, immI lshift, immI rshift, rFlagsReg cr)
@@ -8619,7 +8990,7 @@
     __ extr(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 63);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}

 instruct extrAddI(iRegINoSp dst, iRegI src1, iRegI src2, immI lshift, immI rshift, rFlagsReg cr)
@@ -8634,7 +9005,7 @@
     __ extrw(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}


@@ -8651,7 +9022,7 @@
     __ rorv(as_Register($dst$$reg), as_Register($src$$reg),
 	    rscratch1);
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // rol expander
@@ -8667,7 +9038,7 @@
     __ rorvw(as_Register($dst$$reg), as_Register($src$$reg),
 	    rscratch1);
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 instruct rolL_rReg_Var_C_64(iRegL dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
@@ -8718,7 +9089,7 @@
     __ rorv(as_Register($dst$$reg), as_Register($src$$reg),
 	    as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 // ror expander
@@ -8733,7 +9104,7 @@
     __ rorvw(as_Register($dst$$reg), as_Register($src$$reg),
 	    as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}

 instruct rorL_rReg_Var_C_64(iRegL dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
@@ -8784,7 +9155,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %};

 instruct SubExtI(iRegLNoSp dst, iRegL src1, iRegIorL2I src2, rFlagsReg cr)
@@ -8797,7 +9168,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %};


@@ -8811,7 +9182,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtI_sxtb(iRegINoSp dst, iRegI src1, iRegI src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
@@ -8824,7 +9195,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtI_uxtb(iRegINoSp dst, iRegI src1, iRegI src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
@@ -8837,7 +9208,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_sxth(iRegLNoSp dst, iRegL src1, iRegL src2, immI_48 lshift, immI_48 rshift, rFlagsReg cr)
@@ -8850,7 +9221,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_sxtw(iRegLNoSp dst, iRegL src1, iRegL src2, immI_32 lshift, immI_32 rshift, rFlagsReg cr)
@@ -8863,7 +9234,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_sxtb(iRegLNoSp dst, iRegL src1, iRegL src2, immI_56 lshift, immI_56 rshift, rFlagsReg cr)
@@ -8876,7 +9247,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_uxtb(iRegLNoSp dst, iRegL src1, iRegL src2, immI_56 lshift, immI_56 rshift, rFlagsReg cr)
@@ -8889,7 +9260,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}


@@ -8903,7 +9274,7 @@
      __ addw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtI_uxth_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_65535 mask, rFlagsReg cr)
@@ -8916,7 +9287,7 @@
      __ addw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_uxtb_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_255 mask, rFlagsReg cr)
@@ -8929,7 +9300,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_uxth_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_65535 mask, rFlagsReg cr)
@@ -8942,7 +9313,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct AddExtL_uxtw_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_4294967295 mask, rFlagsReg cr)
@@ -8955,7 +9326,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct SubExtI_uxtb_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_255 mask, rFlagsReg cr)
@@ -8968,7 +9339,7 @@
      __ subw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct SubExtI_uxth_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_65535 mask, rFlagsReg cr)
@@ -8981,7 +9352,7 @@
      __ subw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct SubExtL_uxtb_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_255 mask, rFlagsReg cr)
@@ -8994,7 +9365,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct SubExtL_uxth_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_65535 mask, rFlagsReg cr)
@@ -9007,7 +9378,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct SubExtL_uxtw_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_4294967295 mask, rFlagsReg cr)
@@ -9020,7 +9391,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 // END This section of the file is automatically generated. Do not edit --------------
@@ -9382,7 +9753,7 @@
 	    as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct andI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2, rFlagsReg cr) %{
@@ -9397,7 +9768,7 @@
 	    (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Or Instructions
@@ -9414,7 +9785,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct orI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2) %{
@@ -9429,7 +9800,7 @@
             (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Xor Instructions
@@ -9446,7 +9817,7 @@
             as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct xorI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2) %{
@@ -9461,7 +9832,7 @@
             (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Long Logical Instructions
@@ -9479,7 +9850,7 @@
 	    as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct andL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2, rFlagsReg cr) %{
@@ -9494,7 +9865,7 @@
             (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Or Instructions
@@ -9511,7 +9882,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct orL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2) %{
@@ -9526,7 +9897,7 @@
            (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 // Xor Instructions
@@ -9543,7 +9914,7 @@
            as_Register($src2$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct xorL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2) %{
@@ -9558,7 +9929,7 @@
            (unsigned long)($src2$$constant));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}

 instruct convI2L_reg_reg(iRegLNoSp dst, iRegIorL2I src)
@@ -9570,7 +9941,7 @@
   ins_encode %{
     __ sbfm($dst$$Register, $src$$Register, 0, 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // this pattern occurs in bigmath arithmetic
@@ -9584,7 +9955,7 @@
     __ ubfm($dst$$Register, $src$$Register, 0, 31);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 instruct convL2I_reg(iRegINoSp dst, iRegL src) %{
@@ -9597,7 +9968,7 @@
     __ movw(as_Register($dst$$reg), as_Register($src$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct convI2B(iRegINoSp dst, iRegI src, rFlagsReg cr)
@@ -9615,7 +9986,7 @@
     __ cset(as_Register($dst$$reg), Assembler::NE);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct convP2B(iRegINoSp dst, iRegP src, rFlagsReg cr)
@@ -9633,7 +10004,7 @@
     __ cset(as_Register($dst$$reg), Assembler::NE);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}

 instruct convD2F_reg(vRegF dst, vRegD src) %{
@@ -9782,7 +10153,7 @@
     __ ldrw($dst$$Register, Address(sp, $src$$disp));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_reg);

 %}

@@ -9818,7 +10189,7 @@
     __ ldr($dst$$Register, Address(sp, $src$$disp));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_reg);

 %}

@@ -9872,7 +10243,7 @@
     __ strw($src$$Register, Address(sp, $dst$$disp));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_reg);

 %}

@@ -9908,7 +10279,7 @@
     __ str($src$$Register, Address(sp, $dst$$disp));
   %}

-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_reg);

 %}

@@ -10014,7 +10385,7 @@

   ins_encode(aarch64_enc_cmpw(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}

 instruct compI_reg_immI0(rFlagsReg cr, iRegI op1, immI0 zero)
@@ -10028,7 +10399,7 @@

   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, zero));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compI_reg_immIAddSub(rFlagsReg cr, iRegI op1, immIAddSub op2)
@@ -10042,7 +10413,7 @@

   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compI_reg_immI(rFlagsReg cr, iRegI op1, immI op2)
@@ -10056,7 +10427,7 @@

   ins_encode(aarch64_enc_cmpw_imm(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 // Unsigned compare Instructions; really, same as signed compare
@@ -10074,7 +10445,7 @@

   ins_encode(aarch64_enc_cmpw(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}

 instruct compU_reg_immI0(rFlagsRegU cr, iRegI op1, immI0 zero)
@@ -10088,7 +10459,7 @@

   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, zero));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compU_reg_immIAddSub(rFlagsRegU cr, iRegI op1, immIAddSub op2)
@@ -10102,7 +10473,7 @@

   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compU_reg_immI(rFlagsRegU cr, iRegI op1, immI op2)
@@ -10130,7 +10501,7 @@

   ins_encode(aarch64_enc_cmp(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}

 instruct compL_reg_immI0(rFlagsReg cr, iRegL op1, immI0 zero)
@@ -10144,7 +10515,7 @@

   ins_encode(aarch64_enc_cmp_imm_addsub(op1, zero));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compL_reg_immLAddSub(rFlagsReg cr, iRegL op1, immLAddSub op2)
@@ -10158,7 +10529,7 @@

   ins_encode(aarch64_enc_cmp_imm_addsub(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compL_reg_immL(rFlagsReg cr, iRegL op1, immL op2)
@@ -10172,7 +10543,7 @@

   ins_encode(aarch64_enc_cmp_imm(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct compP_reg_reg(rFlagsRegU cr, iRegP op1, iRegP op2)
@@ -10186,7 +10557,7 @@

   ins_encode(aarch64_enc_cmpp(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}

 instruct compN_reg_reg(rFlagsRegU cr, iRegN op1, iRegN op2)
@@ -10200,7 +10571,7 @@

   ins_encode(aarch64_enc_cmpn(op1, op2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}

 instruct testP_reg(rFlagsRegU cr, iRegP op1, immP0 zero)
@@ -10214,7 +10585,7 @@

   ins_encode(aarch64_enc_testp(op1));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 instruct testN_reg(rFlagsRegU cr, iRegN op1, immN0 zero)
@@ -10228,7 +10599,7 @@

   ins_encode(aarch64_enc_testn(op1));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}

 // FP comparisons
@@ -10400,6 +10771,29 @@

 %}

+// Manifest a CmpL result in an integer register.
+// (src1 < src2) ? -1 : ((src1 > src2) ? 1 : 0)
+instruct cmpL3_reg_reg(iRegINoSp dst, iRegL src1, iRegL src2, rFlagsReg flags)
+%{
+  match(Set dst (CmpL3 src1 src2));
+  effect(KILL flags);
+
+  ins_cost(INSN_COST * 6);
+  format %{
+      "cmp $src1, $src2"
+      "csetw $dst, ne"
+      "cnegw $dst, lt"
+  %}
+  // format %{ "CmpL3 $dst, $src1, $src2" %}
+  ins_encode %{
+    __ cmp($src1$$Register, $src2$$Register);
+    __ csetw($dst$$Register, Assembler::NE);
+    __ cnegw($dst$$Register, $dst$$Register, Assembler::LT);
+  %}
+
+  ins_pipe(ialu_reg_reg);
+%}
+
 instruct cmpLTMask_reg_reg(iRegINoSp dst, iRegI p, iRegI q, rFlagsReg cr)
 %{
   match(Set dst (CmpLTMask p q));
@@ -10418,7 +10812,7 @@
     __ subw(as_Register($dst$$reg), zr, as_Register($dst$$reg));
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct cmpLTMask_reg_zero(iRegINoSp dst, iRegI src, immI0 zero, rFlagsReg cr)
@@ -10434,7 +10828,7 @@
     __ asrw(as_Register($dst$$reg), as_Register($src$$reg), 31);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // ============================================================================
@@ -10462,7 +10856,7 @@
              Assembler::LT);
   %}

-  ins_pipe(pipe_class_compare);
+  ins_pipe(ialu_reg_reg);
 %}

 instruct maxI_rReg(iRegINoSp dst, iRegI src1, iRegI src2, rFlagsReg cr)
@@ -10487,7 +10881,7 @@
              Assembler::GT);
   %}

-  ins_pipe(pipe_class_compare);
+  ins_pipe(ialu_reg_reg);
 %}

 // ============================================================================
@@ -10505,7 +10899,7 @@

   ins_encode(aarch64_enc_b(lbl));

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}

 // Conditional Near Branch
@@ -10526,7 +10920,7 @@

   ins_encode(aarch64_enc_br_con(cmp, lbl));

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch_cond);
 %}

 // Conditional Near Branch Unsigned
@@ -10547,7 +10941,7 @@

   ins_encode(aarch64_enc_br_conU(cmp, lbl));

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch_cond);
 %}

 // Make use of CBZ and CBNZ.  These instructions, as well as being
@@ -10570,7 +10964,7 @@
     else
       __ cbnzw($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}

 instruct cmpL_imm0_branch(cmpOp cmp, iRegL op1, immL0 op2, label labl, rFlagsReg cr) %{
@@ -10589,7 +10983,7 @@
     else
       __ cbnz($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}

 instruct cmpP_imm0_branch(cmpOp cmp, iRegP op1, immP0 op2, label labl, rFlagsReg cr) %{
@@ -10608,7 +11002,7 @@
     else
       __ cbnz($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}

 // Conditional Far Branch
@@ -10629,7 +11023,7 @@

   ins_encode(aarch64_enc_br_con(cmp, lbl));

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}

 // counted loop end branch near Unsigned
@@ -10646,7 +11040,7 @@

   ins_encode(aarch64_enc_br_conU(cmp, lbl));

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}

 // counted loop end branch far
@@ -10668,7 +11062,7 @@

   ins_encode(aarch64_enc_fast_lock(object, box, tmp, tmp2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(pipe_serial);
 %}

 instruct cmpFastUnlock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp, iRegPNoSp tmp2)
@@ -10681,7 +11075,7 @@

   ins_encode(aarch64_enc_fast_unlock(object, box, tmp, tmp2));

-  ins_pipe(pipe_class_compare);
+  ins_pipe(pipe_serial);
 %}


@@ -10701,7 +11095,7 @@
   ins_encode %{
     __ read_polling_page(as_Register($poll$$reg), relocInfo::poll_type);
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}


@@ -10861,7 +11255,7 @@

   ins_encode( /*empty*/ );

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_class_empty);
 %}

 // Rethrow exception: The exception oop will come in the first
@@ -10888,7 +11282,7 @@

   ins_encode( aarch64_enc_ret() );

-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}

 // Die now.
@@ -10960,6 +11354,44 @@
   ins_pipe(pipe_class_memory);
 %}

+instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
+       iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
+
+  ins_encode %{
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      -1, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
+                 immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
+                 iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
+
+  ins_encode %{
+    int icnt2 = (int)$int_cnt2$$constant;
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, zr,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      icnt2, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
                         iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
 %{
@@ -10975,6 +11407,20 @@
   ins_pipe(pipe_class_memory);
 %}

+instruct array_equals(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
+                      iRegP_R10 tmp, rFlagsReg cr)
+%{
+  match(Set result (AryEq ary1 ary2));
+  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
+
+  format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
+  ins_encode %{
+    __ char_arrays_equals($ary1$$Register, $ary2$$Register,
+                          $result$$Register, $tmp$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 // ============================================================================
 // This name is KNOWN by the ADLC and cannot be changed.
 // The ADLC forces a 'TypeRawPtr::BOTTOM' output type
--- a/src/cpu/aarch64/vm/aarch64Test.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64Test.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -35,43 +35,4 @@
   CodeBuffer code(b);
   MacroAssembler _masm(&code);
   entry(&code);
-  // dive now before we hit all the Unimplemented() calls
-  // exit(0);
-
-#if 0
-  // old test code to compute sum of squares
-  enum { r0, r1, r2, r3, r4, LR = 30 };
-
-  address entry = __ pc();
-
-  __ _mov_imm(r0, 100);
-  address loop = __ pc();
-  __ _sub_imm(r0, r0, 1);
-  __ _cbnz(r0, loop);
-  // __ _br(LR);
-
-  char stack[4096];
-  unsigned long memory[100];
-
-  __ _mov_imm(r0, 1);
-  __ _mov_imm(r4, 100);
-  loop = __ pc();
-  __ _mov(r1, r0);
-  __ _mul(r2, r1, r1);
-  __ _str_post(r2, r3, 8);
-  __ _add_imm(r0, r0, 1);
-  __ _sub_imm(r4, r4, 1);
-  __ _cbnz(r4, loop);
-  __ _br(LR);
-
-  Disassembler::decode(entry, __ pc());
-
-  sim.init((u_int64_t)entry, (u_int64_t)stack + sizeof stack,
-	   (u_int64_t)stack);
-  sim.getCPUState().xreg((GReg)r3, 0) = (u_int64_t)memory;
-  sim.run();
-  printf("Table of squares:\n");
-  for (int i = 0; i < 100; i++)
-    printf("  %d\n", memory[i]);
-#endif
 }
--- a/src/cpu/aarch64/vm/aarch64_ad.m4	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Dec 04 14:30:02 2014 +0000
@@ -18,7 +18,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}')dnl
 define(`BASE_INVERTED_INSN',
 `
@@ -40,7 +40,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')dnl
 define(`INVERTED_SHIFT_INSN',
 `
@@ -63,7 +63,7 @@
               $src3$$constant & 0x3f);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}')dnl
 define(`NOT_INSN',
 `instruct reg$1_not_reg(iReg$1NoSp dst,
@@ -80,7 +80,7 @@
               Assembler::LSL, 0);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}')dnl
 dnl
 define(`BOTH_SHIFT_INSNS',
@@ -142,7 +142,7 @@
 	    r, s);
   %}

-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}')
 BFM_INSN(L, 63, RShift, sbfm)
 BFM_INSN(I, 31, RShift, sbfmw)
@@ -164,7 +164,7 @@
     __ $3(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}')
 BFX_INSN(I,URShift,ubfxw)
 BFX_INSN(L,URShift,ubfx)
@@ -184,7 +184,7 @@
     __ ubfx(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}

 // Rotations
@@ -202,7 +202,7 @@
     __ $4(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & $2);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}
 ')dnl
 EXTRACT_INSN(L, 63, Or, extr)
@@ -223,7 +223,7 @@
     __ $3(as_Register($dst$$reg), as_Register($src$$reg),
 	    rscratch1);
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}')dnl
 define(`ROR_EXPAND', `
 // $2 expander
@@ -238,7 +238,7 @@
     __ $3(as_Register($dst$$reg), as_Register($src$$reg),
 	    as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}')dnl
 define(ROL_INSN, `
 instruct $3$1_rReg_Var_C$2(iRegL dst, iRegL src, iRegI shift, immI$2 c$2, rFlagsReg cr)
@@ -284,7 +284,7 @@
      __ $4(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$5);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')dnl
 ADD_SUB_CONV(I,L,Add,add,sxtw);
 ADD_SUB_CONV(I,L,Sub,sub,sxtw);
@@ -300,7 +300,7 @@
      __ $5(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$6);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')
 ADD_SUB_EXTENDED(I,16,Add,RShift,add,sxth,32)
 ADD_SUB_EXTENDED(I,8,Add,RShift,add,sxtb,32)
@@ -322,7 +322,7 @@
      __ $4(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$5);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')
 dnl
 ADD_SUB_ZERO_EXTEND(I,255,Add,addw,uxtb)
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -1194,18 +1194,11 @@
   }

 #ifndef PRODUCT
-  {
-    address PC = __ pc();
-    __ bl(__ pc()+(1<<27)-4);
-    NativeCall* call = nativeCall_at(PC);
-    ptrdiff_t offset = call->destination()-PC;
-    assert(offset == (1<<27)-4, "broken branch coding");
-    PC = __ pc();
-    __ bl(__ pc()-(1<<27));
-    call = nativeCall_at(PC);
-    offset = call->destination()-PC;
-    assert(offset == -(1<<27), "broken branch coding");
-  }
+
+  address PC = __ pc();
+  __ ld1(v0, __ T16B, Address(r16)); // No offset
+  __ ld1(v0, __ T16B, __ post(r16, 0)); // Post-index
+  __ ld1(v0, __ T16B, Address(r16, r17)); //


 #endif // PRODUCT
@@ -1464,7 +1457,7 @@

 bool Assembler::operand_valid_for_add_sub_immediate(long imm) {
   bool shift = false;
-  unsigned long uimm = labs(imm);
+  unsigned long uimm = uabs(imm);
   if (uimm < (1 << 12))
     return true;
   if (uimm < (1 << 24)
@@ -1573,7 +1566,8 @@

 // Implementation of MacroAssembler

-void MacroAssembler::pd_patch_instruction(address branch, address target) {
+int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
+  int instructions = 1;
   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
   long offset = (target - branch) >> 2;
   unsigned insn = *(unsigned*)branch;
@@ -1609,15 +1603,22 @@
       //   2 - adrp    Rx, target_page
       //       add     Ry, Rx, #offset_in_page
       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
-      // In the first 2 cases we must check that Rx is the same in the adrp and the
-      // subsequent ldr/str or add instruction. Otherwise we could accidentally end
-      // up treating a type 3 relocation as a type 1 or 2 just because it happened
-      // to be followed by a random unrelated ldr/str or add instruction.
+      //
+      // In the first 2 cases we must check that Rx is the same in the
+      // adrp and the subsequent ldr/str or add instruction. Otherwise
+      // we could accidentally end up treating a type 3 relocation as
+      // a type 1 or 2 just because it happened to be followed by a
+      // random unrelated ldr/str or add instruction.
       //
-      // In the case of a type 3 relocation, we know that these are only generated
-      // for the safepoint polling page, or for the card type byte map base so we
-      // assert as much and of course that the offset is 0.
-      //
+      // In the case of a type 3 relocation, we know that these are
+      // only generated for the safepoint polling page, the crc table
+      // base or the card type byte map base so we assert as much
+      // and of course that the offset is 0.
+      //
+      // In jdk7 the card type byte map base is aligned on a 1K
+      // boundary which may fail to be 4K aligned. In that case the
+      // card table load will fall into category 2.
+
       unsigned insn2 = ((unsigned*)branch)[1];
       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 		Instruction_aarch64::extract(insn, 4, 0) ==
@@ -1627,19 +1628,24 @@
 	Instruction_aarch64::patch(branch + sizeof (unsigned),
 				    21, 10, offset_lo >> size);
 	guarantee(((dest >> size) << size) == dest, "misaligned target");
+        instructions = 2;
       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 		Instruction_aarch64::extract(insn, 4, 0) ==
 			Instruction_aarch64::extract(insn2, 4, 0)) {
 	// add (immediate)
+	assert (((jbyte *)target !=
+		 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base) ||
+		(offset_lo & 0x3FFl) == 0, "offset must be 0x400 aligned for crc_table");
 	Instruction_aarch64::patch(branch + sizeof (unsigned),
 				   21, 10, offset_lo);
+        instructions = 2;
       } else {
 	assert((jbyte *)target ==
 		((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
                target == StubRoutines::crc_table_addr() ||
                (address)target == os::get_polling_page(),
-	       "adrp must be polling page or byte map base");
-	assert(offset_lo == 0, "offset must be 0 for polling page or byte map base");
+	       "adrp must be polling page, crc_table or byte map base");
+	assert(offset_lo == 0, "offset must be 0 for polling page, crc_table or byte map base");
       }
     }
     int offset_lo = offset & 3;
@@ -1655,6 +1661,7 @@
     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
     assert(pd_call_destination(branch) == target, "should be");
+    instructions = 2;
   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
     // nothing to do
@@ -1662,19 +1669,34 @@
   } else {
     ShouldNotReachHere();
   }
+  return instructions * NativeInstruction::instruction_size;
 }

-void MacroAssembler::patch_oop(address insn_addr, address o) {
+int MacroAssembler::patch_oop(address insn_addr, address o) {
+  int instructions;
   unsigned insn = *(unsigned*)insn_addr;
+  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
+
+  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
+  // narrow OOPs by setting the upper 16 bits in the first
+  // instruction.
   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
-      // Move narrow constant
-      assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
-      narrowOop n = oopDesc::encode_heap_oop((oop)o);
-      Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
-      Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+    // Move narrow OOP
+    assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
+    narrowOop n = oopDesc::encode_heap_oop((oop)o);
+    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+    instructions = 2;
   } else {
-    pd_patch_instruction(insn_addr, o);
+    // Move wide OOP
+    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
+    uintptr_t dest = (uintptr_t)o;
+    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
+    instructions = 3;
   }
+  return instructions * NativeInstruction::instruction_size;
 }

 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
@@ -2236,15 +2258,27 @@
   while (offset() % modulus != 0) nop();
 }

-// these are meant to be no-ops overridden by InterpreterMacroAssembler
-
-void MacroAssembler::check_and_handle_earlyret(Register java_thread) { Unimplemented(); }
-
-void MacroAssembler::check_and_handle_popframe(Register java_thread) { Unimplemented(); }
+// these are no-ops overridden by InterpreterMacroAssembler
+
+void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
+
+void MacroAssembler::check_and_handle_popframe(Register java_thread) { }

 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
                                                       Register tmp,
-                                                      int offset) { Unimplemented(); return RegisterOrConstant(r0); }
+                                                      int offset) {
+  intptr_t value = *delayed_value_addr;
+  if (value != 0)
+    return RegisterOrConstant(value + offset);
+
+  // load indirectly to solve generation ordering problem
+  ldr(tmp, ExternalAddress((address) delayed_value_addr));
+
+  if (offset != 0)
+    add(tmp, tmp, offset);
+
+  return RegisterOrConstant(tmp);
+}

 void MacroAssembler:: notify(int type) {
   if (type == bytecode_start) {
@@ -2687,6 +2721,9 @@
 					Label *retaddr) {
   Label E, L;

+  // !!! FIXME AARCH64 we normally need to save rmethod as it is
+  // volatile.  however we don't need to when calling from the
+  // interpreter.
   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));

   // We add 1 to number_of_arguments because the thread in arg0 is
@@ -2697,6 +2734,7 @@
     bind(*retaddr);

   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
+  maybe_isb();
 }

 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
@@ -3187,7 +3225,7 @@
   }
 }

-void MacroAssembler::increment(Address dst, int value)
+void MacroAssembler::incrementw(Address dst, int value)
 {
   assert(!dst.uses(rscratch1), "invalid dst for address increment");
   ldrw(rscratch1, dst);
@@ -3195,7 +3233,7 @@
   strw(rscratch1, dst);
 }

-void MacroAssembler::incrementw(Address dst, int value)
+void MacroAssembler::increment(Address dst, int value)
 {
   assert(!dst.uses(rscratch1), "invalid dst for address increment");
   ldr(rscratch1, dst);
@@ -3312,7 +3350,7 @@
   if (operand_valid_for_add_sub_immediate((int)imm)) {
     (this->*insn1)(Rd, Rn, imm);
   } else {
-    if (labs(imm) < (1 << 24)) {
+    if (uabs(imm) < (1 << 24)) {
        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
     } else {
@@ -3760,131 +3798,131 @@
   if (UseNeon) {
       cmp(len, 64);
       br(Assembler::LT, L_by16);
-      v_eor(v16, T16B, v16, v16);
+      eor(v16, T16B, v16, v16);

     Label L_fold;

       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants

-      v_ld1(v0, v1, T2D, buf, 32);
-      v_ld1r(v4, T2D, tmp, 8);
-      v_ld1r(v5, T2D, tmp, 8);
-      v_ld1r(v6, T2D, tmp, 8);
-      v_ld1r(v7, T2D, tmp, 8);
-      v_mov(v16, T4S, 0, crc);
-
-      v_eor(v0, T16B, v0, v16);
+      ld1(v0, v1, T2D, post(buf, 32));
+      ld1r(v4, T2D, post(tmp, 8));
+      ld1r(v5, T2D, post(tmp, 8));
+      ld1r(v6, T2D, post(tmp, 8));
+      ld1r(v7, T2D, post(tmp, 8));
+      mov(v16, T4S, 0, crc);
+
+      eor(v0, T16B, v0, v16);
       sub(len, len, 64);

     BIND(L_fold);
-      v_pmull(v22, T8H, v0, v5, T8B);
-      v_pmull(v20, T8H, v0, v7, T8B);
-      v_pmull(v23, T8H, v0, v4, T8B);
-      v_pmull(v21, T8H, v0, v6, T8B);
+      pmull(v22, T8H, v0, v5, T8B);
+      pmull(v20, T8H, v0, v7, T8B);
+      pmull(v23, T8H, v0, v4, T8B);
+      pmull(v21, T8H, v0, v6, T8B);

-      v_pmull2(v18, T8H, v0, v5, T16B);
-      v_pmull2(v16, T8H, v0, v7, T16B);
-      v_pmull2(v19, T8H, v0, v4, T16B);
-      v_pmull2(v17, T8H, v0, v6, T16B);
+      pmull2(v18, T8H, v0, v5, T16B);
+      pmull2(v16, T8H, v0, v7, T16B);
+      pmull2(v19, T8H, v0, v4, T16B);
+      pmull2(v17, T8H, v0, v6, T16B);

-      v_uzp1(v24, v20, v22, T8H);
-      v_uzp2(v25, v20, v22, T8H);
-      v_eor(v20, T16B, v24, v25);
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);

-      v_uzp1(v26, v16, v18, T8H);
-      v_uzp2(v27, v16, v18, T8H);
-      v_eor(v16, T16B, v26, v27);
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);

-      v_ushll2(v22, T4S, v20, T8H, 8);
-      v_ushll(v20, T4S, v20, T4H, 8);
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);

-      v_ushll2(v18, T4S, v16, T8H, 8);
-      v_ushll(v16, T4S, v16, T4H, 8);
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);

-      v_eor(v22, T16B, v23, v22);
-      v_eor(v18, T16B, v19, v18);
-      v_eor(v20, T16B, v21, v20);
-      v_eor(v16, T16B, v17, v16);
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);

-      v_uzp1(v17, v16, v20, T2D);
-      v_uzp2(v21, v16, v20, T2D);
-      v_eor(v17, T16B, v17, v21);
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v17, T16B, v17, v21);

-      v_ushll2(v20, T2D, v17, T4S, 16);
-      v_ushll(v16, T2D, v17, T2S, 16);
+      ushll2(v20, T2D, v17, T4S, 16);
+      ushll(v16, T2D, v17, T2S, 16);

-      v_eor(v20, T16B, v20, v22);
-      v_eor(v16, T16B, v16, v18);
+      eor(v20, T16B, v20, v22);
+      eor(v16, T16B, v16, v18);

-      v_uzp1(v17, v20, v16, T2D);
-      v_uzp2(v21, v20, v16, T2D);
-      v_eor(v28, T16B, v17, v21);
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v28, T16B, v17, v21);

-      v_pmull(v22, T8H, v1, v5, T8B);
-      v_pmull(v20, T8H, v1, v7, T8B);
-      v_pmull(v23, T8H, v1, v4, T8B);
-      v_pmull(v21, T8H, v1, v6, T8B);
+      pmull(v22, T8H, v1, v5, T8B);
+      pmull(v20, T8H, v1, v7, T8B);
+      pmull(v23, T8H, v1, v4, T8B);
+      pmull(v21, T8H, v1, v6, T8B);

-      v_pmull2(v18, T8H, v1, v5, T16B);
-      v_pmull2(v16, T8H, v1, v7, T16B);
-      v_pmull2(v19, T8H, v1, v4, T16B);
-      v_pmull2(v17, T8H, v1, v6, T16B);
+      pmull2(v18, T8H, v1, v5, T16B);
+      pmull2(v16, T8H, v1, v7, T16B);
+      pmull2(v19, T8H, v1, v4, T16B);
+      pmull2(v17, T8H, v1, v6, T16B);

-      v_ld1(v0, v1, T2D, buf, 32);
+      ld1(v0, v1, T2D, post(buf, 32));

-      v_uzp1(v24, v20, v22, T8H);
-      v_uzp2(v25, v20, v22, T8H);
-      v_eor(v20, T16B, v24, v25);
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);

-      v_uzp1(v26, v16, v18, T8H);
-      v_uzp2(v27, v16, v18, T8H);
-      v_eor(v16, T16B, v26, v27);
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);

-      v_ushll2(v22, T4S, v20, T8H, 8);
-      v_ushll(v20, T4S, v20, T4H, 8);
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);

-      v_ushll2(v18, T4S, v16, T8H, 8);
-      v_ushll(v16, T4S, v16, T4H, 8);
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);

-      v_eor(v22, T16B, v23, v22);
-      v_eor(v18, T16B, v19, v18);
-      v_eor(v20, T16B, v21, v20);
-      v_eor(v16, T16B, v17, v16);
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);

-      v_uzp1(v17, v16, v20, T2D);
-      v_uzp2(v21, v16, v20, T2D);
-      v_eor(v16, T16B, v17, v21);
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v16, T16B, v17, v21);

-      v_ushll2(v20, T2D, v16, T4S, 16);
-      v_ushll(v16, T2D, v16, T2S, 16);
+      ushll2(v20, T2D, v16, T4S, 16);
+      ushll(v16, T2D, v16, T2S, 16);

-      v_eor(v20, T16B, v22, v20);
-      v_eor(v16, T16B, v16, v18);
+      eor(v20, T16B, v22, v20);
+      eor(v16, T16B, v16, v18);

-      v_uzp1(v17, v20, v16, T2D);
-      v_uzp2(v21, v20, v16, T2D);
-      v_eor(v20, T16B, v17, v21);
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v20, T16B, v17, v21);

-      v_shl(v16, v28, T2D, 1);
-      v_shl(v17, v20, T2D, 1);
+      shl(v16, v28, T2D, 1);
+      shl(v17, v20, T2D, 1);

-      v_eor(v0, T16B, v0, v16);
-      v_eor(v1, T16B, v1, v17);
+      eor(v0, T16B, v0, v16);
+      eor(v1, T16B, v1, v17);

       subs(len, len, 32);
       br(Assembler::GE, L_fold);

       mov(crc, 0);
-      v_mov(tmp, v0, T1D, 0);
+      mov(tmp, v0, T1D, 0);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v0, T1D, 1);
+      mov(tmp, v0, T1D, 1);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v1, T1D, 0);
+      mov(tmp, v1, T1D, 0);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
-      v_mov(tmp, v1, T1D, 1);
+      mov(tmp, v1, T1D, 1);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);

@@ -4707,7 +4745,7 @@

 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
   relocInfo::relocType rtype = dest.rspec().reloc()->type();
-  if (labs(pc() - dest.target()) >= (1LL << 32)) {
+  if (uabs(pc() - dest.target()) >= (1LL << 32)) {
     guarantee(rtype == relocInfo::none
 	      || rtype == relocInfo::external_word_type
 	      || rtype == relocInfo::poll_type
@@ -4760,6 +4798,346 @@
   }
 }

+// Search for str1 in str2 and return index or -1
+void MacroAssembler::string_indexof(Register str2, Register str1,
+                                    Register cnt2, Register cnt1,
+                                    Register tmp1, Register tmp2,
+                                    Register tmp3, Register tmp4,
+                                    int icnt1, Register result) {
+  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
+
+  Register ch1 = rscratch1;
+  Register ch2 = rscratch2;
+  Register cnt1tmp = tmp1;
+  Register cnt2tmp = tmp2;
+  Register cnt1_neg = cnt1;
+  Register cnt2_neg = cnt2;
+  Register result_tmp = tmp4;
+
+  // Note, inline_string_indexOf() generates checks:
+  // if (substr.count > string.count) return -1;
+  // if (substr.count == 0) return 0;
+
+// We have two strings, a source string in str2, cnt2 and a pattern string
+// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+// For larger pattern and source we use a simplified Boyer Moore algorithm.
+// With a small pattern and source we use linear scan.
+
+  if (icnt1 == -1) {
+    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
+    br(LO, LINEARSEARCH);       // a byte array.
+    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
+    br(HS, LINEARSEARCH);
+  }
+
+// The Boyer Moore alogorithm is based on the description here:-
+//
+// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
+//
+// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
+// and the 'Good Suffix' rule.
+//
+// These rules are essentially heuristics for how far we can shift the
+// pattern along the search string.
+//
+// The implementation here uses the 'Bad Character' rule only because of the
+// complexity of initialisation for the 'Good Suffix' rule.
+//
+// This is also known as the Boyer-Moore-Horspool algorithm:-
+//
+// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
+//
+// #define ASIZE 128
+//
+//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
+//       int i, j;
+//       unsigned c;
+//       unsigned char bc[ASIZE];
+//
+//       /* Preprocessing */
+//       for (i = 0; i < ASIZE; ++i)
+//          bc[i] = 0;
+//       for (i = 0; i < m - 1; ) {
+//          c = x[i];
+//          ++i;
+//          if (c < ASIZE) bc[c] = i;
+//       }
+//
+//       /* Searching */
+//       j = 0;
+//       while (j <= n - m) {
+//          c = y[i+j];
+//          if (x[m-1] == c)
+//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
+//          if (i < 0) return j;
+//          if (c < ASIZE)
+//            j = j - bc[y[j+m-1]] + m;
+//          else
+//            j += 1; // Advance by 1 only if char >= ASIZE
+//       }
+//    }
+
+  if (icnt1 == -1) {
+    BIND(BM);
+
+    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
+    Label BMADV, BMMATCH, BMCHECKEND;
+
+    Register cnt1end = tmp2;
+    Register str2end = cnt2;
+    Register skipch = tmp2;
+
+    // Restrict ASIZE to 128 to reduce stack space/initialisation.
+    // The presence of chars >= ASIZE in the target string does not affect
+    // performance, but we must be careful not to initialise them in the stack
+    // array.
+    // The presence of chars >= ASIZE in the source string may adversely affect
+    // performance since we can only advance by one when we encounter one.
+
+      stp(zr, zr, pre(sp, -128));
+      for (int i = 1; i < 8; i++)
+          stp(zr, zr, Address(sp, i*16));
+
+      mov(cnt1tmp, 0);
+      sub(cnt1end, cnt1, 1);
+    BIND(BCLOOP);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, 128);
+      add(cnt1tmp, cnt1tmp, 1);
+      br(HS, BCSKIP);
+      strb(cnt1tmp, Address(sp, ch1));
+    BIND(BCSKIP);
+      cmp(cnt1tmp, cnt1end);
+      br(LT, BCLOOP);
+
+      mov(result_tmp, str2);
+
+      sub(cnt2, cnt2, cnt1);
+      add(str2end, str2, cnt2, LSL, 1);
+    BIND(BMLOOPSTR2);
+      sub(cnt1tmp, cnt1, 1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, skipch);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMMATCH);
+    BIND(BMLOOPSTR1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, ch2);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(GE, BMLOOPSTR1);
+    BIND(BMMATCH);
+      sub(result_tmp, str2, result_tmp);
+      lsr(result, result_tmp, 1);
+      add(sp, sp, 128);
+      b(DONE);
+    BIND(BMADV);
+      add(str2, str2, 2);
+      b(BMCHECKEND);
+    BIND(BMSKIP);
+      cmp(skipch, 128);
+      br(HS, BMADV);
+      ldrb(ch2, Address(sp, skipch));
+      add(str2, str2, cnt1, LSL, 1);
+      sub(str2, str2, ch2, LSL, 1);
+    BIND(BMCHECKEND);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, 128);
+      b(NOMATCH);
+  }
+
+  BIND(LINEARSEARCH);
+  {
+    Label DO1, DO2, DO3;
+
+    Register str2tmp = tmp2;
+    Register first = tmp3;
+
+    if (icnt1 == -1)
+    {
+        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
+
+        cmp(cnt1, 4);
+        br(LT, DOSHORT);
+
+        sub(cnt2, cnt2, cnt1);
+        sub(cnt1, cnt1, 4);
+        mov(result_tmp, cnt2);
+
+        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt1_neg, zr, cnt1, LSL, 1);
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+        ldr(first, Address(str1, cnt1_neg));
+
+      BIND(FIRST_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        adds(cnt1tmp, cnt1_neg, 8);
+        add(cnt2tmp, cnt2_neg, 8);
+        br(GE, LAST_WORD);
+
+      BIND(STR1_NEXT);
+        ldr(ch1, Address(str1, cnt1tmp));
+        ldr(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        adds(cnt1tmp, cnt1tmp, 8);
+        add(cnt2tmp, cnt2tmp, 8);
+        br(LT, STR1_NEXT);
+
+      BIND(LAST_WORD);
+        ldr(ch1, Address(str1));
+        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
+        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+
+      BIND(DOSHORT);
+        cmp(cnt1, 2);
+        br(LT, DO1);
+        br(GT, DO3);
+    }
+
+    if (icnt1 == 4) {
+      Label CH1_LOOP;
+
+        ldr(ch1, str1);
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 2) {
+      Label CH1_LOOP;
+
+      BIND(DO2);
+        ldrw(ch1, str1);
+        sub(cnt2, cnt2, 2);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 3) {
+      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
+
+      BIND(DO3);
+        ldrw(first, str1);
+        ldrh(ch1, Address(str1, 4));
+
+        sub(cnt2, cnt2, 3);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(FIRST_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmpw(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        add(cnt2tmp, cnt2_neg, 4);
+        ldrh(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 1) {
+      Label CH1_LOOP, HAS_ZERO;
+      Label DO1_SHORT, DO1_LOOP;
+
+      BIND(DO1);
+        ldrh(ch1, str1);
+        cmp(cnt2, 4);
+        br(LT, DO1_SHORT);
+
+        orr(ch1, ch1, ch1, LSL, 16);
+        orr(ch1, ch1, ch1, LSL, 32);
+
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+        mov(tmp3, 0x0001000100010001);
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        eor(ch2, ch1, ch2);
+        sub(tmp1, ch2, tmp3);
+        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
+        bics(tmp1, tmp1, tmp2);
+        br(NE, HAS_ZERO);
+        adds(cnt2_neg, cnt2_neg, 8);
+        br(LT, CH1_LOOP);
+
+        cmp(cnt2_neg, 8);
+        mov(cnt2_neg, 0);
+        br(LT, CH1_LOOP);
+        b(NOMATCH);
+
+      BIND(HAS_ZERO);
+        rev(tmp1, tmp1);
+        clz(tmp1, tmp1);
+        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
+        b(MATCH);
+
+      BIND(DO1_SHORT);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+      BIND(DO1_LOOP);
+        ldrh(ch2, Address(str2, cnt2_neg));
+        cmpw(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LT, DO1_LOOP);
+    }
+  }
+  BIND(NOMATCH);
+    mov(result, -1);
+    b(DONE);
+  BIND(MATCH);
+    add(result, result_tmp, cnt2_neg, ASR, 1);
+  BIND(DONE);
+}
+
 // Compare strings.
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
@@ -4919,3 +5297,72 @@

   BLOCK_COMMENT("} string_equals");
 }
+
+
+// Compare char[] arrays aligned to 4 bytes
+void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
+                                        Register result, Register tmp1)
+{
+  Register cnt1 = rscratch1;
+  Register cnt2 = rscratch2;
+  Register tmp2 = rscratch2;
+
+  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
+
+  int length_offset  = arrayOopDesc::length_offset_in_bytes();
+  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+  BLOCK_COMMENT("char_arrays_equals  {");
+
+    // different until proven equal
+    mov(result, false);
+
+    // same array?
+    cmp(ary1, ary2);
+    br(Assembler::EQ, SAME);
+
+    // ne if either null
+    cbz(ary1, DIFFER);
+    cbz(ary2, DIFFER);
+
+    // lengths ne?
+    ldrw(cnt1, Address(ary1, length_offset));
+    ldrw(cnt2, Address(ary2, length_offset));
+    cmp(cnt1, cnt2);
+    br(Assembler::NE, DIFFER);
+
+    lea(ary1, Address(ary1, base_offset));
+    lea(ary2, Address(ary2, base_offset));
+
+    subs(cnt1, cnt1, 4);
+    br(LT, TAIL03);
+
+  BIND(NEXT);
+    ldr(tmp1, Address(post(ary1, 8)));
+    ldr(tmp2, Address(post(ary2, 8)));
+    subs(cnt1, cnt1, 4);
+    eor(tmp1, tmp1, tmp2);
+    cbnz(tmp1, DIFFER);
+    br(GE, NEXT);
+
+  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
+    tst(cnt1, 0b10);
+    br(EQ, TAIL01);
+    ldrw(tmp1, Address(post(ary1, 4)));
+    ldrw(tmp2, Address(post(ary2, 4)));
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+  BIND(TAIL01);  // 0-1 chars left
+    tst(cnt1, 0b01);
+    br(EQ, SAME);
+    ldrh(tmp1, ary1);
+    ldrh(tmp2, ary2);
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+
+  BIND(SAME);
+    mov(result, true);
+  BIND(DIFFER);	// result already set
+
+  BLOCK_COMMENT("} char_arrays_equals");
+}
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -322,6 +322,29 @@
   enum operation { uxtb, uxth, uxtw, uxtx, sxtb, sxth, sxtw, sxtx };
 };

+// abs methods which cannot overflow and so are well-defined across
+// the entire domain of integer types.
+static inline unsigned int uabs(unsigned int n) {
+  union {
+    unsigned int result;
+    int value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(unsigned long n) {
+  union {
+    unsigned long result;
+    long value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(long n) { return uabs((unsigned long)n); }
+static inline unsigned long uabs(int n) { return uabs((unsigned int)n); }
+
 // Addressing modes
 class Address VALUE_OBJ_CLASS_SPEC {
  public:
@@ -419,15 +442,16 @@
     }
   }

-  Register base() {
-    guarantee((_mode == base_plus_offset | _mode == base_plus_offset_reg),
+  Register base() const {
+    guarantee((_mode == base_plus_offset | _mode == base_plus_offset_reg
+               | _mode == post),
 	      "wrong mode");
     return _base;
   }
-  long offset() {
+  long offset() const {
     return _offset;
   }
-  Register index() {
+  Register index() const {
     return _index;
   }
   mode getMode() const {
@@ -548,7 +572,7 @@
   static bool offset_ok_for_immed(long offset, int shift = 0) {
     unsigned mask = (1 << shift) - 1;
     if (offset < 0 || offset & mask) {
-      return (abs(offset) < (1 << (20 - 12))); // Unscaled offset
+      return (uabs(offset) < (1 << (20 - 12))); // Unscaled offset
     } else {
       return ((offset >> shift) < (1 << (21 - 10 + 1))); // Scaled, unsigned offset
     }
@@ -1250,12 +1274,6 @@
       f(size & 0b01, 31, 30), f(0b011, 29, 27), f(0b00, 25, 24);
       long offset = (adr.target() - pc()) >> 2;
       sf(offset, 23, 5);
-#if 0
-      Relocation* reloc = adr.rspec().reloc();
-      relocInfo::relocType rtype = (relocInfo::relocType) reloc->type();
-      assert(rtype == relocInfo::internal_word_type,
-	     "only internal_word_type relocs make sense here");
-#endif
       // code_section()->relocate(pc(), adr.rspec());
       relocate(pc(), adr.rspec());
       return;
@@ -1855,7 +1873,7 @@
  * We just use FloatRegister in the following. They are exactly the same
  * as SIMD registers.
  */
-public:
+ public:

   enum SIMD_Arrangement {
        T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
@@ -1865,7 +1883,136 @@
        S32, D64, Q128
   };

-  void v_shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+ private:
+
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             int imm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             Register Xm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+ void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) {
+   switch (a.getMode()) {
+   case Address::base_plus_offset:
+     guarantee(a.offset() == 0, "no offset allowed here");
+     ld_st(Vt, T, a.base(), op1, op2);
+     break;
+   case Address::post:
+     ld_st(Vt, T, a.base(), a.offset(), op1, op2);
+     break;
+   case Address::base_plus_offset_reg:
+     ld_st(Vt, T, a.base(), a.index(), op1, op2);
+     break;
+   default:
+     ShouldNotReachHere();
+   }
+ }
+
+ public:
+
+#define INSN1(NAME, op1, op2)					\
+  void NAME(FloatRegister Vt, SIMD_Arrangement T, const Address &a) {	\
+   ld_st(Vt, T, a, op1, op2);						\
+ }
+
+#define INSN2(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, const Address &a) { \
+    assert(Vt->successor() == Vt2, "Registers must be ordered");	\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+#define INSN3(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,	\
+            SIMD_Arrangement T, const Address &a) {			\
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3,		\
+           "Registers must be ordered");				\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+#define INSN4(NAME, op1, op2)						\
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,	\
+            FloatRegister Vt4, SIMD_Arrangement T, const Address &a) {	\
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3 &&		\
+           Vt3->successor() == Vt4, "Registers must be ordered");	\
+    ld_st(Vt, T, a, op1, op2);						\
+  }
+
+  INSN1(ld1,  0b001100010, 0b0111);
+  INSN2(ld1,  0b001100010, 0b1010);
+  INSN3(ld1,  0b001100010, 0b0110);
+  INSN4(ld1,  0b001100010, 0b0010);
+
+  INSN2(ld2,  0b001100010, 0b1000);
+  INSN3(ld3,  0b001100010, 0b0100);
+  INSN4(ld4,  0b001100010, 0b0000);
+
+  INSN1(st1,  0b001100000, 0b0111);
+  INSN2(st1,  0b001100000, 0b1010);
+  INSN3(st1,  0b001100000, 0b0110);
+  INSN4(st1,  0b001100000, 0b0010);
+
+  INSN2(st2,  0b001100000, 0b1000);
+  INSN3(st3,  0b001100000, 0b0100);
+  INSN4(st4,  0b001100000, 0b0000);
+
+  INSN1(ld1r, 0b001101010, 0b1100);
+  INSN2(ld2r, 0b001101011, 0b1100);
+  INSN3(ld3r, 0b001101010, 0b1110);
+  INSN4(ld4r, 0b001101011, 0b1110);
+
+#undef INSN1
+#undef INSN2
+#undef INSN3
+#undef INSN4
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T8B || T == T16B, "must be T8B or T16B");                               \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29, 21);                                        \
+    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);                              \
+  }
+
+  INSN(eor, 0b101110001);
+  INSN(orr, 0b001110101);
+  INSN(andr, 0b001110001);
+  INSN(bic, 0b001110011);
+  INSN(bif, 0b101110111);
+  INSN(bit, 0b101110101);
+  INSN(bsl, 0b101110011);
+  INSN(orn, 0b001110111);
+
+#undef INSN
+
+#define INSN(NAME, opc)                           \
+  void NAME(FloatRegister Vd, FloatRegister Vn) { \
+    starti;                                       \
+    f(opc, 31, 10), rf(Vn, 5), rf(Vd, 0);         \
+  }
+
+  INSN(aese, 0b0100111000101000010010);
+  INSN(aesd, 0b0100111000101000010110);
+  INSN(aesmc, 0b0100111000101000011010);
+  INSN(aesimc, 0b0100111000101000011110);
+
+#undef INSN
+
+  void shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
      *   0001 xxx	8B/16B, shift = xxx
@@ -1878,7 +2025,7 @@
     f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
   }

-  void v_ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+  void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
      *   0001 xxx	8H, 8B/16b shift = xxx
@@ -1891,22 +2038,22 @@
     f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
     f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
   }
-  void v_ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
-    v_ushll(Vd, Ta, Vn, Tb, shift);
+  void ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    ushll(Vd, Ta, Vn, Tb, shift);
   }

-  void v_uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
+  void uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
     starti;
     f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
     rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
   }
-  void v_uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
-    v_uzp1(Vd, Vn, Vm, T, 1);
+  void uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
+    uzp1(Vd, Vn, Vm, T, 1);
   }

   // Move from general purpose register
   //   mov  Vd.T[index], Rn
-  void v_mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+  void mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
     starti;
     f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
     f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
@@ -1914,7 +2061,7 @@

   // Move to general purpose register
   //   mov  Rd, Vn.T[index]
-  void v_mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+  void mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
     starti;
     f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
     f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
@@ -1922,149 +2069,23 @@
   }

   // We do not handle the 1Q arrangement.
-  void v_pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+  void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
     starti;
     assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
     f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
     rf(Vn, 5), rf(Vd, 0);
   }
-  void v_pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
-    v_pmull(Vd, Ta, Vn, Vm, Tb);
-  }
-
-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0,31), f((int)T & 1, 30), f(0b00110001000000, 29, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    pmull(Vd, Ta, Vn, Vm, Tb);
   }

-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((8 << ((int)T & 1)) == imm, "size/imm mismatch");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((16 << ((int)T & 1)) == imm, "size/imm mismatch");
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((24 << ((int)T & 1)) == imm, "size/imm mismatch");
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn, Register Xm) {
+  void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
+  {
     starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((32 << ((int)T & 1)) == imm, "size/imm mismatch");
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), f(0b11111, 20, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b001100110, 29, 21), rf(Xm, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_st1(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0111, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b1010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0110, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_st1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, FloatRegister Vt4, SIMD_Arrangement T, Register Xn) {
-    starti;
-    assert((Vt2->encoding_nocheck()) == ((Vt->encoding_nocheck() + 1) % 32), "Invalid Vt2");
-    assert((Vt3->encoding_nocheck()) == ((Vt->encoding_nocheck() + 2) % 32), "Invalid Vt3");
-    assert((Vt4->encoding_nocheck()) == ((Vt->encoding_nocheck() + 3) % 32), "Invalid Vt4");
-    f(0, 31), f((int)T & 1, 30), f(0b00110000000000, 29, 16), f(0b0010, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001101010000001100, 29, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm) {
-    starti;
-    f(0, 31), f((int)T & 1, 30), f(0b001101110, 29, 21), rf(Xm, 16);
-    f(0b1100, 15, 12), f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-  void v_ld1r(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm) {
-    starti;
-    assert((1 << ((int)T & 3)) == imm, "size/imm mismatch");
-    f(0, 31), f((int)T & 1, 30), f(0b001101110111111100, 29, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
-  }
-
-  void v_eor(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) {
-    starti;
-    assert(T == T8B || T == T16B, "must be T8B or T16B");
-    f(0, 31), f((int)T & 1, 30), f(0b101110001, 29, 21);
-    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);
+    assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H");
+    f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24);
+    f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10);
+    rf(Vn, 5), rf(Vd, 0);
   }

   // CRC32 instructions
@@ -2259,6 +2280,8 @@
 class MacroAssembler: public Assembler {
   friend class LIR_Assembler;

+  using Assembler::mov;
+
  protected:

   // Support for VM calls
@@ -2396,6 +2419,13 @@
     csincw(Rd, zr, zr, ~cond);
   }

+  void cneg(Register Rd, Register Rn, Assembler::Condition cond) {
+    csneg(Rd, Rn, Rn, ~cond);
+  }
+  void cnegw(Register Rd, Register Rn, Assembler::Condition cond) {
+    csnegw(Rd, Rn, Rn, ~cond);
+  }
+
   inline void movw(Register Rd, Register Rn) {
     if (Rd == sp || Rn == sp) {
       addw(Rd, Rn, 0U);
@@ -2703,7 +2733,10 @@

   // Required platform-specific helpers for Label::patch_instructions.
   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
-  static void pd_patch_instruction(address branch, address target);
+  static int pd_patch_instruction_size (address branch, address target);
+  static void pd_patch_instruction(address branch, address target) {
+    pd_patch_instruction_size (branch, target);
+  }
   static address pd_call_destination(address branch) {
     unsigned insn = *(unsigned*)branch;
     return target_addr_for_insn(branch, insn);
@@ -2712,7 +2745,7 @@
   static void pd_print_patched_instruction(address branch);
 #endif

-  static void patch_oop(address insn_addr, address o);
+  static int patch_oop(address insn_addr, address o);

   // The following 4 methods return the offset of the appropriate move instruction

@@ -2909,19 +2942,6 @@
   void store_check_part_1(Register obj);
   void store_check_part_2(Register obj);

-  // currently unimplemented
-#if 0
-  // C 'boolean' to Java boolean: x == 0 ? 0 : 1
-  void c2bool(Register x);
-
-  // C++ bool manipulation
-
-  void movbool(Register dst, Address src);
-  void movbool(Address dst, bool boolconst);
-  void movbool(Address dst, Register src);
-  void testbool(Register dst);
-#endif
-
   // oop manipulations
   void load_klass(Register dst, Register src);
   void store_klass(Register dst, Register src);
@@ -2932,7 +2952,6 @@
   void load_heap_oop_not_null(Register dst, Address src);
   void store_heap_oop(Address dst, Register src);

-  // currently unimplemented
   // Used for storing NULL. All other oop constants should be
   // stored using routines that take a jobject.
   void store_heap_oop_null(Address dst);
@@ -2957,23 +2976,12 @@
   void decode_heap_oop_not_null(Register dst, Register src);

   void set_narrow_oop(Register dst, jobject obj);
-  // currently unimplemented
-#if 0
-  void set_narrow_oop(Address dst, jobject obj);
-  void cmp_narrow_oop(Register dst, jobject obj);
-  void cmp_narrow_oop(Address dst, jobject obj);
-#endif

   // if heap base register is used - reinit it with the correct value
   void reinit_heapbase();

   DEBUG_ONLY(void verify_heapbase(const char* msg);)

-  // currently unimplemented
-#if 0
-  void int3();
-#endif
-
   void push_CPU_state();
   void pop_CPU_state() ;

@@ -3470,12 +3478,21 @@
         Register table0, Register table1, Register table2, Register table3,
         bool upper = false);

+  void string_indexof(Register str1, Register str2,
+                      Register cnt1, Register cnt2,
+                      Register tmp1, Register tmp2,
+                      Register tmp3, Register tmp4,
+                      int int_cnt1, Register result);
   void string_compare(Register str1, Register str2,
 		      Register cnt1, Register cnt2, Register result,
 		      Register tmp1);
   void string_equals(Register str1, Register str2,
 		     Register cnt, Register result,
 		     Register tmp1);
+  void char_arrays_equals(Register ary1, Register ary2,
+                          Register result, Register tmp1);
+  // ISB may be needed because of a safepoint
+  void maybe_isb() { isb(); }
 };

 #ifdef ASSERT
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -503,6 +503,7 @@
   __ str(r0, Address(rthread, JavaThread::saved_exception_pc_offset()));
   __ mov(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::get_poll_stub));
   __ blrt(rscratch1, 1, 0, 1);
+  __ maybe_isb();
   __ pop(0x3ffffffc, sp);          // integer registers except lr & sp & r0 & r1
   __ mov(rscratch1, r0);
   __ pop(0x3, sp);                 // r0 & r1
@@ -2681,6 +2682,7 @@
   if (info != NULL) {
     add_call_info_here(info);
   }
+  __ maybe_isb();
 }

 void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
@@ -2723,7 +2725,9 @@

 void LIR_Assembler::membar_storeload() { __ membar(MacroAssembler::StoreLoad); }

-void LIR_Assembler::get_thread(LIR_Opr result_reg) { Unimplemented(); }
+void LIR_Assembler::get_thread(LIR_Opr result_reg) {
+  __ mov(result_reg->as_register(), rthread);
+}


 void LIR_Assembler::peephole(LIR_List *lir) {
--- a/src/cpu/aarch64/vm/c1_LinearScan_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_LinearScan_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -35,1211 +35,5 @@
 //----------------------------------------------------------------------

 void LinearScan::allocate_fpu_stack() {
-  // First compute which FPU registers are live at the start of each basic block
-  // (To minimize the amount of work we have to do if we have to merge FPU stacks)
-  if (ComputeExactFPURegisterUsage) {
-    Interval* intervals_in_register, *intervals_in_memory;
-    create_unhandled_lists(&intervals_in_register, &intervals_in_memory, is_in_fpu_register, NULL);
-
-    // ignore memory intervals by overwriting intervals_in_memory
-    // the dummy interval is needed to enforce the walker to walk until the given id:
-    // without it, the walker stops when the unhandled-list is empty -> live information
-    // beyond this point would be incorrect.
-    Interval* dummy_interval = new Interval(any_reg);
-    dummy_interval->add_range(max_jint - 2, max_jint - 1);
-    dummy_interval->set_next(Interval::end());
-    intervals_in_memory = dummy_interval;
-
-    IntervalWalker iw(this, intervals_in_register, intervals_in_memory);
-
-    const int num_blocks = block_count();
-    for (int i = 0; i < num_blocks; i++) {
-      BlockBegin* b = block_at(i);
-
-      // register usage is only needed for merging stacks -> compute only
-      // when more than one predecessor.
-      // the block must not have any spill moves at the beginning (checked by assertions)
-      // spill moves would use intervals that are marked as handled and so the usage bit
-      // would been set incorrectly
-
-      // NOTE: the check for number_of_preds > 1 is necessary. A block with only one
-      //       predecessor may have spill moves at the begin of the block.
-      //       If an interval ends at the current instruction id, it is not possible
-      //       to decide if the register is live or not at the block begin -> the
-      //       register information would be incorrect.
-      if (b->number_of_preds() > 1) {
-        int id = b->first_lir_instruction_id();
-        BitMap regs(FrameMap::nof_fpu_regs);
-        regs.clear();
-
-        iw.walk_to(id);   // walk after the first instruction (always a label) of the block
-        assert(iw.current_position() == id, "did not walk completely to id");
-
-        // Only consider FPU values in registers
-        Interval* interval = iw.active_first(fixedKind);
-        while (interval != Interval::end()) {
-          int reg = interval->assigned_reg();
-          assert(reg >= pd_first_fpu_reg && reg <= pd_last_fpu_reg, "no fpu register");
-          assert(interval->assigned_regHi() == -1, "must not have hi register (doubles stored in one register)");
-          assert(interval->from() <= id && id < interval->to(), "interval out of range");
-
-#ifndef PRODUCT
-          if (TraceFPURegisterUsage) {
-            tty->print("fpu reg %d is live because of ", reg - pd_first_fpu_reg); interval->print();
-          }
-#endif
-
-          regs.set_bit(reg - pd_first_fpu_reg);
-          interval = interval->next();
-        }
-
-        b->set_fpu_register_usage(regs);
-
-#ifndef PRODUCT
-        if (TraceFPURegisterUsage) {
-          tty->print("FPU regs for block %d, LIR instr %d): ", b->block_id(), id); regs.print_on(tty); tty->print_cr("");
-        }
-#endif
-      }
-    }
-  }
-
-#ifndef TARGET_ARCH_aarch64
-  FpuStackAllocator alloc(ir()->compilation(), this);
-  _fpu_stack_allocator = &alloc;
-  alloc.allocate();
-  _fpu_stack_allocator = NULL;
-#endif
-}
-
-
-FpuStackAllocator::FpuStackAllocator(Compilation* compilation, LinearScan* allocator)
-  : _compilation(compilation)
-  , _lir(NULL)
-  , _pos(-1)
-  , _allocator(allocator)
-  , _sim(compilation)
-  , _temp_sim(compilation)
-{}
-
-void FpuStackAllocator::allocate() {
-  int num_blocks = allocator()->block_count();
-  for (int i = 0; i < num_blocks; i++) {
-    // Set up to process block
-    BlockBegin* block = allocator()->block_at(i);
-    intArray* fpu_stack_state = block->fpu_stack_state();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- Begin of new Block %d -------", block->block_id());
-    }
-#endif
-
-    assert(fpu_stack_state != NULL ||
-           block->end()->as_Base() != NULL ||
-           block->is_set(BlockBegin::exception_entry_flag),
-           "FPU stack state must be present due to linear-scan order for FPU stack allocation");
-    // note: exception handler entries always start with an empty fpu stack
-    //       because stack merging would be too complicated
-
-    if (fpu_stack_state != NULL) {
-      sim()->read_state(fpu_stack_state);
-    } else {
-      sim()->clear();
-    }
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Reading FPU state for block %d:", block->block_id());
-      sim()->print();
-      tty->cr();
-    }
-#endif
-
-    allocate_block(block);
-    CHECK_BAILOUT();
-  }
-}
-
-void FpuStackAllocator::allocate_block(BlockBegin* block) {
-  bool processed_merge = false;
-  LIR_OpList* insts = block->lir()->instructions_list();
-  set_lir(block->lir());
-  set_pos(0);
-
-
-  // Note: insts->length() may change during loop
-  while (pos() < insts->length()) {
-    LIR_Op* op = insts->at(pos());
-    _debug_information_computed = false;
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      op->print();
-    }
-    check_invalid_lir_op(op);
-#endif
-
-    LIR_OpBranch* branch = op->as_OpBranch();
-    LIR_Op1* op1 = op->as_Op1();
-    LIR_Op2* op2 = op->as_Op2();
-    LIR_OpCall* opCall = op->as_OpCall();
-
-    if (branch != NULL && branch->block() != NULL) {
-      if (!processed_merge) {
-        // propagate stack at first branch to a successor
-        processed_merge = true;
-        bool required_merge = merge_fpu_stack_with_successors(block);
-
-        assert(!required_merge || branch->cond() == lir_cond_always, "splitting of critical edges should prevent FPU stack mismatches at cond branches");
-      }
-
-    } else if (op1 != NULL) {
-      handle_op1(op1);
-    } else if (op2 != NULL) {
-      handle_op2(op2);
-    } else if (opCall != NULL) {
-      handle_opCall(opCall);
-    }
-
-    compute_debug_information(op);
-
-    set_pos(1 + pos());
-  }
-
-  // Propagate stack when block does not end with branch
-  if (!processed_merge) {
-    merge_fpu_stack_with_successors(block);
-  }
-}
-
-
-void FpuStackAllocator::compute_debug_information(LIR_Op* op) {
-  if (!_debug_information_computed && op->id() != -1 && allocator()->has_info(op->id())) {
-    visitor.visit(op);
-
-    // exception handling
-    if (allocator()->compilation()->has_exception_handlers()) {
-      XHandlers* xhandlers = visitor.all_xhandler();
-      int n = xhandlers->length();
-      for (int k = 0; k < n; k++) {
-        allocate_exception_handler(xhandlers->handler_at(k));
-      }
-    } else {
-      assert(visitor.all_xhandler()->length() == 0, "missed exception handler");
-    }
-
-    // compute debug information
-    int n = visitor.info_count();
-    assert(n > 0, "should not visit operation otherwise");
-
-    for (int j = 0; j < n; j++) {
-      CodeEmitInfo* info = visitor.info_at(j);
-      // Compute debug information
-      allocator()->compute_debug_info(info, op->id());
-    }
-  }
-  _debug_information_computed = true;
-}
-
-void FpuStackAllocator::allocate_exception_handler(XHandler* xhandler) {
-  if (!sim()->is_empty()) {
-    LIR_List* old_lir = lir();
-    int old_pos = pos();
-    intArray* old_state = sim()->write_state();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- begin of exception handler -------");
-    }
-#endif
-
-    if (xhandler->entry_code() == NULL) {
-      // need entry code to clear FPU stack
-      LIR_List* entry_code = new LIR_List(_compilation);
-      entry_code->jump(xhandler->entry_block());
-      xhandler->set_entry_code(entry_code);
-    }
-
-    LIR_OpList* insts = xhandler->entry_code()->instructions_list();
-    set_lir(xhandler->entry_code());
-    set_pos(0);
-
-    // Note: insts->length() may change during loop
-    while (pos() < insts->length()) {
-      LIR_Op* op = insts->at(pos());
-
-#ifndef PRODUCT
-      if (TraceFPUStack) {
-        op->print();
-      }
-      check_invalid_lir_op(op);
-#endif
-
-      switch (op->code()) {
-        case lir_move:
-          assert(op->as_Op1() != NULL, "must be LIR_Op1");
-          assert(pos() != insts->length() - 1, "must not be last operation");
-
-          handle_op1((LIR_Op1*)op);
-          break;
-
-        case lir_branch:
-          assert(op->as_OpBranch()->cond() == lir_cond_always, "must be unconditional branch");
-          assert(pos() == insts->length() - 1, "must be last operation");
-
-          // remove all remaining dead registers from FPU stack
-          clear_fpu_stack(LIR_OprFact::illegalOpr);
-          break;
-
-        default:
-          // other operations not allowed in exception entry code
-          ShouldNotReachHere();
-      }
-
-      set_pos(pos() + 1);
-    }
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- end of exception handler -------");
-    }
-#endif
-
-    set_lir(old_lir);
-    set_pos(old_pos);
-    sim()->read_state(old_state);
-  }
-}
-
-
-int FpuStackAllocator::fpu_num(LIR_Opr opr) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-  return opr->is_single_fpu() ? opr->fpu_regnr() : opr->fpu_regnrLo();
-}
-
-int FpuStackAllocator::tos_offset(LIR_Opr opr) {
-  return sim()->offset_from_tos(fpu_num(opr));
-}
-
-
-LIR_Opr FpuStackAllocator::to_fpu_stack(LIR_Opr opr) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-
-  int stack_offset = tos_offset(opr);
-  if (opr->is_single_fpu()) {
-    return LIR_OprFact::single_fpu(stack_offset)->make_fpu_stack_offset();
-  } else {
-    assert(opr->is_double_fpu(), "shouldn't call this otherwise");
-    return LIR_OprFact::double_fpu(stack_offset)->make_fpu_stack_offset();
-  }
-}
-
-LIR_Opr FpuStackAllocator::to_fpu_stack_top(LIR_Opr opr, bool dont_check_offset) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-  assert(dont_check_offset || tos_offset(opr) == 0, "operand is not on stack top");
-
-  int stack_offset = 0;
-  if (opr->is_single_fpu()) {
-    return LIR_OprFact::single_fpu(stack_offset)->make_fpu_stack_offset();
-  } else {
-    assert(opr->is_double_fpu(), "shouldn't call this otherwise");
-    return LIR_OprFact::double_fpu(stack_offset)->make_fpu_stack_offset();
-  }
-}
-
-
-
-void FpuStackAllocator::insert_op(LIR_Op* op) {
-  lir()->insert_before(pos(), op);
-  set_pos(1 + pos());
-}
-
-
-void FpuStackAllocator::insert_exchange(int offset) {
-  if (offset > 0) {
-    LIR_Op1* fxch_op = new LIR_Op1(lir_fxch, LIR_OprFact::intConst(offset), LIR_OprFact::illegalOpr);
-    insert_op(fxch_op);
-    sim()->swap(offset);
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Exchanged register: %d         New state: ", sim()->get_slot(0)); sim()->print(); tty->cr();
-    }
-#endif
-
-  }
-}
-
-void FpuStackAllocator::insert_exchange(LIR_Opr opr) {
-  insert_exchange(tos_offset(opr));
-}
-
-
-void FpuStackAllocator::insert_free(int offset) {
-  // move stack slot to the top of stack and then pop it
-  insert_exchange(offset);
-
-  LIR_Op* fpop = new LIR_Op0(lir_fpop_raw);
-  insert_op(fpop);
-  sim()->pop();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Inserted pop                   New state: "); sim()->print(); tty->cr();
-    }
-#endif
-}
-
-
-void FpuStackAllocator::insert_free_if_dead(LIR_Opr opr) {
-  if (sim()->contains(fpu_num(opr))) {
-    int res_slot = tos_offset(opr);
-    insert_free(res_slot);
-  }
-}
-
-void FpuStackAllocator::insert_free_if_dead(LIR_Opr opr, LIR_Opr ignore) {
-  if (fpu_num(opr) != fpu_num(ignore) && sim()->contains(fpu_num(opr))) {
-    int res_slot = tos_offset(opr);
-    insert_free(res_slot);
-  }
-}
-
-void FpuStackAllocator::insert_copy(LIR_Opr from, LIR_Opr to) {
-  int offset = tos_offset(from);
-  LIR_Op1* fld = new LIR_Op1(lir_fld, LIR_OprFact::intConst(offset), LIR_OprFact::illegalOpr);
-  insert_op(fld);
-
-  sim()->push(fpu_num(to));
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("Inserted copy (%d -> %d)         New state: ", fpu_num(from), fpu_num(to)); sim()->print(); tty->cr();
-  }
-#endif
-}
-
-void FpuStackAllocator::do_rename(LIR_Opr from, LIR_Opr to) {
-  sim()->rename(fpu_num(from), fpu_num(to));
-}
-
-void FpuStackAllocator::do_push(LIR_Opr opr) {
-  sim()->push(fpu_num(opr));
-}
-
-void FpuStackAllocator::pop_if_last_use(LIR_Op* op, LIR_Opr opr) {
-  assert(op->fpu_pop_count() == 0, "fpu_pop_count alredy set");
-  assert(tos_offset(opr) == 0, "can only pop stack top");
-
-  if (opr->is_last_use()) {
-    op->set_fpu_pop_count(1);
-    sim()->pop();
-  }
-}
-
-void FpuStackAllocator::pop_always(LIR_Op* op, LIR_Opr opr) {
-  assert(op->fpu_pop_count() == 0, "fpu_pop_count alredy set");
-  assert(tos_offset(opr) == 0, "can only pop stack top");
-
-  op->set_fpu_pop_count(1);
-  sim()->pop();
-}
-
-void FpuStackAllocator::clear_fpu_stack(LIR_Opr preserve) {
-  int result_stack_size = (preserve->is_fpu_register() && !preserve->is_xmm_register() ? 1 : 0);
-  while (sim()->stack_size() > result_stack_size) {
-    assert(!sim()->slot_is_empty(0), "not allowed");
-
-    if (result_stack_size == 0 || sim()->get_slot(0) != fpu_num(preserve)) {
-      insert_free(0);
-    } else {
-      // move "preserve" to bottom of stack so that all other stack slots can be popped
-      insert_exchange(sim()->stack_size() - 1);
-    }
-  }
+  // No FPU stack on AArch64
 }
-
-
-void FpuStackAllocator::handle_op1(LIR_Op1* op1) {
-  LIR_Opr in  = op1->in_opr();
-  LIR_Opr res = op1->result_opr();
-
-  LIR_Opr new_in  = in;  // new operands relative to the actual fpu stack top
-  LIR_Opr new_res = res;
-
-  // Note: this switch is processed for all LIR_Op1, regardless if they have FPU-arguments,
-  //       so checks for is_float_kind() are necessary inside the cases
-  switch (op1->code()) {
-
-    case lir_return: {
-      // FPU-Stack must only contain the (optional) fpu return value.
-      // All remaining dead values are popped from the stack
-      // If the input operand is a fpu-register, it is exchanged to the bottom of the stack
-
-      clear_fpu_stack(in);
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        new_in = to_fpu_stack_top(in);
-      }
-
-      break;
-    }
-
-    case lir_move: {
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        if (res->is_xmm_register()) {
-          // move from fpu register to xmm register (necessary for operations that
-          // are not available in the SSE instruction set)
-          insert_exchange(in);
-          new_in = to_fpu_stack_top(in);
-          pop_always(op1, in);
-
-        } else if (res->is_fpu_register() && !res->is_xmm_register()) {
-          // move from fpu-register to fpu-register:
-          // * input and result register equal:
-          //   nothing to do
-          // * input register is last use:
-          //   rename the input register to result register -> input register
-          //   not present on fpu-stack afterwards
-          // * input register not last use:
-          //   duplicate input register to result register to preserve input
-          //
-          // Note: The LIR-Assembler does not produce any code for fpu register moves,
-          //       so input and result stack index must be equal
-
-          if (fpu_num(in) == fpu_num(res)) {
-            // nothing to do
-          } else if (in->is_last_use()) {
-            insert_free_if_dead(res);//, in);
-            do_rename(in, res);
-          } else {
-            insert_free_if_dead(res);
-            insert_copy(in, res);
-          }
-          new_in = to_fpu_stack(res);
-          new_res = new_in;
-
-        } else {
-          // move from fpu-register to memory
-          // input operand must be on top of stack
-
-          insert_exchange(in);
-
-          // create debug information here because afterwards the register may have been popped
-          compute_debug_information(op1);
-
-          new_in = to_fpu_stack_top(in);
-          pop_if_last_use(op1, in);
-        }
-
-      } else if (res->is_fpu_register() && !res->is_xmm_register()) {
-        // move from memory/constant to fpu register
-        // result is pushed on the stack
-
-        insert_free_if_dead(res);
-
-        // create debug information before register is pushed
-        compute_debug_information(op1);
-
-        do_push(res);
-        new_res = to_fpu_stack_top(res);
-      }
-      break;
-    }
-
-    case lir_neg: {
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        assert(res->is_fpu_register() && !res->is_xmm_register(), "must be");
-        assert(in->is_last_use(), "old value gets destroyed");
-
-        insert_free_if_dead(res, in);
-        insert_exchange(in);
-        new_in = to_fpu_stack_top(in);
-
-        do_rename(in, res);
-        new_res = to_fpu_stack_top(res);
-      }
-      break;
-    }
-
-    case lir_convert: {
-      Bytecodes::Code bc = op1->as_OpConvert()->bytecode();
-      switch (bc) {
-        case Bytecodes::_d2f:
-        case Bytecodes::_f2d:
-          assert(res->is_fpu_register(), "must be");
-          assert(in->is_fpu_register(), "must be");
-
-          if (!in->is_xmm_register() && !res->is_xmm_register()) {
-            // this is quite the same as a move from fpu-register to fpu-register
-            // Note: input and result operands must have different types
-            if (fpu_num(in) == fpu_num(res)) {
-              // nothing to do
-              new_in = to_fpu_stack(in);
-            } else if (in->is_last_use()) {
-              insert_free_if_dead(res);//, in);
-              new_in = to_fpu_stack(in);
-              do_rename(in, res);
-            } else {
-              insert_free_if_dead(res);
-              insert_copy(in, res);
-              new_in = to_fpu_stack_top(in, true);
-            }
-            new_res = to_fpu_stack(res);
-          }
-
-          break;
-
-        case Bytecodes::_i2f:
-        case Bytecodes::_l2f:
-        case Bytecodes::_i2d:
-        case Bytecodes::_l2d:
-          assert(res->is_fpu_register(), "must be");
-          if (!res->is_xmm_register()) {
-            insert_free_if_dead(res);
-            do_push(res);
-            new_res = to_fpu_stack_top(res);
-          }
-          break;
-
-        case Bytecodes::_f2i:
-        case Bytecodes::_d2i:
-          assert(in->is_fpu_register(), "must be");
-          if (!in->is_xmm_register()) {
-            insert_exchange(in);
-            new_in = to_fpu_stack_top(in);
-
-            // TODO: update registes of stub
-          }
-          break;
-
-        case Bytecodes::_f2l:
-        case Bytecodes::_d2l:
-          assert(in->is_fpu_register(), "must be");
-          if (!in->is_xmm_register()) {
-            insert_exchange(in);
-            new_in = to_fpu_stack_top(in);
-            pop_always(op1, in);
-          }
-          break;
-
-        case Bytecodes::_i2l:
-        case Bytecodes::_l2i:
-        case Bytecodes::_i2b:
-        case Bytecodes::_i2c:
-        case Bytecodes::_i2s:
-          // no fpu operands
-          break;
-
-        default:
-          ShouldNotReachHere();
-      }
-      break;
-    }
-
-    case lir_roundfp: {
-      assert(in->is_fpu_register() && !in->is_xmm_register(), "input must be in register");
-      assert(res->is_stack(), "result must be on stack");
-
-      insert_exchange(in);
-      new_in = to_fpu_stack_top(in);
-      pop_if_last_use(op1, in);
-      break;
-    }
-
-    default: {
-      assert(!in->is_float_kind() && !res->is_float_kind(), "missed a fpu-operation");
-    }
-  }
-
-  op1->set_in_opr(new_in);
-  op1->set_result_opr(new_res);
-}
-
-void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
-  LIR_Opr left  = op2->in_opr1();
-  if (!left->is_float_kind()) {
-    return;
-  }
-  if (left->is_xmm_register()) {
-    return;
-  }
-
-  LIR_Opr right = op2->in_opr2();
-  LIR_Opr res   = op2->result_opr();
-  LIR_Opr new_left  = left;  // new operands relative to the actual fpu stack top
-  LIR_Opr new_right = right;
-  LIR_Opr new_res   = res;
-
-  assert(!left->is_xmm_register() && !right->is_xmm_register() && !res->is_xmm_register(), "not for xmm registers");
-
-  switch (op2->code()) {
-    case lir_cmp:
-    case lir_cmp_fd2i:
-    case lir_ucmp_fd2i: {
-      assert(left->is_fpu_register(), "invalid LIR");
-      assert(right->is_fpu_register(), "invalid LIR");
-
-      // the left-hand side must be on top of stack.
-      // the right-hand side is never popped, even if is_last_use is set
-      insert_exchange(left);
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-      pop_if_last_use(op2, left);
-      break;
-    }
-
-    case lir_mul_strictfp:
-    case lir_div_strictfp: {
-      assert(op2->tmp1_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
-      insert_free_if_dead(op2->tmp1_opr());
-      assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
-      // fall-through: continue with the normal handling of lir_mul and lir_div
-    }
-    case lir_add:
-    case lir_sub:
-    case lir_mul:
-    case lir_div: {
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_equal(res), "must be");
-
-      // either the left-hand or the right-hand side must be on top of stack
-      // (if right is not a register, left must be on top)
-      if (!right->is_fpu_register()) {
-        insert_exchange(left);
-        new_left = to_fpu_stack_top(left);
-      } else {
-        // no exchange necessary if right is alredy on top of stack
-        if (tos_offset(right) == 0) {
-          new_left = to_fpu_stack(left);
-          new_right = to_fpu_stack_top(right);
-        } else {
-          insert_exchange(left);
-          new_left = to_fpu_stack_top(left);
-          new_right = to_fpu_stack(right);
-        }
-
-        if (right->is_last_use()) {
-          op2->set_fpu_pop_count(1);
-
-          if (tos_offset(right) == 0) {
-            sim()->pop();
-          } else {
-            // if left is on top of stack, the result is placed in the stack
-            // slot of right, so a renaming from right to res is necessary
-            assert(tos_offset(left) == 0, "must be");
-            sim()->pop();
-            do_rename(right, res);
-          }
-        }
-      }
-      new_res = to_fpu_stack(res);
-
-      break;
-    }
-
-    case lir_rem: {
-      assert(left->is_fpu_register(), "must be");
-      assert(right->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_equal(res), "must be");
-
-      // Must bring both operands to top of stack with following operand ordering:
-      // * fpu stack before rem: ... right left
-      // * fpu stack after rem:  ... left
-      if (tos_offset(right) != 1) {
-        insert_exchange(right);
-        insert_exchange(1);
-      }
-      insert_exchange(left);
-      assert(tos_offset(right) == 1, "check");
-      assert(tos_offset(left) == 0, "check");
-
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-
-      op2->set_fpu_pop_count(1);
-      sim()->pop();
-      do_rename(right, res);
-
-      new_res = to_fpu_stack_top(res);
-      break;
-    }
-
-    case lir_abs:
-    case lir_sqrt: {
-      // Right argument appears to be unused
-      assert(right->is_illegal(), "must be");
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_last_use(), "old value gets destroyed");
-
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      break;
-    }
-
-    case lir_log:
-    case lir_log10: {
-      // log and log10 need one temporary fpu stack slot, so
-      // there is one temporary registers stored in temp of the
-      // operation. the stack allocator must guarantee that the stack
-      // slots are really free, otherwise there might be a stack
-      // overflow.
-      assert(right->is_illegal(), "must be");
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(op2->tmp1_opr()->is_fpu_register(), "must be");
-
-      insert_free_if_dead(op2->tmp1_opr());
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
-      break;
-    }
-
-
-    case lir_tan:
-    case lir_sin:
-    case lir_cos:
-    case lir_exp: {
-      // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
-      // registers (stored in right and temp of the operation).
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      // assert(left->is_last_use(), "old value gets destroyed");
-      assert(right->is_fpu_register(), "right is used as the first temporary register");
-      assert(op2->tmp1_opr()->is_fpu_register(), "temp is used as the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
-
-      insert_free_if_dead(right);
-      insert_free_if_dead(op2->tmp1_opr());
-
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
-      break;
-    }
-
-    case lir_pow: {
-      // pow needs two temporary fpu stack slots, so there are two temporary
-      // registers (stored in tmp1 and tmp2 of the operation).
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
-      assert(left->is_fpu_register(), "must be");
-      assert(right->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-
-      assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
-      assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
-
-      insert_free_if_dead(op2->tmp1_opr());
-      insert_free_if_dead(op2->tmp2_opr());
-
-      // Must bring both operands to top of stack with following operand ordering:
-      // * fpu stack before pow: ... right left
-      // * fpu stack after pow:  ... left
-
-      insert_free_if_dead(res, right);
-
-      if (tos_offset(right) != 1) {
-        insert_exchange(right);
-        insert_exchange(1);
-      }
-      insert_exchange(left);
-      assert(tos_offset(right) == 1, "check");
-      assert(tos_offset(left) == 0, "check");
-
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
-
-      sim()->pop();
-
-      do_rename(right, res);
-
-      new_res = to_fpu_stack_top(res);
-      break;
-    }
-
-    default: {
-      assert(false, "missed a fpu-operation");
-    }
-  }
-
-  op2->set_in_opr1(new_left);
-  op2->set_in_opr2(new_right);
-  op2->set_result_opr(new_res);
-}
-
-void FpuStackAllocator::handle_opCall(LIR_OpCall* opCall) {
-  LIR_Opr res = opCall->result_opr();
-
-  // clear fpu-stack before call
-  // it may contain dead values that could not have been remved by previous operations
-  clear_fpu_stack(LIR_OprFact::illegalOpr);
-  assert(sim()->is_empty(), "fpu stack must be empty now");
-
-  // compute debug information before (possible) fpu result is pushed
-  compute_debug_information(opCall);
-
-  if (res->is_fpu_register() && !res->is_xmm_register()) {
-    do_push(res);
-    opCall->set_result_opr(to_fpu_stack_top(res));
-  }
-}
-
-#ifndef PRODUCT
-void FpuStackAllocator::check_invalid_lir_op(LIR_Op* op) {
-  switch (op->code()) {
-    case lir_24bit_FPU:
-    case lir_reset_FPU:
-    case lir_ffree:
-      assert(false, "operations not allowed in lir. If one of these operations is needed, check if they have fpu operands");
-      break;
-
-    case lir_fpop_raw:
-    case lir_fxch:
-    case lir_fld:
-      assert(false, "operations only inserted by FpuStackAllocator");
-      break;
-  }
-}
-#endif
-
-
-void FpuStackAllocator::merge_insert_add(LIR_List* instrs, FpuStackSim* cur_sim, int reg) {
-  LIR_Op1* move = new LIR_Op1(lir_move, LIR_OprFact::doubleConst(0), LIR_OprFact::double_fpu(reg)->make_fpu_stack_offset());
-
-  instrs->instructions_list()->push(move);
-
-  cur_sim->push(reg);
-  move->set_result_opr(to_fpu_stack(move->result_opr()));
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Added new register: %d         New state: ", reg); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-void FpuStackAllocator::merge_insert_xchg(LIR_List* instrs, FpuStackSim* cur_sim, int slot) {
-  assert(slot > 0, "no exchange necessary");
-
-  LIR_Op1* fxch = new LIR_Op1(lir_fxch, LIR_OprFact::intConst(slot));
-  instrs->instructions_list()->push(fxch);
-  cur_sim->swap(slot);
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Exchanged register: %d         New state: ", cur_sim->get_slot(slot)); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-void FpuStackAllocator::merge_insert_pop(LIR_List* instrs, FpuStackSim* cur_sim) {
-  int reg = cur_sim->get_slot(0);
-
-  LIR_Op* fpop = new LIR_Op0(lir_fpop_raw);
-  instrs->instructions_list()->push(fpop);
-  cur_sim->pop(reg);
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Removed register: %d           New state: ", reg); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-bool FpuStackAllocator::merge_rename(FpuStackSim* cur_sim, FpuStackSim* sux_sim, int start_slot, int change_slot) {
-  int reg = cur_sim->get_slot(change_slot);
-
-  for (int slot = start_slot; slot >= 0; slot--) {
-    int new_reg = sux_sim->get_slot(slot);
-
-    if (!cur_sim->contains(new_reg)) {
-      cur_sim->set_slot(change_slot, new_reg);
-
-      #ifndef PRODUCT
-        if (TraceFPUStack) {
-          tty->print("Renamed register %d to %d       New state: ", reg, new_reg); cur_sim->print(); tty->cr();
-        }
-      #endif
-
-      return true;
-    }
-  }
-  return false;
-}
-
-
-void FpuStackAllocator::merge_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, FpuStackSim* sux_sim) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->cr();
-    tty->print("before merging: pred: "); cur_sim->print(); tty->cr();
-    tty->print("                 sux: "); sux_sim->print(); tty->cr();
-  }
-
-  int slot;
-  for (slot = 0; slot < cur_sim->stack_size(); slot++) {
-    assert(!cur_sim->slot_is_empty(slot), "not handled by algorithm");
-  }
-  for (slot = 0; slot < sux_sim->stack_size(); slot++) {
-    assert(!sux_sim->slot_is_empty(slot), "not handled by algorithm");
-  }
-#endif
-
-  // size difference between cur and sux that must be resolved by adding or removing values form the stack
-  int size_diff = cur_sim->stack_size() - sux_sim->stack_size();
-
-  if (!ComputeExactFPURegisterUsage) {
-    // add slots that are currently free, but used in successor
-    // When the exact FPU register usage is computed, the stack does
-    // not contain dead values at merging -> no values must be added
-
-    int sux_slot = sux_sim->stack_size() - 1;
-    while (size_diff < 0) {
-      assert(sux_slot >= 0, "slot out of bounds -> error in algorithm");
-
-      int reg = sux_sim->get_slot(sux_slot);
-      if (!cur_sim->contains(reg)) {
-        merge_insert_add(instrs, cur_sim, reg);
-        size_diff++;
-
-        if (sux_slot + size_diff != 0) {
-          merge_insert_xchg(instrs, cur_sim, sux_slot + size_diff);
-        }
-      }
-     sux_slot--;
-    }
-  }
-
-  assert(cur_sim->stack_size() >= sux_sim->stack_size(), "stack size must be equal or greater now");
-  assert(size_diff == cur_sim->stack_size() - sux_sim->stack_size(), "must be");
-
-  // stack merge algorithm:
-  // 1) as long as the current stack top is not in the right location (that meens
-  //    it should not be on the stack top), exchange it into the right location
-  // 2) if the stack top is right, but the remaining stack is not ordered correctly,
-  //    the stack top is exchanged away to get another value on top ->
-  //    now step 1) can be continued
-  // the stack can also contain unused items -> these items are removed from stack
-
-  int finished_slot = sux_sim->stack_size() - 1;
-  while (finished_slot >= 0 || size_diff > 0) {
-    while (size_diff > 0 || (cur_sim->stack_size() > 0 && cur_sim->get_slot(0) != sux_sim->get_slot(0))) {
-      int reg = cur_sim->get_slot(0);
-      if (sux_sim->contains(reg)) {
-        int sux_slot = sux_sim->offset_from_tos(reg);
-        merge_insert_xchg(instrs, cur_sim, sux_slot + size_diff);
-
-      } else if (!merge_rename(cur_sim, sux_sim, finished_slot, 0)) {
-        assert(size_diff > 0, "must be");
-
-        merge_insert_pop(instrs, cur_sim);
-        size_diff--;
-      }
-      assert(cur_sim->stack_size() == 0 || cur_sim->get_slot(0) != reg, "register must have been changed");
-    }
-
-    while (finished_slot >= 0 && cur_sim->get_slot(finished_slot) == sux_sim->get_slot(finished_slot)) {
-      finished_slot--;
-    }
-
-    if (finished_slot >= 0) {
-      int reg = cur_sim->get_slot(finished_slot);
-
-      if (sux_sim->contains(reg) || !merge_rename(cur_sim, sux_sim, finished_slot, finished_slot)) {
-        assert(sux_sim->contains(reg) || size_diff > 0, "must be");
-        merge_insert_xchg(instrs, cur_sim, finished_slot);
-      }
-      assert(cur_sim->get_slot(finished_slot) != reg, "register must have been changed");
-    }
-  }
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("after merging:  pred: "); cur_sim->print(); tty->cr();
-    tty->print("                 sux: "); sux_sim->print(); tty->cr();
-    tty->cr();
-  }
-#endif
-  assert(cur_sim->stack_size() == sux_sim->stack_size(), "stack size must be equal now");
-}
-
-
-void FpuStackAllocator::merge_cleanup_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, BitMap& live_fpu_regs) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->cr();
-    tty->print("before cleanup: state: "); cur_sim->print(); tty->cr();
-    tty->print("                live:  "); live_fpu_regs.print_on(tty); tty->cr();
-  }
-#endif
-
-  int slot = 0;
-  while (slot < cur_sim->stack_size()) {
-    int reg = cur_sim->get_slot(slot);
-    if (!live_fpu_regs.at(reg)) {
-      if (slot != 0) {
-        merge_insert_xchg(instrs, cur_sim, slot);
-      }
-      merge_insert_pop(instrs, cur_sim);
-    } else {
-      slot++;
-    }
-  }
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("after cleanup:  state: "); cur_sim->print(); tty->cr();
-    tty->print("                live:  "); live_fpu_regs.print_on(tty); tty->cr();
-    tty->cr();
-  }
-
-  // check if fpu stack only contains live registers
-  for (unsigned int i = 0; i < live_fpu_regs.size(); i++) {
-    if (live_fpu_regs.at(i) != cur_sim->contains(i)) {
-      tty->print_cr("mismatch between required and actual stack content");
-      break;
-    }
-  }
-#endif
-}
-
-
-bool FpuStackAllocator::merge_fpu_stack_with_successors(BlockBegin* block) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print_cr("Propagating FPU stack state for B%d at LIR_Op position %d to successors:",
-                  block->block_id(), pos());
-    sim()->print();
-    tty->cr();
-  }
-#endif
-
-  bool changed = false;
-  int number_of_sux = block->number_of_sux();
-
-  if (number_of_sux == 1 && block->sux_at(0)->number_of_preds() > 1) {
-    // The successor has at least two incoming edges, so a stack merge will be necessary
-    // If this block is the first predecessor, cleanup the current stack and propagate it
-    // If this block is not the first predecessor, a stack merge will be necessary
-
-    BlockBegin* sux = block->sux_at(0);
-    intArray* state = sux->fpu_stack_state();
-    LIR_List* instrs = new LIR_List(_compilation);
-
-    if (state != NULL) {
-      // Merge with a successors that already has a FPU stack state
-      // the block must only have one successor because critical edges must been split
-      FpuStackSim* cur_sim = sim();
-      FpuStackSim* sux_sim = temp_sim();
-      sux_sim->read_state(state);
-
-      merge_fpu_stack(instrs, cur_sim, sux_sim);
-
-    } else {
-      // propagate current FPU stack state to successor without state
-      // clean up stack first so that there are no dead values on the stack
-      if (ComputeExactFPURegisterUsage) {
-        FpuStackSim* cur_sim = sim();
-        BitMap live_fpu_regs = block->sux_at(0)->fpu_register_usage();
-        assert(live_fpu_regs.size() == FrameMap::nof_fpu_regs, "missing register usage");
-
-        merge_cleanup_fpu_stack(instrs, cur_sim, live_fpu_regs);
-      }
-
-      intArray* state = sim()->write_state();
-      if (TraceFPUStack) {
-        tty->print_cr("Setting FPU stack state of B%d (merge path)", sux->block_id());
-        sim()->print(); tty->cr();
-      }
-      sux->set_fpu_stack_state(state);
-    }
-
-    if (instrs->instructions_list()->length() > 0) {
-      lir()->insert_before(pos(), instrs);
-      set_pos(instrs->instructions_list()->length() + pos());
-      changed = true;
-    }
-
-  } else {
-    // Propagate unmodified Stack to successors where a stack merge is not necessary
-    intArray* state = sim()->write_state();
-    for (int i = 0; i < number_of_sux; i++) {
-      BlockBegin* sux = block->sux_at(i);
-
-#ifdef ASSERT
-      for (int j = 0; j < sux->number_of_preds(); j++) {
-        assert(block == sux->pred_at(j), "all critical edges must be broken");
-      }
-
-      // check if new state is same
-      if (sux->fpu_stack_state() != NULL) {
-        intArray* sux_state = sux->fpu_stack_state();
-        assert(state->length() == sux_state->length(), "overwriting existing stack state");
-        for (int j = 0; j < state->length(); j++) {
-          assert(state->at(j) == sux_state->at(j), "overwriting existing stack state");
-        }
-      }
-#endif
-#ifndef PRODUCT
-      if (TraceFPUStack) {
-        tty->print_cr("Setting FPU stack state of B%d", sux->block_id());
-        sim()->print(); tty->cr();
-      }
-#endif
-
-      sux->set_fpu_stack_state(state);
-    }
-  }
-
-#ifndef PRODUCT
-  // assertions that FPU stack state conforms to all successors' states
-  intArray* cur_state = sim()->write_state();
-  for (int i = 0; i < number_of_sux; i++) {
-    BlockBegin* sux = block->sux_at(i);
-    intArray* sux_state = sux->fpu_stack_state();
-
-    assert(sux_state != NULL, "no fpu state");
-    assert(cur_state->length() == sux_state->length(), "incorrect length");
-    for (int i = 0; i < cur_state->length(); i++) {
-      assert(cur_state->at(i) == sux_state->at(i), "element not equal");
-    }
-  }
-#endif
-
-  return changed;
-}
--- a/src/cpu/aarch64/vm/c1_LinearScan_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_LinearScan_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -24,8 +24,8 @@
  *
  */

-#ifndef CPU_X86_VM_C1_LINEARSCAN_X86_HPP
-#define CPU_X86_VM_C1_LINEARSCAN_X86_HPP
+#ifndef CPU_AARCH64_VM_C1_LINEARSCAN_HPP
+#define CPU_AARCH64_VM_C1_LINEARSCAN_HPP

 inline bool LinearScan::is_processed_reg_num(int reg_num) {
   return reg_num <= FrameMap::last_cpu_reg() || reg_num >= pd_nof_cpu_regs_frame_map;
@@ -75,77 +75,4 @@


-class FpuStackAllocator VALUE_OBJ_CLASS_SPEC {
- private:
-  Compilation* _compilation;
-  LinearScan* _allocator;
-
-  LIR_OpVisitState visitor;
-
-  LIR_List* _lir;
-  int _pos;
-  FpuStackSim _sim;
-  FpuStackSim _temp_sim;
-
-  bool _debug_information_computed;
-
-  LinearScan*   allocator()                      { return _allocator; }
-  Compilation*  compilation() const              { return _compilation; }
-
-  // unified bailout support
-  void          bailout(const char* msg) const   { compilation()->bailout(msg); }
-  bool          bailed_out() const               { return compilation()->bailed_out(); }
-
-  int pos() { return _pos; }
-  void set_pos(int pos) { _pos = pos; }
-  LIR_Op* cur_op() { Unimplemented(); return lir()->instructions_list()->at(pos()); }
-  LIR_List* lir() { return _lir; }
-  void set_lir(LIR_List* lir) { _lir = lir; }
-  FpuStackSim* sim() { return &_sim; }
-  FpuStackSim* temp_sim() { return &_temp_sim; }
-
-  int fpu_num(LIR_Opr opr);
-  int tos_offset(LIR_Opr opr);
-  LIR_Opr to_fpu_stack_top(LIR_Opr opr, bool dont_check_offset = false);
-
-  // Helper functions for handling operations
-  void insert_op(LIR_Op* op);
-  void insert_exchange(int offset);
-  void insert_exchange(LIR_Opr opr);
-  void insert_free(int offset);
-  void insert_free_if_dead(LIR_Opr opr);
-  void insert_free_if_dead(LIR_Opr opr, LIR_Opr ignore);
-  void insert_copy(LIR_Opr from, LIR_Opr to);
-  void do_rename(LIR_Opr from, LIR_Opr to);
-  void do_push(LIR_Opr opr);
-  void pop_if_last_use(LIR_Op* op, LIR_Opr opr);
-  void pop_always(LIR_Op* op, LIR_Opr opr);
-  void clear_fpu_stack(LIR_Opr preserve);
-  void handle_op1(LIR_Op1* op1);
-  void handle_op2(LIR_Op2* op2);
-  void handle_opCall(LIR_OpCall* opCall);
-  void compute_debug_information(LIR_Op* op);
-  void allocate_exception_handler(XHandler* xhandler);
-  void allocate_block(BlockBegin* block);
-
-#ifndef PRODUCT
-  void check_invalid_lir_op(LIR_Op* op);
-#endif
-
-  // Helper functions for merging of fpu stacks
-  void merge_insert_add(LIR_List* instrs, FpuStackSim* cur_sim, int reg);
-  void merge_insert_xchg(LIR_List* instrs, FpuStackSim* cur_sim, int slot);
-  void merge_insert_pop(LIR_List* instrs, FpuStackSim* cur_sim);
-  bool merge_rename(FpuStackSim* cur_sim, FpuStackSim* sux_sim, int start_slot, int change_slot);
-  void merge_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, FpuStackSim* sux_sim);
-  void merge_cleanup_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, BitMap& live_fpu_regs);
-  bool merge_fpu_stack_with_successors(BlockBegin* block);
-
- public:
-  LIR_Opr to_fpu_stack(LIR_Opr opr); // used by LinearScan for creation of debug information
-
-  FpuStackAllocator(Compilation* compilation, LinearScan* allocator);
-  void allocate();
-};
-
-#endif // CPU_X86_VM_C1_LINEARSCAN_X86_HPP
+#endif // CPU_AARCH64_VM_C1_LINEARSCAN_HPP
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -41,7 +41,6 @@
 				  FloatRegister f0, FloatRegister f1,
 				  Register result)
 {
-  Label done;
   if (is_float) {
     fcmps(f0, f1);
   } else {
@@ -50,21 +49,14 @@
   if (unordered_result < 0) {
     // we want -1 for unordered or less than, 0 for equal and 1 for
     // greater than.
-    mov(result, (u_int64_t)-1L);
-    // for FP LT tests less than or unordered
-    br(Assembler::LT, done);
-    // install 0 for EQ otherwise 1
-    csinc(result, zr, zr, Assembler::EQ);
+    cset(result, NE);  // Not equal or unordered
+    cneg(result, result, LT);  // Less than or unordered
   } else {
     // we want -1 for less than, 0 for equal and 1 for unordered or
     // greater than.
-    mov(result, 1L);
-    // for FP HI tests greater than or unordered
-    br(Assembler::HI, done);
-    // install 0 for EQ otherwise ~0
-    csinv(result, zr, zr, Assembler::EQ);
+    cset(result, NE);  // Not equal or unordered
+    cneg(result, result, LO);  // Less than
   }
-  bind(done);
 }

 int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr, Register scratch, Label& slow_case) {
@@ -431,10 +423,6 @@
   }
 }

-
-void C1_MacroAssembler::unverified_entry(Register receiver, Register ic_klass) { Unimplemented(); }
-
-
 void C1_MacroAssembler::verified_entry() {
 }
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -80,6 +80,7 @@
   pop(r0, sp);
 #endif
   reset_last_Java_frame(true, true);
+  maybe_isb();

   // check for pending exceptions
   { Label L;
@@ -569,6 +570,7 @@
   }
 #endif
   __ reset_last_Java_frame(true, false);
+  __ maybe_isb();

   // check for pending exceptions
   { Label L;
@@ -1235,6 +1237,12 @@
         __ lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
 	unsigned long offset;
 	__ adrp(rscratch1, cardtable, offset);
+	assert((offset & 0x3ffL) == 0, "assumed offset aligned to 0x400");
+	// n.b. intra-page offset will never change even if this gets
+	// relocated so it is safe to omit the lea when offset == 0
+	if (offset != 0) {
+	  __ lea(rscratch1, Address(rscratch1, offset));
+	}
         __ add(card_addr, card_addr, rscratch1);
         __ ldrb(rscratch1, Address(card_addr, offset));
         __ cmpw(rscratch1, (int)G1SATBCardTableModRefBS::g1_young_card_val());
--- a/src/cpu/aarch64/vm/cppInterpreterGenerator_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/cppInterpreterGenerator_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -29,29 +29,7 @@

  protected:

-#if 0
-  address generate_asm_interpreter_entry(bool synchronized);
-  address generate_native_entry(bool synchronized);
-  address generate_abstract_entry(void);
-  address generate_math_entry(AbstractInterpreter::MethodKind kind);
-  address generate_empty_entry(void);
-  address generate_accessor_entry(void);
-  address generate_Reference_get_entry(void);
-  void lock_method(void);
-  void generate_stack_overflow_check(void);
-
-  void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
-  void generate_counter_overflow(Label* do_continue);
-#endif
-
   void generate_more_monitors();
   void generate_deopt_handling();
-#if 0
-  address generate_interpreter_frame_manager(bool synchronized); // C++ interpreter only
-  void generate_compute_interpreter_state(const Register state,
-                                          const Register prev_state,
-                                          const Register sender_sp,
-                                          bool native); // C++ interpreter only
-#endif

 #endif // CPU_AARCH64_VM_CPPINTERPRETERGENERATOR_AARCH64_HPP
--- a/src/cpu/aarch64/vm/frame_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/frame_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -810,7 +810,7 @@
 		   unsigned long bcx, unsigned long thread) {
   RegisterMap map((JavaThread*)thread, false);
   if (!reg_map) {
-    reg_map = (RegisterMap*)new char[sizeof map];
+    reg_map = (RegisterMap*)os::malloc(sizeof map, mtNone);
   }
   memcpy(reg_map, &map, sizeof map);
   {
--- a/src/cpu/aarch64/vm/icache_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/icache_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -32,7 +32,10 @@

 void ICacheStubGenerator::generate_icache_flush(
   		ICache::flush_icache_stub_t* flush_icache_stub) {
-  aarch64TestHook();
   // Give anyone who calls this a surprise
   *flush_icache_stub = (ICache::flush_icache_stub_t)NULL;
 }
+
+void ICache::initialize() {
+  aarch64TestHook();
+}
--- a/src/cpu/aarch64/vm/icache_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/icache_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -33,7 +33,7 @@

 class ICache : public AbstractICache {
  public:
-  static void initialize() {}
+  static void initialize();
   static void invalidate_word(address addr) {
     __clear_cache((char *)addr, (char *)(addr + 3));
   }
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -54,10 +54,6 @@

 // Implementation of InterpreterMacroAssembler

-#ifdef CC_INTERP
-void InterpreterMacroAssembler::get_method(Register reg) { Unimplemented(); }
-#endif // CC_INTERP
-
 #ifndef CC_INTERP

 void InterpreterMacroAssembler::check_and_handle_popframe(Register java_thread) {
@@ -1413,6 +1409,7 @@
   // Note: No need to save/restore rbcp & rlocals pointer since these
   //       are callee saved registers and no blocking/ GC can happen
   //       in leaf calls.
+  // also no need to restore method register
 #ifdef ASSERT
   {
     Label L;
@@ -1456,6 +1453,8 @@
                                entry_point, number_of_arguments,
                      check_exceptions);
 // interpreter specific
+  // method oop may have moved so reload from interpreter stack frame
+  get_method(rmethod);
   restore_bcp();
   restore_locals();
   // reload the constant pool cache in case a PermGen GC moved it
--- a/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -42,25 +42,16 @@
   void clear(void) {
     // clearing _last_Java_sp must be first
     _last_Java_sp = NULL;
-    // fence?
+    OrderAccess::release();
     _last_Java_fp = NULL;
     _last_Java_pc = NULL;
   }

   void copy(JavaFrameAnchor* src) {
-    // In order to make sure the transition state is valid for "this"
-    // We must clear _last_Java_sp before copying the rest of the new data
-    //
-    // Hack Alert: Temporary bugfix for 4717480/4721647
-    // To act like previous version (pd_cache_state) don't NULL _last_Java_sp
-    // unless the value is changing
-    //
-    if (_last_Java_sp != src->_last_Java_sp)
-      _last_Java_sp = NULL;
-
     _last_Java_fp = src->_last_Java_fp;
     _last_Java_pc = src->_last_Java_pc;
     // Must be last so profiler will always see valid frame if has_last_frame() is true
+    OrderAccess::release();
     _last_Java_sp = src->_last_Java_sp;
   }
--- a/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -128,10 +128,15 @@
     case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
     default:        ShouldNotReachHere();
   }
-  // tail call
-  __ lea(rscratch1, ExternalAddress(slow_case_addr));
-  __ br(rscratch1);

+  {
+    __ enter();
+    __ lea(rscratch1, ExternalAddress(slow_case_addr));
+    __ blr(rscratch1);
+    __ maybe_isb();
+    __ leave();
+    __ ret(lr);
+  }
   __ flush ();

   return fast_entry;
--- a/src/cpu/aarch64/vm/methodHandles_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/methodHandles_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -47,9 +47,9 @@

 void MethodHandles::load_klass_from_Class(MacroAssembler* _masm, Register klass_reg) {
   if (VerifyMethodHandles)
-    verify_klass(_masm, klass_reg, SystemDictionary::Class_klass(),
+    verify_klass(_masm, klass_reg, SystemDictionaryHandles::Class_klass(),
                  "MH argument is a Class");
-  __ ldr(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
+  __ load_heap_oop(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
 }

 #ifdef ASSERT
@@ -140,7 +140,7 @@
   __ load_heap_oop(method_temp, Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())));
   __ verify_oop(method_temp);
   // the following assumes that a methodOop is normally compressed in the vmtarget field:
-  __ ldr(method_temp, Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())));
+  __ load_heap_oop(method_temp, Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())));
   __ verify_oop(method_temp);

   if (VerifyMethodHandles && !for_compiler_entry) {
@@ -334,14 +334,14 @@
       if (VerifyMethodHandles) {
         verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
       }
-      __ ldr(rmethod, member_vmtarget);
+      __ load_heap_oop(rmethod, member_vmtarget);
       break;

     case vmIntrinsics::_linkToStatic:
       if (VerifyMethodHandles) {
         verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
       }
-      __ ldr(rmethod, member_vmtarget);
+      __ load_heap_oop(rmethod, member_vmtarget);
       break;

     case vmIntrinsics::_linkToVirtual:
--- a/src/cpu/aarch64/vm/methodHandles_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/methodHandles_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -29,7 +29,7 @@

 // Adapters
 static unsigned int adapter_code_size() {
-  32000 DEBUG_ONLY(+ 120000);
+  return 32000 DEBUG_ONLY(+ 120000);
 }

 public:
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -37,22 +37,17 @@
 #include "c1/c1_Runtime1.hpp"
 #endif

-void NativeInstruction::wrote(int offset) {
-  // FIXME: Native needs ISB here
-; }
-
-
 void NativeCall::verify() { ; }

 address NativeCall::destination() const {
   return instruction_address() + displacement();
 }

-void NativeCall::print() { Unimplemented(); }
-
 // Inserts a native call instruction at a given pc
 void NativeCall::insert(address code_pos, address entry) { Unimplemented(); }

+//-------------------------------------------------------------------
+
 void NativeMovConstReg::verify() {
   // make sure code pattern is actually mov reg64, imm64 instructions
 }
@@ -73,6 +68,7 @@
     *(intptr_t*)addr = x;
   } else {
     MacroAssembler::pd_patch_instruction(instruction_address(), (address)x);
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }
 };

@@ -83,12 +79,8 @@

 //-------------------------------------------------------------------

-int NativeMovRegMem::instruction_start() const { Unimplemented(); return 0; }
-
 address NativeMovRegMem::instruction_address() const      { return addr_at(instruction_offset); }

-address NativeMovRegMem::next_instruction_address() const { Unimplemented(); return 0; }
-
 int NativeMovRegMem::offset() const  {
   address pc = instruction_address();
   unsigned insn = *(unsigned*)pc;
@@ -108,6 +100,7 @@
     *(long*)addr = x;
   } else {
     MacroAssembler::pd_patch_instruction(pc, (address)intptr_t(x));
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }
 }

@@ -117,23 +110,11 @@
 #endif
 }

-
-void NativeMovRegMem::print() { Unimplemented(); }
-
-//-------------------------------------------------------------------
-
-void NativeLoadAddress::verify() { Unimplemented(); }
-
-
-void NativeLoadAddress::print() { Unimplemented(); }
-
 //--------------------------------------------------------------------------------

 void NativeJump::verify() { ; }


-void NativeJump::insert(address code_pos, address entry) { Unimplemented(); }
-
 void NativeJump::check_verified_entry_alignment(address entry, address verified_entry) {
 }

@@ -156,8 +137,11 @@
     dest = instruction_address();

   MacroAssembler::pd_patch_instruction(instruction_address(), dest);
+  ICache::invalidate_range(instruction_address(), instruction_size);
 };

+//-------------------------------------------------------------------
+
 bool NativeInstruction::is_safepoint_poll() {
   // a safepoint_poll is implemented in two steps as either
   //
@@ -207,7 +191,7 @@
   return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b11100101;
 }

-// MT safe inserting of a jump over an unknown instruction sequence (used by nmethod::makeZombie)
+// MT safe inserting of a jump over a jump or a nop (used by nmethod::makeZombie)

 void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
   ptrdiff_t disp = dest - verified_entry;
@@ -221,15 +205,10 @@
   ICache::invalidate_range(verified_entry, instruction_size);
 }

-
-void NativePopReg::insert(address code_pos, Register reg) { Unimplemented(); }
-
-
 void NativeIllegalInstruction::insert(address code_pos) { Unimplemented(); }

 void NativeGeneralJump::verify() {  }

-
 void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
   ptrdiff_t disp = entry - code_pos;
   guarantee(disp < 1 << 27 && disp > - (1 << 27), "branch overflow");
@@ -242,7 +221,8 @@

 // MT-safe patching of a long jump instruction.
 void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
-  assert(nativeInstruction_at(instr_addr)->is_jump_or_nop(),
+  NativeGeneralJump* n_jump = (NativeGeneralJump*)instr_addr;
+  assert(n_jump->is_jump_or_nop(),
 	 "Aarch64 cannot replace non-jump with jump");
   uint32_t instr = *(uint32_t*)code_buffer;
   *(uint32_t*)instr_addr = instr;
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -57,7 +57,6 @@
   enum { instruction_size = 4 };
   inline bool is_nop();
   bool is_dtrace_trap();
-  inline bool is_call();
   inline bool is_illegal();
   inline bool is_return();
   bool is_jump();
@@ -81,10 +80,10 @@
   oop  oop_at (int offset) const       { return *(oop*) addr_at(offset); }


-  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; wrote(offset); }
-  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i;  wrote(offset); }
-  void set_ptr_at (int offset, intptr_t  ptr) { *(intptr_t*) addr_at(offset) = ptr;  wrote(offset); }
-  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o;  wrote(offset); }
+  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; }
+  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i; }
+  void set_ptr_at (int offset, intptr_t  ptr) { *(intptr_t*) addr_at(offset) = ptr; }
+  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o; }

   // This doesn't really do anything on AArch64, but it is the place where
   // cache invalidation belongs, generically:
@@ -143,6 +142,7 @@
     offset &= (1 << 26) - 1; // mask off insn part
     insn |= offset;
     set_int_at(displacement_offset, insn);
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }

   // Similar to replace_mt_safe, but just changes the destination.  The
@@ -175,11 +175,6 @@
     return is_call_at(return_address - NativeCall::return_address_offset);
   }

-  static bool is_call_to(address instr, address target) {
-    return nativeInstruction_at(instr)->is_call() &&
-      nativeCall_at(instr)->destination() == target;
-  }
-
   // MT-safe patching of a call instruction.
   static void insert(address code_pos, address entry);

@@ -345,9 +340,6 @@

   // unit test stuff
   static void test() {}
-
- private:
-  friend NativeLoadAddress* nativeLoadAddress_at (address address) { Unimplemented(); return 0; }
 };

 class NativeJump: public NativeInstruction {
@@ -434,10 +426,6 @@
  public:
 };

-inline bool NativeInstruction::is_illegal()      { Unimplemented(); return false; }
-inline bool NativeInstruction::is_call()         { Unimplemented(); return false; }
-inline bool NativeInstruction::is_return()       { Unimplemented(); return false; }
-
 inline bool NativeInstruction::is_nop()         {
   uint32_t insn = *(uint32_t*)addr_at(0);
   return insn == 0xd503201f;
@@ -466,8 +454,4 @@
   return is_nop() || is_jump();
 }

-inline bool NativeInstruction::is_cond_jump()    { Unimplemented(); return false; }
-
-inline bool NativeInstruction::is_mov_literal64() { Unimplemented(); return false; }
-
 #endif // CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP
--- a/src/cpu/aarch64/vm/register_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/register_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -174,44 +174,6 @@
 CONSTANT_REGISTER_DECLARATION(FloatRegister, v30    , (30));
 CONSTANT_REGISTER_DECLARATION(FloatRegister, v31    , (31));

-// #ifndef DONT_USE_REGISTER_DEFINES
-#if 0
-#define fnoreg ((FloatRegister)(fnoreg_FloatRegisterEnumValue))
-#define v0     ((FloatRegister)(    v0_FloatRegisterEnumValue))
-#define v1     ((FloatRegister)(    v1_FloatRegisterEnumValue))
-#define v2     ((FloatRegister)(    v2_FloatRegisterEnumValue))
-#define v3     ((FloatRegister)(    v3_FloatRegisterEnumValue))
-#define v4     ((FloatRegister)(    v4_FloatRegisterEnumValue))
-#define v5     ((FloatRegister)(    v5_FloatRegisterEnumValue))
-#define v6     ((FloatRegister)(    v6_FloatRegisterEnumValue))
-#define v7     ((FloatRegister)(    v7_FloatRegisterEnumValue))
-#define v8     ((FloatRegister)(    v8_FloatRegisterEnumValue))
-#define v9     ((FloatRegister)(    v9_FloatRegisterEnumValue))
-#define v10    ((FloatRegister)(   v10_FloatRegisterEnumValue))
-#define v11    ((FloatRegister)(   v11_FloatRegisterEnumValue))
-#define v12    ((FloatRegister)(   v12_FloatRegisterEnumValue))
-#define v13    ((FloatRegister)(   v13_FloatRegisterEnumValue))
-#define v14    ((FloatRegister)(   v14_FloatRegisterEnumValue))
-#define v15    ((FloatRegister)(   v15_FloatRegisterEnumValue))
-#define v16    ((FloatRegister)(   v16_FloatRegisterEnumValue))
-#define v17    ((FloatRegister)(   v17_FloatRegisterEnumValue))
-#define v18    ((FloatRegister)(   v18_FloatRegisterEnumValue))
-#define v19    ((FloatRegister)(   v19_FloatRegisterEnumValue))
-#define v20    ((FloatRegister)(   v20_FloatRegisterEnumValue))
-#define v21    ((FloatRegister)(   v21_FloatRegisterEnumValue))
-#define v22    ((FloatRegister)(   v22_FloatRegisterEnumValue))
-#define v23    ((FloatRegister)(   v23_FloatRegisterEnumValue))
-#define v24    ((FloatRegister)(   v24_FloatRegisterEnumValue))
-#define v25    ((FloatRegister)(   v25_FloatRegisterEnumValue))
-#define v26    ((FloatRegister)(   v26_FloatRegisterEnumValue))
-#define v27    ((FloatRegister)(   v27_FloatRegisterEnumValue))
-#define v28    ((FloatRegister)(   v28_FloatRegisterEnumValue))
-#define v29    ((FloatRegister)(   v29_FloatRegisterEnumValue))
-#define v30    ((FloatRegister)(   v30_FloatRegisterEnumValue))
-#define v31    ((FloatRegister)(   v31_FloatRegisterEnumValue))
-#endif // 0
-//#endif // DONT_USE_REGISTER_DEFINES
-
 // Need to know the total number of registers of all sorts for SharedInfo.
 // Define a class that exports it.
 class ConcreteRegisterImpl : public AbstractRegisterImpl {
--- a/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -34,23 +34,30 @@


 void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
+  if (verify_only) {
+    return;
+  }
+
+  int bytes;
+
   switch(type()) {
   case relocInfo::oop_type:
     {
       oop_Relocation *reloc = (oop_Relocation *)this;
       if (NativeInstruction::is_ldr_literal_at(addr())) {
 	address constptr = (address)code()->oop_addr_at(reloc->oop_index());
-	MacroAssembler::pd_patch_instruction(addr(), constptr);
+	bytes = MacroAssembler::pd_patch_instruction_size(addr(), constptr);
 	assert(*(address*)constptr == x, "error in oop relocation");
       } else{
-	MacroAssembler::patch_oop(addr(), x);
+	bytes = MacroAssembler::patch_oop(addr(), x);
       }
     }
     break;
   default:
-    MacroAssembler::pd_patch_instruction(addr(), x);
+    bytes = MacroAssembler::pd_patch_instruction_size(addr(), x);
     break;
   }
+  ICache::invalidate_range(addr(), bytes);
 }

 address Relocation::pd_call_destination(address orig_addr) {
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -319,6 +319,8 @@
   __ mov(c_rarg1, lr);
   __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
   __ blrt(rscratch1, 2, 0, 0);
+  __ maybe_isb();
+  __ membar(Assembler::LoadLoad | Assembler::LoadStore);

   __ pop_CPU_state();
   // restore sp
@@ -1171,6 +1173,7 @@
     __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
     __ blrt(rscratch1, rscratch2);
     // __ blrt(rscratch1, gpargs, fpargs, type);
+    __ maybe_isb();
   }
 }

@@ -1977,6 +1980,7 @@
       __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
     }
     __ blrt(rscratch1, 1, 0, 1);
+    __ maybe_isb();
     // Restore any method result value
     restore_native_result(masm, ret_type, stack_slots);

@@ -2594,11 +2598,6 @@
   }
 #endif

-  // TODO check various assumptions here
-  //
-  // call unimplemented to make sure we actually check this later
-  // __ call_Unimplemented();
-
   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");

   address start = __ pc();
@@ -2842,6 +2841,8 @@

   __ reset_last_Java_frame(false, true);

+  __ maybe_isb();
+
   __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
   __ cbz(rscratch1, noException);

@@ -2911,6 +2912,8 @@

   oop_maps->add_gc_map( __ offset() - start, map);

+  __ maybe_isb();
+
   // r0 contains the address we are going to jump to assuming no exception got installed

   // clear last_Java_sp
@@ -3033,7 +3036,8 @@
   __ mov(c_rarg0, rthread);
   __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
   __ blrt(rscratch1, 1, 0, MacroAssembler::ret_type_integral);
-
+  __ maybe_isb();
+
   // Set an oopmap for the call site.  This oopmap will only be used if we
   // are unwinding the stack.  Hence, all locations will be dead.
   // Callee-saved registers will be the same as the frame above (i.e.,
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -78,10 +78,6 @@

 // Stub Code definitions

-#if 0
-static address handle_unsafe_access() { Unimplemented(); return 0; }
-#endif
-
 class StubGenerator: public StubCodeGenerator {
  private:

@@ -603,159 +599,6 @@
     return start;
   }

-  // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
-  //
-  // Arguments :
-  //    c_rarg0: exchange_value
-  //    c_rarg0: dest
-  //
-  // Result:
-  //    *dest <- ex, return (orig *dest)
-
-  // NOTE: not sure this is actually needed but if so it looks like it
-  // is called from os-specific code i.e. it needs an x86 prolog
-
-  address generate_atomic_xchg() { return 0; }
-
-  // Support for intptr_t atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
-  //
-  // Arguments :
-  //    c_rarg0: exchange_value
-  //    c_rarg1: dest
-  //
-  // Result:
-  //    *dest <- ex, return (orig *dest)
-
-  // NOTE: not sure this is actually needed but if so it looks like it
-  // is called from os-specific code i.e. it needs an x86 prolog
-
-  address generate_atomic_xchg_ptr() { return 0; }
-
-  // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
-  //                                         jint compare_value)
-  //
-  // Arguments :
-  //    c_rarg0: exchange_value
-  //    c_rarg1: dest
-  //    c_rarg2: compare_value
-  //
-  // Result:
-  //    if ( compare_value == *dest ) {
-  //       *dest = exchange_value
-  //       return compare_value;
-  //    else
-  //       return *dest;
-  address generate_atomic_cmpxchg() { return 0; }
-
-  // Support for jint atomic::atomic_cmpxchg_long(jlong exchange_value,
-  //                                             volatile jlong* dest,
-  //                                             jlong compare_value)
-  // Arguments :
-  //    c_rarg0: exchange_value
-  //    c_rarg1: dest
-  //    c_rarg2: compare_value
-  //
-  // Result:
-  //    if ( compare_value == *dest ) {
-  //       *dest = exchange_value
-  //       return compare_value;
-  //    else
-  //       return *dest;
-
-  // NOTE: not sure this is actually needed but if so it looks like it
-  // is called from os-specific code i.e. it needs an x86 prolog
-
-  address generate_atomic_cmpxchg_long() { return 0; }
-
-  // Support for jint atomic::add(jint add_value, volatile jint* dest)
-  //
-  // Arguments :
-  //    c_rarg0: add_value
-  //    c_rarg1: dest
-  //
-  // Result:
-  //    *dest += add_value
-  //    return *dest;
-
-  // NOTE: not sure this is actually needed but if so it looks like it
-  // is called from os-specific code i.e. it needs an x86 prolog
-
-  address generate_atomic_add() { return 0; }
-
-  // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
-  //
-  // Arguments :
-  //    c_rarg0: add_value
-  //    c_rarg1: dest
-  //
-  // Result:
-  //    *dest += add_value
-  //    return *dest;
-
-  // NOTE: not sure this is actually needed but if so it looks like it
-  // is called from os-specific code i.e. it needs an x86 prolog
-
-  address generate_atomic_add_ptr() { return 0; }
-
-  // Support for intptr_t OrderAccess::fence()
-  //
-  // Arguments :
-  //
-  // Result:
-
-  // NOTE: this is called from C code so it needs an x86 prolog
-  // or else we need to fiddle it with inline asm for now
-
-  address generate_orderaccess_fence() { return 0; }
-
-  // Support for intptr_t get_previous_fp()
-  //
-  // This routine is used to find the previous frame pointer for the
-  // caller (current_frame_guess). This is used as part of debugging
-  // ps() is seemingly lost trying to find frames.
-  // This code assumes that caller current_frame_guess) has a frame.
-
-  // NOTE: this is called from C code in os_windows.cpp with AMD64. other
-  // builds use inline asm -- so we should be ok for aarch64
-
-  address generate_get_previous_fp() { return 0; }
-
-  // Support for intptr_t get_previous_sp()
-  //
-  // This routine is used to find the previous stack pointer for the
-  // caller.
-
-  // NOTE: this is called from C code in os_windows.cpp with AMD64. other
-  // builds use inline asm -- so we should be ok for aarch64
-
-  address generate_get_previous_sp() { return 0; }
-
-  // NOTE: these fixup routines appear only to be called from the
-  // opto code (they are mentioned in x86_64.ad) so we can do
-  // without them for now on aarch64
-
-  address generate_f2i_fixup() { Unimplemented(); return 0; }
-
-  address generate_f2l_fixup() { Unimplemented(); return 0; }
-
-  address generate_d2i_fixup() { Unimplemented(); return 0; }
-
-  address generate_d2l_fixup() { Unimplemented(); return 0; }
-
-  // The following routine generates a subroutine to throw an
-  // asynchronous UnknownError when an unsafe access gets a fault that
-  // could not be reasonably prevented by the programmer.  (Example:
-  // SIGBUS/OBJERR.)
-
-  // NOTE: this is used by the signal handler code as a return address
-  // to re-enter Java execution so it needs an x86 prolog which will
-  // reenter the simulator executing the generated handler code. so
-  // the prolog needs to adjust the sim's restart pc to enter the
-  // generated code at the start position then return from native to
-  // simulated execution.
-
-  address generate_handler_for_unsafe_access() { return 0; }
-
   // Non-destructive plausibility checks for oops
   //
   // Arguments:
@@ -835,29 +678,7 @@
     return start;
   }

-  //
-  // Verify that a register contains clean 32-bits positive value
-  // (high 32-bits are 0) so it could be used in 64-bits shifts.
-  //
-  //  Input:
-  //    Rint  -  32-bits value
-  //    Rtmp  -  scratch
-  //
-  void assert_clean_int(Register Rint, Register Rtmp) { Unimplemented(); }
-
-  //  Generate overlap test for array copy stubs
-  //
-  //  Input:
-  //     c_rarg0 - from
-  //     c_rarg1 - to
-  //     c_rarg2 - element count
-  //
-  //  Output:
-  //     r0   - &from[element count - 1]
-  //
-  void array_overlap_test(address no_overlap_target, int sf) { Unimplemented(); }
   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
-  void array_overlap_test(address no_overlap_target, Label* NOLp, int sf) { Unimplemented(); }

   // Generate code for an array write pre barrier
   //
@@ -1096,7 +917,7 @@

   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
     bool is_backwards = step < 0;
-    size_t granularity = abs(step);
+    size_t granularity = uabs(step);
     int direction = is_backwards ? -1 : 1;
     int unit = wordSize * direction;

@@ -1152,7 +973,7 @@
 		   Register count, Register tmp, int step) {
     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
     bool is_backwards = step < 0;
-    int granularity = abs(step);
+    int granularity = uabs(step);
     const Register t0 = r3, t1 = r4;

     if (is_backwards) {
@@ -1748,23 +1569,6 @@
     return start;
   }

-  //
-  //  Generate 'unsafe' array copy stub
-  //  Though just as safe as the other stubs, it takes an unscaled
-  //  size_t argument instead of an element count.
-  //
-  //  Input:
-  //    c_rarg0   - source array address
-  //    c_rarg1   - destination array address
-  //    c_rarg2   - byte count, treated as ssize_t, can be zero
-  //
-  // Examines the alignment of the operands and dispatches
-  // to a long, int, short, or byte copy loop.
-  //
-  address generate_unsafe_copy(const char *name,
-                               address byte_copy_entry, address short_copy_entry,
-                               address int_copy_entry, address long_copy_entry) { Unimplemented(); return 0; }
-
   // Perform range checks on the proposed arraycopy.
   // Kills temp, but nothing else.
   // Also, clean the sign bits of src_pos and dst_pos.
@@ -1776,28 +1580,6 @@
                               Register temp,
                               Label& L_failed) { Unimplemented(); }

-  //
-  //  Generate generic array copy stubs
-  //
-  //  Input:
-  //    c_rarg0    -  src oop
-  //    c_rarg1    -  src_pos (32-bits)
-  //    c_rarg2    -  dst oop
-  //    c_rarg3    -  dst_pos (32-bits)
-  // not Win64
-  //    c_rarg4    -  element count (32-bits)
-  // Win64
-  //    rsp+40     -  element count (32-bits)
-  //
-  //  Output:
-  //    r0 ==  0  -  success
-  //    r0 == -1^K - failure, where K is partial transfer count
-  //
-  address generate_generic_copy(const char *name,
-                                address byte_copy_entry, address short_copy_entry,
-                                address int_copy_entry, address oop_copy_entry,
-                                address long_copy_entry, address checkcast_copy_entry) { Unimplemented(); return 0; }
-
   // These stubs get called from some dumb test routine.
   // I'll write them properly when they're called from
   // something that's actually doing something.
@@ -1894,7 +1676,413 @@
                                                                         /*dest_uninitialized*/true);
   }

-  void generate_math_stubs() { Unimplemented(); }
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter();
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+
+    __ ld1(v1, __ T16B, key);
+    __ rev32(v1, __ T16B, v1);
+    __ eor(v0, __ T16B, v0, v1);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v5, __ T16B, __ post(key, 16));
+    __ rev32(v5, __ T16B, v5);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+
+    __ eor(v0, __ T16B, v0, v5);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch1, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v0, __ T16B, rvec);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, v31, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+      __ rev32(v31, __ T16B, v31);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v1, __ T16B, __ post(from, 16));
+      __ eor(v0, __ T16B, v0, v1);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aese(v0, v17); __ aesmc(v0, v0);
+      __ aese(v0, v18); __ aesmc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aese(v0, v19); __ aesmc(v0, v0);
+      __ aese(v0, v20); __ aesmc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aese(v0, v21); __ aesmc(v0, v0);
+      __ aese(v0, v22); __ aesmc(v0, v0);
+      __ aese(v0, v23); __ aesmc(v0, v0);
+      __ aese(v0, v24); __ aesmc(v0, v0);
+      __ aese(v0, v25); __ aesmc(v0, v0);
+      __ aese(v0, v26); __ aesmc(v0, v0);
+      __ aese(v0, v27); __ aesmc(v0, v0);
+      __ aese(v0, v28); __ aesmc(v0, v0);
+      __ aese(v0, v29); __ aesmc(v0, v0);
+      __ aese(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v0, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch2, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v2, __ T16B, rvec);
+
+      __ ld1(v31, __ T16B, __ post(key, 16));
+      __ rev32(v31, __ T16B, v31);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v0, __ T16B, __ post(from, 16));
+      __ orr(v1, __ T16B, v0, v0);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aesd(v0, v19); __ aesimc(v0, v0);
+      __ aesd(v0, v20); __ aesimc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aesd(v0, v21); __ aesimc(v0, v0);
+      __ aesd(v0, v22); __ aesimc(v0, v0);
+      __ aesd(v0, v23); __ aesimc(v0, v0);
+      __ aesd(v0, v24); __ aesimc(v0, v0);
+      __ aesd(v0, v25); __ aesimc(v0, v0);
+      __ aesd(v0, v26); __ aesimc(v0, v0);
+      __ aesd(v0, v27); __ aesimc(v0, v0);
+      __ aesd(v0, v28); __ aesimc(v0, v0);
+      __ aesd(v0, v29); __ aesimc(v0, v0);
+      __ aesd(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+      __ eor(v0, __ T16B, v0, v2);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ orr(v2, __ T16B, v1, v1);
+
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v2, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+
+    return start;
+  }

   // AARCH64 use safefetch stubs unless we are building for the simulator
   // in which case the x86 asm code in linux_aarch64.S is used
@@ -2005,38 +2193,6 @@
   // otherwise assume that stack unwinding will be initiated, so
   // caller saved registers were assumed volatile in the compiler.

-  // NOTE: this needs carefully checking to see where the generated
-  // code gets called from for each generated error
-  //
-  // WrongMethodTypeException : jumped to directly from generated method
-  // handle code.
-  //
-  // StackOverflowError : jumped to directly from generated code in
-  // cpp and template interpreter. the generated code address also
-  // appears to be returned from the signal handler as the re-entry
-  // address forJava execution to continue from. This means it needs
-  // to be enterable from x86 code. Hmm, we may need to expose both an
-  // x86 prolog and the address of the generated ARM code and clients
-  // will have to be mdoified to pick the correct one.
-  //
-  // AbstractMethodError : never jumped to from generated code but the
-  // generated code address appears to be returned from the signal
-  // handler as the re-entry address for Java execution to continue
-  // from. This means it needs to be enterable from x86 code. So, we
-  // will need to provide this one with an x86 prolog as per
-  // StackOverflowError
-  //
-  // IncompatibleClassChangeError : only appears to be jumped to
-  // directly from vtableStubs code
-  //
-  // NullPointerException : never jumped to from generated code but
-  // the generated code address appears to be returned from the signal
-  // handler as the re-entry address for Java execution to continue
-  // from. This means it needs to be enterable from x86 code. So, we
-  // will need to provide this one with an x86 prolog as per
-  // StackOverflowError
-
-
   address generate_throw_exception(const char* name,
                                    address runtime_entry,
                                    Register arg1 = noreg,
@@ -2100,6 +2256,7 @@
     oop_maps->add_gc_map(the_pc - start, map);

     __ reset_last_Java_frame(true, true);
+    __ maybe_isb();

     __ leave();

@@ -2142,22 +2299,6 @@
     // is referenced by megamorphic call
     StubRoutines::_catch_exception_entry = generate_catch_exception();

-    // atomic calls
-    StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
-    StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
-    StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
-    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
-    StubRoutines::_atomic_add_entry          = generate_atomic_add();
-    StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
-    StubRoutines::_fence_entry               = generate_orderaccess_fence();
-
-    StubRoutines::_handler_for_unsafe_access_entry =
-      generate_handler_for_unsafe_access();
-
-    // platform dependent
-    StubRoutines::aarch64::_get_previous_fp_entry = generate_get_previous_fp();
-    StubRoutines::aarch64::_get_previous_sp_entry = generate_get_previous_sp();
-
     // Build this early so it's available for the interpreter.
     StubRoutines::_throw_StackOverflowError_entry =
       generate_throw_exception("StackOverflowError throw_exception",
@@ -2197,6 +2338,13 @@
     generate_arraycopy_stubs();

 #ifndef BUILTIN_SIM
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -325,6 +325,7 @@
   address entry = __ pc();
   __ push(state);
   __ call_VM(noreg, runtime_entry);
+  __ membar(Assembler::AnyAny);
   __ dispatch_via(vtos, Interpreter::_normal_table.table_for(vtos));
   return entry;
 }
@@ -1049,6 +1050,7 @@

   // Call the native method.
   __ blrt(r10, rscratch1);
+  __ maybe_isb();
   __ get_method(rmethod);
   // result potentially in r0 or v0

@@ -1106,6 +1108,7 @@
     __ mov(c_rarg0, rthread);
     __ mov(rscratch2, CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans));
     __ blrt(rscratch2, 1, 0, 0);
+    __ maybe_isb();
     __ get_method(rmethod);
     __ reinit_heapbase();
     __ bind(Continue);
--- a/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -1604,6 +1604,12 @@

 void TemplateTable::branch(bool is_jsr, bool is_wide)
 {
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
   __ profile_taken_branch(r0, r1);
   const ByteSize be_offset = methodOopDesc::backedge_counter_offset() +
                              InvocationCounter::counter_offset();
@@ -1867,6 +1873,12 @@

 void TemplateTable::ret() {
   transition(vtos, vtos);
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
   locals_index(r1);
   __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
   __ profile_ret(r1, r2);
@@ -3379,6 +3391,8 @@

   // continue
   __ bind(done);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
 }

 void TemplateTable::newarray() {
@@ -3387,6 +3401,7 @@
   __ mov(c_rarg2, r0);
   call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray),
           c_rarg1, c_rarg2);
+  __ membar(Assembler::StoreStore);
 }

 void TemplateTable::anewarray() {
@@ -3396,6 +3411,7 @@
   __ mov(c_rarg3, r0);
   call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray),
           c_rarg1, c_rarg2, c_rarg3);
+  __ membar(Assembler::StoreStore);
 }

 void TemplateTable::arraylength() {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -47,6 +47,10 @@
 #include <sys/auxv.h>
 #include <asm/hwcap.h>

+#ifndef HWCAP_AES
+#define HWCAP_AES   (1<<3)
+#endif
+
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1<<7)
 #endif
@@ -105,10 +109,14 @@
   _supports_atomic_getset8 = true;
   _supports_atomic_getadd8 = true;

-  FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
+    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
+    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, 64);
   FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
   FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
   FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);

 #ifndef BUILTIN_SIM
   unsigned long auxv = getauxval(AT_HWCAP);
@@ -118,11 +126,32 @@
   if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
     warning("UseCRC32 specified, but not supported on this CPU");
   }
+  if (auxv & HWCAP_AES) {
+    UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
+    UseAESIntrinsics =
+        UseAESIntrinsics || (UseAES && FLAG_IS_DEFAULT(UseAESIntrinsics));
+    if (UseAESIntrinsics && !UseAES) {
+      warning("UseAESIntrinsics enabled, but UseAES not, enabling");
+      UseAES = true;
+    }
+  } else {
+    if (UseAES) {
+      warning("UseAES specified, but not supported on this CPU");
+    }
+    if (UseAESIntrinsics) {
+      warning("UseAESIntrinsics specified, but not supported on this CPU");
+    }
+  }
 #endif

   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(OptoScheduling)) {
+    OptoScheduling = true;
+  }
+#endif
 }

 void VM_Version::initialize() {
--- a/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -58,7 +58,8 @@

 #ifndef PRODUCT
   if (CountCompiledCalls) {
-    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ lea(r19, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r19));
   }
 #endif

@@ -73,12 +74,14 @@
   if (DebugVtables) {
     Label L;
     // check offset vs vtable length
-    __ ldrw(rscratch1, Address(r0, instanceKlass::vtable_length_offset() * wordSize));
+    __ ldrw(rscratch1, Address(r19, instanceKlass::vtable_length_offset() * wordSize));
     __ cmpw(rscratch1, vtable_index * vtableEntry::size());
     __ br(Assembler::GT, L);
+    __ enter();
     __ mov(r2, vtable_index);
     __ call_VM(noreg,
                CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), j_rarg0, r2);
+    __ leave();
     __ bind(L);
   }
 #endif // PRODUCT
@@ -109,9 +112,6 @@
                   (int)(s->code_end() - __ pc()));
   }
   guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(vtable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");

   s->set_exception_points(npe_addr, ame_addr);
   return s;
@@ -130,7 +130,8 @@

 #ifndef PRODUCT
   if (CountCompiledCalls) {
-    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ lea(r10, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r10));
   }
 #endif

@@ -190,9 +191,6 @@
                   (int)(s->code_end() - __ pc()));
   }
   guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(itable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");

   s->set_exception_points(npe_addr, ame_addr);
   return s;
@@ -200,8 +198,48 @@


 int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
-  // FIXME
-  return 200;
+  int size = DebugVtables ? 216 : 0;
+  if (CountCompiledCalls)
+    size += 6 * 4;
+   // FIXME
+  if (is_vtable_stub)
+    size += 52;
+  else
+    size += 104;
+  return size;
+
+  // In order to tune these parameters, run the JVM with VM options
+  // +PrintMiscellaneous and +WizardMode to see information about
+  // actual itable stubs.  Run it with -Xmx31G -XX:+UseCompressedOops.
+  //
+  // If Universe::narrow_klass_base is nonzero, decoding a compressed
+  // class can take zeveral instructions.  Run it with -Xmx31G
+  // -XX:+UseCompressedOops.
+  //
+  // The JVM98 app. _202_jess has a megamorphic interface call.
+  // The itable code looks like this:
+  // Decoding VtableStub itbl[1]@12
+  //     ldr     w10, [x1,#8]
+  //     lsl     x10, x10, #3
+  //     ldr     w11, [x10,#280]
+  //     add     x11, x10, x11, uxtx #3
+  //     add     x11, x11, #0x1b8
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.eq    success
+  // loop:
+  //     cbz     x12, throw_icce
+  //     add     x11, x11, #0x10
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.ne    loop
+  // success:
+  //     ldr     x11, [x11,#8]
+  //     ldr     x12, [x10,x11]
+  //     ldr     x8, [x12,#72]
+  //     br      x8
+  // throw_icce:
+  //     b	throw_ICCE_entry
 }

 int VtableStub::pd_code_alignment() { return 4; }
--- a/src/os_cpu/linux_aarch64/vm/atomic_linux_aarch64.inline.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/os_cpu/linux_aarch64/vm/atomic_linux_aarch64.inline.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -31,6 +31,10 @@

 // Implementation of class atomic

+#define FULL_MEM_BARRIER  __sync_synchronize()
+#define READ_MEM_BARRIER  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+#define WRITE_MEM_BARRIER __atomic_thread_fence(__ATOMIC_RELEASE);
+
 inline void Atomic::store    (jbyte    store_value, jbyte*    dest) { *dest = store_value; }
 inline void Atomic::store    (jshort   store_value, jshort*   dest) { *dest = store_value; }
 inline void Atomic::store    (jint     store_value, jint*     dest) { *dest = store_value; }
@@ -71,7 +75,9 @@

 inline jint Atomic::xchg (jint exchange_value, volatile jint* dest)
 {
- return __sync_lock_test_and_set (dest, exchange_value);
+  jint res = __sync_lock_test_and_set (dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return res;
 }

 inline void* Atomic::xchg_ptr(void* exchange_value, volatile void* dest)
@@ -111,7 +117,9 @@

 inline intptr_t Atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
 {
- return __sync_lock_test_and_set (dest, exchange_value);
+  intptr_t res = __sync_lock_test_and_set (dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return res;
 }

 inline jlong Atomic::cmpxchg (jlong exchange_value, volatile jlong* dest, jlong compare_value)
--- a/src/os_cpu/linux_aarch64/vm/globals_linux_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/os_cpu/linux_aarch64/vm/globals_linux_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -41,4 +41,6 @@
 // Only used on 64 bit Windows platforms
 define_pd_global(bool, UseVectoredExceptions,    false);

+extern __thread Thread *aarch64_currentThread;
+
 #endif // OS_CPU_LINUX_AARCH64_VM_GLOBALS_LINUX_AARCH64_HPP
--- a/src/os_cpu/linux_aarch64/vm/orderAccess_linux_aarch64.inline.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/os_cpu/linux_aarch64/vm/orderAccess_linux_aarch64.inline.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -27,13 +27,10 @@
 #define OS_CPU_LINUX_AARCH64_VM_ORDERACCESS_LINUX_AARCH64_INLINE_HPP

 #include "runtime/atomic.hpp"
+#include "atomic_linux_aarch64.inline.hpp"
 #include "runtime/orderAccess.hpp"
 #include "vm_version_aarch64.hpp"

-#define FULL_MEM_BARRIER  __sync_synchronize()
-#define READ_MEM_BARRIER  __atomic_thread_fence(__ATOMIC_ACQUIRE);
-#define WRITE_MEM_BARRIER __atomic_thread_fence(__ATOMIC_RELEASE);
-
 // Implementation of class OrderAccess.

 inline void OrderAccess::loadload()   { acquire(); }
--- a/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -26,32 +26,6 @@
 #include "runtime/threadLocalStorage.hpp"
 #include "thread_linux.inline.hpp"

-// Map stack pointer (%esp) to thread pointer for faster TLS access
-//
-// Here we use a flat table for better performance. Getting current thread
-// is down to one memory access (read _sp_map[%esp>>12]) in generated code
-// and two in runtime code (-fPIC code needs an extra load for _sp_map).
-//
-// This code assumes stack page is not shared by different threads. It works
-// in 32-bit VM when page size is 4K (or a multiple of 4K, if that matters).
-//
-// Notice that _sp_map is allocated in the bss segment, which is ZFOD
-// (zero-fill-on-demand). While it reserves 4M address space upfront,
-// actual memory pages are committed on demand.
-//
-// If an application creates and destroys a lot of threads, usually the
-// stack space freed by a thread will soon get reused by new thread
-// (this is especially true in NPTL or LinuxThreads in fixed-stack mode).
-// No memory page in _sp_map is wasted.
-//
-// However, it's still possible that we might end up populating &
-// committing a large fraction of the 4M table over time, but the actual
-// amount of live data in the table could be quite small. The max wastage
-// is less than 4M bytes. If it becomes an issue, we could use madvise()
-// with MADV_DONTNEED to reclaim unused (i.e. all-zero) pages in _sp_map.
-// MADV_DONTNEED on Linux keeps the virtual memory mapping, but zaps the
-// physical memory page (i.e. similar to MADV_FREE on Solaris).
-
 void ThreadLocalStorage::generate_code_for_get_thread() {
     // nothing we can do here for user-level thread
 }
@@ -59,6 +33,9 @@
 void ThreadLocalStorage::pd_init() {
 }

+__thread Thread *aarch64_currentThread;
+
 void ThreadLocalStorage::pd_set_thread(Thread* thread) {
   os::thread_local_storage_at_put(ThreadLocalStorage::thread_index(), thread);
+  aarch64_currentThread = thread;
 }
--- a/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -29,8 +29,8 @@

 public:

-   static Thread* thread() {
-     return (Thread*) os::thread_local_storage_at(thread_index());
+  static Thread* thread() {
+    return aarch64_currentThread;
    }

 #endif // OS_CPU_LINUX_AARCH64_VM_THREADLS_LINUX_AARCH64_HPP
--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp	Thu Dec 04 14:30:02 2014 +0000
@@ -130,7 +130,6 @@
   // The alignment used for eden and survivors within the young gen
   // and for boundary between young gen and old gen.
   size_t intra_heap_alignment() const { return 64 * K * HeapWordSize; }
-
   size_t capacity() const;
   size_t used() const;
--- a/src/share/vm/memory/collectorPolicy.cpp	Fri Nov 28 03:10:21 2014 +0000
+++ b/src/share/vm/memory/collectorPolicy.cpp	Thu Dec 04 14:30:02 2014 +0000
@@ -73,7 +73,7 @@
   }
   PermSize = MAX2(min_alignment(), align_size_down_(PermSize, min_alignment()));
   // Don't increase Perm size limit above specified.
-  MaxPermSize = align_size_down(MaxPermSize, max_alignment());
+  MaxPermSize = MAX2(max_alignment(), align_size_down_(MaxPermSize, max_alignment()));
   if (PermSize > MaxPermSize) {
     PermSize = MaxPermSize;
   }