changeset 8679:2ee4407fe4e4 icedtea-3.0.0pre06

Merge aarch64 port to jdk8u60-b24
author andrew
date Fri, 02 Oct 2015 04:37:30 +0100
parents 1993eaeabefc (current diff) ff13d8140756 (diff)
children 3b05ef40e997
files .hgtags THIRD_PARTY_README agent/src/os/linux/LinuxDebuggerLocal.c agent/src/os/linux/libproc.h agent/src/os/linux/ps_proc.c agent/src/share/classes/sun/jvm/hotspot/gc_implementation/g1/HeapRegionManager.java make/Makefile make/defs.make make/hotspot_version make/linux/makefiles/buildtree.make make/linux/makefiles/defs.make make/linux/makefiles/gcc.make make/linux/makefiles/mapfile-vers-debug make/linux/makefiles/sa.make make/linux/makefiles/saproc.make make/linux/makefiles/vm.make make/windows/makefiles/sa.make src/cpu/aarch64/vm/c1_LinearScan_aarch64.cpp src/cpu/sparc/vm/sparc.ad src/cpu/x86/vm/assembler_x86.cpp src/cpu/x86/vm/assembler_x86.hpp src/cpu/x86/vm/globals_x86.hpp src/cpu/x86/vm/macroAssembler_x86.cpp src/cpu/x86/vm/macroAssembler_x86.hpp src/cpu/x86/vm/x86_64.ad src/cpu/zero/vm/cppInterpreter_zero.cpp src/os/bsd/vm/os_bsd.cpp src/os/linux/vm/os_linux.cpp src/os_cpu/linux_x86/vm/os_linux_x86.cpp src/share/vm/c1/c1_GraphBuilder.cpp src/share/vm/c1/c1_LIR.cpp src/share/vm/c1/c1_LIR.hpp src/share/vm/c1/c1_LIRGenerator.cpp src/share/vm/c1/c1_LIRGenerator.hpp src/share/vm/c1/c1_LinearScan.cpp src/share/vm/c1/c1_Runtime1.cpp src/share/vm/ci/ciEnv.cpp src/share/vm/ci/ciMethod.cpp src/share/vm/ci/ciMethod.hpp src/share/vm/ci/ciMethodData.cpp src/share/vm/classfile/classFileParser.cpp src/share/vm/classfile/classLoaderData.cpp src/share/vm/classfile/classLoaderData.hpp src/share/vm/classfile/defaultMethods.cpp src/share/vm/classfile/dictionary.cpp src/share/vm/classfile/dictionary.hpp src/share/vm/classfile/javaClasses.cpp src/share/vm/classfile/metadataOnStackMark.cpp src/share/vm/classfile/symbolTable.cpp src/share/vm/classfile/systemDictionary.cpp src/share/vm/classfile/systemDictionary.hpp src/share/vm/classfile/verifier.cpp src/share/vm/classfile/vmSymbols.hpp src/share/vm/code/codeCache.cpp src/share/vm/code/dependencies.cpp src/share/vm/code/nmethod.cpp src/share/vm/code/nmethod.hpp src/share/vm/code/relocInfo.cpp src/share/vm/code/vmreg.hpp src/share/vm/compiler/compileBroker.cpp src/share/vm/compiler/compileBroker.hpp src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp src/share/vm/gc_implementation/g1/concurrentMark.cpp src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp src/share/vm/gc_implementation/g1/g1AllocRegion.hpp src/share/vm/gc_implementation/g1/g1CardCounts.cpp src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp src/share/vm/gc_implementation/g1/g1EvacFailure.hpp src/share/vm/gc_implementation/g1/g1MarkSweep.cpp src/share/vm/gc_implementation/g1/g1RemSet.cpp src/share/vm/gc_implementation/g1/g1RemSet.hpp src/share/vm/gc_implementation/g1/g1SATBCardTableModRefBS.cpp src/share/vm/gc_implementation/g1/g1SATBCardTableModRefBS.hpp src/share/vm/gc_implementation/g1/heapRegion.cpp src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp src/share/vm/gc_implementation/parNew/parNewGeneration.cpp src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp src/share/vm/gc_implementation/parallelScavenge/psPromotionManager.inline.hpp src/share/vm/gc_implementation/parallelScavenge/psScavenge.cpp src/share/vm/gc_interface/collectedHeap.cpp src/share/vm/gc_interface/collectedHeap.hpp src/share/vm/interpreter/abstractInterpreter.hpp src/share/vm/interpreter/bytecodeInterpreter.cpp src/share/vm/interpreter/interpreterRuntime.cpp src/share/vm/interpreter/linkResolver.cpp src/share/vm/interpreter/templateTable.hpp src/share/vm/memory/allocation.inline.hpp src/share/vm/memory/cardTableModRefBS.cpp src/share/vm/memory/cardTableModRefBS.hpp src/share/vm/memory/collectorPolicy.cpp src/share/vm/memory/defNewGeneration.cpp src/share/vm/memory/filemap.hpp src/share/vm/memory/genCollectedHeap.cpp src/share/vm/memory/genCollectedHeap.hpp src/share/vm/memory/metadataFactory.hpp src/share/vm/memory/metaspace.cpp src/share/vm/memory/metaspace.hpp src/share/vm/memory/metaspaceShared.cpp src/share/vm/memory/metaspaceShared.hpp src/share/vm/memory/referenceProcessor.cpp src/share/vm/memory/universe.cpp src/share/vm/oops/constantPool.cpp src/share/vm/oops/cpCache.cpp src/share/vm/oops/cpCache.hpp src/share/vm/oops/instanceKlass.cpp src/share/vm/oops/instanceKlass.hpp src/share/vm/oops/klassVtable.cpp src/share/vm/oops/method.cpp src/share/vm/oops/method.hpp src/share/vm/oops/methodData.cpp src/share/vm/oops/methodData.hpp src/share/vm/oops/oop.inline.hpp src/share/vm/oops/typeArrayOop.hpp src/share/vm/opto/c2_globals.hpp src/share/vm/opto/c2compiler.cpp src/share/vm/opto/callGenerator.cpp src/share/vm/opto/callGenerator.hpp src/share/vm/opto/compile.cpp src/share/vm/opto/compile.hpp src/share/vm/opto/doCall.cpp src/share/vm/opto/escape.cpp src/share/vm/opto/gcm.cpp src/share/vm/opto/graphKit.cpp src/share/vm/opto/ifnode.cpp src/share/vm/opto/lcm.cpp src/share/vm/opto/library_call.cpp src/share/vm/opto/locknode.hpp src/share/vm/opto/loopTransform.cpp src/share/vm/opto/loopopts.cpp src/share/vm/opto/machnode.hpp src/share/vm/opto/matcher.cpp src/share/vm/opto/memnode.cpp src/share/vm/opto/memnode.hpp src/share/vm/opto/node.hpp src/share/vm/opto/output.hpp src/share/vm/opto/parse.hpp src/share/vm/opto/parse1.cpp src/share/vm/opto/parse2.cpp src/share/vm/opto/phaseX.cpp src/share/vm/opto/regmask.cpp src/share/vm/opto/regmask.hpp src/share/vm/opto/runtime.cpp src/share/vm/opto/runtime.hpp src/share/vm/opto/subnode.cpp src/share/vm/opto/type.cpp src/share/vm/prims/jni.cpp src/share/vm/prims/jvm.cpp src/share/vm/prims/jvm.h src/share/vm/prims/jvmtiRedefineClasses.cpp src/share/vm/prims/jvmtiTagMap.cpp src/share/vm/prims/unsafe.cpp src/share/vm/prims/whitebox.cpp src/share/vm/runtime/advancedThresholdPolicy.cpp src/share/vm/runtime/arguments.cpp src/share/vm/runtime/arguments.hpp src/share/vm/runtime/deoptimization.cpp src/share/vm/runtime/frame.cpp src/share/vm/runtime/frame.hpp src/share/vm/runtime/globals.cpp src/share/vm/runtime/globals.hpp src/share/vm/runtime/java.cpp src/share/vm/runtime/os.hpp src/share/vm/runtime/safepoint.cpp src/share/vm/runtime/sharedRuntime.cpp src/share/vm/runtime/stubRoutines.cpp src/share/vm/runtime/stubRoutines.hpp src/share/vm/runtime/sweeper.cpp src/share/vm/runtime/thread.cpp src/share/vm/runtime/thread.hpp src/share/vm/runtime/vframeArray.cpp src/share/vm/runtime/virtualspace.cpp src/share/vm/runtime/vmStructs.cpp src/share/vm/runtime/vm_operations.hpp src/share/vm/runtime/vm_version.cpp src/share/vm/services/attachListener.cpp src/share/vm/services/management.cpp src/share/vm/trace/trace.xml src/share/vm/utilities/array.hpp src/share/vm/utilities/elfFile.cpp src/share/vm/utilities/globalDefinitions.hpp src/share/vm/utilities/macros.hpp src/share/vm/utilities/ostream.cpp src/share/vm/utilities/vmError.cpp test/TEST.groups test/compiler/whitebox/CompilerWhiteBoxTest.java test/compiler/whitebox/IsMethodCompilableTest.java test/testlibrary/com/oracle/java/testlibrary/Platform.java test/testlibrary/whitebox/sun/hotspot/WhiteBox.java
diffstat 98 files changed, 7610 insertions(+), 3138 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Wed Sep 30 16:43:15 2015 +0100
+++ b/.hgtags	Fri Oct 02 04:37:30 2015 +0100
@@ -586,6 +586,9 @@
 6b93bf9ea3ea57ed0fe53cfedb2f9ab912c324e5 jdk8u40-b12
 521e269ae1daa9df1cb0835b97aa76bdf340fcb2 hs25.40-b17
 86307d47790785398d0695acc361bccaefe25f94 jdk8u40-b13
+b280f4f4f11916e202aaa4d458630d4c26b59e2a jdk8u40-b12-aarch64
+26fc60dd5da8d3f1554fb8f2553f050839a539c6 jdk8u40-b12-aarch64-1262
+d7c03eb8b2c2bc4d34438699f07609ba4c4bca5c jdk8u40-b12-aarch64-1263
 4d5dc0d0f8799fafa1135d51d85edd4edd566501 hs25.40-b18
 b8ca8ec1daea70f7c0d519e866f9f147ec247055 jdk8u40-b14
 eb16b24e2eba9bdf04a9b377bebc2db9f713ff5e jdk8u40-b15
@@ -696,3 +699,6 @@
 878cb0df27c22c6b1e9f4add1eb3da3edc8ab51d jdk8u60-b22
 0e4094950cd312c8f95c7f37336606323fe049fe jdk8u60-b23
 d89ceecf1bad55e1aee2932b8895d60fc64c15db hs25.60-b23
+fb157d537278cda4150740e27bb57cd8694e15bf jdk8u60-b24
+11098f828fb815a467e77729f2055d6b1575ad3e arch64-jdk8u60-b24
+8ec803e97a0d578eaeaf8375ee295a5928eb546f aarch64-jdk8u60-b24.2
--- a/agent/make/Makefile	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/make/Makefile	Fri Oct 02 04:37:30 2015 +0100
@@ -58,11 +58,13 @@
 sun.jvm.hotspot.debugger.dummy \
 sun.jvm.hotspot.debugger.linux \
 sun.jvm.hotspot.debugger.linux.amd64 \
+sun.jvm.hotspot.debugger.linux.aarch64 \
 sun.jvm.hotspot.debugger.linux.x86 \
 sun.jvm.hotspot.debugger.posix \
 sun.jvm.hotspot.debugger.posix.elf \
 sun.jvm.hotspot.debugger.proc \
 sun.jvm.hotspot.debugger.proc.amd64 \
+sun.jvm.hotspot.debugger.proc.aarch64 \
 sun.jvm.hotspot.debugger.proc.sparc \
 sun.jvm.hotspot.debugger.proc.x86 \
 sun.jvm.hotspot.debugger.remote \
@@ -88,11 +90,13 @@
 sun.jvm.hotspot.prims \
 sun.jvm.hotspot.runtime \
 sun.jvm.hotspot.runtime.amd64 \
+sun.jvm.hotspot.runtime.aarch64 \
 sun.jvm.hotspot.runtime.bsd \
 sun.jvm.hotspot.runtime.bsd_amd64 \
 sun.jvm.hotspot.runtime.bsd_x86 \
 sun.jvm.hotspot.runtime.linux \
 sun.jvm.hotspot.runtime.linux_amd64 \
+sun.jvm.hotspot.runtime.linux_aarch64 \
 sun.jvm.hotspot.runtime.linux_sparc \
 sun.jvm.hotspot.runtime.linux_x86 \
 sun.jvm.hotspot.runtime.posix \
@@ -143,12 +147,13 @@
 sun/jvm/hotspot/debugger/dummy/*.java \
 sun/jvm/hotspot/debugger/linux/*.java \
 sun/jvm/hotspot/debugger/linux/x86/*.java \
+sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 sun/jvm/hotspot/debugger/posix/*.java \
 sun/jvm/hotspot/debugger/posix/elf/*.java \
 sun/jvm/hotspot/debugger/proc/*.java \
-sun/jvm/hotspot/debugger/proc/amd64/*.java \
 sun/jvm/hotspot/debugger/proc/sparc/*.java \
 sun/jvm/hotspot/debugger/proc/x86/*.java \
+sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 sun/jvm/hotspot/debugger/remote/*.java \
 sun/jvm/hotspot/debugger/remote/amd64/*.java \
 sun/jvm/hotspot/debugger/remote/sparc/*.java \
@@ -169,11 +174,13 @@
 sun/jvm/hotspot/prims/*.java \
 sun/jvm/hotspot/runtime/*.java \
 sun/jvm/hotspot/runtime/amd64/*.java \
+sun/jvm/hotspot/runtime/aarch64/*.java \
 sun/jvm/hotspot/runtime/bsd/*.java \
 sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 sun/jvm/hotspot/runtime/bsd_x86/*.java \
 sun/jvm/hotspot/runtime/linux/*.java \
 sun/jvm/hotspot/runtime/linux_amd64/*.java \
+sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 sun/jvm/hotspot/runtime/linux_sparc/*.java \
 sun/jvm/hotspot/runtime/linux_x86/*.java \
 sun/jvm/hotspot/runtime/posix/*.java \
--- a/agent/src/os/linux/LinuxDebuggerLocal.c	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/os/linux/LinuxDebuggerLocal.c	Fri Oct 02 04:37:30 2015 +0100
@@ -49,6 +49,10 @@
 #include "sun_jvm_hotspot_debugger_sparc_SPARCThreadContext.h"
 #endif
 
+#ifdef aarch64
+#include "sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext.h"
+#endif
+
 static jfieldID p_ps_prochandle_ID = 0;
 static jfieldID threadList_ID = 0;
 static jfieldID loadObjectList_ID = 0;
@@ -353,7 +357,7 @@
 #define NPRGREG sun_jvm_hotspot_debugger_amd64_AMD64ThreadContext_NPRGREG
 #endif
 #ifdef aarch64
-#define NPRGREG 32
+#define NPRGREG sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_NPRGREG
 #endif
 #if defined(sparc) || defined(sparcv9)
 #define NPRGREG sun_jvm_hotspot_debugger_sparc_SPARCThreadContext_NPRGREG
@@ -462,6 +466,13 @@
 
 #define REG_INDEX(reg) sun_jvm_hotspot_debugger_aarch64_AARCH64ThreadContext_##reg
 
+  {
+    int i;
+    for (i = 0; i < 31; i++)
+      regs[i] = gregs.regs[i];
+    regs[REG_INDEX(SP)] = gregs.sp;
+    regs[REG_INDEX(PC)] = gregs.pc;
+  }
 #endif /* aarch64 */
 
 
--- a/agent/src/os/linux/Makefile	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/os/linux/Makefile	Fri Oct 02 04:37:30 2015 +0100
@@ -53,14 +53,15 @@
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
 		sun.jvm.hotspot.debugger.x86.X86ThreadContext \
 		sun.jvm.hotspot.debugger.sparc.SPARCThreadContext \
-		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext 
+		sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext \
+		sun.jvm.hotspot.debugger.aarch64.AARCH64ThreadContext 
         $(GCC) $(CFLAGS) $< -o $@
 
 $(ARCH)/sadis.o:  ../../share/native/sadis.c
         $(JAVAH) -jni -classpath ../../../build/classes -d $(ARCH) \
                 sun.jvm.hotspot.asm.Disassembler
         $(GCC) $(CFLAGS) $< -o $@
- 
+
 $(ARCH)/%.o: %.c
         $(GCC) $(CFLAGS) $< -o $@
 
--- a/agent/src/os/linux/libproc.h	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/os/linux/libproc.h	Fri Oct 02 04:37:30 2015 +0100
@@ -40,10 +40,6 @@
 #include "asm/ptrace.h"
 #endif
 
-#if defined(aarch64)
-#include "asm/ptrace.h"
-#endif
-
 /************************************************************************************
 
 0. This is very minimal subset of Solaris libproc just enough for current application.
--- a/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/HSDB.java	Fri Oct 02 04:37:30 2015 +0100
@@ -985,19 +985,15 @@
                                                      curFrame.getFP(),
                                                      anno));
             } else {
-              if (VM.getVM().getCPU().equals("x86") || VM.getVM().getCPU().equals("amd64")) {
-                // For C2, which has null frame pointers on x86/amd64
-                CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
-                Address sp = curFrame.getSP();
-                if (Assert.ASSERTS_ENABLED) {
-                  Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
-                }
-                annoPanel.addAnnotation(new Annotation(sp,
-                                                       sp.addOffsetTo(cb.getFrameSize()),
-                                                       anno));
-              } else {
-                Assert.that(VM.getVM().getCPU().equals("ia64"), "only ia64 should reach here");
+              // For C2, which has null frame pointers on x86/amd64/aarch64
+              CodeBlob cb = VM.getVM().getCodeCache().findBlob(curFrame.getPC());
+              Address sp = curFrame.getSP();
+              if (Assert.ASSERTS_ENABLED) {
+                Assert.that(cb.getFrameSize() > 0, "CodeBlob must have non-zero frame size");
               }
+              annoPanel.addAnnotation(new Annotation(sp,
+                                                     sp.addOffsetTo(cb.getFrameSize()),
+                                                     anno));
             }
 
             // Add interpreter frame annotations
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/MachineDescriptionAARCH64.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2003, 2008, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger;
+
+public class MachineDescriptionAARCH64 extends MachineDescriptionTwosComplement implements MachineDescription {
+  public long getAddressSize() {
+    return 8;
+  }
+
+  public boolean isLP64() {
+    return true;
+  }
+
+  public boolean isBigEndian() {
+    return false;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/aarch64/AARCH64ThreadContext.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+
+/** Specifies the thread context on aarch64 platforms; only a sub-portion
+ * of the context is guaranteed to be present on all operating
+ * systems. */
+
+public abstract class AARCH64ThreadContext implements ThreadContext {
+    // Taken from /usr/include/asm/sigcontext.h on Linux/AARCH64.
+
+    // NOTE: the indices for the various registers must be maintained as
+    // listed across various operating systems. However, only a small
+    // subset of the registers' values are guaranteed to be present (and
+    // must be present for the SA's stack walking to work)
+
+    public static final int R0 = 0;
+    public static final int R1 = 1;
+    public static final int R2 = 2;
+    public static final int R3 = 3;
+    public static final int R4 = 4;
+    public static final int R5 = 5;
+    public static final int R6 = 6;
+    public static final int R7 = 7;
+    public static final int R8 = 8;
+    public static final int R9 = 9;
+    public static final int R10 = 10;
+    public static final int R11 = 11;
+    public static final int R12 = 12;
+    public static final int R13 = 13;
+    public static final int R14 = 14;
+    public static final int R15 = 15;
+    public static final int R16 = 16;
+    public static final int R17 = 17;
+    public static final int R18 = 18;
+    public static final int R19 = 19;
+    public static final int R20 = 20;
+    public static final int R21 = 21;
+    public static final int R22 = 22;
+    public static final int R23 = 23;
+    public static final int R24 = 24;
+    public static final int R25 = 25;
+    public static final int R26 = 26;
+    public static final int R27 = 27;
+    public static final int R28 = 28;
+    public static final int FP = 29;
+    public static final int LR = 30;
+    public static final int SP = 31;
+    public static final int PC = 32;
+
+    public static final int NPRGREG = 33;
+
+    private long[] data;
+
+    public AARCH64ThreadContext() {
+        data = new long[NPRGREG];
+    }
+
+    public int getNumRegisters() {
+        return NPRGREG;
+    }
+
+    public String getRegisterName(int index) {
+        switch (index) {
+        case LR: return "lr";
+        case SP: return "sp";
+        case PC: return "pc";
+        default:
+            return "r" + index;
+        }
+    }
+
+    public void setRegister(int index, long value) {
+        data[index] = value;
+    }
+
+    public long getRegister(int index) {
+        return data[index];
+    }
+
+    public CFrame getTopFrame(Debugger dbg) {
+        return null;
+    }
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract void setRegisterAsAddress(int index, Address value);
+
+    /** This can't be implemented in this class since we would have to
+     * tie the implementation to, for example, the debugging system */
+    public abstract Address getRegisterAsAddress(int index);
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxCDebugger.java	Fri Oct 02 04:37:30 2015 +0100
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,6 +34,8 @@
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.linux.x86.*;
 import sun.jvm.hotspot.debugger.linux.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.aarch64.*;
 import sun.jvm.hotspot.debugger.linux.sparc.*;
 import sun.jvm.hotspot.utilities.*;
 
@@ -106,6 +109,13 @@
        Address pc  = context.getRegisterAsAddress(SPARCThreadContext.R_O7);
        if (pc == null) return null;
        return new LinuxSPARCCFrame(dbg, sp, pc, LinuxDebuggerLocal.getAddressSize());
+    } else if (cpu.equals("aarch64")) {
+       AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+       Address fp = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+       if (fp == null) return null;
+       Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+       if (pc == null) return null;
+       return new LinuxAARCH64CFrame(dbg, fp, pc);
     } else {
        // Runtime exception thrown by LinuxThreadContextFactory if unknown cpu
        ThreadContext context = (ThreadContext) thread.getContext();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64CFrame.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+import sun.jvm.hotspot.debugger.cdbg.*;
+import sun.jvm.hotspot.debugger.cdbg.basic.*;
+
+final public class LinuxAARCH64CFrame extends BasicCFrame {
+   public LinuxAARCH64CFrame(LinuxDebugger dbg, Address fp, Address pc) {
+      super(dbg.getCDebugger());
+      this.fp = fp;
+      this.pc = pc;
+      this.dbg = dbg;
+   }
+
+   // override base class impl to avoid ELF parsing
+   public ClosestSymbol closestSymbolToPC() {
+      // try native lookup in debugger.
+      return dbg.lookup(dbg.getAddressValue(pc()));
+   }
+
+   public Address pc() {
+      return pc;
+   }
+
+   public Address localVariableBase() {
+      return fp;
+   }
+
+   public CFrame sender(ThreadProxy thread) {
+      AARCH64ThreadContext context = (AARCH64ThreadContext) thread.getContext();
+      Address rsp = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+
+      if ((fp == null) || fp.lessThan(rsp)) {
+        return null;
+      }
+
+      // Check alignment of fp
+      if (dbg.getAddressValue(fp) % (2 * ADDRESS_SIZE) != 0) {
+        return null;
+      }
+
+      Address nextFP = fp.getAddressAt(0 * ADDRESS_SIZE);
+      if (nextFP == null || nextFP.lessThanOrEqual(fp)) {
+        return null;
+      }
+      Address nextPC  = fp.getAddressAt(1 * ADDRESS_SIZE);
+      if (nextPC == null) {
+        return null;
+      }
+      return new LinuxAARCH64CFrame(dbg, nextFP, nextPC);
+   }
+
+   // package/class internals only
+   private static final int ADDRESS_SIZE = 8;
+   private Address pc;
+   private Address sp;
+   private Address fp;
+   private LinuxDebugger dbg;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/aarch64/LinuxAARCH64ThreadContext.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.linux.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.linux.*;
+
+public class LinuxAARCH64ThreadContext extends AARCH64ThreadContext {
+  private LinuxDebugger debugger;
+
+  public LinuxAARCH64ThreadContext(LinuxDebugger debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcDebuggerLocal.java	Fri Oct 02 04:37:30 2015 +0100
@@ -31,9 +31,11 @@
 import sun.jvm.hotspot.debugger.*;
 import sun.jvm.hotspot.debugger.cdbg.*;
 import sun.jvm.hotspot.debugger.proc.amd64.*;
+import sun.jvm.hotspot.debugger.proc.aarch64.*;
 import sun.jvm.hotspot.debugger.proc.sparc.*;
 import sun.jvm.hotspot.debugger.proc.x86.*;
 import sun.jvm.hotspot.debugger.amd64.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
 import sun.jvm.hotspot.debugger.sparc.*;
 import sun.jvm.hotspot.debugger.x86.*;
 import sun.jvm.hotspot.utilities.*;
@@ -86,6 +88,10 @@
             threadFactory = new ProcAMD64ThreadFactory(this);
             pcRegIndex = AMD64ThreadContext.RIP;
             fpRegIndex = AMD64ThreadContext.RBP;
+        } else if (cpu.equals("aarch64")) {
+            threadFactory = new ProcAARCH64ThreadFactory(this);
+            pcRegIndex = AARCH64ThreadContext.PC;
+            fpRegIndex = AARCH64ThreadContext.FP;
         } else {
           try {
             Class tfc = Class.forName("sun.jvm.hotspot.debugger.proc." +
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64Thread.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class ProcAARCH64Thread implements ThreadProxy {
+    private ProcDebugger debugger;
+    private int         id;
+
+    public ProcAARCH64Thread(ProcDebugger debugger, Address addr) {
+        this.debugger = debugger;
+
+        // FIXME: the size here should be configurable. However, making it
+        // so would produce a dependency on the "types" package from the
+        // debugger package, which is not desired.
+        this.id       = (int) addr.getCIntegerAt(0, 4, true);
+    }
+
+    public ProcAARCH64Thread(ProcDebugger debugger, long id) {
+        this.debugger = debugger;
+        this.id = (int) id;
+    }
+
+    public ThreadContext getContext() throws IllegalThreadStateException {
+        ProcAARCH64ThreadContext context = new ProcAARCH64ThreadContext(debugger);
+        long[] regs = debugger.getThreadIntegerRegisterSet(id);
+        if (Assert.ASSERTS_ENABLED) {
+            Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size mismatch");
+        }
+        for (int i = 0; i < regs.length; i++) {
+            context.setRegister(i, regs[i]);
+        }
+        return context;
+    }
+
+    public boolean canSetContext() throws DebuggerException {
+        return false;
+    }
+
+    public void setContext(ThreadContext context)
+    throws IllegalThreadStateException, DebuggerException {
+        throw new DebuggerException("Unimplemented");
+    }
+
+    public String toString() {
+        return "t@" + id;
+    }
+
+    public boolean equals(Object obj) {
+        if ((obj == null) || !(obj instanceof ProcAARCH64Thread)) {
+            return false;
+        }
+
+        return (((ProcAARCH64Thread) obj).id == id);
+    }
+
+    public int hashCode() {
+        return id;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadContext.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadContext extends AARCH64ThreadContext {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadContext(ProcDebugger debugger) {
+        super();
+        this.debugger = debugger;
+    }
+
+    public void setRegisterAsAddress(int index, Address value) {
+        setRegister(index, debugger.getAddressValue(value));
+    }
+
+    public Address getRegisterAsAddress(int index) {
+        return debugger.newAddress(getRegister(index));
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/aarch64/ProcAARCH64ThreadFactory.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.proc.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.proc.*;
+
+public class ProcAARCH64ThreadFactory implements ProcThreadFactory {
+    private ProcDebugger debugger;
+
+    public ProcAARCH64ThreadFactory(ProcDebugger debugger) {
+        this.debugger = debugger;
+    }
+
+    public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+        return new ProcAARCH64Thread(debugger, threadIdentifierAddr);
+    }
+
+    public ThreadProxy createThreadWrapper(long id) {
+        return new ProcAARCH64Thread(debugger, id);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64Thread.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class RemoteAARCH64Thread extends RemoteThread  {
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, Address addr) {
+     super(debugger, addr);
+  }
+
+  public RemoteAARCH64Thread(RemoteDebuggerClient debugger, long id) {
+     super(debugger, id);
+  }
+
+  public ThreadContext getContext() throws IllegalThreadStateException {
+    RemoteAARCH64ThreadContext context = new RemoteAARCH64ThreadContext(debugger);
+    long[] regs = (addr != null)? debugger.getThreadIntegerRegisterSet(addr) :
+                                  debugger.getThreadIntegerRegisterSet(id);
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(regs.length == AARCH64ThreadContext.NPRGREG, "size of register set must match");
+    }
+    for (int i = 0; i < regs.length; i++) {
+      context.setRegister(i, regs[i]);
+    }
+    return context;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadContext.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadContext extends AARCH64ThreadContext {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadContext(RemoteDebuggerClient debugger) {
+    super();
+    this.debugger = debugger;
+  }
+
+  public void setRegisterAsAddress(int index, Address value) {
+    setRegister(index, debugger.getAddressValue(value));
+  }
+
+  public Address getRegisterAsAddress(int index) {
+    return debugger.newAddress(getRegister(index));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/aarch64/RemoteAARCH64ThreadFactory.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger.remote.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.remote.*;
+
+public class RemoteAARCH64ThreadFactory implements RemoteThreadFactory {
+  private RemoteDebuggerClient debugger;
+
+  public RemoteAARCH64ThreadFactory(RemoteDebuggerClient debugger) {
+    this.debugger = debugger;
+  }
+
+  public ThreadProxy createThreadWrapper(Address threadIdentifierAddr) {
+    return new RemoteAARCH64Thread(debugger, threadIdentifierAddr);
+  }
+
+  public ThreadProxy createThreadWrapper(long id) {
+    return new RemoteAARCH64Thread(debugger, id);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/Threads.java	Fri Oct 02 04:37:30 2015 +0100
@@ -35,6 +35,7 @@
 import sun.jvm.hotspot.runtime.linux_x86.LinuxX86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_amd64.LinuxAMD64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.linux_sparc.LinuxSPARCJavaThreadPDAccess;
+import sun.jvm.hotspot.runtime.linux_aarch64.LinuxAARCH64JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.bsd_x86.BsdX86JavaThreadPDAccess;
 import sun.jvm.hotspot.runtime.bsd_amd64.BsdAMD64JavaThreadPDAccess;
 import sun.jvm.hotspot.utilities.*;
@@ -87,6 +88,8 @@
                 access = new LinuxAMD64JavaThreadPDAccess();
             } else if (cpu.equals("sparc")) {
                 access = new LinuxSPARCJavaThreadPDAccess();
+            } else if (cpu.equals("aarch64")) {
+                access = new LinuxAARCH64JavaThreadPDAccess();
             } else {
               try {
                 access = (JavaThreadPDAccess)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64CurrentFrameGuess.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.interpreter.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+
+/** <P> Should be able to be used on all aarch64 platforms we support
+    (Linux/aarch64) to implement JavaThread's "currentFrameGuess()"
+    functionality. Input is an AARCH64ThreadContext; output is SP, FP,
+    and PC for an AARCH64Frame. Instantiation of the AARCH64Frame is
+    left to the caller, since we may need to subclass AARCH64Frame to
+    support signal handler frames on Unix platforms. </P>
+
+    <P> Algorithm is to walk up the stack within a given range (say,
+    512K at most) looking for a plausible PC and SP for a Java frame,
+    also considering those coming in from the context. If we find a PC
+    that belongs to the VM (i.e., in generated code like the
+    interpreter or CodeCache) then we try to find an associated FP.
+    We repeat this until we either find a complete frame or run out of
+    stack to look at. </P> */
+
+public class AARCH64CurrentFrameGuess {
+  private AARCH64ThreadContext context;
+  private JavaThread       thread;
+  private Address          spFound;
+  private Address          fpFound;
+  private Address          pcFound;
+
+  private static final boolean DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG")
+                                       != null;
+
+  public AARCH64CurrentFrameGuess(AARCH64ThreadContext context,
+                              JavaThread thread) {
+    this.context = context;
+    this.thread  = thread;
+  }
+
+  /** Returns false if not able to find a frame within a reasonable range. */
+  public boolean run(long regionInBytesToSearch) {
+    Address sp  = context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+    Address pc  = context.getRegisterAsAddress(AARCH64ThreadContext.PC);
+    Address fp  = context.getRegisterAsAddress(AARCH64ThreadContext.FP);
+    if (sp == null) {
+      // Bail out if no last java frame either
+      if (thread.getLastJavaSP() != null) {
+        setValues(thread.getLastJavaSP(), thread.getLastJavaFP(), null);
+        return true;
+      }
+      return false;
+    }
+    Address end = sp.addOffsetTo(regionInBytesToSearch);
+    VM vm       = VM.getVM();
+
+    setValues(null, null, null); // Assume we're not going to find anything
+
+    if (vm.isJavaPCDbg(pc)) {
+      if (vm.isClientCompiler()) {
+        // If the topmost frame is a Java frame, we are (pretty much)
+        // guaranteed to have a viable FP. We should be more robust
+        // than this (we have the potential for losing entire threads'
+        // stack traces) but need to see how much work we really have
+        // to do here. Searching the stack for an (SP, FP) pair is
+        // hard since it's easy to misinterpret inter-frame stack
+        // pointers as base-of-frame pointers; we also don't know the
+        // sizes of C1 frames (not registered in the nmethod) so can't
+        // derive them from SP.
+
+        setValues(sp, fp, pc);
+        return true;
+      } else {
+        if (vm.getInterpreter().contains(pc)) {
+          if (DEBUG) {
+            System.out.println("CurrentFrameGuess: choosing interpreter frame: sp = " +
+                               sp + ", fp = " + fp + ", pc = " + pc);
+          }
+          setValues(sp, fp, pc);
+          return true;
+        }
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. In addition, an earlier attempt at a
+        // non-searching algorithm (see below) failed because the
+        // stack pointer from the thread context was pointing
+        // (considerably) beyond the ostensible end of the stack, into
+        // garbage; walking from the topmost frame back caused a crash.
+        //
+        // This algorithm takes the current PC as a given and tries to
+        // find the correct corresponding SP by walking up the stack
+        // and repeatedly performing stackwalks (very inefficient).
+        //
+        // FIXME: there is something wrong with stackwalking across
+        // adapter frames...this is likely to be the root cause of the
+        // failure with the simpler algorithm below.
+
+        for (long offset = 0;
+             offset < regionInBytesToSearch;
+             offset += vm.getAddressSize()) {
+          try {
+            Address curSP = sp.addOffsetTo(offset);
+            Frame frame = new AARCH64Frame(curSP, null, pc);
+            RegisterMap map = thread.newRegisterMap(false);
+            while (frame != null) {
+              if (frame.isEntryFrame() && frame.entryFrameIsFirst()) {
+                // We were able to traverse all the way to the
+                // bottommost Java frame.
+                // This sp looks good. Keep it.
+                if (DEBUG) {
+                  System.out.println("CurrentFrameGuess: Choosing sp = " + curSP + ", pc = " + pc);
+                }
+                setValues(curSP, null, pc);
+                return true;
+              }
+              frame = frame.sender(map);
+            }
+          } catch (Exception e) {
+            if (DEBUG) {
+              System.out.println("CurrentFrameGuess: Exception " + e + " at offset " + offset);
+            }
+            // Bad SP. Try another.
+          }
+        }
+
+        // Were not able to find a plausible SP to go with this PC.
+        // Bail out.
+        return false;
+
+        /*
+        // Original algorithm which does not work because SP was
+        // pointing beyond where it should have:
+
+        // For the server compiler, FP is not guaranteed to be valid
+        // for compiled code. We see whether the PC is in the
+        // interpreter and take care of that, otherwise we run code
+        // (unfortunately) duplicated from AARCH64Frame.senderForCompiledFrame.
+
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+
+          // See if we can derive a frame pointer from SP and PC
+          // NOTE: This is the code duplicated from AARCH64Frame
+          Address saved_fp = null;
+          int llink_offset = cb.getLinkOffset();
+          if (llink_offset >= 0) {
+            // Restore base-pointer, since next frame might be an interpreter frame.
+            Address fp_addr = sp.addOffsetTo(VM.getVM().getAddressSize() * llink_offset);
+            saved_fp = fp_addr.getAddressAt(0);
+          }
+
+          setValues(sp, saved_fp, pc);
+          return true;
+        }
+        */
+      }
+    } else {
+      // If the current program counter was not known to us as a Java
+      // PC, we currently assume that we are in the run-time system
+      // and attempt to look to thread-local storage for saved SP and
+      // FP. Note that if these are null (because we were, in fact,
+      // in Java code, i.e., vtable stubs or similar, and the SA
+      // didn't have enough insight into the target VM to understand
+      // that) then we are going to lose the entire stack trace for
+      // the thread, which is sub-optimal. FIXME.
+
+      if (DEBUG) {
+        System.out.println("CurrentFrameGuess: choosing last Java frame: sp = " +
+                           thread.getLastJavaSP() + ", fp = " + thread.getLastJavaFP());
+      }
+      if (thread.getLastJavaSP() == null) {
+        return false; // No known Java frames on stack
+      }
+
+      // The runtime has a nasty habit of not saving fp in the frame
+      // anchor, leaving us to grovel about in the stack to find a
+      // plausible address.  Fortunately, this only happens in
+      // compiled code; there we always have a valid PC, and we always
+      // push LR and FP onto the stack as a pair, with FP at the lower
+      // address.
+      pc = thread.getLastJavaPC();
+      fp = thread.getLastJavaFP();
+      sp = thread.getLastJavaSP();
+
+      if (fp == null) {
+        CodeCache cc = vm.getCodeCache();
+        if (cc.contains(pc)) {
+          CodeBlob cb = cc.findBlob(pc);
+          if (DEBUG) {
+            System.out.println("FP is null.  Found blob frame size " + cb.getFrameSize());
+          }
+          // See if we can derive a frame pointer from SP and PC
+          long link_offset = cb.getFrameSize() - 2 * VM.getVM().getAddressSize();
+          if (link_offset >= 0) {
+            fp = sp.addOffsetTo(link_offset);
+          }
+        }
+      }
+
+      setValues(sp, fp, null);
+
+      return true;
+    }
+  }
+
+  public Address getSP() { return spFound; }
+  public Address getFP() { return fpFound; }
+  /** May be null if getting values from thread-local storage; take
+      care to call the correct AARCH64Frame constructor to recover this if
+      necessary */
+  public Address getPC() { return pcFound; }
+
+  private void setValues(Address sp, Address fp, Address pc) {
+    spFound = sp;
+    fpFound = fp;
+    pcFound = pc;
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64Frame.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.code.*;
+import sun.jvm.hotspot.compiler.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.oops.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+/** Specialization of and implementation of abstract methods of the
+    Frame class for the aarch64 family of CPUs. */
+
+public class AARCH64Frame extends Frame {
+  private static final boolean DEBUG;
+  static {
+    DEBUG = System.getProperty("sun.jvm.hotspot.runtime.aarch64.AARCH64Frame.DEBUG") != null;
+  }
+
+  // All frames
+  private static final int LINK_OFFSET                =  0;
+  private static final int RETURN_ADDR_OFFSET         =  1;
+  private static final int SENDER_SP_OFFSET           =  2;
+
+  // Interpreter frames
+  private static final int INTERPRETER_FRAME_MIRROR_OFFSET    =  2; // for native calls only
+  private static final int INTERPRETER_FRAME_SENDER_SP_OFFSET = -1;
+  private static final int INTERPRETER_FRAME_LAST_SP_OFFSET   = INTERPRETER_FRAME_SENDER_SP_OFFSET - 1;
+  private static final int INTERPRETER_FRAME_METHOD_OFFSET    = INTERPRETER_FRAME_LAST_SP_OFFSET - 1;
+  private static       int INTERPRETER_FRAME_MDX_OFFSET;         // Non-core builds only
+  private static       int INTERPRETER_FRAME_CACHE_OFFSET;
+  private static       int INTERPRETER_FRAME_LOCALS_OFFSET;
+  private static       int INTERPRETER_FRAME_BCX_OFFSET;
+  private static       int INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET;
+  private static       int INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET;
+
+  // Entry frames
+  private static       int ENTRY_FRAME_CALL_WRAPPER_OFFSET = -8;
+
+  // Native frames
+  private static final int NATIVE_FRAME_INITIAL_PARAM_OFFSET =  2;
+
+  private static VMReg fp = new VMReg(29);
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    INTERPRETER_FRAME_MDX_OFFSET                  = INTERPRETER_FRAME_METHOD_OFFSET - 1;
+    INTERPRETER_FRAME_CACHE_OFFSET                = INTERPRETER_FRAME_MDX_OFFSET - 1;
+    INTERPRETER_FRAME_LOCALS_OFFSET               = INTERPRETER_FRAME_CACHE_OFFSET - 1;
+    INTERPRETER_FRAME_BCX_OFFSET                  = INTERPRETER_FRAME_LOCALS_OFFSET - 1;
+    INTERPRETER_FRAME_INITIAL_SP_OFFSET           = INTERPRETER_FRAME_BCX_OFFSET - 1;
+    INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET    = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+    INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET = INTERPRETER_FRAME_INITIAL_SP_OFFSET;
+  }
+
+
+  // an additional field beyond sp and pc:
+  Address raw_fp; // frame pointer
+  private Address raw_unextendedSP;
+
+  private AARCH64Frame() {
+  }
+
+  private void adjustForDeopt() {
+    if ( pc != null) {
+      // Look for a deopt pc and if it is deopted convert to original pc
+      CodeBlob cb = VM.getVM().getCodeCache().findBlob(pc);
+      if (cb != null && cb.isJavaMethod()) {
+        NMethod nm = (NMethod) cb;
+        if (pc.equals(nm.deoptHandlerBegin())) {
+          if (Assert.ASSERTS_ENABLED) {
+            Assert.that(this.getUnextendedSP() != null, "null SP in Java frame");
+          }
+          // adjust pc if frame is deoptimized.
+          pc = this.getUnextendedSP().getAddressAt(nm.origPCOffset());
+          deoptimized = true;
+        }
+      }
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp, pc): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_fp) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_sp;
+    this.raw_fp = raw_fp;
+    this.pc = raw_sp.getAddressAt(-1 * VM.getVM().getAddressSize());
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, fp): " + this);
+      dumpStack();
+    }
+  }
+
+  public AARCH64Frame(Address raw_sp, Address raw_unextendedSp, Address raw_fp, Address pc) {
+    this.raw_sp = raw_sp;
+    this.raw_unextendedSP = raw_unextendedSp;
+    this.raw_fp = raw_fp;
+    this.pc = pc;
+    adjustUnextendedSP();
+
+    // Frame must be fully constructed before this call
+    adjustForDeopt();
+
+    if (DEBUG) {
+      System.out.println("AARCH64Frame(sp, unextendedSP, fp, pc): " + this);
+      dumpStack();
+    }
+
+  }
+
+  public Object clone() {
+    AARCH64Frame frame = new AARCH64Frame();
+    frame.raw_sp = raw_sp;
+    frame.raw_unextendedSP = raw_unextendedSP;
+    frame.raw_fp = raw_fp;
+    frame.pc = pc;
+    frame.deoptimized = deoptimized;
+    return frame;
+  }
+
+  public boolean equals(Object arg) {
+    if (arg == null) {
+      return false;
+    }
+
+    if (!(arg instanceof AARCH64Frame)) {
+      return false;
+    }
+
+    AARCH64Frame other = (AARCH64Frame) arg;
+
+    return (AddressOps.equal(getSP(), other.getSP()) &&
+            AddressOps.equal(getUnextendedSP(), other.getUnextendedSP()) &&
+            AddressOps.equal(getFP(), other.getFP()) &&
+            AddressOps.equal(getPC(), other.getPC()));
+  }
+
+  public int hashCode() {
+    if (raw_sp == null) {
+      return 0;
+    }
+
+    return raw_sp.hashCode();
+  }
+
+  public String toString() {
+    return "sp: " + (getSP() == null? "null" : getSP().toString()) +
+         ", unextendedSP: " + (getUnextendedSP() == null? "null" : getUnextendedSP().toString()) +
+         ", fp: " + (getFP() == null? "null" : getFP().toString()) +
+         ", pc: " + (pc == null? "null" : pc.toString());
+  }
+
+  // accessors for the instance variables
+  public Address getFP() { return raw_fp; }
+  public Address getSP() { return raw_sp; }
+  public Address getID() { return raw_sp; }
+
+  // FIXME: not implemented yet
+  public boolean isSignalHandlerFrameDbg() { return false; }
+  public int     getSignalNumberDbg()      { return 0;     }
+  public String  getSignalNameDbg()        { return null;  }
+
+  public boolean isInterpretedFrameValid() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "Not an interpreted frame");
+    }
+
+    // These are reasonable sanity checks
+    if (getFP() == null || getFP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getSP() == null || getSP().andWithMask(0x3) != null) {
+      return false;
+    }
+
+    if (getFP().addOffsetTo(INTERPRETER_FRAME_INITIAL_SP_OFFSET * VM.getVM().getAddressSize()).lessThan(getSP())) {
+      return false;
+    }
+
+    // These are hacks to keep us out of trouble.
+    // The problem with these is that they mask other problems
+    if (getFP().lessThanOrEqual(getSP())) {
+      // this attempts to deal with unsigned comparison above
+      return false;
+    }
+
+    if (getFP().minus(getSP()) > 4096 * VM.getVM().getAddressSize()) {
+      // stack frames shouldn't be large.
+      return false;
+    }
+
+    return true;
+  }
+
+  // FIXME: not applicable in current system
+  //  void    patch_pc(Thread* thread, address pc);
+
+  public Frame sender(RegisterMap regMap, CodeBlob cb) {
+    AARCH64RegisterMap map = (AARCH64RegisterMap) regMap;
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // Default is we done have to follow them. The sender_for_xxx will
+    // update it accordingly
+    map.setIncludeArgumentOops(false);
+
+    if (isEntryFrame())       return senderForEntryFrame(map);
+    if (isInterpretedFrame()) return senderForInterpreterFrame(map);
+
+    if(cb == null) {
+      cb = VM.getVM().getCodeCache().findBlob(getPC());
+    } else {
+      if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.equals(VM.getVM().getCodeCache().findBlob(getPC())), "Must be the same");
+      }
+    }
+
+    if (cb != null) {
+      return senderForCompiledFrame(map, cb);
+    }
+
+    // Must be native-compiled frame, i.e. the marshaling code for native
+    // methods that exists in the core system.
+    return new AARCH64Frame(getSenderSP(), getLink(), getSenderPC());
+  }
+
+  private Frame senderForEntryFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForEntryFrame");
+    }
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+    // Java frame called from C; skip all C frames and return top C
+    // frame of that chunk as the sender
+    AARCH64JavaCallWrapper jcw = (AARCH64JavaCallWrapper) getEntryFrameCallWrapper();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(!entryFrameIsFirst(), "next Java fp must be non zero");
+      Assert.that(jcw.getLastJavaSP().greaterThan(getSP()), "must be above this frame on stack");
+    }
+    AARCH64Frame fr;
+    if (jcw.getLastJavaPC() != null) {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP(), jcw.getLastJavaPC());
+    } else {
+      fr = new AARCH64Frame(jcw.getLastJavaSP(), jcw.getLastJavaFP());
+    }
+    map.clear();
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map.getIncludeArgumentOops(), "should be set by clear");
+    }
+    return fr;
+  }
+
+  //------------------------------------------------------------------------------
+  // frame::adjust_unextended_sp
+  private void adjustUnextendedSP() {
+    // If we are returning to a compiled MethodHandle call site, the
+    // saved_fp will in fact be a saved value of the unextended SP.  The
+    // simplest way to tell whether we are returning to such a call site
+    // is as follows:
+
+    CodeBlob cb = cb();
+    NMethod senderNm = (cb == null) ? null : cb.asNMethodOrNull();
+    if (senderNm != null) {
+      // If the sender PC is a deoptimization point, get the original
+      // PC.  For MethodHandle call site the unextended_sp is stored in
+      // saved_fp.
+      if (senderNm.isDeoptMhEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptMhOriginalPc(senderNm, getFP()));
+        raw_unextendedSP = getFP();
+      }
+      else if (senderNm.isDeoptEntry(getPC())) {
+        // DEBUG_ONLY(verifyDeoptOriginalPc(senderNm, raw_unextendedSp));
+      }
+      else if (senderNm.isMethodHandleReturn(getPC())) {
+        raw_unextendedSP = getFP();
+      }
+    }
+  }
+
+  private Frame senderForInterpreterFrame(AARCH64RegisterMap map) {
+    if (DEBUG) {
+      System.out.println("senderForInterpreterFrame");
+    }
+    Address unextendedSP = addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+    Address sp = addressOfStackSlot(SENDER_SP_OFFSET);
+    // We do not need to update the callee-save register mapping because above
+    // us is either another interpreter frame or a converter-frame, but never
+    // directly a compiled frame.
+    // 11/24/04 SFG. With the removal of adapter frames this is no longer true.
+    // However c2 no longer uses callee save register for java calls so there
+    // are no callee register to find.
+
+    if (map.getUpdateMap())
+      updateMapWithSavedLink(map, addressOfStackSlot(LINK_OFFSET));
+
+    return new AARCH64Frame(sp, unextendedSP, getLink(), getSenderPC());
+  }
+
+  private void updateMapWithSavedLink(RegisterMap map, Address savedFPAddr) {
+    map.setLocation(fp, savedFPAddr);
+  }
+
+  private Frame senderForCompiledFrame(AARCH64RegisterMap map, CodeBlob cb) {
+    if (DEBUG) {
+      System.out.println("senderForCompiledFrame");
+    }
+
+    //
+    // NOTE: some of this code is (unfortunately) duplicated  AARCH64CurrentFrameGuess
+    //
+
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(map != null, "map must be set");
+    }
+
+    // frame owned by optimizing compiler
+    if (Assert.ASSERTS_ENABLED) {
+        Assert.that(cb.getFrameSize() >= 0, "must have non-zero frame size");
+    }
+    Address senderSP = getUnextendedSP().addOffsetTo(cb.getFrameSize());
+
+    // The return_address is always the word on the stack
+    Address senderPC = senderSP.getAddressAt(-1 * VM.getVM().getAddressSize());
+
+    // This is the saved value of FP which may or may not really be an FP.
+    // It is only an FP if the sender is an interpreter frame.
+    Address savedFPAddr = senderSP.addOffsetTo(- SENDER_SP_OFFSET * VM.getVM().getAddressSize());
+
+    if (map.getUpdateMap()) {
+      // Tell GC to use argument oopmaps for some runtime stubs that need it.
+      // For C1, the runtime stub might not have oop maps, so set this flag
+      // outside of update_register_map.
+      map.setIncludeArgumentOops(cb.callerMustGCArguments());
+
+      if (cb.getOopMaps() != null) {
+        OopMapSet.updateRegisterMap(this, cb, map, true);
+      }
+
+      // Since the prolog does the save and restore of FP there is no oopmap
+      // for it so we must fill in its location as if there was an oopmap entry
+      // since if our caller was compiled code there could be live jvm state in it.
+      updateMapWithSavedLink(map, savedFPAddr);
+    }
+
+    return new AARCH64Frame(senderSP, savedFPAddr.getAddressAt(0), senderPC);
+  }
+
+  protected boolean hasSenderPD() {
+    return true;
+  }
+
+  public long frameSize() {
+    return (getSenderSP().minus(getSP()) / VM.getVM().getAddressSize());
+  }
+
+    public Address getLink() {
+        try {
+            if (DEBUG) {
+                System.out.println("Reading link at " + addressOfStackSlot(LINK_OFFSET)
+                        + " = " + addressOfStackSlot(LINK_OFFSET).getAddressAt(0));
+            }
+            return addressOfStackSlot(LINK_OFFSET).getAddressAt(0);
+        } catch (Exception e) {
+            if (DEBUG)
+                System.out.println("Returning null");
+            return null;
+        }
+    }
+
+  // FIXME: not implementable yet
+  //inline void      frame::set_link(intptr_t* addr)  { *(intptr_t **)addr_at(link_offset) = addr; }
+
+  public Address getUnextendedSP() { return raw_unextendedSP; }
+
+  // Return address:
+  public Address getSenderPCAddr() { return addressOfStackSlot(RETURN_ADDR_OFFSET); }
+  public Address getSenderPC()     { return getSenderPCAddr().getAddressAt(0);      }
+
+  // return address of param, zero origin index.
+  public Address getNativeParamAddr(int idx) {
+    return addressOfStackSlot(NATIVE_FRAME_INITIAL_PARAM_OFFSET + idx);
+  }
+
+  public Address getSenderSP()     { return addressOfStackSlot(SENDER_SP_OFFSET); }
+
+  public Address addressOfInterpreterFrameLocals() {
+    return addressOfStackSlot(INTERPRETER_FRAME_LOCALS_OFFSET);
+  }
+
+  private Address addressOfInterpreterFrameBCX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_BCX_OFFSET);
+  }
+
+  public int getInterpreterFrameBCI() {
+    // FIXME: this is not atomic with respect to GC and is unsuitable
+    // for use in a non-debugging, or reflective, system. Need to
+    // figure out how to express this.
+    Address bcp = addressOfInterpreterFrameBCX().getAddressAt(0);
+    Address methodHandle = addressOfInterpreterFrameMethod().getAddressAt(0);
+    Method method = (Method)Metadata.instantiateWrapperFor(methodHandle);
+    return bcpToBci(bcp, method);
+  }
+
+  public Address addressOfInterpreterFrameMDX() {
+    return addressOfStackSlot(INTERPRETER_FRAME_MDX_OFFSET);
+  }
+
+  // FIXME
+  //inline int frame::interpreter_frame_monitor_size() {
+  //  return BasicObjectLock::size();
+  //}
+
+  // expression stack
+  // (the max_stack arguments are used by the GC; see class FrameClosure)
+
+  public Address addressOfInterpreterFrameExpressionStack() {
+    Address monitorEnd = interpreterFrameMonitorEnd().address();
+    return monitorEnd.addOffsetTo(-1 * VM.getVM().getAddressSize());
+  }
+
+  public int getInterpreterFrameExpressionStackDirection() { return -1; }
+
+  // top of expression stack
+  public Address addressOfInterpreterFrameTOS() {
+    return getSP();
+  }
+
+  /** Expression stack from top down */
+  public Address addressOfInterpreterFrameTOSAt(int slot) {
+    return addressOfInterpreterFrameTOS().addOffsetTo(slot * VM.getVM().getAddressSize());
+  }
+
+  public Address getInterpreterFrameSenderSP() {
+    if (Assert.ASSERTS_ENABLED) {
+      Assert.that(isInterpretedFrame(), "interpreted frame expected");
+    }
+    return addressOfStackSlot(INTERPRETER_FRAME_SENDER_SP_OFFSET).getAddressAt(0);
+  }
+
+  // Monitors
+  public BasicObjectLock interpreterFrameMonitorBegin() {
+    return new BasicObjectLock(addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_BOTTOM_OFFSET));
+  }
+
+  public BasicObjectLock interpreterFrameMonitorEnd() {
+    Address result = addressOfStackSlot(INTERPRETER_FRAME_MONITOR_BLOCK_TOP_OFFSET).getAddressAt(0);
+    if (Assert.ASSERTS_ENABLED) {
+      // make sure the pointer points inside the frame
+      Assert.that(AddressOps.gt(getFP(), result), "result must <  than frame pointer");
+      Assert.that(AddressOps.lte(getSP(), result), "result must >= than stack pointer");
+    }
+    return new BasicObjectLock(result);
+  }
+
+  public int interpreterFrameMonitorSize() {
+    return BasicObjectLock.size();
+  }
+
+  // Method
+  public Address addressOfInterpreterFrameMethod() {
+    return addressOfStackSlot(INTERPRETER_FRAME_METHOD_OFFSET);
+  }
+
+  // Constant pool cache
+  public Address addressOfInterpreterFrameCPCache() {
+    return addressOfStackSlot(INTERPRETER_FRAME_CACHE_OFFSET);
+  }
+
+  // Entry frames
+  public JavaCallWrapper getEntryFrameCallWrapper() {
+    return new AARCH64JavaCallWrapper(addressOfStackSlot(ENTRY_FRAME_CALL_WRAPPER_OFFSET).getAddressAt(0));
+  }
+
+  protected Address addressOfSavedOopResult() {
+    // offset is 2 for compiler2 and 3 for compiler1
+    return getSP().addOffsetTo((VM.getVM().isClientCompiler() ? 2 : 3) *
+                               VM.getVM().getAddressSize());
+  }
+
+  protected Address addressOfSavedReceiver() {
+    return getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+  }
+
+  private void dumpStack() {
+    for (Address addr = getSP().addOffsetTo(-4 * VM.getVM().getAddressSize());
+         AddressOps.lt(addr, getSP());
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+    System.out.println("-----------------------");
+    for (Address addr = getSP();
+         AddressOps.lte(addr, getSP().addOffsetTo(20 * VM.getVM().getAddressSize()));
+         addr = addr.addOffsetTo(VM.getVM().getAddressSize())) {
+      System.out.println(addr + ": " + addr.getAddressAt(0));
+    }
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64JavaCallWrapper.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64JavaCallWrapper extends JavaCallWrapper {
+  private static AddressField lastJavaFPField;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaFrameAnchor");
+
+    lastJavaFPField  = type.getAddressField("_last_Java_fp");
+  }
+
+  public AARCH64JavaCallWrapper(Address addr) {
+    super(addr);
+  }
+
+  public Address getLastJavaFP() {
+    return lastJavaFPField.getValue(addr.addOffsetTo(anchorField.getOffset()));
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/aarch64/AARCH64RegisterMap.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.aarch64;
+
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.runtime.*;
+
+public class AARCH64RegisterMap extends RegisterMap {
+
+  /** This is the only public constructor */
+  public AARCH64RegisterMap(JavaThread thread, boolean updateMap) {
+    super(thread, updateMap);
+  }
+
+  protected AARCH64RegisterMap(RegisterMap map) {
+    super(map);
+  }
+
+  public Object clone() {
+    AARCH64RegisterMap retval = new AARCH64RegisterMap(this);
+    return retval;
+  }
+
+  // no PD state to clear or copy:
+  protected void clearPD() {}
+  protected void initializePD() {}
+  protected void initializeFromPD(RegisterMap map) {}
+  protected Address getLocationPD(VMReg reg) { return null; }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/agent/src/share/classes/sun/jvm/hotspot/runtime/linux_aarch64/LinuxAARCH64JavaThreadPDAccess.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.runtime.linux_aarch64;
+
+import java.io.*;
+import java.util.*;
+import sun.jvm.hotspot.debugger.*;
+import sun.jvm.hotspot.debugger.aarch64.*;
+import sun.jvm.hotspot.runtime.*;
+import sun.jvm.hotspot.runtime.aarch64.*;
+import sun.jvm.hotspot.types.*;
+import sun.jvm.hotspot.utilities.*;
+
+public class LinuxAARCH64JavaThreadPDAccess implements JavaThreadPDAccess {
+  private static AddressField  lastJavaFPField;
+  private static AddressField  osThreadField;
+
+  // Field from OSThread
+  private static CIntegerField osThreadThreadIDField;
+
+  // This is currently unneeded but is being kept in case we change
+  // the currentFrameGuess algorithm
+  private static final long GUESS_SCAN_RANGE = 128 * 1024;
+
+  static {
+    VM.registerVMInitializedObserver(new Observer() {
+        public void update(Observable o, Object data) {
+          initialize(VM.getVM().getTypeDataBase());
+        }
+      });
+  }
+
+  private static synchronized void initialize(TypeDataBase db) {
+    Type type = db.lookupType("JavaThread");
+    osThreadField           = type.getAddressField("_osthread");
+
+    Type anchorType = db.lookupType("JavaFrameAnchor");
+    lastJavaFPField         = anchorType.getAddressField("_last_Java_fp");
+
+    Type osThreadType = db.lookupType("OSThread");
+    osThreadThreadIDField   = osThreadType.getCIntegerField("_thread_id");
+  }
+
+  public Address getLastJavaFP(Address addr) {
+    return lastJavaFPField.getValue(addr.addOffsetTo(sun.jvm.hotspot.runtime.JavaThread.getAnchorField().getOffset()));
+  }
+
+  public Address getLastJavaPC(Address addr) {
+    return null;
+  }
+
+  public Address getBaseOfStackPointer(Address addr) {
+    return null;
+  }
+
+  public Frame getLastFramePD(JavaThread thread, Address addr) {
+    Address fp = thread.getLastJavaFP();
+    if (fp == null) {
+      return null; // no information
+    }
+    return new AARCH64Frame(thread.getLastJavaSP(), fp);
+  }
+
+  public RegisterMap newRegisterMap(JavaThread thread, boolean updateMap) {
+    return new AARCH64RegisterMap(thread, updateMap);
+  }
+
+  public Frame getCurrentFrameGuess(JavaThread thread, Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    AARCH64CurrentFrameGuess guesser = new AARCH64CurrentFrameGuess(context, thread);
+    if (!guesser.run(GUESS_SCAN_RANGE)) {
+      return null;
+    }
+    if (guesser.getPC() == null) {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP());
+    } else {
+      return new AARCH64Frame(guesser.getSP(), guesser.getFP(), guesser.getPC());
+    }
+  }
+
+  public void printThreadIDOn(Address addr, PrintStream tty) {
+    tty.print(getThreadProxy(addr));
+  }
+
+  public void printInfoOn(Address threadAddr, PrintStream tty) {
+    tty.print("Thread id: ");
+    printThreadIDOn(threadAddr, tty);
+//    tty.println("\nPostJavaState: " + getPostJavaState(threadAddr));
+  }
+
+  public Address getLastSP(Address addr) {
+    ThreadProxy t = getThreadProxy(addr);
+    AARCH64ThreadContext context = (AARCH64ThreadContext) t.getContext();
+    return context.getRegisterAsAddress(AARCH64ThreadContext.SP);
+  }
+
+  public ThreadProxy getThreadProxy(Address addr) {
+    // Addr is the address of the JavaThread.
+    // Fetch the OSThread (for now and for simplicity, not making a
+    // separate "OSThread" class in this package)
+    Address osThreadAddr = osThreadField.getValue(addr);
+    // Get the address of the _thread_id from the OSThread
+    Address threadIdAddr = osThreadAddr.addOffsetTo(osThreadThreadIDField.getOffset());
+
+    JVMDebugger debugger = VM.getVM().getDebugger();
+    return debugger.getThreadForIdentifierAddress(threadIdAddr);
+  }
+}
--- a/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/agent/src/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	Fri Oct 02 04:37:30 2015 +0100
@@ -63,7 +63,9 @@
       return "sparc";
     } else if (cpu.equals("ia64") || cpu.equals("amd64") || cpu.equals("x86_64")) {
       return cpu;
-    } else {
+    } else {if (cpu.equals("aarch64")) {
+      return cpu;
+    } else 
       try {
         Class pic = Class.forName("sun.jvm.hotspot.utilities.PlatformInfoClosed");
         AltPlatformInfo api = (AltPlatformInfo)pic.newInstance();
--- a/make/defs.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/defs.make	Fri Oct 02 04:37:30 2015 +0100
@@ -322,7 +322,7 @@
     endif
   endif
 
-  LP64_ARCH = sparcv9 amd64 ia64 ppc64 aarch64 zero
+  LP64_ARCH += sparcv9 amd64 ia64 ppc64 aarch64 zero
 endif
 
 # Required make macro settings for all platforms
--- a/make/linux/makefiles/defs.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/linux/makefiles/defs.make	Fri Oct 02 04:37:30 2015 +0100
@@ -305,6 +305,8 @@
                         $(EXPORT_LIB_DIR)/sa-jdi.jar
 ADD_SA_BINARIES/sparc = $(EXPORT_JRE_LIB_ARCH_DIR)/libsaproc.$(LIBRARY_SUFFIX) \
                         $(EXPORT_LIB_DIR)/sa-jdi.jar
+ADD_SA_BINARIES/aarch64 = $(EXPORT_JRE_LIB_ARCH_DIR)/libsaproc.$(LIBRARY_SUFFIX) \
+                        $(EXPORT_LIB_DIR)/sa-jdi.jar
 ifeq ($(ENABLE_FULL_DEBUG_SYMBOLS),1)
   ifeq ($(ZIP_DEBUGINFO_FILES),1)
     ADD_SA_BINARIES/x86   += $(EXPORT_JRE_LIB_ARCH_DIR)/libsaproc.diz
--- a/make/linux/makefiles/gcc.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/linux/makefiles/gcc.make	Fri Oct 02 04:37:30 2015 +0100
@@ -345,7 +345,7 @@
   DEBUG_CFLAGS/amd64 = -g
   DEBUG_CFLAGS/aarch64 = -g
   DEBUG_CFLAGS/ppc64 = -g
-  DEBUG_CFLAGS/zero  = -g
+  DEBUG_CFLAGS/zero = -g
   DEBUG_CFLAGS += $(DEBUG_CFLAGS/$(BUILDARCH))
   ifeq ($(DEBUG_CFLAGS/$(BUILDARCH)),)
       ifeq ($(USE_CLANG), true)
@@ -361,7 +361,7 @@
     FASTDEBUG_CFLAGS/amd64 = -g
     FASTDEBUG_CFLAGS/aarch64 = -g
     FASTDEBUG_CFLAGS/ppc64 = -g
-    FASTDEBUG_CFLAGS/zero  = -g
+    FASTDEBUG_CFLAGS/zero = -g
     FASTDEBUG_CFLAGS += $(FASTDEBUG_CFLAGS/$(BUILDARCH))
     ifeq ($(FASTDEBUG_CFLAGS/$(BUILDARCH)),)
       ifeq ($(USE_CLANG), true)
@@ -376,7 +376,7 @@
     OPT_CFLAGS/amd64 = -g
     OPT_CFLAGS/aarch64 = -g
     OPT_CFLAGS/ppc64 = -g
-    OPT_CFLAGS/zero  = -g
+    OPT_CFLAGS/zero = -g
     OPT_CFLAGS += $(OPT_CFLAGS/$(BUILDARCH))
     ifeq ($(OPT_CFLAGS/$(BUILDARCH)),)
       ifeq ($(USE_CLANG), true)
--- a/make/linux/makefiles/sa.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/linux/makefiles/sa.make	Fri Oct 02 04:37:30 2015 +0100
@@ -62,8 +62,7 @@
 
 all: 
 	if [ -d $(AGENT_DIR) -a "$(SRCARCH)" != "ia64" \
-             -a "$(SRCARCH)" != "zero" \
-             -a "$(SRCARCH)" != "aarch64" ] ; then \
+             -a "$(SRCARCH)" != "zero" ] ; then \
 	   $(MAKE) -f sa.make $(GENERATED)/sa-jdi.jar; \
 	fi
 
@@ -109,6 +108,7 @@
 	$(QUIETLY) $(REMOTE) $(RUN.JAR) uf $@ -C $(AGENT_SRC_DIR) META-INF/services/com.sun.jdi.connect.Connector
 	$(QUIETLY) $(REMOTE) $(RUN.JAVAH) -classpath $(SA_CLASSDIR) -d $(GENERATED) -jni sun.jvm.hotspot.debugger.x86.X86ThreadContext
 	$(QUIETLY) $(REMOTE) $(RUN.JAVAH) -classpath $(SA_CLASSDIR) -d $(GENERATED) -jni sun.jvm.hotspot.debugger.amd64.AMD64ThreadContext
+	$(QUIETLY) $(REMOTE) $(RUN.JAVAH) -classpath $(SA_CLASSDIR) -d $(GENERATED) -jni sun.jvm.hotspot.debugger.aarch64.AARCH64ThreadContext
 	$(QUIETLY) $(REMOTE) $(RUN.JAVAH) -classpath $(SA_CLASSDIR) -d $(GENERATED) -jni sun.jvm.hotspot.debugger.sparc.SPARCThreadContext
 	$(QUIETLY) $(REMOTE) $(RUN.JAVAH) -classpath $(SA_CLASSDIR) -d $(GENERATED) -jni sun.jvm.hotspot.asm.Disassembler
 
--- a/make/linux/makefiles/saproc.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/linux/makefiles/saproc.make	Fri Oct 02 04:37:30 2015 +0100
@@ -63,7 +63,7 @@
 # also, we don't build SA on Itanium or zero.
 
 ifneq ($(wildcard $(AGENT_DIR)),)
-ifneq ($(filter-out ia64 zero aarch64,$(SRCARCH)),)
+ifneq ($(filter-out ia64 zero,$(SRCARCH)),)
   BUILDLIBSAPROC = $(LIBSAPROC)
 endif
 endif
--- a/make/linux/makefiles/vm.make	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/linux/makefiles/vm.make	Fri Oct 02 04:37:30 2015 +0100
@@ -295,7 +295,7 @@
 mapfile : $(MAPFILE) vm.def mapfile_ext
 	rm -f $@
 	awk '{ if ($$0 ~ "INSERT VTABLE SYMBOLS HERE")	\
-                 { system ("cat mapfile_ext"); system ("cat vm.def"); } \
+                 { system ("cat mapfile_ext"); system ("cat vm.def"); }             \
                else					\
                  { print $$0 }				\
              }' > $@ < $(MAPFILE)
--- a/make/sa.files	Wed Sep 30 16:43:15 2015 +0100
+++ b/make/sa.files	Fri Oct 02 04:37:30 2015 +0100
@@ -43,6 +43,7 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/compiler/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/bsd/x86/*.java \
@@ -52,17 +53,20 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/x86/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/linux/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/posix/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/posix/elf/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/proc/x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/x86/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/remote/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/win32/coff/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/debugger/windbg/*.java \
@@ -83,11 +87,13 @@
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/prims/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_amd64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/bsd_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_amd64/*.java \
+$(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_aarch64/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_x86/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/linux_sparc/*.java \
 $(AGENT_SRC_DIR)/sun/jvm/hotspot/runtime/posix/*.java \
--- a/src/cpu/aarch64/vm/aarch64.ad	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Fri Oct 02 04:37:30 2015 +0100
@@ -162,70 +162,165 @@
 // the platform ABI treats v8-v15 as callee save). float registers
 // v16-v31 are SOC as per the platform spec
 
-  reg_def V0   ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()         );
-  reg_def V0_H ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next() );
-  reg_def V1   ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()         );
-  reg_def V1_H ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next() );
-  reg_def V2   ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()         );
-  reg_def V2_H ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next() );
-  reg_def V3   ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()         );
-  reg_def V3_H ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next() );
-  reg_def V4   ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()         );
-  reg_def V4_H ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next() );
-  reg_def V5   ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()         );
-  reg_def V5_H ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next() );
-  reg_def V6   ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()         );
-  reg_def V6_H ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next() );
-  reg_def V7   ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()         );
-  reg_def V7_H ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next() );
-  reg_def V8   ( SOC, SOE, Op_RegF,  8, v8->as_VMReg()         );
-  reg_def V8_H ( SOC, SOE, Op_RegF,  8, v8->as_VMReg()->next() );
-  reg_def V9   ( SOC, SOE, Op_RegF,  9, v9->as_VMReg()         );
-  reg_def V9_H ( SOC, SOE, Op_RegF,  9, v9->as_VMReg()->next() );
-  reg_def V10  ( SOC, SOE, Op_RegF, 10, v10->as_VMReg()        );
-  reg_def V10_H( SOC, SOE, Op_RegF, 10, v10->as_VMReg()->next());
-  reg_def V11  ( SOC, SOE, Op_RegF, 11, v11->as_VMReg()        );
-  reg_def V11_H( SOC, SOE, Op_RegF, 11, v11->as_VMReg()->next());
-  reg_def V12  ( SOC, SOE, Op_RegF, 12, v12->as_VMReg()        );
-  reg_def V12_H( SOC, SOE, Op_RegF, 12, v12->as_VMReg()->next());
-  reg_def V13  ( SOC, SOE, Op_RegF, 13, v13->as_VMReg()        );
-  reg_def V13_H( SOC, SOE, Op_RegF, 13, v13->as_VMReg()->next());
-  reg_def V14  ( SOC, SOE, Op_RegF, 14, v14->as_VMReg()        );
-  reg_def V14_H( SOC, SOE, Op_RegF, 14, v14->as_VMReg()->next());
-  reg_def V15  ( SOC, SOE, Op_RegF, 15, v15->as_VMReg()        );
-  reg_def V15_H( SOC, SOE, Op_RegF, 15, v15->as_VMReg()->next());
-  reg_def V16  ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()        );
-  reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next());
-  reg_def V17  ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()        );
-  reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next());
-  reg_def V18  ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()        );
-  reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next());
-  reg_def V19  ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()        );
-  reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next());
-  reg_def V20  ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()        );
-  reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next());
-  reg_def V21  ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()        );
-  reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next());
-  reg_def V22  ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()        );
-  reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next());
-  reg_def V23  ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()        );
-  reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next());
-  reg_def V24  ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()        );
-  reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next());
-  reg_def V25  ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()        );
-  reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next());
-  reg_def V26  ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()        );
-  reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next());
-  reg_def V27  ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()        );
-  reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next());
-  reg_def V28  ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()        );
-  reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next());
-  reg_def V29  ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()        );
-  reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next());
-  reg_def V30  ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()        );
-  reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next());
-  reg_def V31  ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()        );
-  reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next());
+  reg_def V0   ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()          );
+  reg_def V0_H ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next()  );
+  reg_def V0_J ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next(2) );
+  reg_def V0_K ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next(3) );
+
+  reg_def V1   ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()          );
+  reg_def V1_H ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next()  );
+  reg_def V1_J ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next(2) );
+  reg_def V1_K ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next(3) );
+
+  reg_def V2   ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()          );
+  reg_def V2_H ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next()  );
+  reg_def V2_J ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next(2) );
+  reg_def V2_K ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next(3) );
+
+  reg_def V3   ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()          );
+  reg_def V3_H ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next()  );
+  reg_def V3_J ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next(2) );
+  reg_def V3_K ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next(3) );
+
+  reg_def V4   ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()          );
+  reg_def V4_H ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next()  );
+  reg_def V4_J ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next(2) );
+  reg_def V4_K ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next(3) );
+
+  reg_def V5   ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()          );
+  reg_def V5_H ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next()  );
+  reg_def V5_J ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next(2) );
+  reg_def V5_K ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next(3) );
+
+  reg_def V6   ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()          );
+  reg_def V6_H ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next()  );
+  reg_def V6_J ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next(2) );
+  reg_def V6_K ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next(3) );
+
+  reg_def V7   ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()          );
+  reg_def V7_H ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next()  );
+  reg_def V7_J ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next(2) );
+  reg_def V7_K ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next(3) );
+
+  reg_def V8   ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()          );
+  reg_def V8_H ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next()  );
+  reg_def V8_J ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next(2) );
+  reg_def V8_K ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next(3) );
+
+  reg_def V9   ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()          );
+  reg_def V9_H ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next()  );
+  reg_def V9_J ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next(2) );
+  reg_def V9_K ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next(3) );
+
+  reg_def V10  ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()         );
+  reg_def V10_H( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next() );
+  reg_def V10_J( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2));
+  reg_def V10_K( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3));
+
+  reg_def V11  ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()         );
+  reg_def V11_H( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next() );
+  reg_def V11_J( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2));
+  reg_def V11_K( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3));
+
+  reg_def V12  ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()         );
+  reg_def V12_H( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next() );
+  reg_def V12_J( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2));
+  reg_def V12_K( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3));
+
+  reg_def V13  ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()         );
+  reg_def V13_H( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next() );
+  reg_def V13_J( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2));
+  reg_def V13_K( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3));
+
+  reg_def V14  ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()         );
+  reg_def V14_H( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next() );
+  reg_def V14_J( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2));
+  reg_def V14_K( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3));
+
+  reg_def V15  ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()         );
+  reg_def V15_H( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next() );
+  reg_def V15_J( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2));
+  reg_def V15_K( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3));
+
+  reg_def V16  ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()         );
+  reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() );
+  reg_def V16_J( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2));
+  reg_def V16_K( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3));
+
+  reg_def V17  ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()         );
+  reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() );
+  reg_def V17_J( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2));
+  reg_def V17_K( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3));
+
+  reg_def V18  ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()         );
+  reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() );
+  reg_def V18_J( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2));
+  reg_def V18_K( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3));
+
+  reg_def V19  ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()         );
+  reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() );
+  reg_def V19_J( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2));
+  reg_def V19_K( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3));
+
+  reg_def V20  ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()         );
+  reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() );
+  reg_def V20_J( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2));
+  reg_def V20_K( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3));
+
+  reg_def V21  ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()         );
+  reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() );
+  reg_def V21_J( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2));
+  reg_def V21_K( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3));
+
+  reg_def V22  ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()         );
+  reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() );
+  reg_def V22_J( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2));
+  reg_def V22_K( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3));
+
+  reg_def V23  ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()         );
+  reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() );
+  reg_def V23_J( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2));
+  reg_def V23_K( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3));
+
+  reg_def V24  ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()         );
+  reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() );
+  reg_def V24_J( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2));
+  reg_def V24_K( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3));
+
+  reg_def V25  ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()         );
+  reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() );
+  reg_def V25_J( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2));
+  reg_def V25_K( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3));
+
+  reg_def V26  ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()         );
+  reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() );
+  reg_def V26_J( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2));
+  reg_def V26_K( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3));
+
+  reg_def V27  ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()         );
+  reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() );
+  reg_def V27_J( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2));
+  reg_def V27_K( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3));
+
+  reg_def V28  ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()         );
+  reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() );
+  reg_def V28_J( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2));
+  reg_def V28_K( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3));
+
+  reg_def V29  ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()         );
+  reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() );
+  reg_def V29_J( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2));
+  reg_def V29_K( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3));
+
+  reg_def V30  ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()         );
+  reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() );
+  reg_def V30_J( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2));
+  reg_def V30_K( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3));
+
+  reg_def V31  ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()         );
+  reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() );
+  reg_def V31_J( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2));
+  reg_def V31_K( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3));
 
 // ----------------------------
 // Special Registers
@@ -292,42 +387,42 @@
 alloc_class chunk1(
 
     // no save
-    V16, V16_H,
-    V17, V17_H,
-    V18, V18_H,
-    V19, V19_H,
-    V20, V20_H,
-    V21, V21_H,
-    V22, V22_H,
-    V23, V23_H,
-    V24, V24_H,
-    V25, V25_H,
-    V26, V26_H,
-    V27, V27_H,
-    V28, V28_H,
-    V29, V29_H,
-    V30, V30_H,
-    V31, V31_H,
+    V16, V16_H, V16_J, V16_K,
+    V17, V17_H, V17_J, V17_K,
+    V18, V18_H, V18_J, V18_K,
+    V19, V19_H, V19_J, V19_K,
+    V20, V20_H, V20_J, V20_K,
+    V21, V21_H, V21_J, V21_K,
+    V22, V22_H, V22_J, V22_K,
+    V23, V23_H, V23_J, V23_K,
+    V24, V24_H, V24_J, V24_K,
+    V25, V25_H, V25_J, V25_K,
+    V26, V26_H, V26_J, V26_K,
+    V27, V27_H, V27_J, V27_K,
+    V28, V28_H, V28_J, V28_K,
+    V29, V29_H, V29_J, V29_K,
+    V30, V30_H, V30_J, V30_K,
+    V31, V31_H, V31_J, V31_K,
 
     // arg registers
-    V0, V0_H,
-    V1, V1_H,
-    V2, V2_H,
-    V3, V3_H,
-    V4, V4_H,
-    V5, V5_H,
-    V6, V6_H,
-    V7, V7_H,
+    V0, V0_H, V0_J, V0_K,
+    V1, V1_H, V1_J, V1_K,
+    V2, V2_H, V2_J, V2_K,
+    V3, V3_H, V3_J, V3_K,
+    V4, V4_H, V4_J, V4_K,
+    V5, V5_H, V5_J, V5_K,
+    V6, V6_H, V6_J, V6_K,
+    V7, V7_H, V7_J, V7_K,
 
     // non-volatiles
-    V8, V8_H,
-    V9, V9_H,
-    V10, V10_H,
-    V11, V11_H,
-    V12, V12_H,
-    V13, V13_H,
-    V14, V14_H,
-    V15, V15_H,
+    V8, V8_H, V8_J, V8_K,
+    V9, V9_H, V9_J, V9_K,
+    V10, V10_H, V10_J, V10_K,
+    V11, V11_H, V11_J, V11_K,
+    V12, V12_H, V12_J, V12_K,
+    V13, V13_H, V13_J, V13_K,
+    V14, V14_H, V14_J, V14_K,
+    V15, V15_H, V15_J, V15_K,
 );
 
 alloc_class chunk2(RFLAGS);
@@ -381,6 +476,9 @@
 // Singleton class for R2 int register
 reg_class int_r2_reg(R2);
 
+// Singleton class for R3 int register
+reg_class int_r3_reg(R3);
+
 // Singleton class for R4 int register
 reg_class int_r4_reg(R4);
 
@@ -447,7 +545,7 @@
     R26
  /* R27, */			// heapbase
  /* R28, */			// thread
- /* R29, */			// fp
+    R29,                        // fp
  /* R30, */			// lr
  /* R31 */			// sp
 );
@@ -481,7 +579,7 @@
     R26, R26_H,
  /* R27, R27_H,	*/		// heapbase
  /* R28, R28_H, */		// thread
- /* R29, R29_H, */		// fp
+    R29, R29_H,                 // fp
  /* R30, R30_H, */		// lr
  /* R31, R31_H */		// sp
 );
@@ -698,6 +796,98 @@
     V31, V31_H
 );
 
+// Class for all 64bit vector registers
+reg_class vectord_reg(
+    V0, V0_H,
+    V1, V1_H,
+    V2, V2_H,
+    V3, V3_H,
+    V4, V4_H,
+    V5, V5_H,
+    V6, V6_H,
+    V7, V7_H,
+    V8, V8_H,
+    V9, V9_H,
+    V10, V10_H,
+    V11, V11_H,
+    V12, V12_H,
+    V13, V13_H,
+    V14, V14_H,
+    V15, V15_H,
+    V16, V16_H,
+    V17, V17_H,
+    V18, V18_H,
+    V19, V19_H,
+    V20, V20_H,
+    V21, V21_H,
+    V22, V22_H,
+    V23, V23_H,
+    V24, V24_H,
+    V25, V25_H,
+    V26, V26_H,
+    V27, V27_H,
+    V28, V28_H,
+    V29, V29_H,
+    V30, V30_H,
+    V31, V31_H
+);
+
+// Class for all 128bit vector registers
+reg_class vectorx_reg(
+    V0, V0_H, V0_J, V0_K,
+    V1, V1_H, V1_J, V1_K,
+    V2, V2_H, V2_J, V2_K,
+    V3, V3_H, V3_J, V3_K,
+    V4, V4_H, V4_J, V4_K,
+    V5, V5_H, V5_J, V5_K,
+    V6, V6_H, V6_J, V6_K,
+    V7, V7_H, V7_J, V7_K,
+    V8, V8_H, V8_J, V8_K,
+    V9, V9_H, V9_J, V9_K,
+    V10, V10_H, V10_J, V10_K,
+    V11, V11_H, V11_J, V11_K,
+    V12, V12_H, V12_J, V12_K,
+    V13, V13_H, V13_J, V13_K,
+    V14, V14_H, V14_J, V14_K,
+    V15, V15_H, V15_J, V15_K,
+    V16, V16_H, V16_J, V16_K,
+    V17, V17_H, V17_J, V17_K,
+    V18, V18_H, V18_J, V18_K,
+    V19, V19_H, V19_J, V19_K,
+    V20, V20_H, V20_J, V20_K,
+    V21, V21_H, V21_J, V21_K,
+    V22, V22_H, V22_J, V22_K,
+    V23, V23_H, V23_J, V23_K,
+    V24, V24_H, V24_J, V24_K,
+    V25, V25_H, V25_J, V25_K,
+    V26, V26_H, V26_J, V26_K,
+    V27, V27_H, V27_J, V27_K,
+    V28, V28_H, V28_J, V28_K,
+    V29, V29_H, V29_J, V29_K,
+    V30, V30_H, V30_J, V30_K,
+    V31, V31_H, V31_J, V31_K
+);
+
+// Class for 128 bit register v0
+reg_class v0_reg(
+    V0, V0_H
+);
+
+// Class for 128 bit register v1
+reg_class v1_reg(
+    V1, V1_H
+);
+
+// Class for 128 bit register v2
+reg_class v2_reg(
+    V2, V2_H
+);
+
+// Class for 128 bit register v3
+reg_class v3_reg(
+    V3, V3_H
+);
+
 // Singleton class for condition codes
 reg_class int_flags(RFLAGS);
 
@@ -772,62 +962,10 @@
   }
 };
 
-  bool followed_by_ordered_store(const Node *barrier);
-  bool preceded_by_ordered_load(const Node *barrier);
-
 %}
 
 source %{
 
-  // AArch64 has load acquire and store release instructions which we
-  // use for ordered memory accesses, e.g. for volatiles.  The ideal
-  // graph generator also inserts memory barriers around volatile
-  // accesses, and we don't want to generate both barriers and acq/rel
-  // instructions.  So, when we emit a MemBarAcquire we look back in
-  // the ideal graph for an ordered load and only emit the barrier if
-  // we don't find one.
-
-bool preceded_by_ordered_load(const Node *barrier) {
-  Node *x = barrier->lookup(TypeFunc::Parms);
-
-  if (! x)
-    return false;
-
-  if (x->is_DecodeNarrowPtr())
-    x = x->in(1);
-
-  if (x->is_Load())
-    return ! x->as_Load()->is_unordered();
-
-  return false;
-}
-
-bool followed_by_ordered_store(const Node *barrier) {
-
-  // Find following mem node.
-  //
-  Node *mem_proj = NULL;
-  for (DUIterator_Fast imax, i = barrier->fast_outs(imax); i < imax; i++) {
-    mem_proj = barrier->fast_out(i);      // Throw out-of-bounds if proj not found
-    assert(mem_proj->is_Proj(), "only projections here");
-    ProjNode *proj = mem_proj->as_Proj();
-    if (proj->_con == TypeFunc::Memory &&
-        !Compile::current()->node_arena()->contains(mem_proj)) // Unmatched old-space only
-      break;
-  }
-  assert(mem_proj->as_Proj()->_con == TypeFunc::Memory, "Graph broken");
-
-  // Search behind Proj.
-  for (DUIterator_Fast jmax, j = mem_proj->fast_outs(jmax); j < jmax; j++) {
-    Node *x = mem_proj->fast_out(j);
-    if (x->is_Store() && ! x->as_Store()->is_unordered()) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
 #define __ _masm.
 
 // advance declaratuons for helper functions to convert register
@@ -1107,7 +1245,7 @@
   }
 
   // we have 32 float register * 2 halves
-  if (reg < 60 + 64) {
+  if (reg < 60 + 128) {
     return rc_float;
   }
 
@@ -1143,258 +1281,128 @@
     return 0;            // Self copy, no move.
   }
 
-  switch (src_lo_rc) {
-  case rc_int:
-    if (dst_lo_rc == rc_int) {	// gpr --> gpr copy
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ mov(as_Register(Matcher::_regEncode[dst_lo]),
-                 as_Register(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("mov  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ movw(as_Register(Matcher::_regEncode[dst_lo]),
-                  as_Register(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("movw  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
+  bool is64 = (src_lo & 1) == 0 && src_lo + 1 == src_hi &&
+              (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi;
+  int src_offset = ra_->reg2offset(src_lo);
+  int dst_offset = ra_->reg2offset(dst_lo);
+
+  if (bottom_type()->isa_vect() != NULL) {
+    uint ireg = ideal_reg();
+    assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector");
+    if (cbuf) {
+      MacroAssembler _masm(cbuf);
+      assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity");
+      if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
+        // stack->stack
+        assert((src_offset & 7) && (dst_offset & 7), "unaligned stack offset");
+        if (ireg == Op_VecD) {
+          __ unspill(rscratch1, true, src_offset);
+          __ spill(rscratch1, true, dst_offset);
+        } else {
+          __ spill_copy128(src_offset, dst_offset);
         }
-      }
-    } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                   as_Register(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovd  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
-        }
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
+        __ mov(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+               ireg == Op_VecD ? __ T8B : __ T16B,
+               as_FloatRegister(Matcher::_regEncode[src_lo]));
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
+        __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]),
+                       ireg == Op_VecD ? __ D : __ Q,
+                       ra_->reg2offset(dst_lo));
+      } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
+        __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                       ireg == Op_VecD ? __ D : __ Q,
+                       ra_->reg2offset(src_lo));
       } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                   as_Register(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovs  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
-        }
-      }
-    } else {			// gpr --> stack spill
-      assert(dst_lo_rc == rc_stack, "spill to bad register class");
-      int dst_offset = ra_->reg2offset(dst_lo);
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ str(as_Register(Matcher::_regEncode[src_lo]),
-	         Address(sp, dst_offset));
-        } else if (st) {
-          st->print("str  %s, [sp, #%d]\t# spill",
-                    Matcher::regName[src_lo],
-		    dst_offset);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ strw(as_Register(Matcher::_regEncode[src_lo]),
-	         Address(sp, dst_offset));
-        } else if (st) {
-          st->print("strw  %s, [sp, #%d]\t# spill",
-                    Matcher::regName[src_lo],
-		    dst_offset);
-        }
+        ShouldNotReachHere();
       }
     }
-    return 4;
-  case rc_float:
-    if (dst_lo_rc == rc_int) {	// fpr --> gpr copy
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovd(as_Register(Matcher::_regEncode[dst_lo]),
-                   as_FloatRegister(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovd  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovs(as_Register(Matcher::_regEncode[dst_lo]),
-                   as_FloatRegister(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovs  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
+  } else if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    switch (src_lo_rc) {
+    case rc_int:
+      if (dst_lo_rc == rc_int) {  // gpr --> gpr copy
+        if (is64) {
+            __ mov(as_Register(Matcher::_regEncode[dst_lo]),
+                   as_Register(Matcher::_regEncode[src_lo]));
+        } else {
+            MacroAssembler _masm(cbuf);
+            __ movw(as_Register(Matcher::_regEncode[dst_lo]),
+                    as_Register(Matcher::_regEncode[src_lo]));
         }
-      }
-    } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                   as_FloatRegister(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovd  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                   as_FloatRegister(Matcher::_regEncode[src_lo]));
-        } else if (st) {
-          st->print("fmovs  %s, %s\t# shuffle",
-                    Matcher::regName[dst_lo],
-                    Matcher::regName[src_lo]);
+      } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy
+        if (is64) {
+            __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                     as_Register(Matcher::_regEncode[src_lo]));
+        } else {
+            __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                     as_Register(Matcher::_regEncode[src_lo]));
         }
-      }
-    } else {			// fpr --> stack spill
-      assert(dst_lo_rc == rc_stack, "spill to bad register class");
-      int dst_offset = ra_->reg2offset(dst_lo);
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ strd(as_FloatRegister(Matcher::_regEncode[src_lo]),
-	         Address(sp, dst_offset));
-        } else if (st) {
-          st->print("strd  %s, [sp, #%d]\t# spill",
-                    Matcher::regName[src_lo],
-		    dst_offset);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ strs(as_FloatRegister(Matcher::_regEncode[src_lo]),
-	         Address(sp, dst_offset));
-        } else if (st) {
-          st->print("strs  %s, [sp, #%d]\t# spill",
-                    Matcher::regName[src_lo],
-		    dst_offset);
-        }
+      } else {                    // gpr --> stack spill
+        assert(dst_lo_rc == rc_stack, "spill to bad register class");
+        __ spill(as_Register(Matcher::_regEncode[src_lo]), is64, dst_offset);
       }
-    }
-    return 4;
-  case rc_stack:
-    int src_offset = ra_->reg2offset(src_lo);
-    if (dst_lo_rc == rc_int) {	// stack --> gpr load
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldr(as_Register(Matcher::_regEncode[dst_lo]),
-                 Address(sp, src_offset));
-        } else if (st) {
-          st->print("ldr  %s, [sp, %d]\t# restore",
-                    Matcher::regName[dst_lo],
-		    src_offset);
+      break;
+    case rc_float:
+      if (dst_lo_rc == rc_int) {  // fpr --> gpr copy
+        if (is64) {
+            __ fmovd(as_Register(Matcher::_regEncode[dst_lo]),
+                     as_FloatRegister(Matcher::_regEncode[src_lo]));
+        } else {
+            __ fmovs(as_Register(Matcher::_regEncode[dst_lo]),
+                     as_FloatRegister(Matcher::_regEncode[src_lo]));
         }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldrw(as_Register(Matcher::_regEncode[dst_lo]),
-                  Address(sp, src_offset));
-        } else if (st) {
-          st->print("ldr  %s, [sp, %d]\t# restore",
-                    Matcher::regName[dst_lo],
-                   src_offset);
-        }
-      }
-      return 4;
-    } else if (dst_lo_rc == rc_float) { // stack --> fpr load
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldrd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                 Address(sp, src_offset));
-        } else if (st) {
-          st->print("ldrd  %s, [sp, %d]\t# restore",
-                    Matcher::regName[dst_lo],
-		    src_offset);
+      } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy
+          if (cbuf) {
+            __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                     as_FloatRegister(Matcher::_regEncode[src_lo]));
+        } else {
+            __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                     as_FloatRegister(Matcher::_regEncode[src_lo]));
         }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldrs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
-                  Address(sp, src_offset));
-        } else if (st) {
-          st->print("ldrs  %s, [sp, %d]\t# restore",
-                    Matcher::regName[dst_lo],
-                   src_offset);
-        }
+      } else {                    // fpr --> stack spill
+        assert(dst_lo_rc == rc_stack, "spill to bad register class");
+        __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]),
+                 is64 ? __ D : __ S, dst_offset);
       }
-      return 4;
-    } else {			// stack --> stack copy
-      assert(dst_lo_rc == rc_stack, "spill to bad register class");
-      int dst_offset = ra_->reg2offset(dst_lo);
-      if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
-          (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
-          // 64 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldr(rscratch1, Address(sp, src_offset));
-          __ str(rscratch1, Address(sp, dst_offset));
-        } else if (st) {
-          st->print("ldr  rscratch1, [sp, %d]\t# mem-mem spill",
-		    src_offset);
-          st->print("\n\t");
-          st->print("str  rscratch1, [sp, %d]",
-		    dst_offset);
-        }
-      } else {
-        // 32 bit
-        if (cbuf) {
-          MacroAssembler _masm(cbuf);
-          __ ldrw(rscratch1, Address(sp, src_offset));
-          __ strw(rscratch1, Address(sp, dst_offset));
-        } else if (st) {
-          st->print("ldrw  rscratch1, [sp, %d]\t# mem-mem spill",
-		    src_offset);
-          st->print("\n\t");
-          st->print("strw  rscratch1, [sp, %d]",
-		    dst_offset);
-        }
+      break;
+    case rc_stack:
+      if (dst_lo_rc == rc_int) {  // stack --> gpr load
+        __ unspill(as_Register(Matcher::_regEncode[dst_lo]), is64, src_offset);
+      } else if (dst_lo_rc == rc_float) { // stack --> fpr load
+        __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+                   is64 ? __ D : __ S, src_offset);
+      } else {                    // stack --> stack copy
+        assert(dst_lo_rc == rc_stack, "spill to bad register class");
+        __ unspill(rscratch1, is64, src_offset);
+        __ spill(rscratch1, is64, dst_offset);
       }
-      return 8;
+      break;
+    default:
+      assert(false, "bad rc_class for spill");
+      ShouldNotReachHere();
     }
   }
 
-  assert(false," bad rc_class for spill ");
-  Unimplemented();
+  if (st) {
+    st->print("spill ");
+    if (src_lo_rc == rc_stack) {
+      st->print("[sp, #%d] -> ", ra_->reg2offset(src_lo));
+    } else {
+      st->print("%s -> ", Matcher::regName[src_lo]);
+    }
+    if (dst_lo_rc == rc_stack) {
+      st->print("[sp, #%d]", ra_->reg2offset(dst_lo));
+    } else {
+      st->print("%s", Matcher::regName[dst_lo]);
+    }
+    if (bottom_type()->isa_vect() != NULL) {
+      st->print("\t# vector spill size = %d", ideal_reg()==Op_VecD ? 64:128);
+    } else {
+      st->print("\t# spill size = %d", is64 ? 64:32);
+    }
+  }
+
   return 0;
 
 }
@@ -1413,7 +1421,7 @@
 }
 
 uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
-  return implementation(NULL, ra_, true, NULL);
+  return MachNode::size(ra_);
 }
 
 //=============================================================================
@@ -1567,8 +1575,12 @@
 
 // Vector width in bytes.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
-  // TODO fixme
-  return 0;
+  int size = MIN2(16,(int)MaxVectorSize);
+  // Minimum 2 values in vector
+  if (size < 2*type2aelembytes(bt)) size = 0;
+  // But never < 4
+  if (size < 4) size = 0;
+  return size;
 }
 
 // Limits on vector size (number of elements) loaded into vector.
@@ -1576,22 +1588,24 @@
   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 }
 const int Matcher::min_vector_size(const BasicType bt) {
-  int max_size = max_vector_size(bt);
-  // Min size which can be loaded into vector is 4 bytes.
-  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
-  return MIN2(size,max_size);
+//  For the moment limit the vector size to 8 bytes
+    int size = 8 / type2aelembytes(bt);
+    if (size < 2) size = 2;
+    return size;
 }
 
 // Vector ideal reg.
 const int Matcher::vector_ideal_reg(int len) {
-  // TODO fixme
-  return Op_RegD;
+  switch(len) {
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+  }
+  ShouldNotReachHere();
+  return 0;
 }
 
-// Only lowest bits of xmm reg are used for vector shift count.
 const int Matcher::vector_shift_count_ideal_reg(int size) {
-  // TODO fixme
-  return Op_RegL;
+  return Op_VecX;
 }
 
 // AES support not yet implemented
@@ -1601,9 +1615,7 @@
 
 // x86 supports misaligned vectors store/load.
 const bool Matcher::misaligned_vectors_ok() {
-  // TODO fixme
-  // return !AlignVector; // can be changed by flag
-  return false;
+  return !AlignVector; // can be changed by flag
 }
 
 // false => size gets scaled to BytesPerLong, ok.
@@ -1746,7 +1758,7 @@
 }
 
 const RegMask Matcher::method_handle_invoke_SP_save_mask() {
-  return RegMask();
+  return FP_REG_mask();
 }
 
 // helper for encoding java_to_runtime calls on sim
@@ -1802,6 +1814,8 @@
 
 typedef void (MacroAssembler::* mem_insn)(Register Rt, const Address &adr);
 typedef void (MacroAssembler::* mem_float_insn)(FloatRegister Rt, const Address &adr);
+typedef void (MacroAssembler::* mem_vector_insn)(FloatRegister Rt,
+                                  MacroAssembler::SIMD_RegVariant T, const Address &adr);
 
   // Used for all non-volatile memory accesses.  The use of
   // $mem->opcode() to discover whether this pattern uses sign-extended
@@ -1820,6 +1834,8 @@
     case INDINDEXSCALEDI2L:
     case INDINDEXSCALEDOFFSETI2LN:
     case INDINDEXSCALEDI2LN:
+    case INDINDEXOFFSETI2L:
+    case INDINDEXOFFSETI2LN:
       scale = Address::sxtw(size);
       break;
     default:
@@ -1867,6 +1883,18 @@
     }
   }
 
+  static void loadStore(MacroAssembler masm, mem_vector_insn insn,
+                         FloatRegister reg, MacroAssembler::SIMD_RegVariant T,
+                         int opcode, Register base, int index, int size, int disp)
+  {
+    if (index == -1) {
+      (masm.*insn)(reg, T, Address(base, disp));
+    } else {
+      assert(disp == 0, "unsupported address mode");
+      (masm.*insn)(reg, T, Address(base, as_Register(index), Address::lsl(size)));
+    }
+  }
+
 %}
 
 
@@ -1998,6 +2026,24 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_ldrvS(vecD dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::S,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_ldrvD(vecD dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::D,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_ldrvQ(vecX dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::Q,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strb(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strb, src_reg, $mem->opcode(),
@@ -2066,6 +2112,24 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_strvS(vecD src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::S,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_strvD(vecD src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::D,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_strvQ(vecX src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::Q,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   // END Non-volatile memory access
 
   // this encoding writes the address of the first instruction in the
@@ -2135,16 +2199,22 @@
   enc_class aarch64_enc_stlrb(iRegI src, memory mem) %{
     MOV_VOLATILE(as_Register($src$$reg), $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlrb);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
   enc_class aarch64_enc_stlrh(iRegI src, memory mem) %{
     MOV_VOLATILE(as_Register($src$$reg), $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlrh);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
   enc_class aarch64_enc_stlrw(iRegI src, memory mem) %{
     MOV_VOLATILE(as_Register($src$$reg), $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlrw);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
 
@@ -2235,6 +2305,8 @@
     }
     MOV_VOLATILE(src_reg, $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlr);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
   enc_class aarch64_enc_fstlrs(vRegF src, memory mem) %{
@@ -2245,6 +2317,8 @@
     }
     MOV_VOLATILE(rscratch2, $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlrw);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
   enc_class aarch64_enc_fstlrd(vRegD src, memory mem) %{
@@ -2255,6 +2329,8 @@
     }
     MOV_VOLATILE(rscratch2, $mem$$base, $mem$$index, $mem$$scale, $mem$$disp,
 		 rscratch1, stlr);
+    if (VM_Version::cpu_cpuFeatures() & VM_Version::CPU_DMB_ATOMICS)
+      __ dmb(__ ISH);
   %}
 
   // synchronized read/update encodings
@@ -2413,16 +2489,13 @@
     int disp = $mem$$disp;
     if (index == -1) {
       __ prfm(Address(base, disp), PLDL1KEEP);
-      __ nop();
     } else {
       Register index_reg = as_Register(index);
       if (disp == 0) {
-        // __ prfm(Address(base, index_reg, Address::lsl(scale)), PLDL1KEEP);
-        __ nop();
+        __ prfm(Address(base, index_reg, Address::lsl(scale)), PLDL1KEEP);
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PLDL1KEEP);
-        __ nop();
       }
     }
   %}
@@ -2440,11 +2513,9 @@
       Register index_reg = as_Register(index);
       if (disp == 0) {
         __ prfm(Address(base, index_reg, Address::lsl(scale)), PSTL1KEEP);
-        __ nop();
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PSTL1KEEP);
-        __ nop();
       }
     }
   %}
@@ -2457,7 +2528,6 @@
     int disp = $mem$$disp;
     if (index == -1) {
       __ prfm(Address(base, disp), PSTL1STRM);
-      __ nop();
     } else {
       Register index_reg = as_Register(index);
       if (disp == 0) {
@@ -2466,7 +2536,6 @@
       } else {
         __ lea(rscratch1, Address(base, disp));
 	__ prfm(Address(rscratch1, index_reg, Address::lsl(scale)), PSTL1STRM);
-        __ nop();
       }
     }
   %}
@@ -2979,7 +3048,8 @@
       __ ldxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
       __ br(Assembler::NE, cas_failed);
-      __ stxr(tmp, box, oop);
+      // use stlxr to ensure update is immediately visible
+      __ stlxr(tmp, box, oop);
       __ cbzw(tmp, cont);
       __ b(retry_load);
     }
@@ -3028,7 +3098,8 @@
 	__ ldxr(rscratch1, tmp);
 	__ cmp(disp_hdr, rscratch1);
 	__ br(Assembler::NE, fail);
-	__ stxr(rscratch1, rthread, tmp);
+        // use stlxr to ensure update is immediately visible
+	__ stlxr(rscratch1, rthread, tmp);
 	__ cbnzw(rscratch1, retry_load);
 	__ bind(fail);
       }
@@ -3116,7 +3187,8 @@
 	__ ldxr(tmp, oop);
 	__ cmp(box, tmp);
 	__ br(Assembler::NE, cas_failed);
-	__ stxr(tmp, disp_hdr, oop);
+        // use stlxr to ensure update is immediately visible
+	__ stlxr(tmp, disp_hdr, oop);
 	__ cbzw(tmp, cont);
 	__ b(retry_load);
       }
@@ -3392,6 +3464,16 @@
   interface(CONST_INTER);
 %}
 
+operand immI_le_4()
+%{
+  predicate(n->get_int() <= 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 operand immI_31()
 %{
   predicate(n->get_int() == 31);
@@ -4116,6 +4198,18 @@
   interface(REG_INTER);
 %}
 
+// Register R3 only
+operand iRegI_R3()
+%{
+  constraint(ALLOC_IN_RC(int_r3_reg));
+  match(RegI);
+  match(iRegINoSp);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
 // Register R2 only
 operand iRegI_R4()
 %{
@@ -4185,6 +4279,62 @@
   interface(REG_INTER);
 %}
 
+operand vecD()
+%{
+  constraint(ALLOC_IN_RC(vectord_reg));
+  match(VecD);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX()
+%{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V0()
+%{
+  constraint(ALLOC_IN_RC(v0_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V1()
+%{
+  constraint(ALLOC_IN_RC(v1_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V2()
+%{
+  constraint(ALLOC_IN_RC(v2_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vRegD_V3()
+%{
+  constraint(ALLOC_IN_RC(v3_reg));
+  match(RegD);
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 // Flags register, used as output of signed compare instructions
 
 // note that on AArch64 we also use this register as the output for
@@ -4311,6 +4461,20 @@
   %}
 %}
 
+operand indIndexOffsetI2L(iRegP reg, iRegI ireg, immLU12 off)
+%{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP (AddP reg (ConvI2L ireg)) off);
+  op_cost(INSN_COST);
+  format %{ "$reg, $ireg, $off I2L" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($ireg);
+    scale(0x0);
+    disp($off);
+  %}
+%}
+
 operand indIndexScaledOffsetI2L(iRegP reg, iRegI ireg, immIScale scale, immLU12 off)
 %{
   constraint(ALLOC_IN_RC(ptr_reg));
@@ -4371,7 +4535,7 @@
 %{
   constraint(ALLOC_IN_RC(ptr_reg));
   match(AddP reg off);
-  op_cost(INSN_COST);
+  op_cost(0);
   format %{ "[$reg, $off]" %}
   interface(MEMORY_INTER) %{
     base($reg);
@@ -4441,6 +4605,21 @@
   %}
 %}
 
+operand indIndexOffsetI2LN(iRegN reg, iRegI ireg, immLU12 off)
+%{
+  predicate(Universe::narrow_oop_shift() == 0);
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP (AddP (DecodeN reg) (ConvI2L ireg)) off);
+  op_cost(INSN_COST);
+  format %{ "$reg, $ireg, $off I2L\t# narrow" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($ireg);
+    scale(0x0);
+    disp($off);
+  %}
+%}
+
 operand indIndexScaledOffsetI2LN(iRegN reg, iRegI ireg, immIScale scale, immLU12 off)
 %{
   predicate(Universe::narrow_oop_shift() == 0);
@@ -4692,6 +4871,7 @@
   interface(REG_INTER)
 %}
 
+opclass vmem(indirect, indIndex, indOffI, indOffL);
 
 //----------OPERAND CLASSES----------------------------------------------------
 // Operand Classes are groups of operands that are used as to simplify
@@ -4703,8 +4883,10 @@
 // memory is used to define read/write location for load/store
 // instruction defs. we can turn a memory op into an Address
 
-opclass memory(indirect, indIndexScaledOffsetI,  indIndexScaledOffsetL, indIndexScaledOffsetI2L, indIndexScaled, indIndexScaledI2L, indIndex, indOffI, indOffL,
-	       indirectN, indIndexScaledOffsetIN,  indIndexScaledOffsetLN, indIndexScaledOffsetI2LN, indIndexScaledN, indIndexScaledI2LN, indIndexN, indOffIN, indOffLN);
+opclass memory(indirect, indIndexScaledOffsetI, indIndexScaledOffsetL, indIndexOffsetI2L, indIndexScaledOffsetI2L, indIndexScaled, indIndexScaledI2L, indIndex, indOffI, indOffL,
+               indirectN, indIndexScaledOffsetIN, indIndexScaledOffsetLN, indIndexOffsetI2LN, indIndexScaledOffsetI2LN, indIndexScaledN, indIndexScaledI2LN, indIndexN, indOffIN, indOffLN);
+ 
+ // iRegIorL2I is used for src inputs in rules for 32 bit int (I)
 
 
 // iRegIorL2I is used for src inputs in rules for 32 bit int (I)
@@ -4720,7 +4902,6 @@
 // the result of the l2i as an iRegI input. That's a shame since the
 // movw is actually redundant but its not too costly.
 
-
 opclass iRegIorL2I(iRegI, iRegL2I);
 
 //----------PIPELINE-----------------------------------------------------------
@@ -4731,17 +4912,14 @@
 attributes %{
   // ARM instructions are of fixed length
   fixed_size_instructions;        // Fixed size instructions TODO does
-  // TODO does this relate to how many instructions can be scheduled
-  // at once? just guess 8 for now
-  max_instructions_per_bundle = 8;   // Up to 8 instructions per bundle
+  max_instructions_per_bundle = 2;   // A53 = 2, A57 = 4
   // ARM instructions come in 32-bit word units
   instruction_unit_size = 4;         // An instruction is 4 bytes long
-  // TODO identify correct cache line size  just guess 64 for now
   instruction_fetch_unit_size = 64;  // The processor fetches one line
   instruction_fetch_units = 1;       // of 64 bytes
 
   // List of nop instructions
-  //nops( MachNop );
+  nops( MachNop );
 %}
 
 // We don't use an actual pipeline model so don't care about resources
@@ -4751,21 +4929,386 @@
 //----------RESOURCES----------------------------------------------------------
 // Resources are the functional units available to the machine
 
-resources( D0, D1, D2, DECODE = D0 | D1 | D2,
-           MS0, MS1, MS2, MEM = MS0 | MS1 | MS2,
-           BR, FPU,
-           ALU0, ALU1, ALU2, ALU = ALU0 | ALU1 | ALU2);
+resources( INS0, INS1, INS01 = INS0 | INS1,
+           ALU0, ALU1, ALU = ALU0 | ALU1,
+           MAC,
+           DIV,
+           BRANCH,
+           LDST,
+           NEON_FP);
 
 //----------PIPELINE DESCRIPTION-----------------------------------------------
 // Pipeline Description specifies the stages in the machine's pipeline
 
-// Generic P2/P3 pipeline
-pipe_desc(S0, S1, S2, S3, S4, S5);
+pipe_desc(ISS, EX1, EX2, WR);
 
 //----------PIPELINE CLASSES---------------------------------------------------
 // Pipeline Classes describe the stages in which input and output are
 // referenced by the hardware pipeline.
 
+//------- Integer ALU operations --------------------------
+
+// Integer ALU reg-reg operation
+// Operands needed in EX1, result generated in EX2
+// Eg.	ADD	x0, x1, x2
+pipe_class ialu_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  INS01  : ISS; // Dual issue as instruction 0 or 1
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with constant shift
+// Shifted register must be available in LATE_ISS instead of EX1
+// Eg.	ADD	x0, x1, x2, LSL #2
+pipe_class ialu_reg_reg_shift(iRegI dst, iRegI src1, iRegI src2, immI shift)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg operation with constant shift
+// Eg.	LSL	x0, x1, #shift
+pipe_class ialu_reg_shift(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg-reg operation with variable shift
+// Both operands must be available in LATE_ISS instead of EX1
+// Result is available in EX1 instead of EX2
+// Eg.	LSLV	x0, x1, x2
+pipe_class ialu_reg_reg_vshift(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+// Integer ALU reg-reg operation with extract
+// As for _vshift above, but result generated in EX2
+// Eg.	EXTR	x0, x1, x2, #N
+pipe_class ialu_reg_reg_extr(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS1   : ISS; // Can only dual issue as Instruction 1
+  ALU    : EX1;
+%}
+
+// Integer ALU reg operation
+// Eg.	NEG	x0, x1
+pipe_class ialu_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU reg mmediate operation
+// Eg.	ADD	x0, x1, #N
+pipe_class ialu_reg_imm(iRegI dst, iRegI src1)
+%{
+  single_instruction;
+  dst    : EX2(write);
+  src1   : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Integer ALU immediate operation (no source operands)
+// Eg.	MOV	x0, #N
+pipe_class ialu_imm(iRegI dst)
+%{
+  single_instruction;
+  dst    : EX1(write);
+  INS01  : ISS;
+  ALU    : EX1;
+%}
+
+//------- Compare operation -------------------------------
+
+// Compare reg-reg
+// Eg.	CMP	x0, x1
+pipe_class icmp_reg_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  op2    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Compare reg-reg
+// Eg.	CMP	x0, #N
+pipe_class icmp_reg_imm(rFlagsReg cr, iRegI op1)
+%{
+  single_instruction;
+//  fixed_latency(16);
+  cr     : EX2(write);
+  op1    : EX1(read);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Conditional instructions ------------------------
+
+// Conditional no operands
+// Eg.	CSINC	x0, zr, zr, <cond>
+pipe_class icond_none(iRegI dst, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg_reg(iRegI dst, iRegI src1, iRegI src2, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src1   : EX1(read);
+  src2   : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+// Conditional 2 operand
+// EG.	CSEL	X0, X1, X2, <cond>
+pipe_class icond_reg(iRegI dst, iRegI src, rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  src    : EX1(read);
+  dst    : EX2(write);
+  INS01  : ISS;
+  ALU    : EX2;
+%}
+
+//------- Multiply pipeline operations --------------------
+
+// Multiply reg-reg
+// Eg.	MUL	w0, w1, w2
+pipe_class imul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class imac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Eg.	MUL	w0, w1, w2
+pipe_class lmul_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+// Multiply accumulate
+// Eg.	MADD	w0, w1, w2, w3
+pipe_class lmac_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3)
+%{
+  single_instruction;
+  fixed_latency(3); // Maximum latency for 64 bit mul
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  src3   : ISS(read);
+  INS01  : ISS;
+  MAC    : WR;
+%}
+
+//------- Divide pipeline operations --------------------
+
+// Eg.	SDIV	w0, w1, w2
+pipe_class idiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(8); // Maximum latency for 32 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+// Eg.	SDIV	x0, x1, x2
+pipe_class ldiv_reg_reg(iRegI dst, iRegI src1, iRegI src2)
+%{
+  single_instruction;
+  fixed_latency(16); // Maximum latency for 64 bit divide
+  dst    : WR(write);
+  src1   : ISS(read);
+  src2   : ISS(read);
+  INS0   : ISS; // Can only dual issue as instruction 0
+  DIV    : WR;
+%}
+
+//------- Load pipeline operations ------------------------
+
+// Load - prefetch
+// Eg.	PFRM	<mem>
+pipe_class iload_prefetch(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, mem
+// Eg.	LDR	x0, <mem>
+pipe_class iload_reg_mem(iRegI dst, memory mem)
+%{
+  single_instruction;
+  dst    : WR(write);
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Load - reg, reg
+// Eg.	LDR	x0, [sp, x1]
+pipe_class iload_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : WR(write);
+  src    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Store - zr, mem
+// Eg.	STR	zr, <mem>
+pipe_class istore_mem(memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, mem
+// Eg.	STR	x0, <mem>
+pipe_class istore_reg_mem(iRegI src, memory mem)
+%{
+  single_instruction;
+  mem    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+// Store - reg, reg
+// Eg. STR	x0, [sp, x1]
+pipe_class istore_reg_reg(iRegI dst, iRegI src)
+%{
+  single_instruction;
+  dst    : ISS(read);
+  src    : EX2(read);
+  INS01  : ISS;
+  LDST   : WR;
+%}
+
+//------- Store pipeline operations -----------------------
+
+// Branch
+pipe_class pipe_branch()
+%{
+  single_instruction;
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Conditional branch
+pipe_class pipe_branch_cond(rFlagsReg cr)
+%{
+  single_instruction;
+  cr     : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+// Compare & Branch
+// EG.	CBZ/CBNZ
+pipe_class pipe_cmp_branch(iRegI op1)
+%{
+  single_instruction;
+  op1    : EX1(read);
+  INS01  : ISS;
+  BRANCH : EX1;
+%}
+
+//------- Synchronisation operations ----------------------
+
+// Any operation requiring serialization.
+// EG.	DMB/Atomic Ops/Load Acquire/Str Release
+pipe_class pipe_serial()
+%{
+  single_instruction;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
+// Generic big/slow expanded idiom - also serialized
+pipe_class pipe_slow()
+%{
+  instruction_count(10);
+  multiple_bundles;
+  force_serialization;
+  fixed_latency(16);
+  INS01  : ISS(2); // Cannot dual issue with any other instruction
+  LDST   : WR;
+%}
+
 // Empty pipeline class
 pipe_class pipe_class_empty()
 %{
@@ -4787,13 +5330,6 @@
   fixed_latency(16);
 %}
 
-// Pipeline class for traps.
-pipe_class pipe_class_trap()
-%{
-  single_instruction;
-  fixed_latency(100);
-%}
-
 // Pipeline class for memory operations.
 pipe_class pipe_class_memory()
 %{
@@ -4810,7 +5346,7 @@
 
 // Define the class for the Nop node.
 define %{
-   MachNop = pipe_class_default;
+   MachNop = pipe_class_empty;
 %}
 
 %}
@@ -4844,168 +5380,168 @@
 instruct loadB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadB mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsbw  $dst, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_ldrsbw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Byte (8 bit signed) into long
 instruct loadB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadB mem)));
-  predicate(n->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsb  $dst, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_ldrsb(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Byte (8 bit unsigned)
 instruct loadUB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUB mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrbw  $dst, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_ldrb(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Byte (8 bit unsigned) into long
 instruct loadUB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUB mem)));
-  predicate(n->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrb  $dst, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_ldrb(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Short (16 bit signed)
 instruct loadS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadS mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrshw  $dst, $mem\t# short" %}
 
   ins_encode(aarch64_enc_ldrshw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Short (16 bit signed) into long
 instruct loadS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadS mem)));
-  predicate(n->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsh  $dst, $mem\t# short" %}
 
   ins_encode(aarch64_enc_ldrsh(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Char (16 bit unsigned)
 instruct loadUS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUS mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
 
   ins_encode(aarch64_enc_ldrh(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Short/Char (16 bit unsigned) into long
 instruct loadUS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUS mem)));
-  predicate(n->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
 
   ins_encode(aarch64_enc_ldrh(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Integer (32 bit signed)
 instruct loadI(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadI mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
 
   ins_encode(aarch64_enc_ldrw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Integer (32 bit signed) into long
 instruct loadI2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadI mem)));
-  predicate(n->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsw  $dst, $mem\t# int" %}
 
   ins_encode(aarch64_enc_ldrsw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Integer (32 bit unsigned) into long
 instruct loadUI2L(iRegLNoSp dst, memory mem, immL_32bits mask)
 %{
   match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
-  predicate(n->in(1)->in(1)->as_Load()->is_unordered());
+  // predicate(n->in(1)->in(1)->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
 
   ins_encode(aarch64_enc_ldrw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Long (64 bit signed)
 instruct loadL(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (LoadL mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# int" %}
 
   ins_encode(aarch64_enc_ldr(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Range
@@ -5018,70 +5554,70 @@
 
   ins_encode(aarch64_enc_ldrw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Pointer
 instruct loadP(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadP mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# ptr" %}
 
   ins_encode(aarch64_enc_ldr(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Compressed Pointer
 instruct loadN(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadN mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
 
   ins_encode(aarch64_enc_ldrw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Klass Pointer
 instruct loadKlass(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadKlass mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# class" %}
 
   ins_encode(aarch64_enc_ldr(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Narrow Klass Pointer
 instruct loadNKlass(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadNKlass mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed class ptr" %}
 
   ins_encode(aarch64_enc_ldrw(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_mem);
 %}
 
 // Load Float
 instruct loadF(vRegF dst, memory mem)
 %{
   match(Set dst (LoadF mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrs  $dst, $mem\t# float" %}
@@ -5095,7 +5631,7 @@
 instruct loadD(vRegD dst, memory mem)
 %{
   match(Set dst (LoadD mem));
-  predicate(n->as_Load()->is_unordered());
+  // predicate(n->as_Load()->is_unordered());
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrd  $dst, $mem\t# double" %}
@@ -5116,7 +5652,7 @@
 
   ins_encode( aarch64_enc_movw_imm(dst, src) );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Long Constant
@@ -5129,7 +5665,7 @@
 
   ins_encode( aarch64_enc_mov_imm(dst, src) );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Pointer Constant
@@ -5145,7 +5681,7 @@
 
   ins_encode(aarch64_enc_mov_p(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Null Pointer Constant
@@ -5159,7 +5695,7 @@
 
   ins_encode(aarch64_enc_mov_p0(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Pointer Constant One
@@ -5173,7 +5709,7 @@
 
   ins_encode(aarch64_enc_mov_p1(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Poll Page Constant
@@ -5187,7 +5723,7 @@
 
   ins_encode(aarch64_enc_mov_poll_page(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Byte Map Base Constant
@@ -5201,7 +5737,7 @@
 
   ins_encode(aarch64_enc_mov_byte_map_base(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Narrow Pointer Constant
@@ -5215,7 +5751,7 @@
 
   ins_encode(aarch64_enc_mov_n(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Narrow Null Pointer Constant
@@ -5229,7 +5765,7 @@
 
   ins_encode(aarch64_enc_mov_n0(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Narrow Klass Constant
@@ -5243,7 +5779,7 @@
 
   ins_encode(aarch64_enc_mov_nk(dst, con));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_imm);
 %}
 
 // Load Packed Float Constant
@@ -5319,62 +5855,62 @@
 
   ins_encode(aarch64_enc_strb0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Byte
-instruct storeB(iRegI src, memory mem)
+instruct storeB(iRegIorL2I src, memory mem)
 %{
   match(Set mem (StoreB mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strb  $src, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_strb(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 
 instruct storeimmB0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreB mem zero));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_strb0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Char/Short
-instruct storeC(iRegI src, memory mem)
+instruct storeC(iRegIorL2I src, memory mem)
 %{
   match(Set mem (StoreC mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strh  $src, $mem\t# short" %}
 
   ins_encode(aarch64_enc_strh(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 instruct storeimmC0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreC mem zero));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strh  zr, $mem\t# short" %}
 
   ins_encode(aarch64_enc_strh0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Integer
@@ -5382,83 +5918,83 @@
 instruct storeI(iRegIorL2I src, memory mem)
 %{
   match(Set mem(StoreI mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strw  $src, $mem\t# int" %}
 
   ins_encode(aarch64_enc_strw(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 instruct storeimmI0(immI0 zero, memory mem)
 %{
   match(Set mem(StoreI mem zero));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strw  zr, $mem\t# int" %}
 
   ins_encode(aarch64_enc_strw0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Long (64 bit signed)
 instruct storeL(iRegL src, memory mem)
 %{
   match(Set mem (StoreL mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "str  $src, $mem\t# int" %}
 
   ins_encode(aarch64_enc_str(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 // Store Long (64 bit signed)
 instruct storeimmL0(immL0 zero, memory mem)
 %{
   match(Set mem (StoreL mem zero));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "str  zr, $mem\t# int" %}
 
   ins_encode(aarch64_enc_str0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Pointer
 instruct storeP(iRegP src, memory mem)
 %{
   match(Set mem (StoreP mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "str  $src, $mem\t# ptr" %}
 
   ins_encode(aarch64_enc_str(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 // Store Pointer
 instruct storeimmP0(immP0 zero, memory mem)
 %{
   match(Set mem (StoreP mem zero));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "str zr, $mem\t# ptr" %}
 
   ins_encode(aarch64_enc_str0(mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Save last Java PC to thread anchor
@@ -5488,7 +6024,7 @@
 
   ins_encode(aarch64_enc_save_pc());
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 instruct storeLastJavaPC_with_retaddr(thread_anchor_pc mem, immP_M2 dummy_m2)
@@ -5503,43 +6039,44 @@
 
   ins_encode(aarch64_enc_save_pc());
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_mem);
 %}
 
 // Store Compressed Pointer
 instruct storeN(iRegN src, memory mem)
 %{
   match(Set mem (StoreN mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strw  $src, $mem\t# compressed ptr" %}
 
   ins_encode(aarch64_enc_strw(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 instruct storeImmN0(iRegIHeapbase heapbase, immN0 zero, memory mem)
 %{
   match(Set mem (StoreN mem zero));
   predicate(Universe::narrow_oop_base() == NULL &&
-            Universe::narrow_klass_base() == NULL &&
-            n->as_Store()->is_unordered());
+            Universe::narrow_klass_base() == NULL//  &&
+	    // n->as_Store()->is_unordered()
+	    );
 
   ins_cost(INSN_COST);
   format %{ "strw  rheapbase, $mem\t# compressed ptr (rheapbase==0)" %}
 
   ins_encode(aarch64_enc_strw(heapbase, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 // Store Float
 instruct storeF(vRegF src, memory mem)
 %{
   match(Set mem (StoreF mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strs  $src, $mem\t# float" %}
@@ -5556,7 +6093,7 @@
 instruct storeD(vRegD src, memory mem)
 %{
   match(Set mem (StoreD mem src));
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
 
   ins_cost(INSN_COST);
   format %{ "strd  $src, $mem\t# double" %}
@@ -5569,7 +6106,7 @@
 // Store Compressed Klass Pointer
 instruct storeNKlass(iRegN src, memory mem)
 %{
-  predicate(n->as_Store()->is_unordered());
+//   predicate(n->as_Store()->is_unordered());
   match(Set mem (StoreNKlass mem src));
 
   ins_cost(INSN_COST);
@@ -5577,7 +6114,7 @@
 
   ins_encode(aarch64_enc_strw(src, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_mem);
 %}
 
 // TODO
@@ -5594,7 +6131,7 @@
 
   ins_encode( aarch64_enc_prefetchr(mem) );
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_prefetch);
 %}
 
 instruct prefetchw( memory mem ) %{
@@ -5605,7 +6142,7 @@
 
   ins_encode( aarch64_enc_prefetchw(mem) );
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_prefetch);
 %}
 
 instruct prefetchnta( memory mem ) %{
@@ -5616,376 +6153,70 @@
 
   ins_encode( aarch64_enc_prefetchnta(mem) );
 
-  ins_pipe(pipe_class_memory);
-%}
-
-//  ---------------- volatile loads and stores ----------------
-
-// Load Byte (8 bit signed)
-instruct loadB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadB mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarsb  $dst, $mem\t# byte" %}
-
-  ins_encode(aarch64_enc_ldarsb(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Byte (8 bit signed) into long
-instruct loadB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (ConvI2L (LoadB mem)));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarsb  $dst, $mem\t# byte" %}
-
-  ins_encode(aarch64_enc_ldarsb(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Byte (8 bit unsigned)
-instruct loadUB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadUB mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarb  $dst, $mem\t# byte" %}
-
-  ins_encode(aarch64_enc_ldarb(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Byte (8 bit unsigned) into long
-instruct loadUB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (ConvI2L (LoadUB mem)));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarb  $dst, $mem\t# byte" %}
-
-  ins_encode(aarch64_enc_ldarb(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Short (16 bit signed)
-instruct loadS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadS mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarshw  $dst, $mem\t# short" %}
-
-  ins_encode(aarch64_enc_ldarshw(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-instruct loadUS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadUS mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarhw  $dst, $mem\t# short" %}
-
-  ins_encode(aarch64_enc_ldarhw(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Short/Char (16 bit unsigned) into long
-instruct loadUS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (ConvI2L (LoadUS mem)));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarh  $dst, $mem\t# short" %}
-
-  ins_encode(aarch64_enc_ldarh(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Short/Char (16 bit signed) into long
-instruct loadS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (ConvI2L (LoadS mem)));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarh  $dst, $mem\t# short" %}
-
-  ins_encode(aarch64_enc_ldarsh(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Integer (32 bit signed)
-instruct loadI_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadI mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarw  $dst, $mem\t# int" %}
-
-  ins_encode(aarch64_enc_ldarw(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Integer (32 bit unsigned) into long
-instruct loadUI2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem, immL_32bits mask)
-%{
-  match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarw  $dst, $mem\t# int" %}
-
-  ins_encode(aarch64_enc_ldarw(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Long (64 bit signed)
-instruct loadL_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadL mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldar  $dst, $mem\t# int" %}
-
-  ins_encode(aarch64_enc_ldar(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Pointer
-instruct loadP_volatile(iRegPNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadP mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldar  $dst, $mem\t# ptr" %}
-
-  ins_encode(aarch64_enc_ldar(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Compressed Pointer
-instruct loadN_volatile(iRegNNoSp dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadN mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldarw  $dst, $mem\t# compressed ptr" %}
-
-  ins_encode(aarch64_enc_ldarw(dst, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Float
-instruct loadF_volatile(vRegF dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadF mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldars  $dst, $mem\t# float" %}
-
-  ins_encode( aarch64_enc_fldars(dst, mem) );
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Load Double
-instruct loadD_volatile(vRegD dst, /* sync_memory*/indirect mem)
-%{
-  match(Set dst (LoadD mem));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "ldard  $dst, $mem\t# double" %}
-
-  ins_encode( aarch64_enc_fldard(dst, mem) );
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Byte
-instruct storeB_volatile(iRegI src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreB mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrb  $src, $mem\t# byte" %}
-
-  ins_encode(aarch64_enc_stlrb(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Char/Short
-instruct storeC_volatile(iRegI src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreC mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrh  $src, $mem\t# short" %}
-
-  ins_encode(aarch64_enc_stlrh(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Integer
-
-instruct storeI_volatile(iRegIorL2I src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem(StoreI mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrw  $src, $mem\t# int" %}
-
-  ins_encode(aarch64_enc_stlrw(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Long (64 bit signed)
-instruct storeL_volatile(iRegL src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreL mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlr  $src, $mem\t# int" %}
-
-  ins_encode(aarch64_enc_stlr(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Pointer
-instruct storeP_volatile(iRegP src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreP mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlr  $src, $mem\t# ptr" %}
-
-  ins_encode(aarch64_enc_stlr(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Compressed Pointer
-instruct storeN_volatile(iRegN src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreN mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrw  $src, $mem\t# compressed ptr" %}
-
-  ins_encode(aarch64_enc_stlrw(src, mem));
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// Store Float
-instruct storeF_volatile(vRegF src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreF mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrs  $src, $mem\t# float" %}
-
-  ins_encode( aarch64_enc_fstlrs(src, mem) );
-
-  ins_pipe(pipe_class_memory);
-%}
-
-// TODO
-// implement storeImmF0 and storeFImmPacked
-
-// Store Double
-instruct storeD_volatile(vRegD src, /* sync_memory*/indirect mem)
-%{
-  match(Set mem (StoreD mem src));
-
-  ins_cost(VOLATILE_REF_COST);
-  format %{ "stlrd  $src, $mem\t# double" %}
-
-  ins_encode( aarch64_enc_fstlrd(src, mem) );
-
-  ins_pipe(pipe_class_memory);
-%}
-
-//  ---------------- end of volatile loads and stores ----------------
+  ins_pipe(iload_prefetch);
+%}
 
 // ============================================================================
 // BSWAP Instructions
 
-instruct bytes_reverse_int(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesI dst));
-
-  ins_cost(INSN_COST);
-  format %{ "revw  $dst, $dst" %}
-
-  ins_encode %{
-    __ revw(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_long(iRegLNoSp dst) %{
-  match(Set dst (ReverseBytesL dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev  $dst, $dst" %}
-
-  ins_encode %{
-    __ rev(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_unsigned_short(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesUS dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev16w  $dst, $dst" %}
-
-  ins_encode %{
-    __ rev16w(as_Register($dst$$reg), as_Register($dst$$reg));
-  %}
-
-  ins_pipe( pipe_class_default );
-%}
-
-instruct bytes_reverse_short(iRegINoSp dst) %{
-  match(Set dst (ReverseBytesS dst));
-
-  ins_cost(INSN_COST);
-  format %{ "rev16w  $dst, $dst\n\t"
+instruct bytes_reverse_int(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesI src));
+
+  ins_cost(INSN_COST);
+  format %{ "revw  $dst, $src" %}
+
+  ins_encode %{
+    __ revw(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_long(iRegLNoSp dst, iRegL src) %{
+  match(Set dst (ReverseBytesL src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev  $dst, $src" %}
+
+  ins_encode %{
+    __ rev(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_unsigned_short(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesUS src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev16w  $dst, $src" %}
+
+  ins_encode %{
+    __ rev16w(as_Register($dst$$reg), as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct bytes_reverse_short(iRegINoSp dst, iRegIorL2I src) %{
+  match(Set dst (ReverseBytesS src));
+
+  ins_cost(INSN_COST);
+  format %{ "rev16w  $dst, $src\n\t"
             "sbfmw $dst, $dst, #0, #15" %}
 
   ins_encode %{
-    __ rev16w(as_Register($dst$$reg), as_Register($dst$$reg));
+    __ rev16w(as_Register($dst$$reg), as_Register($src$$reg));
     __ sbfmw(as_Register($dst$$reg), as_Register($dst$$reg), 0U, 15U);
   %}
 
-  ins_pipe( pipe_class_default );
+  ins_pipe(ialu_reg);
 %}
 
 // ============================================================================
 // Zero Count Instructions
 
-instruct countLeadingZerosI(iRegI dst, iRegI src) %{
+instruct countLeadingZerosI(iRegINoSp dst, iRegIorL2I src) %{
   match(Set dst (CountLeadingZerosI src));
 
   ins_cost(INSN_COST);
@@ -5994,10 +6225,10 @@
     __ clzw(as_Register($dst$$reg), as_Register($src$$reg));
   %}
 
-  ins_pipe( pipe_class_default );
-%}
-
-instruct countLeadingZerosL(iRegI dst, iRegL src) %{
+  ins_pipe(ialu_reg);
+%}
+
+instruct countLeadingZerosL(iRegINoSp dst, iRegL src) %{
   match(Set dst (CountLeadingZerosL src));
 
   ins_cost(INSN_COST);
@@ -6006,10 +6237,10 @@
     __ clz(as_Register($dst$$reg), as_Register($src$$reg));
   %}
 
-  ins_pipe( pipe_class_default );
-%}
-
-instruct countTrailingZerosI(iRegI dst, iRegI src) %{
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI(iRegINoSp dst, iRegIorL2I src) %{
   match(Set dst (CountTrailingZerosI src));
 
   ins_cost(INSN_COST * 2);
@@ -6020,10 +6251,10 @@
     __ clzw(as_Register($dst$$reg), as_Register($dst$$reg));
   %}
 
-  ins_pipe( pipe_class_default );
-%}
-
-instruct countTrailingZerosL(iRegI dst, iRegL src) %{
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL(iRegINoSp dst, iRegL src) %{
   match(Set dst (CountTrailingZerosL src));
 
   ins_cost(INSN_COST * 2);
@@ -6034,7 +6265,97 @@
     __ clz(as_Register($dst$$reg), as_Register($dst$$reg));
   %}
 
-  ins_pipe( pipe_class_default );
+  ins_pipe(ialu_reg);
+%}
+
+//---------- Population Count Instructions -------------------------------------
+//
+
+instruct popCountI(iRegINoSp dst, iRegIorL2I src, vRegF tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI src));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "movw   $src, $src\n\t"
+            "mov    $tmp, $src\t# vector (1D)\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    __ movw($src$$Register, $src$$Register); // ensure top 32 bits 0
+    __ mov($tmp$$FloatRegister, __ T1D, 0, $src$$Register);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+instruct popCountI_mem(iRegINoSp dst, memory mem, vRegF tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI (LoadI mem)));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "ldrs   $tmp, $mem\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    FloatRegister tmp_reg = as_FloatRegister($tmp$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldrs, tmp_reg, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+// Note: Long.bitCount(long) returns an int.
+instruct popCountL(iRegINoSp dst, iRegL src, vRegD tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL src));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "mov    $tmp, $src\t# vector (1D)\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    __ mov($tmp$$FloatRegister, __ T1D, 0, $src$$Register);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
+%}
+
+instruct popCountL_mem(iRegINoSp dst, memory mem, vRegD tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL (LoadL mem)));
+  effect(TEMP tmp);
+  ins_cost(INSN_COST * 13);
+
+  format %{ "ldrd   $tmp, $mem\n\t"
+            "cnt    $tmp, $tmp\t# vector (8B)\n\t"
+            "addv   $tmp, $tmp\t# vector (8B)\n\t"
+            "mov    $dst, $tmp\t# vector (1D)" %}
+  ins_encode %{
+    FloatRegister tmp_reg = as_FloatRegister($tmp$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldrd, tmp_reg, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+    __ cnt($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ addv($tmp$$FloatRegister, __ T8B, $tmp$$FloatRegister);
+    __ mov($dst$$Register, $tmp$$FloatRegister, __ T1D, 0);
+  %}
+
+  ins_pipe(pipe_class_default);
 %}
 
 // ============================================================================
@@ -6049,21 +6370,7 @@
   ins_encode %{
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
-  ins_pipe(pipe_class_memory);
-%}
-
-instruct unnecessary_membar_acquire() %{
-  predicate(preceded_by_ordered_load(n));
-  match(MemBarAcquire);
-  ins_cost(0);
-
-  format %{ "membar_acquire (elided)" %}
-
-  ins_encode %{
-    __ block_comment("membar_acquire (elided)");
-  %}
-
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct membar_acquire() %{
@@ -6076,7 +6383,7 @@
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 
@@ -6090,7 +6397,7 @@
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct store_fence() %{
@@ -6102,20 +6409,7 @@
   ins_encode %{
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
-  ins_pipe(pipe_class_memory);
-%}
-
-instruct unnecessary_membar_release() %{
-  match(MemBarRelease);
-  predicate(followed_by_ordered_store(n));
-  ins_cost(0);
-
-  format %{ "membar_release (elided)" %}
-
-  ins_encode %{
-    __ block_comment("membar_release (elided)");
-  %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct membar_release() %{
@@ -6127,7 +6421,7 @@
   ins_encode %{
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct membar_storestore() %{
@@ -6139,7 +6433,7 @@
   ins_encode %{
     __ membar(Assembler::StoreStore);
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct membar_release_lock() %{
@@ -6152,7 +6446,7 @@
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct membar_volatile() %{
@@ -6163,9 +6457,9 @@
 
   ins_encode %{
     __ membar(Assembler::StoreLoad);
-  %}
-
-  ins_pipe(pipe_class_memory);
+    %}
+
+  ins_pipe(pipe_serial);
 %}
 
 // ============================================================================
@@ -6183,7 +6477,7 @@
     }
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct castP2X(iRegLNoSp dst, iRegP src) %{
@@ -6198,7 +6492,7 @@
     }
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // Convert oop into int for vectors alignment masking
@@ -6211,7 +6505,7 @@
     __ movw($dst$$Register, $src$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // Convert compressed oop into int for vectors alignment masking
@@ -6227,7 +6521,7 @@
     __ movw($dst$$Register, $src$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 
@@ -6243,7 +6537,7 @@
     Register d = $dst$$Register;
     __ encode_heap_oop(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct encodeHeapOop_not_null(iRegNNoSp dst, iRegP src, rFlagsReg cr) %{
@@ -6254,7 +6548,7 @@
   ins_encode %{
     __ encode_heap_oop_not_null($dst$$Register, $src$$Register);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct decodeHeapOop(iRegPNoSp dst, iRegN src, rFlagsReg cr) %{
@@ -6268,7 +6562,7 @@
     Register d = $dst$$Register;
     __ decode_heap_oop(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct decodeHeapOop_not_null(iRegPNoSp dst, iRegN src, rFlagsReg cr) %{
@@ -6282,7 +6576,7 @@
     Register d = $dst$$Register;
     __ decode_heap_oop_not_null(d, s);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // n.b. AArch64 implementations of encode_klass_not_null and
@@ -6301,7 +6595,7 @@
     __ encode_klass_not_null(dst_reg, src_reg);
   %}
 
-   ins_pipe(pipe_class_default);
+   ins_pipe(ialu_reg);
 %}
 
 instruct decodeKlass_not_null(iRegPNoSp dst, iRegN src) %{
@@ -6320,7 +6614,7 @@
     }
   %}
 
-   ins_pipe(pipe_class_default);
+   ins_pipe(ialu_reg);
 %}
 
 instruct checkCastPP(iRegPNoSp dst)
@@ -6392,7 +6686,7 @@
 
   ins_encode(aarch64_enc_ldaxr(dst, mem));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 // Conditional-store of the updated heap-top.
@@ -6417,7 +6711,7 @@
 
   ins_encode(aarch64_enc_stlxr(newval, heap_top_ptr));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 // this has to be implemented as a CAS
@@ -6434,7 +6728,7 @@
 
   ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 // this has to be implemented as a CAS
@@ -6451,7 +6745,7 @@
 
   ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
@@ -6471,7 +6765,7 @@
  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval),
             aarch64_enc_cset_eq(res));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
@@ -6488,7 +6782,7 @@
  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval),
             aarch64_enc_cset_eq(res));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
@@ -6505,7 +6799,7 @@
  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval),
             aarch64_enc_cset_eq(res));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
@@ -6522,7 +6816,7 @@
  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval),
             aarch64_enc_cset_eq(res));
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_slow);
 %}
 
 
@@ -6532,7 +6826,7 @@
   ins_encode %{
     __ atomic_xchgw($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_setL(indirect mem, iRegLNoSp newv, iRegL prev) %{
@@ -6541,7 +6835,7 @@
   ins_encode %{
     __ atomic_xchg($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_setN(indirect mem, iRegNNoSp newv, iRegI prev) %{
@@ -6550,7 +6844,7 @@
   ins_encode %{
     __ atomic_xchgw($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_setP(indirect mem, iRegPNoSp newv, iRegP prev) %{
@@ -6559,7 +6853,7 @@
   ins_encode %{
     __ atomic_xchg($prev$$Register, $newv$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 
@@ -6570,7 +6864,7 @@
   ins_encode %{
     __ atomic_add($newval$$Register, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addL_no_res(indirect mem, Universe dummy, iRegL incr) %{
@@ -6581,7 +6875,7 @@
   ins_encode %{
     __ atomic_add(noreg, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addLi(indirect mem, iRegLNoSp newval, immLAddSub incr) %{
@@ -6591,7 +6885,7 @@
   ins_encode %{
     __ atomic_add($newval$$Register, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addLi_no_res(indirect mem, Universe dummy, immLAddSub incr) %{
@@ -6602,7 +6896,7 @@
   ins_encode %{
     __ atomic_add(noreg, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addI(indirect mem, iRegINoSp newval, iRegIorL2I incr) %{
@@ -6612,7 +6906,7 @@
   ins_encode %{
     __ atomic_addw($newval$$Register, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addI_no_res(indirect mem, Universe dummy, iRegIorL2I incr) %{
@@ -6623,7 +6917,7 @@
   ins_encode %{
     __ atomic_addw(noreg, $incr$$Register, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addIi(indirect mem, iRegINoSp newval, immIAddSub incr) %{
@@ -6633,7 +6927,7 @@
   ins_encode %{
     __ atomic_addw($newval$$Register, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 instruct get_and_addIi_no_res(indirect mem, Universe dummy, immIAddSub incr) %{
@@ -6644,7 +6938,7 @@
   ins_encode %{
     __ atomic_addw(noreg, $incr$$constant, as_Register($mem$$base));
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial);
 %}
 
 // ============================================================================
@@ -6660,7 +6954,7 @@
 // which throws a ShouldNotHappen. So, we have to provide two flavours
 // of each rule, one for a cmpOp and a second for a cmpOpU (sigh).
 
-instruct cmovI_reg_reg(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, iRegI src1, iRegI src2) %{
+instruct cmovI_reg_reg(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
   match(Set dst (CMoveI (Binary cmp cr) (Binary src1 src2)));
 
   ins_cost(INSN_COST * 2);
@@ -6673,10 +6967,10 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUI_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegI src1, iRegI src2) %{
+  ins_pipe(icond_reg_reg);
+%}
+
+instruct cmovUI_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
   match(Set dst (CMoveI (Binary cmp cr) (Binary src1 src2)));
 
   ins_cost(INSN_COST * 2);
@@ -6689,7 +6983,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 // special cases where one arg is zero
@@ -6701,68 +6995,68 @@
 // we ought only to be able to cull one of these variants as the ideal
 // transforms ought always to order the zero consistently (to left/right?)
 
-instruct cmovI_zero_reg(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, immI0 zero, iRegI src2) %{
-  match(Set dst (CMoveI (Binary cmp cr) (Binary zero src2)));
+instruct cmovI_zero_reg(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, immI0 zero, iRegIorL2I src) %{
+  match(Set dst (CMoveI (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, $src2, zr $cmp\t# signed, int"  %}
+  format %{ "cselw $dst, $src, zr $cmp\t# signed, int"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
-             as_Register($src2$$reg),
+             as_Register($src$$reg),
              zr,
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUI_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, immI0 zero, iRegI src2) %{
-  match(Set dst (CMoveI (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUI_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, immI0 zero, iRegIorL2I src) %{
+  match(Set dst (CMoveI (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, $src2, zr $cmp\t# unsigned, int"  %}
+  format %{ "cselw $dst, $src, zr $cmp\t# unsigned, int"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
-             as_Register($src2$$reg),
+             as_Register($src$$reg),
              zr,
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovI_reg_zero(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, iRegI src1, immI0 zero) %{
-  match(Set dst (CMoveI (Binary cmp cr) (Binary src1 zero)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovI_reg_zero(cmpOp cmp, rFlagsReg cr, iRegINoSp dst, iRegIorL2I src, immI0 zero) %{
+  match(Set dst (CMoveI (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, zr, $src1 $cmp\t# signed, int"  %}
+  format %{ "cselw $dst, zr, $src $cmp\t# signed, int"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
              zr,
-             as_Register($src1$$reg),
+             as_Register($src$$reg),
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUI_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegI src1, immI0 zero) %{
-  match(Set dst (CMoveI (Binary cmp cr) (Binary src1 zero)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUI_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, iRegIorL2I src, immI0 zero) %{
+  match(Set dst (CMoveI (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, zr, $src1 $cmp\t# unsigned, int"  %}
+  format %{ "cselw $dst, zr, $src $cmp\t# unsigned, int"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
              zr,
-             as_Register($src1$$reg),
+             as_Register($src$$reg),
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}
 
 // special case for creating a boolean 0 or 1
@@ -6786,7 +7080,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_none);
 %}
 
 instruct cmovUI_reg_zero_one(cmpOpU cmp, rFlagsRegU cr, iRegINoSp dst, immI0 zero, immI_1 one) %{
@@ -6805,7 +7099,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_none);
 %}
 
 instruct cmovL_reg_reg(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, iRegL src1, iRegL src2) %{
@@ -6821,7 +7115,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 instruct cmovUL_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, iRegL src1, iRegL src2) %{
@@ -6837,73 +7131,73 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 // special cases where one arg is zero
 
-instruct cmovL_reg_zero(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, iRegL src1, immL0 zero) %{
-  match(Set dst (CMoveL (Binary cmp cr) (Binary src1 zero)));
+instruct cmovL_reg_zero(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, iRegL src, immL0 zero) %{
+  match(Set dst (CMoveL (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, zr, $src1 $cmp\t# signed, long"  %}
+  format %{ "csel $dst, zr, $src $cmp\t# signed, long"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
             zr,
-            as_Register($src1$$reg),
+            as_Register($src$$reg),
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUL_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, iRegL src1, immL0 zero) %{
-  match(Set dst (CMoveL (Binary cmp cr) (Binary src1 zero)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUL_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, iRegL src, immL0 zero) %{
+  match(Set dst (CMoveL (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, zr, $src1 $cmp\t# unsigned, long"  %}
+  format %{ "csel $dst, zr, $src $cmp\t# unsigned, long"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
             zr,
-            as_Register($src1$$reg),
+            as_Register($src$$reg),
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovL_zero_reg(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, immL0 zero, iRegL src2) %{
-  match(Set dst (CMoveL (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovL_zero_reg(cmpOp cmp, rFlagsReg cr, iRegLNoSp dst, immL0 zero, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, $src2, zr $cmp\t# signed, long"  %}
+  format %{ "csel $dst, $src, zr $cmp\t# signed, long"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
-            as_Register($src2$$reg),
+            as_Register($src$$reg),
             zr,
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUL_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, immL0 zero, iRegL src2) %{
-  match(Set dst (CMoveL (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUL_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegLNoSp dst, immL0 zero, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, $src2, zr $cmp\t# unsigned, long"  %}
+  format %{ "csel $dst, $src, zr $cmp\t# unsigned, long"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
-            as_Register($src2$$reg),
+            as_Register($src$$reg),
             zr,
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}
 
 instruct cmovP_reg_reg(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, iRegP src1, iRegP src2) %{
@@ -6919,7 +7213,7 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 instruct cmovUP_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, iRegP src1, iRegP src2) %{
@@ -6935,73 +7229,73 @@
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 // special cases where one arg is zero
 
-instruct cmovP_reg_zero(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, iRegP src1, immP0 zero) %{
-  match(Set dst (CMoveP (Binary cmp cr) (Binary src1 zero)));
+instruct cmovP_reg_zero(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, iRegP src, immP0 zero) %{
+  match(Set dst (CMoveP (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, zr, $src1 $cmp\t# signed, ptr"  %}
+  format %{ "csel $dst, zr, $src $cmp\t# signed, ptr"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
             zr,
-            as_Register($src1$$reg),
+            as_Register($src$$reg),
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUP_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, iRegP src1, immP0 zero) %{
-  match(Set dst (CMoveP (Binary cmp cr) (Binary src1 zero)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUP_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, iRegP src, immP0 zero) %{
+  match(Set dst (CMoveP (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, zr, $src1 $cmp\t# unsigned, ptr"  %}
+  format %{ "csel $dst, zr, $src $cmp\t# unsigned, ptr"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
             zr,
-            as_Register($src1$$reg),
+            as_Register($src$$reg),
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovP_zero_reg(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, immP0 zero, iRegP src2) %{
-  match(Set dst (CMoveP (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovP_zero_reg(cmpOp cmp, rFlagsReg cr, iRegPNoSp dst, immP0 zero, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, $src2, zr $cmp\t# signed, ptr"  %}
+  format %{ "csel $dst, $src, zr $cmp\t# signed, ptr"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
-            as_Register($src2$$reg),
+            as_Register($src$$reg),
             zr,
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUP_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, immP0 zero, iRegP src2) %{
-  match(Set dst (CMoveP (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUP_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegPNoSp dst, immP0 zero, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "csel $dst, $src2, zr $cmp\t# unsigned, ptr"  %}
+  format %{ "csel $dst, $src, zr $cmp\t# unsigned, ptr"  %}
 
   ins_encode %{
     __ csel(as_Register($dst$$reg),
-            as_Register($src2$$reg),
+            as_Register($src$$reg),
             zr,
             (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}
 
 instruct cmovN_reg_reg(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, iRegN src1, iRegN src2) %{
@@ -7017,7 +7311,7 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 instruct cmovUN_reg_reg(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, iRegN src1, iRegN src2) %{
@@ -7033,73 +7327,73 @@
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg_reg);
 %}
 
 // special cases where one arg is zero
 
-instruct cmovN_reg_zero(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, iRegN src1, immN0 zero) %{
-  match(Set dst (CMoveN (Binary cmp cr) (Binary src1 zero)));
+instruct cmovN_reg_zero(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, iRegN src, immN0 zero) %{
+  match(Set dst (CMoveN (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, zr, $src1 $cmp\t# signed, compressed ptr"  %}
+  format %{ "cselw $dst, zr, $src $cmp\t# signed, compressed ptr"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
              zr,
-             as_Register($src1$$reg),
+             as_Register($src$$reg),
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUN_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, iRegN src1, immN0 zero) %{
-  match(Set dst (CMoveN (Binary cmp cr) (Binary src1 zero)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUN_reg_zero(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, iRegN src, immN0 zero) %{
+  match(Set dst (CMoveN (Binary cmp cr) (Binary src zero)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, zr, $src1 $cmp\t# unsigned, compressed ptr"  %}
+  format %{ "cselw $dst, zr, $src $cmp\t# unsigned, compressed ptr"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
              zr,
-             as_Register($src1$$reg),
+             as_Register($src$$reg),
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovN_zero_reg(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, immN0 zero, iRegN src2) %{
-  match(Set dst (CMoveN (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovN_zero_reg(cmpOp cmp, rFlagsReg cr, iRegNNoSp dst, immN0 zero, iRegN src) %{
+  match(Set dst (CMoveN (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, $src2, zr $cmp\t# signed, compressed ptr"  %}
+  format %{ "cselw $dst, $src, zr $cmp\t# signed, compressed ptr"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
-             as_Register($src2$$reg),
+             as_Register($src$$reg),
              zr,
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmovUN_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, immN0 zero, iRegN src2) %{
-  match(Set dst (CMoveN (Binary cmp cr) (Binary zero src2)));
+  ins_pipe(icond_reg);
+%}
+
+instruct cmovUN_zero_reg(cmpOpU cmp, rFlagsRegU cr, iRegNNoSp dst, immN0 zero, iRegN src) %{
+  match(Set dst (CMoveN (Binary cmp cr) (Binary zero src)));
 
   ins_cost(INSN_COST * 2);
-  format %{ "cselw $dst, $src2, zr $cmp\t# unsigned, compressed ptr"  %}
+  format %{ "cselw $dst, $src, zr $cmp\t# unsigned, compressed ptr"  %}
 
   ins_encode %{
     __ cselw(as_Register($dst$$reg),
-             as_Register($src2$$reg),
+             as_Register($src$$reg),
              zr,
              (Assembler::Condition)$cmp$$cmpcode);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icond_reg);
 %}
 
 instruct cmovF_reg(cmpOp cmp, rFlagsReg cr, vRegF dst, vRegF src1,  vRegF src2)
@@ -7198,10 +7492,10 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct addI_reg_imm(iRegINoSp dst, iRegI src1, immIAddSub src2) %{
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immIAddSub src2) %{
   match(Set dst (AddI src1 src2));
 
   ins_cost(INSN_COST);
@@ -7212,7 +7506,7 @@
 
   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 instruct addI_reg_imm_i2l(iRegINoSp dst, iRegL src1, immIAddSub src2) %{
@@ -7226,7 +7520,7 @@
 
   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Pointer Addition
@@ -7242,13 +7536,13 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct addP_reg_reg_ext(iRegPNoSp dst, iRegP src1, iRegIorL2I src2) %{
   match(Set dst (AddP src1 (ConvI2L src2)));
 
-  ins_cost(INSN_COST);
+  ins_cost(1.9 * INSN_COST);
   format %{ "add $dst, $src1, $src2, sxtw\t# ptr" %}
 
   ins_encode %{
@@ -7257,7 +7551,7 @@
            as_Register($src2$$reg), ext::sxtw);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct addP_reg_reg_lsl(iRegPNoSp dst, iRegP src1, iRegL src2, immIScale scale) %{
@@ -7272,7 +7566,7 @@
 		   Address::lsl($scale$$constant)));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct addP_reg_reg_ext_shift(iRegPNoSp dst, iRegP src1, iRegIorL2I src2, immIScale scale) %{
@@ -7287,7 +7581,7 @@
 		   Address::sxtw($scale$$constant)));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct lshift_ext(iRegLNoSp dst, iRegIorL2I src, immI scale, rFlagsReg cr) %{
@@ -7302,7 +7596,7 @@
           $scale$$constant & 63, MIN(32, (-$scale$$constant) & 63));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Pointer Immediate Addition
@@ -7319,7 +7613,7 @@
 
   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Long Addition
@@ -7336,7 +7630,7 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 // No constant pool entries requiredLong Immediate Addition.
@@ -7351,7 +7645,7 @@
 
   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Integer Subtraction
@@ -7367,7 +7661,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 // Immediate Subtraction
@@ -7382,7 +7676,7 @@
 
   ins_encode(aarch64_enc_addsubw_imm(dst, src1, src2));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Long Subtraction
@@ -7399,7 +7693,7 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 // No constant pool entries requiredLong Immediate Subtraction.
@@ -7414,7 +7708,7 @@
 
   ins_encode( aarch64_enc_addsub_imm(dst, src1, src2) );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Integer Negation (special case for sub)
@@ -7426,11 +7720,11 @@
   format %{ "negw $dst, $src\t# int" %}
 
   ins_encode %{
-    __ negsw(as_Register($dst$$reg),
-             as_Register($src$$reg));
-  %}
-
-  ins_pipe(pipe_class_default);
+    __ negw(as_Register($dst$$reg),
+	    as_Register($src$$reg));
+  %}
+
+  ins_pipe(ialu_reg);
 %}
 
 // Long Negation
@@ -7446,7 +7740,7 @@
 	   as_Register($src$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // Integer Multiply
@@ -7463,7 +7757,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(imul_reg_reg);
 %}
 
 instruct smulI(iRegLNoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
@@ -7478,7 +7772,7 @@
 	     as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(imul_reg_reg);
 %}
 
 // Long Multiply
@@ -7495,7 +7789,7 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(lmul_reg_reg);
 %}
 
 instruct mulHiL_rReg(iRegLNoSp dst, iRegL src1, iRegL src2, rFlagsReg cr)
@@ -7511,7 +7805,7 @@
 	     as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(lmul_reg_reg);
 %}
 
 // Combined Integer Multiply & Add/Sub
@@ -7529,7 +7823,7 @@
              as_Register($src3$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(imac_reg_reg);
 %}
 
 instruct msubI(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, iRegIorL2I src3) %{
@@ -7545,7 +7839,7 @@
              as_Register($src3$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(imac_reg_reg);
 %}
 
 // Combined Long Multiply & Add/Sub
@@ -7563,7 +7857,7 @@
             as_Register($src3$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(lmac_reg_reg);
 %}
 
 instruct msubL(iRegLNoSp dst, iRegL src1, iRegL src2, iRegL src3) %{
@@ -7579,7 +7873,7 @@
             as_Register($src3$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(lmac_reg_reg);
 %}
 
 // Integer Divide
@@ -7591,20 +7885,20 @@
   format %{ "sdivw  $dst, $src1, $src2" %}
 
   ins_encode(aarch64_enc_divw(dst, src1, src2));
-  ins_pipe(pipe_class_default);
-%}
-
-instruct signExtract(iRegINoSp dst, iRegI src, immI_31 div1, immI_31 div2) %{
-  match(Set dst (URShiftI (RShiftI src div1) div2));
-  ins_cost(INSN_COST);
-  format %{ "lsrw $dst, $src, $div1" %}
-  ins_encode %{
-    __ lsrw(as_Register($dst$$reg), as_Register($src$$reg), 31);
-  %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct div2Round(iRegINoSp dst, iRegI src, immI_31 div1, immI_31 div2) %{
+  ins_pipe(idiv_reg_reg);
+%}
+
+instruct signExtract(iRegINoSp dst, iRegIorL2I src1, immI_31 div1, immI_31 div2) %{
+  match(Set dst (URShiftI (RShiftI src1 div1) div2));
+  ins_cost(INSN_COST);
+  format %{ "lsrw $dst, $src1, $div1" %}
+  ins_encode %{
+    __ lsrw(as_Register($dst$$reg), as_Register($src1$$reg), 31);
+  %}
+  ins_pipe(ialu_reg_shift);
+%}
+
+instruct div2Round(iRegINoSp dst, iRegIorL2I src, immI_31 div1, immI_31 div2) %{
   match(Set dst (AddI src (URShiftI (RShiftI src div1) div2)));
   ins_cost(INSN_COST);
   format %{ "addw $dst, $src, LSR $div1" %}
@@ -7615,7 +7909,7 @@
 	      as_Register($src$$reg),
 	      Assembler::LSR, 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // Long Divide
@@ -7627,17 +7921,17 @@
   format %{ "sdiv   $dst, $src1, $src2" %}
 
   ins_encode(aarch64_enc_div(dst, src1, src2));
-  ins_pipe(pipe_class_default);
-%}
-
-instruct signExtractL(iRegLNoSp dst, iRegL src, immL_63 div1, immL_63 div2) %{
-  match(Set dst (URShiftL (RShiftL src div1) div2));
-  ins_cost(INSN_COST);
-  format %{ "lsr $dst, $src, $div1" %}
-  ins_encode %{
-    __ lsr(as_Register($dst$$reg), as_Register($src$$reg), 63);
-  %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ldiv_reg_reg);
+%}
+
+instruct signExtractL(iRegLNoSp dst, iRegL src1, immL_63 div1, immL_63 div2) %{
+  match(Set dst (URShiftL (RShiftL src1 div1) div2));
+  ins_cost(INSN_COST);
+  format %{ "lsr $dst, $src1, $div1" %}
+  ins_encode %{
+    __ lsr(as_Register($dst$$reg), as_Register($src1$$reg), 63);
+  %}
+  ins_pipe(ialu_reg_shift);
 %}
 
 instruct div2RoundL(iRegLNoSp dst, iRegL src, immL_63 div1, immL_63 div2) %{
@@ -7651,7 +7945,7 @@
 	      as_Register($src$$reg),
 	      Assembler::LSR, 63);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 // Integer Remainder
@@ -7664,7 +7958,7 @@
             "msubw($dst, rscratch1, $src2, $src1" %}
 
   ins_encode(aarch64_enc_modw(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(idiv_reg_reg);
 %}
 
 // Long Remainder
@@ -7677,7 +7971,7 @@
             "msub($dst, rscratch1, $src2, $src1" %}
 
   ins_encode(aarch64_enc_mod(dst, src1, src2));
-  ins_pipe(pipe_class_default);
+  ins_pipe(ldiv_reg_reg);
 %}
 
 // Integer Shifts
@@ -7695,7 +7989,7 @@
              as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Left Immediate
@@ -7711,7 +8005,7 @@
             $src2$$constant & 0x1f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Right Logical Register
@@ -7727,7 +8021,7 @@
              as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Right Logical Immediate
@@ -7743,7 +8037,7 @@
             $src2$$constant & 0x1f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Right Arithmetic Register
@@ -7759,7 +8053,7 @@
              as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Right Arithmetic Immediate
@@ -7775,7 +8069,7 @@
             $src2$$constant & 0x1f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Combined Int Mask and Right Shift (using UBFM)
@@ -7796,7 +8090,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Left Immediate
@@ -7812,7 +8106,7 @@
             $src2$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Right Logical Register
@@ -7828,7 +8122,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Right Logical Immediate
@@ -7844,7 +8138,23 @@
            $src2$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
+%}
+
+// A special-case pattern for card table stores.
+instruct urShiftP_reg_imm(iRegLNoSp dst, iRegP src1, immI src2) %{
+  match(Set dst (URShiftL (CastP2X src1) src2));
+
+  ins_cost(INSN_COST);
+  format %{ "lsr $dst, p2x($src1), ($src2 & 0x3f)" %}
+
+  ins_encode %{
+    __ lsr(as_Register($dst$$reg),
+           as_Register($src1$$reg),
+           $src2$$constant & 0x3f);
+  %}
+
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Right Arithmetic Register
@@ -7860,7 +8170,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // Shift Right Arithmetic Immediate
@@ -7876,7 +8186,7 @@
            $src2$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // BEGIN This section of the file is automatically generated. Do not edit --------------
@@ -7895,10 +8205,10 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 instruct regI_not_reg(iRegINoSp dst,
-                         iRegI src1, immI_M1 m1,
+                         iRegIorL2I src1, immI_M1 m1,
                          rFlagsReg cr) %{
   match(Set dst (XorI src1 m1));
   ins_cost(INSN_COST);
@@ -7911,24 +8221,24 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct AndI_reg_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2, immI_M1 m1,
+                         iRegIorL2I src1, iRegIorL2I src2, immI_M1 m1,
                          rFlagsReg cr) %{
   match(Set dst (AndI src1 (XorI src2 m1)));
   ins_cost(INSN_COST);
-  format %{ "bic  $dst, $src1, $src2" %}
-
-  ins_encode %{
-    __ bic(as_Register($dst$$reg),
+  format %{ "bicw  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ bicw(as_Register($dst$$reg),
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AndL_reg_not_reg(iRegLNoSp dst,
@@ -7945,24 +8255,24 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct OrI_reg_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2, immI_M1 m1,
+                         iRegIorL2I src1, iRegIorL2I src2, immI_M1 m1,
                          rFlagsReg cr) %{
   match(Set dst (OrI src1 (XorI src2 m1)));
   ins_cost(INSN_COST);
-  format %{ "orn  $dst, $src1, $src2" %}
-
-  ins_encode %{
-    __ orn(as_Register($dst$$reg),
+  format %{ "ornw  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ ornw(as_Register($dst$$reg),
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct OrL_reg_not_reg(iRegLNoSp dst,
@@ -7979,24 +8289,24 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct XorI_reg_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2, immI_M1 m1,
+                         iRegIorL2I src1, iRegIorL2I src2, immI_M1 m1,
                          rFlagsReg cr) %{
   match(Set dst (XorI m1 (XorI src2 src1)));
   ins_cost(INSN_COST);
-  format %{ "eon  $dst, $src1, $src2" %}
-
-  ins_encode %{
-    __ eon(as_Register($dst$$reg),
+  format %{ "eonw  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ eonw(as_Register($dst$$reg),
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct XorL_reg_not_reg(iRegLNoSp dst,
@@ -8013,11 +8323,11 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AndI_reg_URShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (AndI src1 (XorI(URShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8031,7 +8341,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -8049,11 +8359,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndI_reg_RShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (AndI src1 (XorI(RShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8067,7 +8377,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -8085,11 +8395,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndI_reg_LShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (AndI src1 (XorI(LShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8103,7 +8413,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -8121,11 +8431,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_URShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (XorI src4 (XorI(URShiftI src2 src3) src1)));
   ins_cost(1.9 * INSN_COST);
@@ -8139,7 +8449,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -8157,11 +8467,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_RShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (XorI src4 (XorI(RShiftI src2 src3) src1)));
   ins_cost(1.9 * INSN_COST);
@@ -8175,7 +8485,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -8193,11 +8503,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_LShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (XorI src4 (XorI(LShiftI src2 src3) src1)));
   ins_cost(1.9 * INSN_COST);
@@ -8211,7 +8521,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -8229,11 +8539,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_URShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (OrI src1 (XorI(URShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8247,7 +8557,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_URShift_not_reg(iRegLNoSp dst,
@@ -8265,11 +8575,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_RShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (OrI src1 (XorI(RShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8283,7 +8593,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_RShift_not_reg(iRegLNoSp dst,
@@ -8301,11 +8611,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_LShift_not_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, immI_M1 src4, rFlagsReg cr) %{
   match(Set dst (OrI src1 (XorI(LShiftI src2 src3) src4)));
   ins_cost(1.9 * INSN_COST);
@@ -8319,7 +8629,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_LShift_not_reg(iRegLNoSp dst,
@@ -8337,11 +8647,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndI_reg_URShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AndI src1 (URShiftI src2 src3)));
 
@@ -8356,7 +8666,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_URShift_reg(iRegLNoSp dst,
@@ -8375,11 +8685,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndI_reg_RShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AndI src1 (RShiftI src2 src3)));
 
@@ -8394,7 +8704,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_RShift_reg(iRegLNoSp dst,
@@ -8413,11 +8723,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndI_reg_LShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AndI src1 (LShiftI src2 src3)));
 
@@ -8432,7 +8742,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AndL_reg_LShift_reg(iRegLNoSp dst,
@@ -8451,11 +8761,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_URShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (XorI src1 (URShiftI src2 src3)));
 
@@ -8470,7 +8780,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_URShift_reg(iRegLNoSp dst,
@@ -8489,11 +8799,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_RShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (XorI src1 (RShiftI src2 src3)));
 
@@ -8508,7 +8818,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_RShift_reg(iRegLNoSp dst,
@@ -8527,11 +8837,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorI_reg_LShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (XorI src1 (LShiftI src2 src3)));
 
@@ -8546,7 +8856,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct XorL_reg_LShift_reg(iRegLNoSp dst,
@@ -8565,11 +8875,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_URShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (OrI src1 (URShiftI src2 src3)));
 
@@ -8584,7 +8894,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_URShift_reg(iRegLNoSp dst,
@@ -8603,11 +8913,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_RShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (OrI src1 (RShiftI src2 src3)));
 
@@ -8622,7 +8932,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_RShift_reg(iRegLNoSp dst,
@@ -8641,11 +8951,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrI_reg_LShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (OrI src1 (LShiftI src2 src3)));
 
@@ -8660,7 +8970,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct OrL_reg_LShift_reg(iRegLNoSp dst,
@@ -8679,11 +8989,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddI_reg_URShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AddI src1 (URShiftI src2 src3)));
 
@@ -8698,7 +9008,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddL_reg_URShift_reg(iRegLNoSp dst,
@@ -8717,11 +9027,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddI_reg_RShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AddI src1 (RShiftI src2 src3)));
 
@@ -8736,7 +9046,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddL_reg_RShift_reg(iRegLNoSp dst,
@@ -8755,11 +9065,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddI_reg_LShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (AddI src1 (LShiftI src2 src3)));
 
@@ -8774,7 +9084,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct AddL_reg_LShift_reg(iRegLNoSp dst,
@@ -8793,11 +9103,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubI_reg_URShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (SubI src1 (URShiftI src2 src3)));
 
@@ -8812,7 +9122,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubL_reg_URShift_reg(iRegLNoSp dst,
@@ -8831,11 +9141,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubI_reg_RShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (SubI src1 (RShiftI src2 src3)));
 
@@ -8850,7 +9160,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubL_reg_RShift_reg(iRegLNoSp dst,
@@ -8869,11 +9179,11 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubI_reg_LShift_reg(iRegINoSp dst,
-                         iRegI src1, iRegI src2,
+                         iRegIorL2I src1, iRegIorL2I src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst (SubI src1 (LShiftI src2 src3)));
 
@@ -8888,7 +9198,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 instruct SubL_reg_LShift_reg(iRegLNoSp dst,
@@ -8907,7 +9217,7 @@
               $src3$$constant & 0x3f);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}
 
 
@@ -8932,12 +9242,12 @@
 	    r, s);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Left followed by Shift Right.
 // This idiom is used by the compiler for the i2b bytecode etc.
-instruct sbfmwI(iRegINoSp dst, iRegI src, immI lshift_count, immI rshift_count)
+instruct sbfmwI(iRegINoSp dst, iRegIorL2I src, immI lshift_count, immI rshift_count)
 %{
   match(Set dst (RShiftI (LShiftI src lshift_count) rshift_count));
   // Make sure we are not going to exceed what sbfmw can do.
@@ -8955,7 +9265,7 @@
 	    r, s);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Left followed by Shift Right.
@@ -8978,12 +9288,12 @@
 	    r, s);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Shift Left followed by Shift Right.
 // This idiom is used by the compiler for the i2b bytecode etc.
-instruct ubfmwI(iRegINoSp dst, iRegI src, immI lshift_count, immI rshift_count)
+instruct ubfmwI(iRegINoSp dst, iRegIorL2I src, immI lshift_count, immI rshift_count)
 %{
   match(Set dst (URShiftI (LShiftI src lshift_count) rshift_count));
   // Make sure we are not going to exceed what ubfmw can do.
@@ -9001,11 +9311,11 @@
 	    r, s);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 // Bitfield extract with shift & mask
 
-instruct ubfxwI(iRegINoSp dst, iRegI src, immI rshift, immI_bitmask mask)
+instruct ubfxwI(iRegINoSp dst, iRegIorL2I src, immI rshift, immI_bitmask mask)
 %{
   match(Set dst (AndI (URShiftI src rshift) mask));
 
@@ -9018,7 +9328,7 @@
     __ ubfxw(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 instruct ubfxL(iRegLNoSp dst, iRegL src, immI rshift, immL_bitmask mask)
 %{
@@ -9033,7 +9343,7 @@
     __ ubfx(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // We can use ubfx when extending an And with a mask when we know mask
@@ -9051,7 +9361,7 @@
     __ ubfx(as_Register($dst$$reg),
 	    as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Rotations
@@ -9068,10 +9378,10 @@
     __ extr(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 63);
   %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct extrOrI(iRegINoSp dst, iRegI src1, iRegI src2, immI lshift, immI rshift, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg_extr);
+%}
+
+instruct extrOrI(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI lshift, immI rshift, rFlagsReg cr)
 %{
   match(Set dst (OrI (LShiftI src1 lshift) (URShiftI src2 rshift)));
   predicate(0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 31));
@@ -9083,7 +9393,7 @@
     __ extrw(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}
 
 instruct extrAddL(iRegLNoSp dst, iRegL src1, iRegL src2, immI lshift, immI rshift, rFlagsReg cr)
@@ -9098,10 +9408,10 @@
     __ extr(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 63);
   %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct extrAddI(iRegINoSp dst, iRegI src1, iRegI src2, immI lshift, immI rshift, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg_extr);
+%}
+
+instruct extrAddI(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI lshift, immI rshift, rFlagsReg cr)
 %{
   match(Set dst (AddI (LShiftI src1 lshift) (URShiftI src2 rshift)));
   predicate(0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 31));
@@ -9113,13 +9423,13 @@
     __ extrw(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}
 
 
 // rol expander
 
-instruct rolL_rReg(iRegL dst, iRegL src, iRegI shift, rFlagsReg cr)
+instruct rolL_rReg(iRegLNoSp dst, iRegL src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -9130,12 +9440,12 @@
     __ rorv(as_Register($dst$$reg), as_Register($src$$reg),
 	    rscratch1);
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // rol expander
 
-instruct rolI_rReg(iRegI dst, iRegI src, iRegI shift, rFlagsReg cr)
+instruct rolI_rReg(iRegINoSp dst, iRegI src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -9146,10 +9456,10 @@
     __ rorvw(as_Register($dst$$reg), as_Register($src$$reg),
 	    rscratch1);
     %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct rolL_rReg_Var_C_64(iRegL dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg_vshift);
+%}
+
+instruct rolL_rReg_Var_C_64(iRegLNoSp dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
 %{
   match(Set dst (OrL (LShiftL src shift) (URShiftL src (SubI c_64 shift))));
 
@@ -9158,7 +9468,7 @@
   %}
 %}
 
-instruct rolL_rReg_Var_C0(iRegL dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
+instruct rolL_rReg_Var_C0(iRegLNoSp dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
 %{
   match(Set dst (OrL (LShiftL src shift) (URShiftL src (SubI c0 shift))));
 
@@ -9167,7 +9477,7 @@
   %}
 %}
 
-instruct rolI_rReg_Var_C_32(iRegL dst, iRegL src, iRegI shift, immI_32 c_32, rFlagsReg cr)
+instruct rolI_rReg_Var_C_32(iRegLNoSp dst, iRegL src, iRegI shift, immI_32 c_32, rFlagsReg cr)
 %{
   match(Set dst (OrI (LShiftI src shift) (URShiftI src (SubI c_32 shift))));
 
@@ -9176,7 +9486,7 @@
   %}
 %}
 
-instruct rolI_rReg_Var_C0(iRegL dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
+instruct rolI_rReg_Var_C0(iRegLNoSp dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
 %{
   match(Set dst (OrI (LShiftI src shift) (URShiftI src (SubI c0 shift))));
 
@@ -9187,7 +9497,7 @@
 
 // ror expander
 
-instruct rorL_rReg(iRegL dst, iRegL src, iRegI shift, rFlagsReg cr)
+instruct rorL_rReg(iRegLNoSp dst, iRegL src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -9197,12 +9507,12 @@
     __ rorv(as_Register($dst$$reg), as_Register($src$$reg),
 	    as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}
 
 // ror expander
 
-instruct rorI_rReg(iRegI dst, iRegI src, iRegI shift, rFlagsReg cr)
+instruct rorI_rReg(iRegINoSp dst, iRegI src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -9212,10 +9522,10 @@
     __ rorvw(as_Register($dst$$reg), as_Register($src$$reg),
 	    as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct rorL_rReg_Var_C_64(iRegL dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg_vshift);
+%}
+
+instruct rorL_rReg_Var_C_64(iRegLNoSp dst, iRegL src, iRegI shift, immI_64 c_64, rFlagsReg cr)
 %{
   match(Set dst (OrL (URShiftL src shift) (LShiftL src (SubI c_64 shift))));
 
@@ -9224,7 +9534,7 @@
   %}
 %}
 
-instruct rorL_rReg_Var_C0(iRegL dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
+instruct rorL_rReg_Var_C0(iRegLNoSp dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
 %{
   match(Set dst (OrL (URShiftL src shift) (LShiftL src (SubI c0 shift))));
 
@@ -9233,7 +9543,7 @@
   %}
 %}
 
-instruct rorI_rReg_Var_C_32(iRegL dst, iRegL src, iRegI shift, immI_32 c_32, rFlagsReg cr)
+instruct rorI_rReg_Var_C_32(iRegLNoSp dst, iRegL src, iRegI shift, immI_32 c_32, rFlagsReg cr)
 %{
   match(Set dst (OrI (URShiftI src shift) (LShiftI src (SubI c_32 shift))));
 
@@ -9242,7 +9552,7 @@
   %}
 %}
 
-instruct rorI_rReg_Var_C0(iRegL dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
+instruct rorI_rReg_Var_C0(iRegLNoSp dst, iRegL src, iRegI shift, immI0 c0, rFlagsReg cr)
 %{
   match(Set dst (OrI (URShiftI src shift) (LShiftI src (SubI c0 shift))));
 
@@ -9263,7 +9573,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %};
 
 instruct SubExtI(iRegLNoSp dst, iRegL src1, iRegIorL2I src2, rFlagsReg cr)
@@ -9276,11 +9586,11 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %};
 
 
-instruct AddExtI_sxth(iRegINoSp dst, iRegI src1, iRegI src2, immI_16 lshift, immI_16 rshift, rFlagsReg cr)
+instruct AddExtI_sxth(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_16 lshift, immI_16 rshift, rFlagsReg cr)
 %{
   match(Set dst (AddI src1 (RShiftI (LShiftI src2 lshift) rshift)));
   ins_cost(INSN_COST);
@@ -9290,10 +9600,10 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxth);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct AddExtI_sxtb(iRegINoSp dst, iRegI src1, iRegI src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct AddExtI_sxtb(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
 %{
   match(Set dst (AddI src1 (RShiftI (LShiftI src2 lshift) rshift)));
   ins_cost(INSN_COST);
@@ -9303,10 +9613,10 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtb);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct AddExtI_uxtb(iRegINoSp dst, iRegI src1, iRegI src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct AddExtI_uxtb(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_24 lshift, immI_24 rshift, rFlagsReg cr)
 %{
   match(Set dst (AddI src1 (URShiftI (LShiftI src2 lshift) rshift)));
   ins_cost(INSN_COST);
@@ -9316,7 +9626,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_sxth(iRegLNoSp dst, iRegL src1, iRegL src2, immI_48 lshift, immI_48 rshift, rFlagsReg cr)
@@ -9329,7 +9639,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_sxtw(iRegLNoSp dst, iRegL src1, iRegL src2, immI_32 lshift, immI_32 rshift, rFlagsReg cr)
@@ -9342,7 +9652,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_sxtb(iRegLNoSp dst, iRegL src1, iRegL src2, immI_56 lshift, immI_56 rshift, rFlagsReg cr)
@@ -9355,7 +9665,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::sxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_uxtb(iRegLNoSp dst, iRegL src1, iRegL src2, immI_56 lshift, immI_56 rshift, rFlagsReg cr)
@@ -9368,11 +9678,11 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-
-instruct AddExtI_uxtb_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_255 mask, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+
+instruct AddExtI_uxtb_and(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_255 mask, rFlagsReg cr)
 %{
   match(Set dst (AddI src1 (AndI src2 mask)));
   ins_cost(INSN_COST);
@@ -9382,10 +9692,10 @@
      __ addw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct AddExtI_uxth_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_65535 mask, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct AddExtI_uxth_and(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_65535 mask, rFlagsReg cr)
 %{
   match(Set dst (AddI src1 (AndI src2 mask)));
   ins_cost(INSN_COST);
@@ -9395,7 +9705,7 @@
      __ addw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_uxtb_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_255 mask, rFlagsReg cr)
@@ -9408,7 +9718,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_uxth_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_65535 mask, rFlagsReg cr)
@@ -9421,7 +9731,7 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct AddExtL_uxtw_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_4294967295 mask, rFlagsReg cr)
@@ -9434,10 +9744,10 @@
      __ add(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtw);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct SubExtI_uxtb_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_255 mask, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct SubExtI_uxtb_and(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_255 mask, rFlagsReg cr)
 %{
   match(Set dst (SubI src1 (AndI src2 mask)));
   ins_cost(INSN_COST);
@@ -9447,10 +9757,10 @@
      __ subw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
-%}
-
-instruct SubExtI_uxth_and(iRegINoSp dst, iRegI src1, iRegI src2, immI_65535 mask, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct SubExtI_uxth_and(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, immI_65535 mask, rFlagsReg cr)
 %{
   match(Set dst (SubI src1 (AndI src2 mask)));
   ins_cost(INSN_COST);
@@ -9460,7 +9770,7 @@
      __ subw(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct SubExtL_uxtb_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_255 mask, rFlagsReg cr)
@@ -9473,7 +9783,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtb);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct SubExtL_uxth_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_65535 mask, rFlagsReg cr)
@@ -9486,7 +9796,7 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxth);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct SubExtL_uxtw_and(iRegLNoSp dst, iRegL src1, iRegL src2, immL_4294967295 mask, rFlagsReg cr)
@@ -9499,12 +9809,11 @@
      __ sub(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::uxtw);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 // END This section of the file is automatically generated. Do not edit --------------
 
-
 // ============================================================================
 // Floating Point Arithmetic Instructions
 
@@ -9861,7 +10170,7 @@
 	    as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct andI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2, rFlagsReg cr) %{
@@ -9876,7 +10185,7 @@
 	    (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Or Instructions
@@ -9893,7 +10202,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct orI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2) %{
@@ -9908,7 +10217,7 @@
             (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Xor Instructions
@@ -9925,7 +10234,7 @@
             as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct xorI_reg_imm(iRegINoSp dst, iRegIorL2I src1, immILog src2) %{
@@ -9940,7 +10249,7 @@
             (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Long Logical Instructions
@@ -9958,7 +10267,7 @@
 	    as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct andL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2, rFlagsReg cr) %{
@@ -9973,7 +10282,7 @@
             (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Or Instructions
@@ -9990,7 +10299,7 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct orL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2) %{
@@ -10005,7 +10314,7 @@
            (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 // Xor Instructions
@@ -10022,7 +10331,7 @@
            as_Register($src2$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}
 
 instruct xorL_reg_imm(iRegLNoSp dst, iRegL src1, immLLog src2) %{
@@ -10037,7 +10346,7 @@
            (unsigned long)($src2$$constant));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_imm);
 %}
 
 instruct convI2L_reg_reg(iRegLNoSp dst, iRegIorL2I src)
@@ -10049,11 +10358,11 @@
   ins_encode %{
     __ sbfm($dst$$Register, $src$$Register, 0, 31);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // this pattern occurs in bigmath arithmetic
-instruct convUI2L_reg_reg(iRegLNoSp dst, iRegI src, immL_32bits mask)
+instruct convUI2L_reg_reg(iRegLNoSp dst, iRegIorL2I src, immL_32bits mask)
 %{
   match(Set dst (AndL (ConvI2L src) mask));
 
@@ -10063,7 +10372,7 @@
     __ ubfm($dst$$Register, $src$$Register, 0, 31);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 instruct convL2I_reg(iRegINoSp dst, iRegL src) %{
@@ -10076,10 +10385,10 @@
     __ movw(as_Register($dst$$reg), as_Register($src$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct convI2B(iRegINoSp dst, iRegI src, rFlagsReg cr)
+  ins_pipe(ialu_reg);
+%}
+
+instruct convI2B(iRegINoSp dst, iRegIorL2I src, rFlagsReg cr)
 %{
   match(Set dst (Conv2B src));
   effect(KILL cr);
@@ -10094,7 +10403,7 @@
     __ cset(as_Register($dst$$reg), Assembler::NE);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct convP2B(iRegINoSp dst, iRegP src, rFlagsReg cr)
@@ -10112,7 +10421,7 @@
     __ cset(as_Register($dst$$reg), Assembler::NE);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}
 
 instruct convD2F_reg(vRegF dst, vRegD src) %{
@@ -10167,7 +10476,7 @@
   ins_pipe(pipe_class_default);
 %}
 
-instruct convI2F_reg_reg(vRegF dst, iRegI src) %{
+instruct convI2F_reg_reg(vRegF dst, iRegIorL2I src) %{
   match(Set dst (ConvI2F src));
 
   ins_cost(INSN_COST * 5);
@@ -10219,7 +10528,7 @@
   ins_pipe(pipe_class_default);
 %}
 
-instruct convI2D_reg_reg(vRegD dst, iRegI src) %{
+instruct convI2D_reg_reg(vRegD dst, iRegIorL2I src) %{
   match(Set dst (ConvI2D src));
 
   ins_cost(INSN_COST * 5);
@@ -10261,7 +10570,7 @@
     __ ldrw($dst$$Register, Address(sp, $src$$disp));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_reg);
 
 %}
 
@@ -10297,7 +10606,7 @@
     __ ldr($dst$$Register, Address(sp, $src$$disp));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(iload_reg_reg);
 
 %}
 
@@ -10351,7 +10660,7 @@
     __ strw($src$$Register, Address(sp, $dst$$disp));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_reg);
 
 %}
 
@@ -10387,7 +10696,7 @@
     __ str($src$$Register, Address(sp, $dst$$disp));
   %}
 
-  ins_pipe(pipe_class_memory);
+  ins_pipe(istore_reg_reg);
 
 %}
 
@@ -10482,7 +10791,7 @@
 // ============================================================================
 // Overflow Math Instructions
 
-instruct overflowAddI_reg_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+instruct overflowAddI_reg_reg(rFlagsReg cr, iRegIorL2I op1, iRegIorL2I op2)
 %{
   match(Set cr (OverflowAddI op1 op2));
 
@@ -10492,10 +10801,10 @@
     __ cmnw($op1$$Register, $op2$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowAddI_reg_imm(rFlagsReg cr, iRegI op1, immIAddSub op2)
+  ins_pipe(icmp_reg_reg);
+%}
+
+instruct overflowAddI_reg_imm(rFlagsReg cr, iRegIorL2I op1, immIAddSub op2)
 %{
   match(Set cr (OverflowAddI op1 op2));
 
@@ -10505,7 +10814,7 @@
     __ cmnw($op1$$Register, $op2$$constant);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct overflowAddL_reg_reg(rFlagsReg cr, iRegL op1, iRegL op2)
@@ -10518,7 +10827,7 @@
     __ cmn($op1$$Register, $op2$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct overflowAddL_reg_imm(rFlagsReg cr, iRegL op1, immLAddSub op2)
@@ -10531,10 +10840,10 @@
     __ cmn($op1$$Register, $op2$$constant);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowSubI_reg_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+  ins_pipe(icmp_reg_imm);
+%}
+
+instruct overflowSubI_reg_reg(rFlagsReg cr, iRegIorL2I op1, iRegIorL2I op2)
 %{
   match(Set cr (OverflowSubI op1 op2));
 
@@ -10544,10 +10853,10 @@
     __ cmpw($op1$$Register, $op2$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowSubI_reg_imm(rFlagsReg cr, iRegI op1, immIAddSub op2)
+  ins_pipe(icmp_reg_reg);
+%}
+
+instruct overflowSubI_reg_imm(rFlagsReg cr, iRegIorL2I op1, immIAddSub op2)
 %{
   match(Set cr (OverflowSubI op1 op2));
 
@@ -10557,7 +10866,7 @@
     __ cmpw($op1$$Register, $op2$$constant);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct overflowSubL_reg_reg(rFlagsReg cr, iRegL op1, iRegL op2)
@@ -10570,7 +10879,7 @@
     __ cmp($op1$$Register, $op2$$Register);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct overflowSubL_reg_imm(rFlagsReg cr, iRegL op1, immLAddSub op2)
@@ -10583,36 +10892,36 @@
     __ cmp($op1$$Register, $op2$$constant);
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowNegI_reg(rFlagsReg cr, immI0 zero, iRegI op2)
-%{
-  match(Set cr (OverflowSubI zero op2));
-
-  format %{ "cmpw  zr, $op2\t# overflow check int" %}
-  ins_cost(INSN_COST);
-  ins_encode %{
-    __ cmpw(zr, $op2$$Register);
-  %}
-
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowNegL_reg(rFlagsReg cr, immI0 zero, iRegL op2)
-%{
-  match(Set cr (OverflowSubL zero op2));
-
-  format %{ "cmp   zr, $op2\t# overflow check long" %}
-  ins_cost(INSN_COST);
-  ins_encode %{
-    __ cmp(zr, $op2$$Register);
-  %}
-
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowMulI_reg(rFlagsReg cr, iRegI op1, iRegI op2)
+  ins_pipe(icmp_reg_imm);
+%}
+
+instruct overflowNegI_reg(rFlagsReg cr, immI0 zero, iRegIorL2I op1)
+%{
+  match(Set cr (OverflowSubI zero op1));
+
+  format %{ "cmpw  zr, $op1\t# overflow check int" %}
+  ins_cost(INSN_COST);
+  ins_encode %{
+    __ cmpw(zr, $op1$$Register);
+  %}
+
+  ins_pipe(icmp_reg_imm);
+%}
+
+instruct overflowNegL_reg(rFlagsReg cr, immI0 zero, iRegL op1)
+%{
+  match(Set cr (OverflowSubL zero op1));
+
+  format %{ "cmp   zr, $op1\t# overflow check long" %}
+  ins_cost(INSN_COST);
+  ins_encode %{
+    __ cmp(zr, $op1$$Register);
+  %}
+
+  ins_pipe(icmp_reg_imm);
+%}
+
+instruct overflowMulI_reg(rFlagsReg cr, iRegIorL2I op1, iRegIorL2I op2)
 %{
   match(Set cr (OverflowMulI op1 op2));
 
@@ -10630,10 +10939,10 @@
     __ cmpw(rscratch1, 1);                             // 0x80000000 - 1 => VS
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct overflowMulI_reg_branch(cmpOp cmp, iRegI op1, iRegI op2, label labl, rFlagsReg cr)
+  ins_pipe(pipe_slow);
+%}
+
+instruct overflowMulI_reg_branch(cmpOp cmp, iRegIorL2I op1, iRegIorL2I op2, label labl, rFlagsReg cr)
 %{
   match(If cmp (OverflowMulI op1 op2));
   predicate(n->in(1)->as_Bool()->_test._test == BoolTest::overflow
@@ -10652,7 +10961,7 @@
     __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_serial);
 %}
 
 instruct overflowMulL_reg(rFlagsReg cr, iRegL op1, iRegL op2)
@@ -10675,7 +10984,7 @@
     __ cmpw(rscratch1, 1);                             // 0x80000000 - 1 => VS
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_slow);
 %}
 
 instruct overflowMulL_reg_branch(cmpOp cmp, iRegL op1, iRegL op2, label labl, rFlagsReg cr)
@@ -10699,7 +11008,7 @@
     __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_serial);
 %}
 
 // ============================================================================
@@ -10716,7 +11025,7 @@
 
   ins_encode(aarch64_enc_cmpw(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct compI_reg_immI0(rFlagsReg cr, iRegI op1, immI0 zero)
@@ -10730,7 +11039,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, zero));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compI_reg_immIAddSub(rFlagsReg cr, iRegI op1, immIAddSub op2)
@@ -10744,7 +11053,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compI_reg_immI(rFlagsReg cr, iRegI op1, immI op2)
@@ -10758,7 +11067,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 // Unsigned compare Instructions; really, same as signed compare
@@ -10776,7 +11085,7 @@
 
   ins_encode(aarch64_enc_cmpw(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct compU_reg_immI0(rFlagsRegU cr, iRegI op1, immI0 zero)
@@ -10790,7 +11099,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, zero));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compU_reg_immIAddSub(rFlagsRegU cr, iRegI op1, immIAddSub op2)
@@ -10804,7 +11113,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm_addsub(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compU_reg_immI(rFlagsRegU cr, iRegI op1, immI op2)
@@ -10818,7 +11127,7 @@
 
   ins_encode(aarch64_enc_cmpw_imm(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compL_reg_reg(rFlagsReg cr, iRegL op1, iRegL op2)
@@ -10832,7 +11141,7 @@
 
   ins_encode(aarch64_enc_cmp(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct compL_reg_immI0(rFlagsReg cr, iRegL op1, immI0 zero)
@@ -10846,7 +11155,7 @@
 
   ins_encode(aarch64_enc_cmp_imm_addsub(op1, zero));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compL_reg_immLAddSub(rFlagsReg cr, iRegL op1, immLAddSub op2)
@@ -10860,7 +11169,7 @@
 
   ins_encode(aarch64_enc_cmp_imm_addsub(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compL_reg_immL(rFlagsReg cr, iRegL op1, immL op2)
@@ -10874,7 +11183,7 @@
 
   ins_encode(aarch64_enc_cmp_imm(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct compP_reg_reg(rFlagsRegU cr, iRegP op1, iRegP op2)
@@ -10888,7 +11197,7 @@
 
   ins_encode(aarch64_enc_cmpp(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct compN_reg_reg(rFlagsRegU cr, iRegN op1, iRegN op2)
@@ -10902,7 +11211,7 @@
 
   ins_encode(aarch64_enc_cmpn(op1, op2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_reg);
 %}
 
 instruct testP_reg(rFlagsRegU cr, iRegP op1, immP0 zero)
@@ -10916,7 +11225,7 @@
 
   ins_encode(aarch64_enc_testp(op1));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 instruct testN_reg(rFlagsRegU cr, iRegN op1, immN0 zero)
@@ -10930,7 +11239,7 @@
 
   ins_encode(aarch64_enc_testn(op1));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(icmp_reg_imm);
 %}
 
 // FP comparisons
@@ -10965,6 +11274,7 @@
 
   ins_pipe(pipe_class_compare);
 %}
+// FROM HERE
 
 instruct compD_reg_reg(rFlagsReg cr, vRegD src1, vRegD src2)
 %{
@@ -11102,7 +11412,30 @@
 
 %}
 
-instruct cmpLTMask_reg_reg(iRegINoSp dst, iRegI p, iRegI q, rFlagsReg cr)
+// Manifest a CmpL result in an integer register.
+// (src1 < src2) ? -1 : ((src1 > src2) ? 1 : 0)
+instruct cmpL3_reg_reg(iRegINoSp dst, iRegL src1, iRegL src2, rFlagsReg flags)
+%{
+  match(Set dst (CmpL3 src1 src2));
+  effect(KILL flags);
+
+  ins_cost(INSN_COST * 6);
+  format %{
+      "cmp $src1, $src2"
+      "csetw $dst, ne"
+      "cnegw $dst, lt"
+  %}
+  // format %{ "CmpL3 $dst, $src1, $src2" %}
+  ins_encode %{
+    __ cmp($src1$$Register, $src2$$Register);
+    __ csetw($dst$$Register, Assembler::NE);
+    __ cnegw($dst$$Register, $dst$$Register, Assembler::LT);
+  %}
+
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct cmpLTMask_reg_reg(iRegINoSp dst, iRegIorL2I p, iRegIorL2I q, rFlagsReg cr)
 %{
   match(Set dst (CmpLTMask p q));
   effect(KILL cr);
@@ -11120,10 +11453,10 @@
     __ subw(as_Register($dst$$reg), zr, as_Register($dst$$reg));
   %}
 
-  ins_pipe(pipe_class_default);
-%}
-
-instruct cmpLTMask_reg_zero(iRegINoSp dst, iRegI src, immI0 zero, rFlagsReg cr)
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct cmpLTMask_reg_zero(iRegINoSp dst, iRegIorL2I src, immI0 zero, rFlagsReg cr)
 %{
   match(Set dst (CmpLTMask src zero));
   effect(KILL cr);
@@ -11136,7 +11469,7 @@
     __ asrw(as_Register($dst$$reg), as_Register($src$$reg), 31);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // ============================================================================
@@ -11164,8 +11497,9 @@
              Assembler::LT);
   %}
 
-  ins_pipe(pipe_class_compare);
-%}
+  ins_pipe(ialu_reg_reg);
+%}
+// FROM HERE
 
 instruct maxI_rReg(iRegINoSp dst, iRegI src1, iRegI src2, rFlagsReg cr)
 %{
@@ -11189,7 +11523,7 @@
              Assembler::GT);
   %}
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(ialu_reg_reg);
 %}
 
 // ============================================================================
@@ -11207,7 +11541,7 @@
 
   ins_encode(aarch64_enc_b(lbl));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}
 
 // Conditional Near Branch
@@ -11228,7 +11562,7 @@
 
   ins_encode(aarch64_enc_br_con(cmp, lbl));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch_cond);
 %}
 
 // Conditional Near Branch Unsigned
@@ -11249,14 +11583,14 @@
 
   ins_encode(aarch64_enc_br_conU(cmp, lbl));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch_cond);
 %}
 
 // Make use of CBZ and CBNZ.  These instructions, as well as being
 // shorter than (cmp; branch), have the additional benefit of not
 // killing the flags.
 
-instruct cmpI_imm0_branch(cmpOp cmp, iRegI op1, immI0 op2, label labl, rFlagsReg cr) %{
+instruct cmpI_imm0_branch(cmpOp cmp, iRegIorL2I op1, immI0 op2, label labl, rFlagsReg cr) %{
   match(If cmp (CmpI op1 op2));
   predicate(n->in(1)->as_Bool()->_test._test == BoolTest::ne
 	    || n->in(1)->as_Bool()->_test._test == BoolTest::eq);
@@ -11272,7 +11606,7 @@
     else
       __ cbnzw($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}
 
 instruct cmpL_imm0_branch(cmpOp cmp, iRegL op1, immL0 op2, label labl, rFlagsReg cr) %{
@@ -11291,7 +11625,7 @@
     else
       __ cbnz($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}
 
 instruct cmpP_imm0_branch(cmpOp cmp, iRegP op1, immP0 op2, label labl, rFlagsReg cr) %{
@@ -11310,7 +11644,7 @@
     else
       __ cbnz($op1$$Register, *L);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_cmp_branch);
 %}
 
 // Conditional Far Branch
@@ -11331,7 +11665,7 @@
 
   ins_encode(aarch64_enc_br_con(cmp, lbl));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}
 
 // counted loop end branch near Unsigned
@@ -11348,7 +11682,7 @@
 
   ins_encode(aarch64_enc_br_conU(cmp, lbl));
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}
 
 // counted loop end branch far
@@ -11370,7 +11704,7 @@
 
   ins_encode(aarch64_enc_fast_lock(object, box, tmp, tmp2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(pipe_serial);
 %}
 
 instruct cmpFastUnlock(rFlagsReg cr, iRegP object, iRegP box, iRegPNoSp tmp, iRegPNoSp tmp2)
@@ -11383,7 +11717,7 @@
 
   ins_encode(aarch64_enc_fast_unlock(object, box, tmp, tmp2));
 
-  ins_pipe(pipe_class_compare);
+  ins_pipe(pipe_serial);
 %}
 
 
@@ -11403,7 +11737,7 @@
   ins_encode %{
     __ read_polling_page(as_Register($poll$$reg), relocInfo::poll_type);
   %}
-  ins_pipe(pipe_class_memory);
+  ins_pipe(pipe_serial); // ins_pipe(iload_reg_mem);
 %}
 
 
@@ -11430,6 +11764,8 @@
   ins_pipe(pipe_class_call);
 %}
 
+// TO HERE
+
 // Call Java Static Instruction (method handle version)
 
 instruct CallStaticJavaDirectHandle(method meth, iRegP_FP reg_mh_save)
@@ -11563,7 +11899,7 @@
 
   ins_encode( /*empty*/ );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_class_empty);
 %}
 
 // Rethrow exception: The exception oop will come in the first
@@ -11590,7 +11926,7 @@
 
   ins_encode( aarch64_enc_ret() );
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(pipe_branch);
 %}
 
 // Die now.
@@ -11662,6 +11998,44 @@
   ins_pipe(pipe_class_memory);
 %}
 
+instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
+       iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
+
+  ins_encode %{
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      -1, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
+                 immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
+                 iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
+
+  ins_encode %{
+    int icnt2 = (int)$int_cnt2$$constant;
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, zr,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      icnt2, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
                         iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
 %{
@@ -11677,6 +12051,39 @@
   ins_pipe(pipe_class_memory);
 %}
 
+instruct array_equals(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
+                      iRegP_R10 tmp, rFlagsReg cr)
+%{
+  match(Set result (AryEq ary1 ary2));
+  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
+
+  format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
+  ins_encode %{
+    __ char_arrays_equals($ary1$$Register, $ary2$$Register,
+                          $result$$Register, $tmp$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+// encode char[] to byte[] in ISO_8859_1
+instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
+                          vRegD_V0 Vtmp1, vRegD_V1 Vtmp2,
+                          vRegD_V2 Vtmp3, vRegD_V3 Vtmp4,
+                          iRegI_R0 result, rFlagsReg cr)
+%{
+  match(Set result (EncodeISOArray src (Binary dst len)));
+  effect(USE_KILL src, USE_KILL dst, USE_KILL len,
+         KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr);
+
+  format %{ "Encode array $src,$dst,$len -> $result" %}
+  ins_encode %{
+    __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
+         $result$$Register, $Vtmp1$$FloatRegister,  $Vtmp2$$FloatRegister,
+         $Vtmp3$$FloatRegister,  $Vtmp4$$FloatRegister);
+  %}
+  ins_pipe( pipe_class_memory );
+%}
+
 // ============================================================================
 // This name is KNOWN by the ADLC and cannot be changed.
 // The ADLC forces a 'TypeRawPtr::BOTTOM' output type
@@ -11696,7 +12103,1363 @@
   ins_pipe(pipe_class_empty);
 %}
 
-
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load vector (32 bits)
+instruct loadV4(vecD dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 4);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrs   $dst,$mem\t# vector (32 bits)" %}
+  ins_encode( aarch64_enc_ldrvS(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Load vector (64 bits)
+instruct loadV8(vecD dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrd   $dst,$mem\t# vector (64 bits)" %}
+  ins_encode( aarch64_enc_ldrvD(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Load Vector (128 bits)
+instruct loadV16(vecX dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrq   $dst,$mem\t# vector (128 bits)" %}
+  ins_encode( aarch64_enc_ldrvQ(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (32 bits)
+instruct storeV4(vecD src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 4);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strs   $mem,$src\t# vector (32 bits)" %}
+  ins_encode( aarch64_enc_strvS(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (64 bits)
+instruct storeV8(vecD src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strd   $mem,$src\t# vector (64 bits)" %}
+  ins_encode( aarch64_enc_strvD(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (128 bits)
+instruct storeV16(vecX src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strq   $mem,$src\t# vector (128 bits)" %}
+  ins_encode( aarch64_enc_strvQ(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct replicate8B(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (8B)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T8B, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate16B(vecX dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate8B_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(8B)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T8B, $con$$constant & 0xff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate16B_imm(vecX dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(16B)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T16B, $con$$constant & 0xff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4S(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4S)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4H, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate8S(vecX dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (8S)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T8H, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4S_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(4H)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T4H, $con$$constant & 0xffff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate8S_imm(vecX dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(8H)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T8H, $con$$constant & 0xffff);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2I(vecD dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2I)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2S, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4I(vecX dst, iRegIorL2I src)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4I)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4S, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2I_imm(vecD dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(2I)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T2S, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4I_imm(vecX dst, immI con)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(4I)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T4S, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2L(vecX dst, iRegL src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2L)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2D, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2L_zero(vecX dst, immI0 zero)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI zero));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $zero\t# vector(4I)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($dst$$reg),
+           as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2F(vecD dst, vRegF src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2F)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4F(vecX dst, vRegF src)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4F)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4S,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2D(vecX dst, vRegD src)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2D)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2D,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+instruct vadd8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd16B(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd8S(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd4I(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2L(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (2L)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd4F(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2D(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- SUB --------------------------------------
+
+instruct vsub8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub16B(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub8S(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub4I(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2L(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (2L)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub4F(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2D(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- MUL --------------------------------------
+
+instruct vmul4S(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (4H)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul8S(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul2I(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul4I(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul4F(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul2D(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- DIV --------------------------------------
+
+instruct vdiv2F(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (2S)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vdiv4F(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vdiv2D(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- AND --------------------------------------
+
+instruct vand8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ andr(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vand16B(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ andr(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vor8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ orr(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vor16B(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "orr  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ orr(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxor8B(vecD dst, vecD src1, vecD src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 4 ||
+            n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "xor  $dst,$src1,$src2\t# vector (8B)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vxor16B(vecX dst, vecX src1, vecX src2)
+%{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "xor  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// ------------------------------ Shift ---------------------------------------
+
+instruct vshiftcntL(vecX dst, iRegIorL2I cnt) %{
+  match(Set dst (LShiftCntV cnt));
+  format %{ "dup  $dst, $cnt\t# shift count (vecX)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($cnt$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Right shifts on aarch64 SIMD are implemented as left shift by -ve amount
+instruct vshiftcntR(vecX dst, iRegIorL2I cnt) %{
+  match(Set dst (RShiftCntV cnt));
+  format %{ "dup  $dst, $cnt\t# shift count (vecX)\n\tneg  $dst, $dst\t T16B" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($cnt$$reg));
+    __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8B(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (8B)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll16B(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (16B)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8B(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (8B)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T8B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl16B(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (16B)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) sh = 7;
+    sh = -sh & 7;
+    __ sshr(as_FloatRegister($dst$$reg), __ T8B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) sh = 7;
+    sh = -sh & 7;
+    __ sshr(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 ||
+            n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg), -sh & 7);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg), -sh & 7);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4S(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (4H)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8S(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (8H)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4S(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (4H)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T4H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8S(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (8H)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T4H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T8H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) sh = 15;
+    sh = -sh & 15;
+    __ sshr(as_FloatRegister($dst$$reg), __ T4H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) sh = 15;
+    sh = -sh & 15;
+    __ sshr(as_FloatRegister($dst$$reg), __ T8H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 ||
+            n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T4H,
+             as_FloatRegister($src$$reg), -sh & 15);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T8H,
+             as_FloatRegister($src$$reg), -sh & 15);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2I(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (2S)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4I(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (4S)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2I(vecD dst, vecD src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (2S)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4I(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (4S)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T4S,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2L(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  match(Set dst (RShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (2D)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2L(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (2D)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T2D,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}
 
 //----------PEEPHOLE RULES-----------------------------------------------------
 // These must follow all instruction definitions as they use the names
--- a/src/cpu/aarch64/vm/aarch64_ad.m4	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/aarch64_ad.m4	Fri Oct 02 04:37:30 2015 +0100
@@ -1,9 +1,36 @@
+dnl Copyright (c) 2014, Red Hat Inc. All rights reserved.
+dnl DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+dnl
+dnl This code is free software; you can redistribute it and/or modify it
+dnl under the terms of the GNU General Public License version 2 only, as
+dnl published by the Free Software Foundation.
+dnl
+dnl This code is distributed in the hope that it will be useful, but WITHOUT
+dnl ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+dnl FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl version 2 for more details (a copy is included in the LICENSE file that
+dnl accompanied this code).
+dnl
+dnl You should have received a copy of the GNU General Public License version
+dnl 2 along with this work; if not, write to the Free Software Foundation,
+dnl Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+dnl
+dnl Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+dnl or visit www.oracle.com if you need additional information or have any
+dnl questions.
+dnl
+dnl 
+dnl Process this file with m4 aarch64_ad.m4 to generate the arithmetic
+dnl and shift patterns patterns used in aarch64.ad.
+dnl
 // BEGIN This section of the file is automatically generated. Do not edit --------------
-
+dnl
+define(`ORL2I', `ifelse($1,I,orL2I)')
+dnl
 define(`BASE_SHIFT_INSN',
 `
 instruct $2$1_reg_$4_reg(iReg$1NoSp dst,
-                         iReg$1 src1, iReg$1 src2,
+                         iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2,
                          immI src3, rFlagsReg cr) %{
   match(Set dst ($2$1 src1 ($4$1 src2 src3)));
 
@@ -15,15 +42,15 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}')dnl
 define(`BASE_INVERTED_INSN',
 `
 instruct $2$1_reg_not_reg(iReg$1NoSp dst,
-                         iReg$1 src1, iReg$1 src2, imm$1_M1 m1,
+                         iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2, imm$1_M1 m1,
                          rFlagsReg cr) %{
 dnl This ifelse is because hotspot reassociates (xor (xor ..)..)
 dnl into this canonical form.
@@ -40,12 +67,12 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')dnl
 define(`INVERTED_SHIFT_INSN',
 `
 instruct $2$1_reg_$4_not_reg(iReg$1NoSp dst,
-                         iReg$1 src1, iReg$1 src2,
+                         iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2,
                          immI src3, imm$1_M1 src4, rFlagsReg cr) %{
 dnl This ifelse is because hotspot reassociates (xor (xor ..)..)
 dnl into this canonical form.
@@ -60,14 +87,14 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_shift);
 %}')dnl
 define(`NOT_INSN',
 `instruct reg$1_not_reg(iReg$1NoSp dst,
-                         iReg$1 src1, imm$1_M1 m1,
+                         iReg$1`'ORL2I($1) src1, imm$1_M1 m1,
                          rFlagsReg cr) %{
   match(Set dst (Xor$1 src1 m1));
   ins_cost(INSN_COST);
@@ -80,7 +107,7 @@
               Assembler::LSL, 0);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg);
 %}')dnl
 dnl
 define(`BOTH_SHIFT_INSNS',
@@ -88,7 +115,7 @@
 BASE_SHIFT_INSN(L, $1, $2, $3, $4)')dnl
 dnl
 define(`BOTH_INVERTED_INSNS',
-`BASE_INVERTED_INSN(I, $1, $2, $3, $4)
+`BASE_INVERTED_INSN(I, $1, $2w, $3, $4)
 BASE_INVERTED_INSN(L, $1, $2, $3, $4)')dnl
 dnl
 define(`BOTH_INVERTED_SHIFT_INSNS',
@@ -124,7 +151,7 @@
 define(`BFM_INSN',`
 // Shift Left followed by Shift Right.
 // This idiom is used by the compiler for the i2b bytecode etc.
-instruct $4$1(iReg$1NoSp dst, iReg$1 src, immI lshift_count, immI rshift_count)
+instruct $4$1(iReg$1NoSp dst, iReg$1`'ORL2I($1) src, immI lshift_count, immI rshift_count)
 %{
   match(Set dst EXTEND($1, $3, src, lshift_count, rshift_count));
   // Make sure we are not going to exceed what $4 can do.
@@ -138,11 +165,11 @@
     int s = $2 - lshift;
     int r = (rshift - lshift) & $2;
     __ $4(as_Register($dst$$reg),
-	    as_Register($src$$reg),
-	    r, s);
+            as_Register($src$$reg),
+            r, s);
   %}
 
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}')
 BFM_INSN(L, 63, RShift, sbfm)
 BFM_INSN(I, 31, RShift, sbfmw)
@@ -151,7 +178,7 @@
 dnl
 // Bitfield extract with shift & mask
 define(`BFX_INSN',
-`instruct $3$1(iReg$1NoSp dst, iReg$1 src, immI rshift, imm$1_bitmask mask)
+`instruct $3$1(iReg$1NoSp dst, iReg$1`'ORL2I($1) src, immI rshift, imm$1_bitmask mask)
 %{
   match(Set dst (And$1 ($2$1 src rshift) mask));
 
@@ -162,9 +189,9 @@
     long mask = $mask$$constant;
     int width = exact_log2(mask+1);
     __ $3(as_Register($dst$$reg),
-	    as_Register($src$$reg), rshift, width);
+            as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}')
 BFX_INSN(I,URShift,ubfxw)
 BFX_INSN(L,URShift,ubfx)
@@ -182,15 +209,15 @@
     long mask = $mask$$constant;
     int width = exact_log2(mask+1);
     __ ubfx(as_Register($dst$$reg),
-	    as_Register($src$$reg), rshift, width);
+            as_Register($src$$reg), rshift, width);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_shift);
 %}
 
 // Rotations
 
 define(`EXTRACT_INSN',
-`instruct extr$3$1(iReg$1NoSp dst, iReg$1 src1, iReg$1 src2, immI lshift, immI rshift, rFlagsReg cr)
+`instruct extr$3$1(iReg$1NoSp dst, iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2, immI lshift, immI rshift, rFlagsReg cr)
 %{
   match(Set dst ($3$1 (LShift$1 src1 lshift) (URShift$1 src2 rshift)));
   predicate(0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & $2));
@@ -202,7 +229,7 @@
     __ $4(as_Register($dst$$reg), as_Register($src1$$reg), as_Register($src2$$reg),
             $rshift$$constant & $2);
   %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_extr);
 %}
 ')dnl
 EXTRACT_INSN(L, 63, Or, extr)
@@ -212,7 +239,7 @@
 define(`ROL_EXPAND', `
 // $2 expander
 
-instruct $2$1_rReg(iReg$1 dst, iReg$1 src, iRegI shift, rFlagsReg cr)
+instruct $2$1_rReg(iReg$1NoSp dst, iReg$1 src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -221,14 +248,14 @@
   ins_encode %{
     __ subw(rscratch1, zr, as_Register($shift$$reg));
     __ $3(as_Register($dst$$reg), as_Register($src$$reg),
-	    rscratch1);
+            rscratch1);
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}')dnl
 define(`ROR_EXPAND', `
 // $2 expander
 
-instruct $2$1_rReg(iReg$1 dst, iReg$1 src, iRegI shift, rFlagsReg cr)
+instruct $2$1_rReg(iReg$1NoSp dst, iReg$1 src, iRegI shift, rFlagsReg cr)
 %{
   effect(DEF dst, USE src, USE shift);
 
@@ -236,12 +263,12 @@
   ins_cost(INSN_COST);
   ins_encode %{
     __ $3(as_Register($dst$$reg), as_Register($src$$reg),
-	    as_Register($shift$$reg));
+            as_Register($shift$$reg));
     %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg_vshift);
 %}')dnl
 define(ROL_INSN, `
-instruct $3$1_rReg_Var_C$2(iRegL dst, iRegL src, iRegI shift, immI$2 c$2, rFlagsReg cr)
+instruct $3$1_rReg_Var_C$2(iRegLNoSp dst, iRegL src, iRegI shift, immI$2 c$2, rFlagsReg cr)
 %{
   match(Set dst (Or$1 (LShift$1 src shift) (URShift$1 src (SubI c$2 shift))));
 
@@ -250,7 +277,7 @@
   %}
 %}')dnl
 define(ROR_INSN, `
-instruct $3$1_rReg_Var_C$2(iRegL dst, iRegL src, iRegI shift, immI$2 c$2, rFlagsReg cr)
+instruct $3$1_rReg_Var_C$2(iRegLNoSp dst, iRegL src, iRegI shift, immI$2 c$2, rFlagsReg cr)
 %{
   match(Set dst (Or$1 (URShift$1 src shift) (LShift$1 src (SubI c$2 shift))));
 
@@ -274,7 +301,7 @@
 // Add/subtract (extended)
 dnl ADD_SUB_EXTENDED(mode, size, add node, shift node, insn, shift type, wordsize
 define(`ADD_SUB_CONV', `
-instruct $3Ext$1(iReg$2NoSp dst, iReg$2 src1, iReg$1orL2I src2, rFlagsReg cr)
+instruct $3Ext$1(iReg$2NoSp dst, iReg$2`'ORL2I($2) src1, iReg$1`'ORL2I($1) src2, rFlagsReg cr)
 %{
   match(Set dst ($3$2 src1 (ConvI2L src2)));
   ins_cost(INSN_COST);
@@ -284,13 +311,13 @@
      __ $4(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$5);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')dnl
 ADD_SUB_CONV(I,L,Add,add,sxtw);
 ADD_SUB_CONV(I,L,Sub,sub,sxtw);
 dnl
 define(`ADD_SUB_EXTENDED', `
-instruct $3Ext$1_$6(iReg$1NoSp dst, iReg$1 src1, iReg$1 src2, immI_`'eval($7-$2) lshift, immI_`'eval($7-$2) rshift, rFlagsReg cr)
+instruct $3Ext$1_$6(iReg$1NoSp dst, iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2, immI_`'eval($7-$2) lshift, immI_`'eval($7-$2) rshift, rFlagsReg cr)
 %{
   match(Set dst ($3$1 src1 EXTEND($1, $4, src2, lshift, rshift)));
   ins_cost(INSN_COST);
@@ -300,7 +327,7 @@
      __ $5(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$6);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')
 ADD_SUB_EXTENDED(I,16,Add,RShift,add,sxth,32)
 ADD_SUB_EXTENDED(I,8,Add,RShift,add,sxtb,32)
@@ -312,7 +339,7 @@
 dnl
 dnl ADD_SUB_ZERO_EXTEND(mode, size, add node, insn, shift type)
 define(`ADD_SUB_ZERO_EXTEND', `
-instruct $3Ext$1_$5_and(iReg$1NoSp dst, iReg$1 src1, iReg$1 src2, imm$1_$2 mask, rFlagsReg cr)
+instruct $3Ext$1_$5_and(iReg$1NoSp dst, iReg$1`'ORL2I($1) src1, iReg$1`'ORL2I($1) src2, imm$1_$2 mask, rFlagsReg cr)
 %{
   match(Set dst ($3$1 src1 (And$1 src2 mask)));
   ins_cost(INSN_COST);
@@ -322,7 +349,7 @@
      __ $4(as_Register($dst$$reg), as_Register($src1$$reg),
             as_Register($src2$$reg), ext::$5);
    %}
-  ins_pipe(pipe_class_default);
+  ins_pipe(ialu_reg_reg);
 %}')
 dnl
 ADD_SUB_ZERO_EXTEND(I,255,Add,addw,uxtb)
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -278,14 +278,14 @@
 
 // LoadStoreExclusiveOp
     __ stxr(r20, r21, r2);                             //	stxr	w20, x21, [x2]
-    __ stlxr(r7, r29, r7);                             //	stlxr	w7, x29, [x7]
+    __ stlxr(r5, r29, r7);                             //       stlxr   w5, x29, [x7]
     __ ldxr(r5, r16);                                  //	ldxr	x5, [x16]
     __ ldaxr(r27, r29);                                //	ldaxr	x27, [x29]
     __ stlr(r0, r29);                                  //	stlr	x0, [x29]
     __ ldar(r21, r28);                                 //	ldar	x21, [x28]
 
 // LoadStoreExclusiveOp
-    __ stxrw(r24, r24, r7);                            //	stxr	w24, w24, [x7]
+    __ stxrw(r21, r24, r7);                            //       stxr    w21, w24, [x7]
     __ stlxrw(r21, r26, r28);                          //	stlxr	w21, w26, [x28]
     __ ldxrw(r21, r6);                                 //	ldxr	w21, [x6]
     __ ldaxrw(r15, r30);                               //	ldaxr	w15, [x30]
@@ -312,11 +312,11 @@
     __ ldxp(r8, r2, r19);                              //	ldxp	x8, x2, [x19]
     __ ldaxp(r7, r19, r14);                            //	ldaxp	x7, x19, [x14]
     __ stxp(r8, r27, r28, r5);                         //	stxp	w8, x27, x28, [x5]
-    __ stlxp(r6, r8, r14, r6);                         //	stlxp	w6, x8, x14, [x6]
+    __ stlxp(r5, r8, r14, r6);                         //       stlxp   w5, x8, x14, [x6]
 
 // LoadStoreExclusiveOp
     __ ldxpw(r25, r4, r22);                            //	ldxp	w25, w4, [x22]
-    __ ldaxpw(r14, r14, r15);                          //	ldaxp	w14, w14, [x15]
+    __ ldaxpw(r13, r14, r15);                          //       ldaxp   w13, w14, [x15]
     __ stxpw(r20, r26, r8, r10);                       //	stxp	w20, w26, w8, [x10]
     __ stlxpw(r23, r18, r18, r18);                     //	stlxp	w23, w18, w18, [x18]
 
@@ -785,12 +785,12 @@
  24c:	d61f0040 	br	x2
  250:	d63f00a0 	blr	x5
  254:	c8147c55 	stxr	w20, x21, [x2]
- 258:	c807fcfd 	stlxr	w7, x29, [x7]
+ 258:   c805fcfd        stlxr   w5, x29, [x7]
  25c:	c85f7e05 	ldxr	x5, [x16]
  260:	c85fffbb 	ldaxr	x27, [x29]
  264:	c89fffa0 	stlr	x0, [x29]
  268:	c8dfff95 	ldar	x21, [x28]
- 26c:	88187cf8 	stxr	w24, w24, [x7]
+ 26c:   88157cf8        stxr    w21, w24, [x7]
  270:	8815ff9a 	stlxr	w21, w26, [x28]
  274:	885f7cd5 	ldxr	w21, [x6]
  278:	885fffcf 	ldaxr	w15, [x30]
@@ -811,9 +811,9 @@
  2b4:	c87f0a68 	ldxp	x8, x2, [x19]
  2b8:	c87fcdc7 	ldaxp	x7, x19, [x14]
  2bc:	c82870bb 	stxp	w8, x27, x28, [x5]
- 2c0:	c826b8c8 	stlxp	w6, x8, x14, [x6]
+ 2c0:   c825b8c8        stlxp   w5, x8, x14, [x6]
  2c4:	887f12d9 	ldxp	w25, w4, [x22]
- 2c8:	887fb9ee 	ldaxp	w14, w14, [x15]
+ 2c8:   887fb9ed        ldaxp   w13, w14, [x15]
  2cc:	8834215a 	stxp	w20, w26, w8, [x10]
  2d0:	8837ca52 	stlxp	w23, w18, w18, [x18]
  2d4:	f806317e 	str	x30, [x11,#99]
@@ -1101,14 +1101,14 @@
     0xd4063721,     0xd4035082,     0xd400bfe3,     0xd4282fc0,
     0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
     0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
-    0xd63f00a0,     0xc8147c55,     0xc807fcfd,     0xc85f7e05,
-    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0xd63f00a0,     0xc8147c55,     0xc805fcfd,     0xc85f7e05,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88157cf8,
     0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
     0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
     0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
     0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
     0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
-    0xc826b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0xc825b8c8,     0x887f12d9,     0x887fb9ed,     0x8834215a,
     0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
     0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
     0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -467,6 +467,11 @@
     case base_plus_offset:
       {
 	unsigned size = i->get(31, 30);
+        if (i->get(26, 26) && i->get(23, 23)) {
+          // SIMD Q Type - Size = 128 bits
+          assert(size == 0, "bad size");
+          size = 0b100;
+        }
 	unsigned mask = (1 << size) - 1;
 	if (_offset < 0 || _offset & mask)
 	  {
@@ -487,6 +492,11 @@
 	i->rf(_index, 16);
 	i->f(_ext.option(), 15, 13);
 	unsigned size = i->get(31, 30);
+        if (i->get(26, 26) && i->get(23, 23)) {
+          // SIMD Q Type - Size = 128 bits
+          assert(size == 0, "bad size");
+          size = 0b100;
+        }
 	if (size == 0) // It's a byte
 	  i->f(_ext.shift() >= 0, 12);
 	else {
@@ -1086,11 +1096,13 @@
 
 #define INSN4(NAME, sz, op, o0) /* Four registers */			\
   void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {	\
+    guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
     load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);			\
   }
 
 #define INSN3(NAME, sz, op, o0) /* Three registers */			\
   void NAME(Register Rs, Register Rt, Register Rn) {			\
+    guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
     load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);	\
   }
 
@@ -1102,6 +1114,7 @@
 
 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
   void NAME(Register Rt1, Register Rt2, Register Rn) {			\
+    guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
     load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);	\
   }
 
@@ -1187,6 +1200,7 @@
 
   INSN(ldrs, 0b00, 1);
   INSN(ldrd, 0b01, 1);
+  INSN(ldrq, 0x10, 1);
 
 #undef INSN
 
@@ -1248,6 +1262,8 @@
   INSN(ldps, 0b00, 0b101, 1, 1, false);
   INSN(stpd, 0b01, 0b101, 1, 0, false);
   INSN(ldpd, 0b01, 0b101, 1, 1, false);
+  INSN(stpq, 0b10, 0b101, 1, 0, false);
+  INSN(ldpq, 0b10, 0b101, 1, 1, false);
 
 #undef INSN
 
@@ -1320,6 +1336,8 @@
   INSN(strs, 0b10, 0b00);
   INSN(ldrd, 0b11, 0b01);
   INSN(ldrs, 0b10, 0b01);
+  INSN(strq, 0b00, 0b10);
+  INSN(ldrq, 0x00, 0b11);
 
 #undef INSN
 
@@ -1454,7 +1472,7 @@
     f(op, 31, 29);
     f(0b11010000, 28, 21);
     f(0b000000, 15, 10);
-    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);
+    zrf(Rm, 16), zrf(Rn, 5), zrf(Rd, 0);
   }
 
   #define INSN(NAME, op)				\
@@ -1873,9 +1891,18 @@
   };
 
   enum SIMD_RegVariant {
-       S32, D64, Q128
+       B, H, S, D, Q
   };
 
+#define INSN(NAME, op)                                            \
+  void NAME(FloatRegister Rt, SIMD_RegVariant T, const Address &adr) {   \
+    ld_st2((Register)Rt, adr, (int)T & 3, op + ((T==Q) ? 0b10:0b00), 1); \
+  }                                                                      \
+
+  INSN(ldr, 1);
+  INSN(str, 0);
+
+#undef INSN
 
  private:
 
@@ -1982,14 +2009,120 @@
     rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);                              \
   }
 
-  INSN(eor, 0b101110001);
-  INSN(orr, 0b001110101);
+  INSN(eor,  0b101110001);
+  INSN(orr,  0b001110101);
   INSN(andr, 0b001110001);
-  INSN(bic, 0b001110011);
-  INSN(bif, 0b101110111);
-  INSN(bit, 0b101110101);
-  INSN(bsl, 0b101110011);
-  INSN(orn, 0b001110111);
+  INSN(bic,  0b001110011);
+  INSN(bif,  0b101110111);
+  INSN(bit,  0b101110101);
+  INSN(bsl,  0b101110011);
+  INSN(orn,  0b001110111);
+
+#undef INSN
+
+#define INSN(NAME, opc, opc2)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);                        \
+    f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(opc2, 15, 10);                      \
+    rf(Vn, 5), rf(Vd, 0);                                                               \
+  }
+
+  INSN(addv, 0, 0b100001);
+  INSN(subv, 1, 0b100001);
+  INSN(mulv, 0, 0b100111);
+  INSN(sshl, 0, 0b010001);
+  INSN(ushl, 1, 0b010001);
+
+#undef INSN
+
+#define INSN(NAME, opc, opc2) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {                   \
+    starti;                                                                             \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);                        \
+    f((int)T >> 1, 23, 22), f(opc2, 21, 10);                                            \
+    rf(Vn, 5), rf(Vd, 0);                                                               \
+  }
+
+  INSN(absr,  0, 0b100000101110);
+  INSN(negr,  1, 0b100000101110);
+  INSN(notr,  1, 0b100000010110);
+  INSN(addv,  0, 0b110001101110);
+  INSN(cls,   0, 0b100000010010);
+  INSN(clz,   1, 0b100000010010);
+  INSN(cnt,   0, 0b100000010110);
+
+#undef INSN
+
+#define INSN(NAME, op0, cmode0) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, unsigned imm8, unsigned lsl = 0) {   \
+    unsigned cmode = cmode0;                                                           \
+    unsigned op = op0;                                                                 \
+    starti;                                                                            \
+    assert(lsl == 0 ||                                                                 \
+           ((T == T4H || T == T8H) && lsl == 8) ||                                     \
+           ((T == T2S || T == T4S) && ((lsl >> 3) < 4)), "invalid shift");             \
+    cmode |= lsl >> 2;                                                                 \
+    if (T == T4H || T == T8H) cmode |= 0b1000;                                         \
+    if (!(T == T4H || T == T8H || T == T2S || T == T4S)) {                             \
+      assert(op == 0 && cmode0 == 0, "must be MOVI");                                  \
+      cmode = 0b1110;                                                                  \
+      if (T == T1D || T == T2D) op = 1;                                                \
+    }                                                                                  \
+    f(0, 31), f((int)T & 1, 30), f(op, 29), f(0b0111100000, 28, 19);                   \
+    f(imm8 >> 5, 18, 16), f(cmode, 15, 12), f(0x01, 11, 10), f(imm8 & 0b11111, 9, 5);  \
+    rf(Vd, 0);                                                                         \
+  }
+
+  INSN(movi, 0, 0);
+  INSN(orri, 0, 1);
+  INSN(mvni, 1, 0);
+  INSN(bici, 1, 1);
+
+#undef INSN
+
+#define INSN(NAME, op1, op2, op3) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");                    \
+    f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01110, 28, 24), f(op2, 23);            \
+    f(T==T2D ? 1:0, 22); f(1, 21), rf(Vm, 16), f(op3, 15, 10), rf(Vn, 5), rf(Vd, 0);    \
+  }
+
+  INSN(fadd, 0, 0, 0b110101);
+  INSN(fdiv, 1, 0, 0b111111);
+  INSN(fmul, 1, 0, 0b110111);
+  INSN(fsub, 0, 1, 0b110101);
+
+#undef INSN
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T4S, "arrangement must be T4S");                                        \
+    f(0b01011110000, 31, 21), rf(Vm, 16), f(opc, 15, 10), rf(Vn, 5), rf(Vd, 0);         \
+  }
+
+  INSN(sha1c,     0b000000);
+  INSN(sha1m,     0b001000);
+  INSN(sha1p,     0b000100);
+  INSN(sha1su0,   0b001100);
+  INSN(sha256h2,  0b010100);
+  INSN(sha256h,   0b010000);
+  INSN(sha256su1, 0b011000);
+
+#undef INSN
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {                   \
+    starti;                                                                             \
+    assert(T == T4S, "arrangement must be T4S");                                        \
+    f(0b0101111000101000, 31, 16), f(opc, 15, 10), rf(Vn, 5), rf(Vd, 0);                \
+  }
+
+  INSN(sha1h,     0b000010);
+  INSN(sha1su1,   0b000110);
+  INSN(sha256su0, 0b001010);
 
 #undef INSN
 
@@ -2006,19 +2139,40 @@
 
 #undef INSN
 
-  void shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+  void ins(FloatRegister Vd, SIMD_RegVariant T, FloatRegister Vn, int didx, int sidx) {
+    starti;
+    assert(T != Q, "invalid register variant");
+    f(0b01101110000, 31, 21), f(((didx<<1)|1)<<(int)T, 20, 16), f(0, 15);
+    f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) {
     starti;
-    /* The encodings for the immh:immb fields (bits 22:16) are
-     *   0001 xxx	8B/16B, shift = xxx
-     *   001x xxx	4H/8H,  shift = xxxx
-     *   01xx xxx	2S/4S,  shift = xxxxx
-     *   1xxx xxx	1D/2D,  shift = xxxxxx (1D is RESERVED)
-     */
-    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
-    f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
-    f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+    f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21);
+    f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10);
+    rf(Vn, 5), rf(Rd, 0);
   }
 
+#define INSN(NAME, opc, opc2) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){         \
+    starti;                                                                             \
+    /* The encodings for the immh:immb fields (bits 22:16) are                          \
+     *   0001 xxx       8B/16B, shift = xxx                                             \
+     *   001x xxx       4H/8H,  shift = xxxx                                            \
+     *   01xx xxx       2S/4S,  shift = xxxxx                                           \
+     *   1xxx xxx       1D/2D,  shift = xxxxxx (1D is RESERVED)                         \
+     */                                                                                 \
+    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");                           \
+    f(0, 31), f(T & 1, 30), f(opc, 29), f(0b011110, 28, 23),                            \
+    f((1 << ((T>>1)+3))|shift, 22, 16); f(opc2, 15, 10), rf(Vn, 5), rf(Vd, 0);          \
+  }
+
+  INSN(shl,  0, 0b010101);
+  INSN(sshr, 0, 0b000001);
+  INSN(ushr, 1, 0b000001);
+
+#undef INSN
+
   void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
@@ -2073,6 +2227,15 @@
     pmull(Vd, Ta, Vn, Vm, Tb);
   }
 
+  void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
+    starti;
+    int size_b = (int)Tb >> 1;
+    int size_a = (int)Ta >> 1;
+    assert(size_b < 3 && size_b == size_a - 1, "Invalid size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b101110, 29, 24), f(size_b, 23, 22);
+    f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
   void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
   {
     starti;
@@ -2082,6 +2245,23 @@
     rf(Vn, 5), rf(Vd, 0);
   }
 
+  void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs)
+  {
+    starti;
+    assert(T != T1D, "reserved encoding");
+    f(0,31), f((int)T & 1, 30), f(0b001110000, 29, 21);
+    f((1 << (T >> 1)), 20, 16), f(0b000011, 15, 10), rf(Xs, 5), rf(Vd, 0);
+  }
+
+  void dup(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int index = 0)
+  {
+    starti;
+    assert(T != T1D, "reserved encoding");
+    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21);
+    f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b000001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
   // CRC32 instructions
 #define INSN(NAME, sf, sz)                                                \
   void NAME(Register Rd, Register Rn, Register Rm) {                      \
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -237,7 +237,7 @@
 
   // build frame
   ciMethod* m = compilation()->method();
-  __ build_frame(initial_frame_size_in_bytes());
+  __ build_frame(initial_frame_size_in_bytes(), bang_size_in_bytes());
 
   // OSR buffer is
   //
@@ -354,7 +354,7 @@
 
 
 // This specifies the rsp decrement needed to build the frame
-int LIR_Assembler::initial_frame_size_in_bytes() {
+int LIR_Assembler::initial_frame_size_in_bytes() const {
   // if rounding, must let FrameMap know!
 
   // The frame_map records size in slots (32bit word)
@@ -518,6 +518,7 @@
   __ str(r0, Address(rthread, JavaThread::saved_exception_pc_offset()));
   __ mov(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::get_poll_stub));
   __ blrt(rscratch1, 1, 0, 1);
+  __ maybe_isb();
   __ pop(0x3ffffffc, sp);          // integer registers except lr & sp & r0 & r1
   __ mov(rscratch1, r0);
   __ pop(0x3, sp);                 // r0 & r1
@@ -557,9 +558,10 @@
     assert(os::is_poll_address(polling_page), "should be");
     unsigned long off;
     __ adrp(rscratch1, Address(polling_page, relocInfo::poll_type), off);
+    assert(off == 0, "must be");
     add_debug_info_for_branch(info);  // This isn't just debug info:
                                       // it's the oop map
-    __ ldrw(zr, Address(rscratch1, off));
+    __ read_polling_page(rscratch1, relocInfo::poll_type);
   } else {
     poll_for_safepoint(relocInfo::poll_type, info);
   }
@@ -659,6 +661,11 @@
       }
     }
     break;
+  case T_ADDRESS:
+    {
+      const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
+      reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
+    }
   case T_INT:
   case T_FLOAT:
     {
@@ -899,7 +906,7 @@
     if (type == T_ARRAY || type == T_OBJECT) {
       __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
       __ verify_oop(dest->as_register());
-    } else if (type == T_METADATA || type == T_DOUBLE) {
+    } else if (type == T_METADATA) {
       __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
     } else {
       __ ldrw(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
@@ -952,8 +959,9 @@
 }
 
 void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {
+
   LIR_Opr temp;
-  if (type == T_LONG)
+  if (type == T_LONG || type == T_DOUBLE)
     temp = FrameMap::rscratch1_long_opr;
   else
     temp = FrameMap::rscratch1_opr;
@@ -2814,7 +2822,6 @@
 
           __ stop("unexpected profiling mismatch");
           __ bind(ok);
-          __ pop(tmp);
         }
 #endif
         // first time here. Set profile type.
@@ -2913,6 +2920,7 @@
   if (info != NULL) {
     add_call_info_here(info);
   }
+  __ maybe_isb();
 }
 
 void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
@@ -3000,6 +3008,7 @@
 
 
 void LIR_Assembler::peephole(LIR_List *lir) {
+#if 0
   if (tableswitch_count >= max_tableswitches)
     return;
 
@@ -3124,6 +3133,7 @@
   next_state:
     ;
   }
+#endif
 }
 
 void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp_op) {
--- a/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -1089,7 +1089,7 @@
 #endif
   CodeEmitInfo* info = state_for(x, x->state());
   LIR_Opr reg = result_register_for(x->type());
-  new_instance(reg, x->klass(),
+  new_instance(reg, x->klass(), x->is_unresolved(),
                        FrameMap::r2_oop_opr,
                        FrameMap::r5_oop_opr,
                        FrameMap::r4_oop_opr,
--- a/src/cpu/aarch64/vm/c1_LinearScan_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_LinearScan_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -29,1218 +29,6 @@
 #include "c1/c1_LinearScan.hpp"
 #include "utilities/bitMap.inline.hpp"
 
-
-//----------------------------------------------------------------------
-// Allocation of FPU stack slots (Intel x86 only)
-//----------------------------------------------------------------------
-
 void LinearScan::allocate_fpu_stack() {
-  // First compute which FPU registers are live at the start of each basic block
-  // (To minimize the amount of work we have to do if we have to merge FPU stacks)
-  if (ComputeExactFPURegisterUsage) {
-    Interval* intervals_in_register, *intervals_in_memory;
-    create_unhandled_lists(&intervals_in_register, &intervals_in_memory, is_in_fpu_register, NULL);
-
-    // ignore memory intervals by overwriting intervals_in_memory
-    // the dummy interval is needed to enforce the walker to walk until the given id:
-    // without it, the walker stops when the unhandled-list is empty -> live information
-    // beyond this point would be incorrect.
-    Interval* dummy_interval = new Interval(any_reg);
-    dummy_interval->add_range(max_jint - 2, max_jint - 1);
-    dummy_interval->set_next(Interval::end());
-    intervals_in_memory = dummy_interval;
-
-    IntervalWalker iw(this, intervals_in_register, intervals_in_memory);
-
-    const int num_blocks = block_count();
-    for (int i = 0; i < num_blocks; i++) {
-      BlockBegin* b = block_at(i);
-
-      // register usage is only needed for merging stacks -> compute only
-      // when more than one predecessor.
-      // the block must not have any spill moves at the beginning (checked by assertions)
-      // spill moves would use intervals that are marked as handled and so the usage bit
-      // would been set incorrectly
-
-      // NOTE: the check for number_of_preds > 1 is necessary. A block with only one
-      //       predecessor may have spill moves at the begin of the block.
-      //       If an interval ends at the current instruction id, it is not possible
-      //       to decide if the register is live or not at the block begin -> the
-      //       register information would be incorrect.
-      if (b->number_of_preds() > 1) {
-        int id = b->first_lir_instruction_id();
-        BitMap regs(FrameMap::nof_fpu_regs);
-        regs.clear();
-
-        iw.walk_to(id);   // walk after the first instruction (always a label) of the block
-        assert(iw.current_position() == id, "did not walk completely to id");
-
-        // Only consider FPU values in registers
-        Interval* interval = iw.active_first(fixedKind);
-        while (interval != Interval::end()) {
-          int reg = interval->assigned_reg();
-          assert(reg >= pd_first_fpu_reg && reg <= pd_last_fpu_reg, "no fpu register");
-          assert(interval->assigned_regHi() == -1, "must not have hi register (doubles stored in one register)");
-          assert(interval->from() <= id && id < interval->to(), "interval out of range");
-
-#ifndef PRODUCT
-          if (TraceFPURegisterUsage) {
-            tty->print("fpu reg %d is live because of ", reg - pd_first_fpu_reg); interval->print();
-          }
-#endif
-
-          regs.set_bit(reg - pd_first_fpu_reg);
-          interval = interval->next();
-        }
-
-        b->set_fpu_register_usage(regs);
-
-#ifndef PRODUCT
-        if (TraceFPURegisterUsage) {
-          tty->print("FPU regs for block %d, LIR instr %d): ", b->block_id(), id); regs.print_on(tty); tty->print_cr("");
-        }
-#endif
-      }
-    }
-  }
-
-#ifndef TARGET_ARCH_aarch64
-  FpuStackAllocator alloc(ir()->compilation(), this);
-  _fpu_stack_allocator = &alloc;
-  alloc.allocate();
-  _fpu_stack_allocator = NULL;
-#endif
-}
-
-
-FpuStackAllocator::FpuStackAllocator(Compilation* compilation, LinearScan* allocator)
-  : _compilation(compilation)
-  , _lir(NULL)
-  , _pos(-1)
-  , _allocator(allocator)
-  , _sim(compilation)
-  , _temp_sim(compilation)
-{}
-
-void FpuStackAllocator::allocate() {
-  int num_blocks = allocator()->block_count();
-  for (int i = 0; i < num_blocks; i++) {
-    // Set up to process block
-    BlockBegin* block = allocator()->block_at(i);
-    intArray* fpu_stack_state = block->fpu_stack_state();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- Begin of new Block %d -------", block->block_id());
-    }
-#endif
-
-    assert(fpu_stack_state != NULL ||
-           block->end()->as_Base() != NULL ||
-           block->is_set(BlockBegin::exception_entry_flag),
-           "FPU stack state must be present due to linear-scan order for FPU stack allocation");
-    // note: exception handler entries always start with an empty fpu stack
-    //       because stack merging would be too complicated
-
-    if (fpu_stack_state != NULL) {
-      sim()->read_state(fpu_stack_state);
-    } else {
-      sim()->clear();
-    }
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Reading FPU state for block %d:", block->block_id());
-      sim()->print();
-      tty->cr();
-    }
-#endif
-
-    allocate_block(block);
-    CHECK_BAILOUT();
-  }
-}
-
-void FpuStackAllocator::allocate_block(BlockBegin* block) {
-  bool processed_merge = false;
-  LIR_OpList* insts = block->lir()->instructions_list();
-  set_lir(block->lir());
-  set_pos(0);
-
-
-  // Note: insts->length() may change during loop
-  while (pos() < insts->length()) {
-    LIR_Op* op = insts->at(pos());
-    _debug_information_computed = false;
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      op->print();
-    }
-    check_invalid_lir_op(op);
-#endif
-
-    LIR_OpBranch* branch = op->as_OpBranch();
-    LIR_Op1* op1 = op->as_Op1();
-    LIR_Op2* op2 = op->as_Op2();
-    LIR_OpCall* opCall = op->as_OpCall();
-
-    if (branch != NULL && branch->block() != NULL) {
-      if (!processed_merge) {
-        // propagate stack at first branch to a successor
-        processed_merge = true;
-        bool required_merge = merge_fpu_stack_with_successors(block);
-
-        assert(!required_merge || branch->cond() == lir_cond_always, "splitting of critical edges should prevent FPU stack mismatches at cond branches");
-      }
-
-    } else if (op1 != NULL) {
-      handle_op1(op1);
-    } else if (op2 != NULL) {
-      handle_op2(op2);
-    } else if (opCall != NULL) {
-      handle_opCall(opCall);
-    }
-
-    compute_debug_information(op);
-
-    set_pos(1 + pos());
-  }
-
-  // Propagate stack when block does not end with branch
-  if (!processed_merge) {
-    merge_fpu_stack_with_successors(block);
-  }
-}
-
-
-void FpuStackAllocator::compute_debug_information(LIR_Op* op) {
-  if (!_debug_information_computed && op->id() != -1 && allocator()->has_info(op->id())) {
-    visitor.visit(op);
-
-    // exception handling
-    if (allocator()->compilation()->has_exception_handlers()) {
-      XHandlers* xhandlers = visitor.all_xhandler();
-      int n = xhandlers->length();
-      for (int k = 0; k < n; k++) {
-        allocate_exception_handler(xhandlers->handler_at(k));
-      }
-    } else {
-      assert(visitor.all_xhandler()->length() == 0, "missed exception handler");
-    }
-
-    // compute debug information
-    int n = visitor.info_count();
-    assert(n > 0, "should not visit operation otherwise");
-
-    for (int j = 0; j < n; j++) {
-      CodeEmitInfo* info = visitor.info_at(j);
-      // Compute debug information
-      allocator()->compute_debug_info(info, op->id());
-    }
-  }
-  _debug_information_computed = true;
-}
-
-void FpuStackAllocator::allocate_exception_handler(XHandler* xhandler) {
-  if (!sim()->is_empty()) {
-    LIR_List* old_lir = lir();
-    int old_pos = pos();
-    intArray* old_state = sim()->write_state();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- begin of exception handler -------");
-    }
-#endif
-
-    if (xhandler->entry_code() == NULL) {
-      // need entry code to clear FPU stack
-      LIR_List* entry_code = new LIR_List(_compilation);
-      entry_code->jump(xhandler->entry_block());
-      xhandler->set_entry_code(entry_code);
-    }
-
-    LIR_OpList* insts = xhandler->entry_code()->instructions_list();
-    set_lir(xhandler->entry_code());
-    set_pos(0);
-
-    // Note: insts->length() may change during loop
-    while (pos() < insts->length()) {
-      LIR_Op* op = insts->at(pos());
-
-#ifndef PRODUCT
-      if (TraceFPUStack) {
-        op->print();
-      }
-      check_invalid_lir_op(op);
-#endif
-
-      switch (op->code()) {
-        case lir_move:
-          assert(op->as_Op1() != NULL, "must be LIR_Op1");
-          assert(pos() != insts->length() - 1, "must not be last operation");
-
-          handle_op1((LIR_Op1*)op);
-          break;
-
-        case lir_branch:
-          assert(op->as_OpBranch()->cond() == lir_cond_always, "must be unconditional branch");
-          assert(pos() == insts->length() - 1, "must be last operation");
-
-          // remove all remaining dead registers from FPU stack
-          clear_fpu_stack(LIR_OprFact::illegalOpr);
-          break;
-
-        default:
-          // other operations not allowed in exception entry code
-          ShouldNotReachHere();
-      }
-
-      set_pos(pos() + 1);
-    }
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->cr();
-      tty->print_cr("------- end of exception handler -------");
-    }
-#endif
-
-    set_lir(old_lir);
-    set_pos(old_pos);
-    sim()->read_state(old_state);
-  }
-}
-
-
-int FpuStackAllocator::fpu_num(LIR_Opr opr) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-  return opr->is_single_fpu() ? opr->fpu_regnr() : opr->fpu_regnrLo();
-}
-
-int FpuStackAllocator::tos_offset(LIR_Opr opr) {
-  return sim()->offset_from_tos(fpu_num(opr));
-}
-
-
-LIR_Opr FpuStackAllocator::to_fpu_stack(LIR_Opr opr) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-
-  int stack_offset = tos_offset(opr);
-  if (opr->is_single_fpu()) {
-    return LIR_OprFact::single_fpu(stack_offset)->make_fpu_stack_offset();
-  } else {
-    assert(opr->is_double_fpu(), "shouldn't call this otherwise");
-    return LIR_OprFact::double_fpu(stack_offset)->make_fpu_stack_offset();
-  }
-}
-
-LIR_Opr FpuStackAllocator::to_fpu_stack_top(LIR_Opr opr, bool dont_check_offset) {
-  assert(opr->is_fpu_register() && !opr->is_xmm_register(), "shouldn't call this otherwise");
-  assert(dont_check_offset || tos_offset(opr) == 0, "operand is not on stack top");
-
-  int stack_offset = 0;
-  if (opr->is_single_fpu()) {
-    return LIR_OprFact::single_fpu(stack_offset)->make_fpu_stack_offset();
-  } else {
-    assert(opr->is_double_fpu(), "shouldn't call this otherwise");
-    return LIR_OprFact::double_fpu(stack_offset)->make_fpu_stack_offset();
-  }
-}
-
-
-
-void FpuStackAllocator::insert_op(LIR_Op* op) {
-  lir()->insert_before(pos(), op);
-  set_pos(1 + pos());
-}
-
-
-void FpuStackAllocator::insert_exchange(int offset) {
-  if (offset > 0) {
-    LIR_Op1* fxch_op = new LIR_Op1(lir_fxch, LIR_OprFact::intConst(offset), LIR_OprFact::illegalOpr);
-    insert_op(fxch_op);
-    sim()->swap(offset);
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Exchanged register: %d         New state: ", sim()->get_slot(0)); sim()->print(); tty->cr();
-    }
-#endif
-
-  }
-}
-
-void FpuStackAllocator::insert_exchange(LIR_Opr opr) {
-  insert_exchange(tos_offset(opr));
-}
-
-
-void FpuStackAllocator::insert_free(int offset) {
-  // move stack slot to the top of stack and then pop it
-  insert_exchange(offset);
-
-  LIR_Op* fpop = new LIR_Op0(lir_fpop_raw);
-  insert_op(fpop);
-  sim()->pop();
-
-#ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Inserted pop                   New state: "); sim()->print(); tty->cr();
-    }
-#endif
-}
-
-
-void FpuStackAllocator::insert_free_if_dead(LIR_Opr opr) {
-  if (sim()->contains(fpu_num(opr))) {
-    int res_slot = tos_offset(opr);
-    insert_free(res_slot);
-  }
-}
-
-void FpuStackAllocator::insert_free_if_dead(LIR_Opr opr, LIR_Opr ignore) {
-  if (fpu_num(opr) != fpu_num(ignore) && sim()->contains(fpu_num(opr))) {
-    int res_slot = tos_offset(opr);
-    insert_free(res_slot);
-  }
-}
-
-void FpuStackAllocator::insert_copy(LIR_Opr from, LIR_Opr to) {
-  int offset = tos_offset(from);
-  LIR_Op1* fld = new LIR_Op1(lir_fld, LIR_OprFact::intConst(offset), LIR_OprFact::illegalOpr);
-  insert_op(fld);
-
-  sim()->push(fpu_num(to));
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("Inserted copy (%d -> %d)         New state: ", fpu_num(from), fpu_num(to)); sim()->print(); tty->cr();
-  }
-#endif
-}
-
-void FpuStackAllocator::do_rename(LIR_Opr from, LIR_Opr to) {
-  sim()->rename(fpu_num(from), fpu_num(to));
-}
-
-void FpuStackAllocator::do_push(LIR_Opr opr) {
-  sim()->push(fpu_num(opr));
-}
-
-void FpuStackAllocator::pop_if_last_use(LIR_Op* op, LIR_Opr opr) {
-  assert(op->fpu_pop_count() == 0, "fpu_pop_count alredy set");
-  assert(tos_offset(opr) == 0, "can only pop stack top");
-
-  if (opr->is_last_use()) {
-    op->set_fpu_pop_count(1);
-    sim()->pop();
-  }
-}
-
-void FpuStackAllocator::pop_always(LIR_Op* op, LIR_Opr opr) {
-  assert(op->fpu_pop_count() == 0, "fpu_pop_count alredy set");
-  assert(tos_offset(opr) == 0, "can only pop stack top");
-
-  op->set_fpu_pop_count(1);
-  sim()->pop();
-}
-
-void FpuStackAllocator::clear_fpu_stack(LIR_Opr preserve) {
-  int result_stack_size = (preserve->is_fpu_register() && !preserve->is_xmm_register() ? 1 : 0);
-  while (sim()->stack_size() > result_stack_size) {
-    assert(!sim()->slot_is_empty(0), "not allowed");
-
-    if (result_stack_size == 0 || sim()->get_slot(0) != fpu_num(preserve)) {
-      insert_free(0);
-    } else {
-      // move "preserve" to bottom of stack so that all other stack slots can be popped
-      insert_exchange(sim()->stack_size() - 1);
-    }
-  }
+  // No FPU stack on AArch64
 }
-
-
-void FpuStackAllocator::handle_op1(LIR_Op1* op1) {
-  LIR_Opr in  = op1->in_opr();
-  LIR_Opr res = op1->result_opr();
-
-  LIR_Opr new_in  = in;  // new operands relative to the actual fpu stack top
-  LIR_Opr new_res = res;
-
-  // Note: this switch is processed for all LIR_Op1, regardless if they have FPU-arguments,
-  //       so checks for is_float_kind() are necessary inside the cases
-  switch (op1->code()) {
-
-    case lir_return: {
-      // FPU-Stack must only contain the (optional) fpu return value.
-      // All remaining dead values are popped from the stack
-      // If the input operand is a fpu-register, it is exchanged to the bottom of the stack
-
-      clear_fpu_stack(in);
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        new_in = to_fpu_stack_top(in);
-      }
-
-      break;
-    }
-
-    case lir_move: {
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        if (res->is_xmm_register()) {
-          // move from fpu register to xmm register (necessary for operations that
-          // are not available in the SSE instruction set)
-          insert_exchange(in);
-          new_in = to_fpu_stack_top(in);
-          pop_always(op1, in);
-
-        } else if (res->is_fpu_register() && !res->is_xmm_register()) {
-          // move from fpu-register to fpu-register:
-          // * input and result register equal:
-          //   nothing to do
-          // * input register is last use:
-          //   rename the input register to result register -> input register
-          //   not present on fpu-stack afterwards
-          // * input register not last use:
-          //   duplicate input register to result register to preserve input
-          //
-          // Note: The LIR-Assembler does not produce any code for fpu register moves,
-          //       so input and result stack index must be equal
-
-          if (fpu_num(in) == fpu_num(res)) {
-            // nothing to do
-          } else if (in->is_last_use()) {
-            insert_free_if_dead(res);//, in);
-            do_rename(in, res);
-          } else {
-            insert_free_if_dead(res);
-            insert_copy(in, res);
-          }
-          new_in = to_fpu_stack(res);
-          new_res = new_in;
-
-        } else {
-          // move from fpu-register to memory
-          // input operand must be on top of stack
-
-          insert_exchange(in);
-
-          // create debug information here because afterwards the register may have been popped
-          compute_debug_information(op1);
-
-          new_in = to_fpu_stack_top(in);
-          pop_if_last_use(op1, in);
-        }
-
-      } else if (res->is_fpu_register() && !res->is_xmm_register()) {
-        // move from memory/constant to fpu register
-        // result is pushed on the stack
-
-        insert_free_if_dead(res);
-
-        // create debug information before register is pushed
-        compute_debug_information(op1);
-
-        do_push(res);
-        new_res = to_fpu_stack_top(res);
-      }
-      break;
-    }
-
-    case lir_neg: {
-      if (in->is_fpu_register() && !in->is_xmm_register()) {
-        assert(res->is_fpu_register() && !res->is_xmm_register(), "must be");
-        assert(in->is_last_use(), "old value gets destroyed");
-
-        insert_free_if_dead(res, in);
-        insert_exchange(in);
-        new_in = to_fpu_stack_top(in);
-
-        do_rename(in, res);
-        new_res = to_fpu_stack_top(res);
-      }
-      break;
-    }
-
-    case lir_convert: {
-      Bytecodes::Code bc = op1->as_OpConvert()->bytecode();
-      switch (bc) {
-        case Bytecodes::_d2f:
-        case Bytecodes::_f2d:
-          assert(res->is_fpu_register(), "must be");
-          assert(in->is_fpu_register(), "must be");
-
-          if (!in->is_xmm_register() && !res->is_xmm_register()) {
-            // this is quite the same as a move from fpu-register to fpu-register
-            // Note: input and result operands must have different types
-            if (fpu_num(in) == fpu_num(res)) {
-              // nothing to do
-              new_in = to_fpu_stack(in);
-            } else if (in->is_last_use()) {
-              insert_free_if_dead(res);//, in);
-              new_in = to_fpu_stack(in);
-              do_rename(in, res);
-            } else {
-              insert_free_if_dead(res);
-              insert_copy(in, res);
-              new_in = to_fpu_stack_top(in, true);
-            }
-            new_res = to_fpu_stack(res);
-          }
-
-          break;
-
-        case Bytecodes::_i2f:
-        case Bytecodes::_l2f:
-        case Bytecodes::_i2d:
-        case Bytecodes::_l2d:
-          assert(res->is_fpu_register(), "must be");
-          if (!res->is_xmm_register()) {
-            insert_free_if_dead(res);
-            do_push(res);
-            new_res = to_fpu_stack_top(res);
-          }
-          break;
-
-        case Bytecodes::_f2i:
-        case Bytecodes::_d2i:
-          assert(in->is_fpu_register(), "must be");
-          if (!in->is_xmm_register()) {
-            insert_exchange(in);
-            new_in = to_fpu_stack_top(in);
-
-            // TODO: update registes of stub
-          }
-          break;
-
-        case Bytecodes::_f2l:
-        case Bytecodes::_d2l:
-          assert(in->is_fpu_register(), "must be");
-          if (!in->is_xmm_register()) {
-            insert_exchange(in);
-            new_in = to_fpu_stack_top(in);
-            pop_always(op1, in);
-          }
-          break;
-
-        case Bytecodes::_i2l:
-        case Bytecodes::_l2i:
-        case Bytecodes::_i2b:
-        case Bytecodes::_i2c:
-        case Bytecodes::_i2s:
-          // no fpu operands
-          break;
-
-        default:
-          ShouldNotReachHere();
-      }
-      break;
-    }
-
-    case lir_roundfp: {
-      assert(in->is_fpu_register() && !in->is_xmm_register(), "input must be in register");
-      assert(res->is_stack(), "result must be on stack");
-
-      insert_exchange(in);
-      new_in = to_fpu_stack_top(in);
-      pop_if_last_use(op1, in);
-      break;
-    }
-
-    default: {
-      assert(!in->is_float_kind() && !res->is_float_kind(), "missed a fpu-operation");
-    }
-  }
-
-  op1->set_in_opr(new_in);
-  op1->set_result_opr(new_res);
-}
-
-void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
-  LIR_Opr left  = op2->in_opr1();
-  if (!left->is_float_kind()) {
-    return;
-  }
-  if (left->is_xmm_register()) {
-    return;
-  }
-
-  LIR_Opr right = op2->in_opr2();
-  LIR_Opr res   = op2->result_opr();
-  LIR_Opr new_left  = left;  // new operands relative to the actual fpu stack top
-  LIR_Opr new_right = right;
-  LIR_Opr new_res   = res;
-
-  assert(!left->is_xmm_register() && !right->is_xmm_register() && !res->is_xmm_register(), "not for xmm registers");
-
-  switch (op2->code()) {
-    case lir_cmp:
-    case lir_cmp_fd2i:
-    case lir_ucmp_fd2i:
-    case lir_assert: {
-      assert(left->is_fpu_register(), "invalid LIR");
-      assert(right->is_fpu_register(), "invalid LIR");
-
-      // the left-hand side must be on top of stack.
-      // the right-hand side is never popped, even if is_last_use is set
-      insert_exchange(left);
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-      pop_if_last_use(op2, left);
-      break;
-    }
-
-    case lir_mul_strictfp:
-    case lir_div_strictfp: {
-      assert(op2->tmp1_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
-      insert_free_if_dead(op2->tmp1_opr());
-      assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
-      // fall-through: continue with the normal handling of lir_mul and lir_div
-    }
-    case lir_add:
-    case lir_sub:
-    case lir_mul:
-    case lir_div: {
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_equal(res), "must be");
-
-      // either the left-hand or the right-hand side must be on top of stack
-      // (if right is not a register, left must be on top)
-      if (!right->is_fpu_register()) {
-        insert_exchange(left);
-        new_left = to_fpu_stack_top(left);
-      } else {
-        // no exchange necessary if right is alredy on top of stack
-        if (tos_offset(right) == 0) {
-          new_left = to_fpu_stack(left);
-          new_right = to_fpu_stack_top(right);
-        } else {
-          insert_exchange(left);
-          new_left = to_fpu_stack_top(left);
-          new_right = to_fpu_stack(right);
-        }
-
-        if (right->is_last_use()) {
-          op2->set_fpu_pop_count(1);
-
-          if (tos_offset(right) == 0) {
-            sim()->pop();
-          } else {
-            // if left is on top of stack, the result is placed in the stack
-            // slot of right, so a renaming from right to res is necessary
-            assert(tos_offset(left) == 0, "must be");
-            sim()->pop();
-            do_rename(right, res);
-          }
-        }
-      }
-      new_res = to_fpu_stack(res);
-
-      break;
-    }
-
-    case lir_rem: {
-      assert(left->is_fpu_register(), "must be");
-      assert(right->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_equal(res), "must be");
-
-      // Must bring both operands to top of stack with following operand ordering:
-      // * fpu stack before rem: ... right left
-      // * fpu stack after rem:  ... left
-      if (tos_offset(right) != 1) {
-        insert_exchange(right);
-        insert_exchange(1);
-      }
-      insert_exchange(left);
-      assert(tos_offset(right) == 1, "check");
-      assert(tos_offset(left) == 0, "check");
-
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-
-      op2->set_fpu_pop_count(1);
-      sim()->pop();
-      do_rename(right, res);
-
-      new_res = to_fpu_stack_top(res);
-      break;
-    }
-
-    case lir_abs:
-    case lir_sqrt: {
-      // Right argument appears to be unused
-      assert(right->is_illegal(), "must be");
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(left->is_last_use(), "old value gets destroyed");
-
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      break;
-    }
-
-    case lir_log:
-    case lir_log10: {
-      // log and log10 need one temporary fpu stack slot, so
-      // there is one temporary registers stored in temp of the
-      // operation. the stack allocator must guarantee that the stack
-      // slots are really free, otherwise there might be a stack
-      // overflow.
-      assert(right->is_illegal(), "must be");
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      assert(op2->tmp1_opr()->is_fpu_register(), "must be");
-
-      insert_free_if_dead(op2->tmp1_opr());
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
-      break;
-    }
-
-
-    case lir_tan:
-    case lir_sin:
-    case lir_cos:
-    case lir_exp: {
-      // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
-      // registers (stored in right and temp of the operation).
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
-      assert(left->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-      // assert(left->is_last_use(), "old value gets destroyed");
-      assert(right->is_fpu_register(), "right is used as the first temporary register");
-      assert(op2->tmp1_opr()->is_fpu_register(), "temp is used as the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
-
-      insert_free_if_dead(right);
-      insert_free_if_dead(op2->tmp1_opr());
-
-      insert_free_if_dead(res, left);
-      insert_exchange(left);
-      do_rename(left, res);
-
-      new_left = to_fpu_stack_top(res);
-      new_res = new_left;
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
-      break;
-    }
-
-    case lir_pow: {
-      // pow needs two temporary fpu stack slots, so there are two temporary
-      // registers (stored in tmp1 and tmp2 of the operation).
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
-      assert(left->is_fpu_register(), "must be");
-      assert(right->is_fpu_register(), "must be");
-      assert(res->is_fpu_register(), "must be");
-
-      assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
-      assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
-      assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
-
-      insert_free_if_dead(op2->tmp1_opr());
-      insert_free_if_dead(op2->tmp2_opr());
-
-      // Must bring both operands to top of stack with following operand ordering:
-      // * fpu stack before pow: ... right left
-      // * fpu stack after pow:  ... left
-
-      insert_free_if_dead(res, right);
-
-      if (tos_offset(right) != 1) {
-        insert_exchange(right);
-        insert_exchange(1);
-      }
-      insert_exchange(left);
-      assert(tos_offset(right) == 1, "check");
-      assert(tos_offset(left) == 0, "check");
-
-      new_left = to_fpu_stack_top(left);
-      new_right = to_fpu_stack(right);
-
-      op2->set_fpu_stack_size(sim()->stack_size());
-      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
-
-      sim()->pop();
-
-      do_rename(right, res);
-
-      new_res = to_fpu_stack_top(res);
-      break;
-    }
-
-    default: {
-      assert(false, "missed a fpu-operation");
-    }
-  }
-
-  op2->set_in_opr1(new_left);
-  op2->set_in_opr2(new_right);
-  op2->set_result_opr(new_res);
-}
-
-void FpuStackAllocator::handle_opCall(LIR_OpCall* opCall) {
-  LIR_Opr res = opCall->result_opr();
-
-  // clear fpu-stack before call
-  // it may contain dead values that could not have been remved by previous operations
-  clear_fpu_stack(LIR_OprFact::illegalOpr);
-  assert(sim()->is_empty(), "fpu stack must be empty now");
-
-  // compute debug information before (possible) fpu result is pushed
-  compute_debug_information(opCall);
-
-  if (res->is_fpu_register() && !res->is_xmm_register()) {
-    do_push(res);
-    opCall->set_result_opr(to_fpu_stack_top(res));
-  }
-}
-
-#ifndef PRODUCT
-void FpuStackAllocator::check_invalid_lir_op(LIR_Op* op) {
-  switch (op->code()) {
-    case lir_24bit_FPU:
-    case lir_reset_FPU:
-    case lir_ffree:
-      assert(false, "operations not allowed in lir. If one of these operations is needed, check if they have fpu operands");
-      break;
-
-    case lir_fpop_raw:
-    case lir_fxch:
-    case lir_fld:
-      assert(false, "operations only inserted by FpuStackAllocator");
-      break;
-  }
-}
-#endif
-
-
-void FpuStackAllocator::merge_insert_add(LIR_List* instrs, FpuStackSim* cur_sim, int reg) {
-  LIR_Op1* move = new LIR_Op1(lir_move, LIR_OprFact::doubleConst(0), LIR_OprFact::double_fpu(reg)->make_fpu_stack_offset());
-
-  instrs->instructions_list()->push(move);
-
-  cur_sim->push(reg);
-  move->set_result_opr(to_fpu_stack(move->result_opr()));
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Added new register: %d         New state: ", reg); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-void FpuStackAllocator::merge_insert_xchg(LIR_List* instrs, FpuStackSim* cur_sim, int slot) {
-  assert(slot > 0, "no exchange necessary");
-
-  LIR_Op1* fxch = new LIR_Op1(lir_fxch, LIR_OprFact::intConst(slot));
-  instrs->instructions_list()->push(fxch);
-  cur_sim->swap(slot);
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Exchanged register: %d         New state: ", cur_sim->get_slot(slot)); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-void FpuStackAllocator::merge_insert_pop(LIR_List* instrs, FpuStackSim* cur_sim) {
-  int reg = cur_sim->get_slot(0);
-
-  LIR_Op* fpop = new LIR_Op0(lir_fpop_raw);
-  instrs->instructions_list()->push(fpop);
-  cur_sim->pop(reg);
-
-  #ifndef PRODUCT
-    if (TraceFPUStack) {
-      tty->print("Removed register: %d           New state: ", reg); cur_sim->print(); tty->cr();
-    }
-  #endif
-}
-
-bool FpuStackAllocator::merge_rename(FpuStackSim* cur_sim, FpuStackSim* sux_sim, int start_slot, int change_slot) {
-  int reg = cur_sim->get_slot(change_slot);
-
-  for (int slot = start_slot; slot >= 0; slot--) {
-    int new_reg = sux_sim->get_slot(slot);
-
-    if (!cur_sim->contains(new_reg)) {
-      cur_sim->set_slot(change_slot, new_reg);
-
-      #ifndef PRODUCT
-        if (TraceFPUStack) {
-          tty->print("Renamed register %d to %d       New state: ", reg, new_reg); cur_sim->print(); tty->cr();
-        }
-      #endif
-
-      return true;
-    }
-  }
-  return false;
-}
-
-
-void FpuStackAllocator::merge_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, FpuStackSim* sux_sim) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->cr();
-    tty->print("before merging: pred: "); cur_sim->print(); tty->cr();
-    tty->print("                 sux: "); sux_sim->print(); tty->cr();
-  }
-
-  int slot;
-  for (slot = 0; slot < cur_sim->stack_size(); slot++) {
-    assert(!cur_sim->slot_is_empty(slot), "not handled by algorithm");
-  }
-  for (slot = 0; slot < sux_sim->stack_size(); slot++) {
-    assert(!sux_sim->slot_is_empty(slot), "not handled by algorithm");
-  }
-#endif
-
-  // size difference between cur and sux that must be resolved by adding or removing values form the stack
-  int size_diff = cur_sim->stack_size() - sux_sim->stack_size();
-
-  if (!ComputeExactFPURegisterUsage) {
-    // add slots that are currently free, but used in successor
-    // When the exact FPU register usage is computed, the stack does
-    // not contain dead values at merging -> no values must be added
-
-    int sux_slot = sux_sim->stack_size() - 1;
-    while (size_diff < 0) {
-      assert(sux_slot >= 0, "slot out of bounds -> error in algorithm");
-
-      int reg = sux_sim->get_slot(sux_slot);
-      if (!cur_sim->contains(reg)) {
-        merge_insert_add(instrs, cur_sim, reg);
-        size_diff++;
-
-        if (sux_slot + size_diff != 0) {
-          merge_insert_xchg(instrs, cur_sim, sux_slot + size_diff);
-        }
-      }
-     sux_slot--;
-    }
-  }
-
-  assert(cur_sim->stack_size() >= sux_sim->stack_size(), "stack size must be equal or greater now");
-  assert(size_diff == cur_sim->stack_size() - sux_sim->stack_size(), "must be");
-
-  // stack merge algorithm:
-  // 1) as long as the current stack top is not in the right location (that meens
-  //    it should not be on the stack top), exchange it into the right location
-  // 2) if the stack top is right, but the remaining stack is not ordered correctly,
-  //    the stack top is exchanged away to get another value on top ->
-  //    now step 1) can be continued
-  // the stack can also contain unused items -> these items are removed from stack
-
-  int finished_slot = sux_sim->stack_size() - 1;
-  while (finished_slot >= 0 || size_diff > 0) {
-    while (size_diff > 0 || (cur_sim->stack_size() > 0 && cur_sim->get_slot(0) != sux_sim->get_slot(0))) {
-      int reg = cur_sim->get_slot(0);
-      if (sux_sim->contains(reg)) {
-        int sux_slot = sux_sim->offset_from_tos(reg);
-        merge_insert_xchg(instrs, cur_sim, sux_slot + size_diff);
-
-      } else if (!merge_rename(cur_sim, sux_sim, finished_slot, 0)) {
-        assert(size_diff > 0, "must be");
-
-        merge_insert_pop(instrs, cur_sim);
-        size_diff--;
-      }
-      assert(cur_sim->stack_size() == 0 || cur_sim->get_slot(0) != reg, "register must have been changed");
-    }
-
-    while (finished_slot >= 0 && cur_sim->get_slot(finished_slot) == sux_sim->get_slot(finished_slot)) {
-      finished_slot--;
-    }
-
-    if (finished_slot >= 0) {
-      int reg = cur_sim->get_slot(finished_slot);
-
-      if (sux_sim->contains(reg) || !merge_rename(cur_sim, sux_sim, finished_slot, finished_slot)) {
-        assert(sux_sim->contains(reg) || size_diff > 0, "must be");
-        merge_insert_xchg(instrs, cur_sim, finished_slot);
-      }
-      assert(cur_sim->get_slot(finished_slot) != reg, "register must have been changed");
-    }
-  }
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("after merging:  pred: "); cur_sim->print(); tty->cr();
-    tty->print("                 sux: "); sux_sim->print(); tty->cr();
-    tty->cr();
-  }
-#endif
-  assert(cur_sim->stack_size() == sux_sim->stack_size(), "stack size must be equal now");
-}
-
-
-void FpuStackAllocator::merge_cleanup_fpu_stack(LIR_List* instrs, FpuStackSim* cur_sim, BitMap& live_fpu_regs) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->cr();
-    tty->print("before cleanup: state: "); cur_sim->print(); tty->cr();
-    tty->print("                live:  "); live_fpu_regs.print_on(tty); tty->cr();
-  }
-#endif
-
-  int slot = 0;
-  while (slot < cur_sim->stack_size()) {
-    int reg = cur_sim->get_slot(slot);
-    if (!live_fpu_regs.at(reg)) {
-      if (slot != 0) {
-        merge_insert_xchg(instrs, cur_sim, slot);
-      }
-      merge_insert_pop(instrs, cur_sim);
-    } else {
-      slot++;
-    }
-  }
-
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print("after cleanup:  state: "); cur_sim->print(); tty->cr();
-    tty->print("                live:  "); live_fpu_regs.print_on(tty); tty->cr();
-    tty->cr();
-  }
-
-  // check if fpu stack only contains live registers
-  for (unsigned int i = 0; i < live_fpu_regs.size(); i++) {
-    if (live_fpu_regs.at(i) != cur_sim->contains(i)) {
-      tty->print_cr("mismatch between required and actual stack content");
-      break;
-    }
-  }
-#endif
-}
-
-
-bool FpuStackAllocator::merge_fpu_stack_with_successors(BlockBegin* block) {
-#ifndef PRODUCT
-  if (TraceFPUStack) {
-    tty->print_cr("Propagating FPU stack state for B%d at LIR_Op position %d to successors:",
-                  block->block_id(), pos());
-    sim()->print();
-    tty->cr();
-  }
-#endif
-
-  bool changed = false;
-  int number_of_sux = block->number_of_sux();
-
-  if (number_of_sux == 1 && block->sux_at(0)->number_of_preds() > 1) {
-    // The successor has at least two incoming edges, so a stack merge will be necessary
-    // If this block is the first predecessor, cleanup the current stack and propagate it
-    // If this block is not the first predecessor, a stack merge will be necessary
-
-    BlockBegin* sux = block->sux_at(0);
-    intArray* state = sux->fpu_stack_state();
-    LIR_List* instrs = new LIR_List(_compilation);
-
-    if (state != NULL) {
-      // Merge with a successors that already has a FPU stack state
-      // the block must only have one successor because critical edges must been split
-      FpuStackSim* cur_sim = sim();
-      FpuStackSim* sux_sim = temp_sim();
-      sux_sim->read_state(state);
-
-      merge_fpu_stack(instrs, cur_sim, sux_sim);
-
-    } else {
-      // propagate current FPU stack state to successor without state
-      // clean up stack first so that there are no dead values on the stack
-      if (ComputeExactFPURegisterUsage) {
-        FpuStackSim* cur_sim = sim();
-        BitMap live_fpu_regs = block->sux_at(0)->fpu_register_usage();
-        assert(live_fpu_regs.size() == FrameMap::nof_fpu_regs, "missing register usage");
-
-        merge_cleanup_fpu_stack(instrs, cur_sim, live_fpu_regs);
-      }
-
-      intArray* state = sim()->write_state();
-      if (TraceFPUStack) {
-        tty->print_cr("Setting FPU stack state of B%d (merge path)", sux->block_id());
-        sim()->print(); tty->cr();
-      }
-      sux->set_fpu_stack_state(state);
-    }
-
-    if (instrs->instructions_list()->length() > 0) {
-      lir()->insert_before(pos(), instrs);
-      set_pos(instrs->instructions_list()->length() + pos());
-      changed = true;
-    }
-
-  } else {
-    // Propagate unmodified Stack to successors where a stack merge is not necessary
-    intArray* state = sim()->write_state();
-    for (int i = 0; i < number_of_sux; i++) {
-      BlockBegin* sux = block->sux_at(i);
-
-#ifdef ASSERT
-      for (int j = 0; j < sux->number_of_preds(); j++) {
-        assert(block == sux->pred_at(j), "all critical edges must be broken");
-      }
-
-      // check if new state is same
-      if (sux->fpu_stack_state() != NULL) {
-        intArray* sux_state = sux->fpu_stack_state();
-        assert(state->length() == sux_state->length(), "overwriting existing stack state");
-        for (int j = 0; j < state->length(); j++) {
-          assert(state->at(j) == sux_state->at(j), "overwriting existing stack state");
-        }
-      }
-#endif
-#ifndef PRODUCT
-      if (TraceFPUStack) {
-        tty->print_cr("Setting FPU stack state of B%d", sux->block_id());
-        sim()->print(); tty->cr();
-      }
-#endif
-
-      sux->set_fpu_stack_state(state);
-    }
-  }
-
-#ifndef PRODUCT
-  // assertions that FPU stack state conforms to all successors' states
-  intArray* cur_state = sim()->write_state();
-  for (int i = 0; i < number_of_sux; i++) {
-    BlockBegin* sux = block->sux_at(i);
-    intArray* sux_state = sux->fpu_stack_state();
-
-    assert(sux_state != NULL, "no fpu state");
-    assert(cur_state->length() == sux_state->length(), "incorrect length");
-    for (int i = 0; i < cur_state->length(); i++) {
-      assert(cur_state->at(i) == sux_state->at(i), "element not equal");
-    }
-  }
-#endif
-
-  return changed;
-}
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -50,21 +50,14 @@
   if (unordered_result < 0) {
     // we want -1 for unordered or less than, 0 for equal and 1 for
     // greater than.
-    mov(result, (u_int64_t)-1L);
-    // for FP LT tests less than or unordered
-    br(Assembler::LT, done);
-    // install 0 for EQ otherwise 1
-    csinc(result, zr, zr, Assembler::EQ);
+    cset(result, NE);  // Not equal or unordered
+    cneg(result, result, LT);  // Less than or unordered
   } else {
     // we want -1 for less than, 0 for equal and 1 for unordered or
     // greater than.
-    mov(result, 1L);
-    // for FP HI tests greater than or unordered
-    br(Assembler::HI, done);
-    // install 0 for EQ otherwise ~0
-    csinv(result, zr, zr, Assembler::EQ);
+    cset(result, NE);  // Not equal or unordered
+    cneg(result, result, LO);  // Less than
   }
-  bind(done);
 }
 
 int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr, Register scratch, Label& slow_case) {
@@ -409,15 +402,16 @@
 }
 
 
-void C1_MacroAssembler::build_frame(int framesize) {
+void C1_MacroAssembler::build_frame(int framesize, int bang_size_in_bytes) {
   // If we have to make this method not-entrant we'll overwrite its
   // first instruction with a jump.  For this action to be legal we
   // must ensure that this first instruction is a B, BL, NOP, BKPT,
   // SVC, HVC, or SMC.  Make it a NOP.
   nop();
+  assert(bang_size_in_bytes >= framesize, "stack bang size incorrect");
   // Make sure there is enough stack space for this method's activation.
   // Note that we do this before doing an enter().
-  generate_stack_overflow_check(framesize);
+  generate_stack_overflow_check(bang_size_in_bytes);
   MacroAssembler::build_frame(framesize + 2 * wordSize);
   if (NotifySimulator) {
     notify(Assembler::method_entry);
--- a/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -27,6 +27,8 @@
 #ifndef CPU_AARCH64_VM_C1_MACROASSEMBLER_AARCH64_HPP
 #define CPU_AARCH64_VM_C1_MACROASSEMBLER_AARCH64_HPP
 
+using MacroAssembler::build_frame;
+
 // C1_MacroAssembler contains high-level macros for C1
 
  private:
--- a/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -80,6 +80,7 @@
   pop(r0, sp);
 #endif
   reset_last_Java_frame(true, true);
+  maybe_isb();
 
   // check for pending exceptions
   { Label L;
@@ -376,7 +377,7 @@
   case handle_exception_nofpu_id:
   case handle_exception_id:
     // At this point all registers MAY be live.
-    oop_map = save_live_registers(sasm, id == handle_exception_nofpu_id);
+    oop_map = save_live_registers(sasm, id != handle_exception_nofpu_id);
     break;
   case handle_exception_from_callee_id: {
     // At this point all registers except exception oop (r0) and
@@ -440,7 +441,7 @@
   case handle_exception_nofpu_id:
   case handle_exception_id:
     // Restore the registers that were saved at the beginning.
-    restore_live_registers(sasm, id == handle_exception_nofpu_id);
+    restore_live_registers(sasm, id != handle_exception_nofpu_id);
     break;
   case handle_exception_from_callee_id:
     // WIN64_ONLY: No need to add frame::arg_reg_save_area_bytes to SP
@@ -569,6 +570,7 @@
   }
 #endif
   __ reset_last_Java_frame(true, false);
+  __ maybe_isb();
 
   // check for pending exceptions
   { Label L;
--- a/src/cpu/aarch64/vm/compiledIC_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/compiledIC_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -47,34 +47,6 @@
   return is_icholder_entry(call->destination());
 }
 
-//-----------------------------------------------------------------------------
-// High-level access to an inline cache. Guaranteed to be MT-safe.
-
-CompiledIC::CompiledIC(nmethod* nm, NativeCall* call)
-  : _ic_call(call)
-{
-  address ic_call = call->instruction_address();
-
-  assert(ic_call != NULL, "ic_call address must be set");
-  assert(nm != NULL, "must pass nmethod");
-  assert(nm->contains(ic_call), "must be in nmethod");
-
-  // Search for the ic_call at the given address.
-  RelocIterator iter(nm, ic_call, ic_call+1);
-  bool ret = iter.next();
-  assert(ret == true, "relocInfo must exist at this address");
-  assert(iter.addr() == ic_call, "must find ic_call");
-  if (iter.type() == relocInfo::virtual_call_type) {
-    virtual_call_Relocation* r = iter.virtual_call_reloc();
-    _is_optimized = false;
-    _value = nativeMovConstReg_at(r->cached_value());
-  } else {
-    assert(iter.type() == relocInfo::opt_virtual_call_type, "must be a virtual call");
-    _is_optimized = true;
-    _value = NULL;
-  }
-}
-
 // ----------------------------------------------------------------------------
 
 #define __ _masm.
@@ -106,15 +78,13 @@
 #undef __
 
 int CompiledStaticCall::to_interp_stub_size() {
-  // count a mov mem --> to 4 movz/k and a branch
-  return 6 * NativeInstruction::instruction_size;
+  // count a mov mem --> to 3 movz/k and a branch
+  return 4 * NativeInstruction::instruction_size;
 }
 
 // Relocation entries for call stub, compiled java to interpreter.
 int CompiledStaticCall::reloc_to_interp_stub() {
-  // TODO fixme
-  // return a large number
-  return 5;
+  return 4; // 3 in emit_to_interp_stub + 1 in emit_call
 }
 
 void CompiledStaticCall::set_to_interpreted(methodHandle callee, address entry) {
@@ -130,18 +100,18 @@
 
   // Creation also verifies the object.
   NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
-  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+#ifndef PRODUCT
+  NativeGeneralJump* jump = nativeGeneralJump_at(method_holder->next_instruction_address());
 
   assert(method_holder->data() == 0 || method_holder->data() == (intptr_t)callee(),
          "a) MT-unsafe modification of inline cache");
-  assert(jump->jump_destination() == (address)-1 || jump->jump_destination() == entry,
+  assert(method_holder->data() == 0 || jump->jump_destination() == entry,
          "b) MT-unsafe modification of inline cache");
-
+#endif
   // Update stub.
   method_holder->set_data((intptr_t)callee());
-  method_holder->flush();
-  jump->set_jump_destination(entry);
-
+  NativeGeneralJump::insert_unconditional(method_holder->next_instruction_address(), entry);
+  ICache::invalidate_range(stub, to_interp_stub_size());
   // Update jump to call.
   set_destination_mt_safe(stub);
 }
@@ -153,9 +123,7 @@
   assert(stub != NULL, "stub not found");
   // Creation also verifies the object.
   NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
-  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
   method_holder->set_data(0);
-  jump->set_jump_destination((address)-1);
 }
 
 //-----------------------------------------------------------------------------
--- a/src/cpu/aarch64/vm/frame_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/frame_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -124,7 +124,9 @@
     }
 
     intptr_t* sender_sp = NULL;
+    intptr_t* sender_unextended_sp = NULL;
     address   sender_pc = NULL;
+    intptr_t* saved_fp =  NULL;
 
     if (is_interpreted_frame()) {
       // fp must be safe
@@ -133,7 +135,12 @@
       }
 
       sender_pc = (address) this->fp()[return_addr_offset];
+      // for interpreted frames, the value below is the sender "raw" sp,
+      // which can be different from the sender unextended sp (the sp seen
+      // by the sender) because of current frame local variables
       sender_sp = (intptr_t*) addr_at(sender_sp_offset);
+      sender_unextended_sp = (intptr_t*) this->fp()[interpreter_frame_sender_sp_offset];
+      saved_fp = (intptr_t*) this->fp()[link_offset];
 
     } else {
       // must be some sort of compiled/runtime frame
@@ -145,7 +152,10 @@
       }
 
       sender_sp = _unextended_sp + _cb->frame_size();
+      sender_unextended_sp = sender_sp;
       sender_pc = (address) *(sender_sp-1);
+      // Note: frame::sender_sp_offset is only valid for compiled frame
+      saved_fp = (intptr_t*) *(sender_sp - frame::sender_sp_offset);
     }
 
 
@@ -156,7 +166,6 @@
       // only if the sender is interpreted/call_stub (c1 too?) are we certain that the saved fp
       // is really a frame pointer.
 
-      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset);
       bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
 
       if (!saved_fp_safe) {
@@ -165,7 +174,7 @@
 
       // construct the potential sender
 
-      frame sender(sender_sp, saved_fp, sender_pc);
+      frame sender(sender_sp, sender_unextended_sp, saved_fp, sender_pc);
 
       return sender.is_interpreted_frame_valid(thread);
 
@@ -194,7 +203,6 @@
 
     // Could be the call_stub
     if (StubRoutines::returns_to_call_stub(sender_pc)) {
-      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset);
       bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
 
       if (!saved_fp_safe) {
@@ -203,7 +211,7 @@
 
       // construct the potential sender
 
-      frame sender(sender_sp, saved_fp, sender_pc);
+      frame sender(sender_sp, sender_unextended_sp, saved_fp, sender_pc);
 
       // Validate the JavaCallWrapper an entry frame must have
       address jcw = (address)sender.entry_frame_call_wrapper();
@@ -573,8 +581,11 @@
   if (!m->is_valid_method()) return false;
 
   // stack frames shouldn't be much larger than max_stack elements
-
-  if (fp() - sp() > 1024 + m->max_stack()*Interpreter::stackElementSize) {
+  // this test requires the use of unextended_sp which is the sp as seen by
+  // the current frame, and not sp which is the "raw" pc which could point
+  // further because of local variables of the callee method inserted after
+  // method arguments
+  if (fp() - unextended_sp() > 1024 + m->max_stack()*Interpreter::stackElementSize) {
     return false;
   }
 
@@ -807,7 +818,7 @@
 		   unsigned long bcx, unsigned long thread) {
   RegisterMap map((JavaThread*)thread, false);
   if (!reg_map) {
-    reg_map = (RegisterMap*)new char[sizeof map];
+    reg_map = (RegisterMap*)os::malloc(sizeof map, mtNone);
   }
   memcpy(reg_map, &map, sizeof map);
   {
@@ -827,3 +838,10 @@
   Method* m = (Method*)p[frame::interpreter_frame_method_offset];
   printbc(m, bcx);
 }
+
+#ifndef PRODUCT
+// This is a generic constructor which is only used by pns() in debug.cpp.
+frame::frame(void* sp, void* fp, void* pc) {
+  init((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+}
+#endif
--- a/src/cpu/aarch64/vm/frame_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/frame_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -188,6 +188,8 @@
 
   frame(intptr_t* sp, intptr_t* fp);
 
+  void init(intptr_t* sp, intptr_t* fp, address pc);
+
   // accessors for the instance variables
   // Note: not necessarily the real 'frame pointer' (see real_fp)
   intptr_t*   fp() const { return _fp; }
--- a/src/cpu/aarch64/vm/frame_aarch64.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/frame_aarch64.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -42,17 +42,11 @@
   _deopt_state = unknown;
 }
 
-static int spin;
+//static int spin;
 
-inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
+inline void frame::init(intptr_t* sp, intptr_t* fp, address pc) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
-#ifndef PRODUCT
-  if (fp)
-    if (sp > fp || (fp - sp > 0x100000))
-      for(;;)
-	asm("nop");
-#endif
   _sp = sp;
   _unextended_sp = sp;
   _fp = fp;
@@ -70,15 +64,13 @@
   }
 }
 
+inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
+  init(sp, fp, pc);
+}
+
 inline frame::frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
-#ifndef PRODUCT
-  if (fp) 
-    if (sp > fp || (fp - sp > 0x100000))
-      for(;;)
-	asm("nop");
-#endif
   _sp = sp;
   _unextended_sp = unextended_sp;
   _fp = fp;
@@ -100,12 +92,6 @@
 inline frame::frame(intptr_t* sp, intptr_t* fp) {
   intptr_t a = intptr_t(sp);
   intptr_t b = intptr_t(fp);
-#ifndef PRODUCT
-  if (fp)
-    if (sp > fp || (fp - sp > 0x100000))
-      for(;;)
-	asm("nop");
-#endif
   _sp = sp;
   _unextended_sp = sp;
   _fp = fp;
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -65,10 +65,12 @@
 define_pd_global(intx, PreInflateSpin,           10);
 
 define_pd_global(bool, RewriteBytecodes,     true);
-define_pd_global(bool, RewriteFrequentPairs, false);
+define_pd_global(bool, RewriteFrequentPairs, true);
 
 define_pd_global(bool, UseMembar,            true);
 
+define_pd_global(bool, PreserveFramePointer, false);
+
 // GC Ergo Flags
 define_pd_global(uintx, CMSYoungGenPerWorker, 64*M);  // default max size of CMS young gen, per GC worker thread
 
@@ -95,11 +97,13 @@
          "constant pool is close to instructions")                      \
                                                                         \
   notproduct(bool, UseAcqRelForVolatileFields, false,			\
-	     "Use acquire and release insns for volatile fields")
+	     "Use acquire and release insns for volatile fields")       \
+                                                                        \
+  product(bool, UseCRC32, false,                                        \
+          "Use CRC32 instructions for CRC32 computation")               \
 
 // Don't attempt to use Neon on builtin sim until builtin sim supports it
 #define UseNeon false
-#define UseCRC32 false
 
 #else
 #define UseBuiltinSim		false
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -612,6 +612,7 @@
     Label done;
 
     const Register swap_reg = r0;
+    const Register tmp = c_rarg2;
     const Register obj_reg = c_rarg3; // Will contain the oop
 
     const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@@ -625,7 +626,7 @@
     ldr(obj_reg, Address(lock_reg, obj_offset));
 
     if (UseBiasedLocking) {
-      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
     }
 
     // Load (object->mark() | 1) into swap_reg
@@ -644,7 +645,7 @@
       cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
       bind(fast);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-		  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
       b(done);
       bind(fail);
     } else {
@@ -672,7 +673,7 @@
     if (PrintBiasedLockingStatistics) {
       br(Assembler::NE, slow_case);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-		  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
     }
     br(Assembler::EQ, done);
 
@@ -852,9 +853,10 @@
     // jcc(Assembler::negative, L);
     // addptr(data, (int32_t) DataLayout::counter_increment);
     // so we do this
+    ldr(rscratch1, addr);
     subs(rscratch1, rscratch1, (unsigned)DataLayout::counter_increment);
     Label L;
-    br(Assembler::CS, L); 	// skip store if counter overflow
+    br(Assembler::LO, L); 	// skip store if counter overflow
     str(rscratch1, addr);
     bind(L);
   } else {
@@ -1314,7 +1316,7 @@
     // case_array_offset_in_bytes()
     movw(reg2, in_bytes(MultiBranchData::per_case_size()));
     movw(rscratch1, in_bytes(MultiBranchData::case_array_offset()));
-    maddw(index, index, reg2, rscratch1);
+    Assembler::maddw(index, index, reg2, rscratch1);
 
     // Update the case count
     increment_mdp_data_at(mdp,
--- a/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -40,7 +40,11 @@
  protected:
 
  protected:
+  using MacroAssembler::call_VM_leaf_base;
+
   // Interpreter specific version of call_VM_base
+  using MacroAssembler::call_VM_leaf_base;
+
   virtual void call_VM_leaf_base(address entry_point,
                                  int number_of_arguments);
 
--- a/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -42,25 +42,28 @@
   void clear(void) {
     // clearing _last_Java_sp must be first
     _last_Java_sp = NULL;
-    // fence?
+    OrderAccess::release();
     _last_Java_fp = NULL;
     _last_Java_pc = NULL;
   }
 
   void copy(JavaFrameAnchor* src) {
-    // In order to make sure the transition state is valid for "this"
-    // We must clear _last_Java_sp before copying the rest of the new data
-    //
-    // Hack Alert: Temporary bugfix for 4717480/4721647
-    // To act like previous version (pd_cache_state) don't NULL _last_Java_sp
-    // unless the value is changing
-    //
-    if (_last_Java_sp != src->_last_Java_sp)
-      _last_Java_sp = NULL;
-
+    // n.b. the writes to fp and pc do not require any preceding
+    // release(). when copying into the thread anchor, which only
+    // happens under ~JavaCallWrapper(), sp will have been NULLed by a
+    // call to zap() and the NULL write will have been published by a
+    // fence in the state transition to in_vm. contrariwise, when
+    // copying into the wrapper anchor, which only happens under
+    // JavaCallWrapper(), there is no ordering requirement at all
+    // since that object is thread local until the subsequent entry
+    // into java. JavaCallWrapper() call clear() after copy() thus
+    // ensuring that all 3 writes are visible() before the wrapper is
+    // accessible to other threads.
     _last_Java_fp = src->_last_Java_fp;
     _last_Java_pc = src->_last_Java_pc;
-    // Must be last so profiler will always see valid frame if has_last_frame() is true
+    // Must be last so profiler will always see valid frame if
+    // has_last_frame() is true
+    OrderAccess::release();
     _last_Java_sp = src->_last_Java_sp;
   }
 
@@ -79,10 +82,14 @@
 
 public:
 
-  void set_last_Java_sp(intptr_t* sp)            { _last_Java_sp = sp; }
+  // n.b. set_last_Java_sp and set_last_Java_fp are never called
+  // (which is good because they would need a preceding or following
+  // call to OrderAccess::release() to make sure the writes are
+  // visible in the correct order).
+void set_last_Java_sp(intptr_t* sp)            { assert(false, "should not be called"); _last_Java_sp = sp; }
 
   intptr_t*   last_Java_fp(void)                     { return _last_Java_fp; }
   // Assert (last_Java_sp == NULL || fp == NULL)
-  void set_last_Java_fp(intptr_t* fp)                { _last_Java_fp = fp; }
+  void set_last_Java_fp(intptr_t* fp)                { assert(false, "should not be called"); _last_Java_fp = fp; }
 
 #endif // CPU_AARCH64_VM_JAVAFRAMEANCHOR_AARCH64_HPP
--- a/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -127,10 +127,15 @@
     case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
     default:        ShouldNotReachHere();
   }
-  // tail call
-  __ lea(rscratch1, ExternalAddress(slow_case_addr));
-  __ br(rscratch1);
 
+  {
+    __ enter();
+    __ lea(rscratch1, ExternalAddress(slow_case_addr));
+    __ blr(rscratch1);
+    __ maybe_isb();
+    __ leave();
+    __ ret(lr);
+  }
   __ flush ();
 
   return fast_entry;
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -1,3 +1,4 @@
+/*
 /*
  * Copyright (c) 2013, Red Hat Inc.
  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
@@ -64,7 +65,10 @@
 
 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 
-void MacroAssembler::pd_patch_instruction(address branch, address target) {
+// Patch any kind of instruction; there may be several instructions.
+// Return the total length (in bytes) of the instructions.
+int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
+  int instructions = 1;
   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
   long offset = (target - branch) >> 2;
   unsigned insn = *(unsigned*)branch;
@@ -118,12 +122,14 @@
 	Instruction_aarch64::patch(branch + sizeof (unsigned),
 				    21, 10, offset_lo >> size);
 	guarantee(((dest >> size) << size) == dest, "misaligned target");
+	instructions = 2;
       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 		Instruction_aarch64::extract(insn, 4, 0) ==
 			Instruction_aarch64::extract(insn2, 4, 0)) {
 	// add (immediate)
 	Instruction_aarch64::patch(branch + sizeof (unsigned),
 				   21, 10, offset_lo);
+	instructions = 2;
       } else {
 	assert((jbyte *)target ==
 		((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
@@ -146,6 +152,7 @@
     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
     assert(pd_call_destination(branch) == target, "should be");
+    instructions = 3;
   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
     // nothing to do
@@ -153,19 +160,33 @@
   } else {
     ShouldNotReachHere();
   }
+  return instructions * NativeInstruction::instruction_size;
 }
 
-void MacroAssembler::patch_oop(address insn_addr, address o) {
+int MacroAssembler::patch_oop(address insn_addr, address o) {
+  int instructions;
   unsigned insn = *(unsigned*)insn_addr;
+  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
+
+  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
+  // narrow OOPs by setting the upper 16 bits in the first
+  // instruction.
   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
-      // Move narrow constant
-      assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
-      narrowOop n = oopDesc::encode_heap_oop((oop)o);
-      Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
-      Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+    // Move narrow OOP
+    narrowOop n = oopDesc::encode_heap_oop((oop)o);
+    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+    instructions = 2;
   } else {
-    pd_patch_instruction(insn_addr, o);
+    // Move wide OOP
+    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
+    uintptr_t dest = (uintptr_t)o;
+    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
+    instructions = 3;
   }
+  return instructions * NativeInstruction::instruction_size;
 }
 
 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
@@ -347,11 +368,7 @@
   if (PrintBiasedLockingStatistics && counters == NULL)
     counters = BiasedLocking::counters();
 
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    tmp_reg = rscratch2;
-  }
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
@@ -381,7 +398,7 @@
   if (counters != NULL) {
     Label around;
     cbnz(tmp_reg, around);
-    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
     b(done);
     bind(around);
   } else {
@@ -434,7 +451,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
-		  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -460,7 +477,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
-		  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -488,7 +505,7 @@
     // removing the bias bit from the object's header.
     if (counters != NULL) {
       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
-		  rscratch1);
+                  rscratch1, rscratch2);
     }
     bind(nope);
   }
@@ -735,15 +752,29 @@
   while (offset() % modulus != 0) nop();
 }
 
-// these are meant to be no-ops overridden by InterpreterMacroAssembler
-
-void MacroAssembler::check_and_handle_earlyret(Register java_thread) { Unimplemented(); }
-
-void MacroAssembler::check_and_handle_popframe(Register java_thread) { Unimplemented(); }
+// these are no-ops overridden by InterpreterMacroAssembler
+
+void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
+
+void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
+
 
 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
                                                       Register tmp,
-                                                      int offset) { Unimplemented(); return RegisterOrConstant(r0); }
+                                                      int offset) {
+  intptr_t value = *delayed_value_addr;
+  if (value != 0)
+    return RegisterOrConstant(value + offset);
+
+  // load indirectly to solve generation ordering problem
+  ldr(tmp, ExternalAddress((address) delayed_value_addr));
+
+  if (offset != 0)
+    add(tmp, tmp, offset);
+
+  return RegisterOrConstant(tmp);
+}
+
 
 void MacroAssembler:: notify(int type) {
   if (type == bytecode_start) {
@@ -1046,7 +1077,7 @@
   // We will consult the secondary-super array.
   ldr(r5, secondary_supers_addr);
   // Load the array length.  (Positive movl does right thing on LP64.)
-  ldr(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
+  ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
   // Skip to start of data.
   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
 
@@ -1181,6 +1212,7 @@
     bind(*retaddr);
 
   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
+  maybe_isb();
 }
 
 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
@@ -1285,6 +1317,52 @@
   movk(r, imm64 & 0xffff, 32);
 }
 
+// Macro to mov replicated immediate to vector register.
+//  Vd will get the following values for different arrangements in T
+//   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
+//   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
+//   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
+//   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
+//   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
+//   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
+//   T1D/T2D: invalid
+void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
+  assert(T != T1D && T != T2D, "invalid arrangement");
+  if (T == T8B || T == T16B) {
+    assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
+    movi(Vd, T, imm32 & 0xff, 0);
+    return;
+  }
+  u_int32_t nimm32 = ~imm32;
+  if (T == T4H || T == T8H) {
+    assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
+    imm32 &= 0xffff;
+    nimm32 &= 0xffff;
+  }
+  u_int32_t x = imm32;
+  int movi_cnt = 0;
+  int movn_cnt = 0;
+  while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
+  x = nimm32;
+  while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
+  if (movn_cnt < movi_cnt) imm32 = nimm32;
+  unsigned lsl = 0;
+  while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+  if (movn_cnt < movi_cnt)
+    mvni(Vd, T, imm32 & 0xff, lsl);
+  else
+    movi(Vd, T, imm32 & 0xff, lsl);
+  imm32 >>= 8; lsl += 8;
+  while (imm32) {
+    while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+    if (movn_cnt < movi_cnt)
+      bici(Vd, T, imm32 & 0xff, lsl);
+    else
+      orri(Vd, T, imm32 & 0xff, lsl);
+    lsl += 8; imm32 >>= 8;
+  }
+}
+
 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
 {
 #ifndef PRODUCT
@@ -1466,15 +1544,15 @@
   return Address(Rd);
 }
 
-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
   addw(tmp, tmp, 1);
   // if we store+flush with no intervening write tmp wil be zero
-  stxrw(tmp, tmp, counter_addr);
-  cbnzw(tmp, retry_load);
+  stxrw(tmp2, tmp, counter_addr);
+  cbnzw(tmp2, retry_load);
 }
 
 
@@ -1502,7 +1580,7 @@
     sdivw(result, ra, rb);
   } else {
     sdivw(scratch, ra, rb);
-    msubw(result, scratch, rb, ra);
+    Assembler::msubw(result, scratch, rb, ra);
   }
 
   return idivl_offset;
@@ -1532,7 +1610,7 @@
     sdiv(result, ra, rb);
   } else {
     sdiv(scratch, ra, rb);
-    msub(result, scratch, rb, ra);
+    Assembler::msub(result, scratch, rb, ra);
   }
 
   return idivq_offset;
@@ -1671,7 +1749,7 @@
   }
 }
 
-void MacroAssembler::increment(Address dst, int value)
+void MacroAssembler::incrementw(Address dst, int value)
 {
   assert(!dst.uses(rscratch1), "invalid dst for address increment");
   ldrw(rscratch1, dst);
@@ -1679,7 +1757,7 @@
   strw(rscratch1, dst);
 }
 
-void MacroAssembler::incrementw(Address dst, int value)
+void MacroAssembler::increment(Address dst, int value)
 {
   assert(!dst.uses(rscratch1), "invalid dst for address increment");
   ldr(rscratch1, dst);
@@ -1839,6 +1917,22 @@
   }
 }
 
+void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    sub(Rd, Rn, decrement.as_register());
+  } else {
+    sub(Rd, Rn, decrement.as_constant());
+  }
+}
+
+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    subw(Rd, Rn, decrement.as_register());
+  } else {
+    subw(Rd, Rn, decrement.as_constant());
+  }
+}
+
 void MacroAssembler::reinit_heapbase()
 {
   if (UseCompressedOops) {
@@ -1928,7 +2022,7 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, STXR)					\
+#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
   Register result = rscratch2;						\
   if (prev->is_valid())							\
@@ -1938,14 +2032,15 @@
   bind(retry_load);							\
   LDXR(result, addr);							\
   OP(rscratch1, result, incr);						\
-  STXR(rscratch1, rscratch1, addr);					\
-  cbnzw(rscratch1, retry_load);						\
-  if (prev->is_valid() && prev != result)				\
-    mov(prev, result);							\
+  STXR(rscratch2, rscratch1, addr);                                     \
+  cbnzw(rscratch2, retry_load);                                         \
+  if (prev->is_valid() && prev != result) {                             \
+    IOP(prev, rscratch1, incr);                                         \
+  }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)
 
 #undef ATOMIC_OP
 
@@ -2115,6 +2210,363 @@
 }
 
 /**
+ * Helpers for multiply_to_len().
+ */
+void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
+                                     Register src1, Register src2) {
+  adds(dest_lo, dest_lo, src1);
+  adc(dest_hi, dest_hi, zr);
+  adds(dest_lo, dest_lo, src2);
+  adc(final_dest_hi, dest_hi, zr);
+}
+
+// Generate an address from (r + r1 extend offset).  "size" is the
+// size of the operand.  The result may be in rscratch2.
+Address MacroAssembler::offsetted_address(Register r, Register r1,
+                                          Address::extend ext, int offset, int size) {
+  if (offset || (ext.shift() % size != 0)) {
+    lea(rscratch2, Address(r, r1, ext));
+    return Address(rscratch2, offset);
+  } else {
+    return Address(r, r1, ext);
+  }
+}
+
+Address MacroAssembler::spill_address(int size, int offset, Register tmp)
+{
+  assert(offset >= 0, "spill to negative address?");
+  // Offset reachable ?
+  //   Not aligned - 9 bits signed offset
+  //   Aligned - 12 bits unsigned offset shifted
+  Register base = sp;
+  if ((offset & (size-1)) && offset >= (1<<8)) {
+    add(tmp, base, offset & ((1<<12)-1));
+    base = tmp;
+    offset &= -1<<12;
+  }
+
+  if (offset >= (1<<12) * size) {
+    add(tmp, base, offset & (((1<<12)-1)<<12));
+    base = tmp;
+    offset &= ~(((1<<12)-1)<<12);
+  }
+
+  return Address(base, offset);
+}
+
+/**
+ * Multiply 64 bit by 64 bit first loop.
+ */
+void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                                           Register y, Register y_idx, Register z,
+                                           Register carry, Register product,
+                                           Register idx, Register kdx) {
+  //
+  //  jlong carry, x[], y[], z[];
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    huge_128 product = y[idx] * x[xstart] + carry;
+  //    z[kdx] = (jlong)product;
+  //    carry  = (jlong)(product >>> 64);
+  //  }
+  //  z[xstart] = carry;
+  //
+
+  Label L_first_loop, L_first_loop_exit;
+  Label L_one_x, L_one_y, L_multiply;
+
+  subsw(xstart, xstart, 1);
+  br(Assembler::MI, L_one_x);
+
+  lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
+  ldr(x_xstart, Address(rscratch1));
+  ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
+
+  bind(L_first_loop);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_first_loop_exit);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_one_y);
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(y_idx, Address(rscratch1));
+  ror(y_idx, y_idx, 32); // convert big-endian to little-endian
+  bind(L_multiply);
+
+  // AArch64 has a multiply-accumulate instruction that we can't use
+  // here because it has no way to process carries, so we have to use
+  // separate add and adc instructions.  Bah.
+  umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
+  mul(product, x_xstart, y_idx);
+  adds(product, product, carry);
+  adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
+
+  subw(kdx, kdx, 2);
+  ror(product, product, 32); // back to big-endian
+  str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
+
+  b(L_first_loop);
+
+  bind(L_one_y);
+  ldrw(y_idx, Address(y,  0));
+  b(L_multiply);
+
+  bind(L_one_x);
+  ldrw(x_xstart, Address(x,  0));
+  b(L_first_loop);
+
+  bind(L_first_loop_exit);
+}
+
+/**
+ * Multiply 128 bit by 128. Unrolled inner loop.
+ *
+ */
+void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
+                                             Register carry, Register carry2,
+                                             Register idx, Register jdx,
+                                             Register yz_idx1, Register yz_idx2,
+                                             Register tmp, Register tmp3, Register tmp4,
+                                             Register tmp6, Register product_hi) {
+
+  //   jlong carry, x[], y[], z[];
+  //   int kdx = ystart+1;
+  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+  //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
+  //     jlong carry2  = (jlong)(tmp3 >>> 64);
+  //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
+  //     carry  = (jlong)(tmp4 >>> 64);
+  //     z[kdx+idx+1] = (jlong)tmp3;
+  //     z[kdx+idx] = (jlong)tmp4;
+  //   }
+  //   idx += 2;
+  //   if (idx > 0) {
+  //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
+  //     z[kdx+idx] = (jlong)yz_idx1;
+  //     carry  = (jlong)(yz_idx1 >>> 64);
+  //   }
+  //
+
+  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+  lsrw(jdx, idx, 2);
+
+  bind(L_third_loop);
+
+  subsw(jdx, jdx, 1);
+  br(Assembler::MI, L_third_loop_exit);
+  subw(idx, idx, 4);
+
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+
+  ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
+
+  lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+
+  ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+  ror(yz_idx2, yz_idx2, 32);
+
+  ldp(rscratch2, rscratch1, Address(tmp6, 0));
+
+  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
+  umulh(tmp4, product_hi, yz_idx1);
+
+  ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
+  ror(rscratch2, rscratch2, 32);
+
+  mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
+  umulh(carry2, product_hi, yz_idx2);
+
+  // propagate sum of both multiplications into carry:tmp4:tmp3
+  adds(tmp3, tmp3, carry);
+  adc(tmp4, tmp4, zr);
+  adds(tmp3, tmp3, rscratch1);
+  adcs(tmp4, tmp4, tmp);
+  adc(carry, carry2, zr);
+  adds(tmp4, tmp4, rscratch2);
+  adc(carry, carry, zr);
+
+  ror(tmp3, tmp3, 32); // convert little-endian to big-endian
+  ror(tmp4, tmp4, 32);
+  stp(tmp4, tmp3, Address(tmp6, 0));
+
+  b(L_third_loop);
+  bind (L_third_loop_exit);
+
+  andw (idx, idx, 0x3);
+  cbz(idx, L_post_third_loop_done);
+
+  Label L_check_1;
+  subsw(idx, idx, 2);
+  br(Assembler::MI, L_check_1);
+
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(yz_idx1, Address(rscratch1, 0));
+  ror(yz_idx1, yz_idx1, 32);
+  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
+  umulh(tmp4, product_hi, yz_idx1);
+  lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(yz_idx2, Address(rscratch1, 0));
+  ror(yz_idx2, yz_idx2, 32);
+
+  add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
+
+  ror(tmp3, tmp3, 32);
+  str(tmp3, Address(rscratch1, 0));
+
+  bind (L_check_1);
+
+  andw (idx, idx, 0x1);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_post_third_loop_done);
+  ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
+  umulh(carry2, tmp4, product_hi);
+  ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+
+  add2_with_carry(carry2, tmp3, tmp4, carry);
+
+  strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+  extr(carry, carry2, tmp3, 32);
+
+  bind(L_post_third_loop_done);
+}
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * r0: x
+ * r1: xlen
+ * r2: y
+ * r3: ylen
+ * r4:  z
+ * r5: zlen
+ * r10: tmp1
+ * r11: tmp2
+ * r12: tmp3
+ * r13: tmp4
+ * r14: tmp5
+ * r15: tmp6
+ * r16: tmp7
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
+                                     Register z, Register zlen,
+                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                                     Register tmp5, Register tmp6, Register product_hi) {
+
+  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
+
+  const Register idx = tmp1;
+  const Register kdx = tmp2;
+  const Register xstart = tmp3;
+
+  const Register y_idx = tmp4;
+  const Register carry = tmp5;
+  const Register product  = xlen;
+  const Register x_xstart = zlen;  // reuse register
+
+  // First Loop.
+  //
+  //  final static long LONG_MASK = 0xffffffffL;
+  //  int xstart = xlen - 1;
+  //  int ystart = ylen - 1;
+  //  long carry = 0;
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+  //    z[kdx] = (int)product;
+  //    carry = product >>> 32;
+  //  }
+  //  z[xstart] = (int)carry;
+  //
+
+  movw(idx, ylen);      // idx = ylen;
+  movw(kdx, zlen);      // kdx = xlen+ylen;
+  mov(carry, zr);       // carry = 0;
+
+  Label L_done;
+
+  movw(xstart, xlen);
+  subsw(xstart, xstart, 1);
+  br(Assembler::MI, L_done);
+
+  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+
+  Label L_second_loop;
+  cbzw(kdx, L_second_loop);
+
+  Label L_carry;
+  subw(kdx, kdx, 1);
+  cbzw(kdx, L_carry);
+
+  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
+  lsr(carry, carry, 32);
+  subw(kdx, kdx, 1);
+
+  bind(L_carry);
+  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
+
+  // Second and third (nested) loops.
+  //
+  // for (int i = xstart-1; i >= 0; i--) { // Second loop
+  //   carry = 0;
+  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+  //                    (z[k] & LONG_MASK) + carry;
+  //     z[k] = (int)product;
+  //     carry = product >>> 32;
+  //   }
+  //   z[i] = (int)carry;
+  // }
+  //
+  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
+
+  const Register jdx = tmp1;
+
+  bind(L_second_loop);
+  mov(carry, zr);                // carry = 0;
+  movw(jdx, ylen);               // j = ystart+1
+
+  subsw(xstart, xstart, 1);      // i = xstart-1;
+  br(Assembler::MI, L_done);
+
+  str(z, Address(pre(sp, -4 * wordSize)));
+
+  Label L_last_x;
+  lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
+  subsw(xstart, xstart, 1);       // i = xstart-1;
+  br(Assembler::MI, L_last_x);
+
+  lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
+  ldr(product_hi, Address(rscratch1));
+  ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
+
+  Label L_third_loop_prologue;
+  bind(L_third_loop_prologue);
+
+  str(ylen, Address(sp, wordSize));
+  stp(x, xstart, Address(sp, 2 * wordSize));
+  multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
+                          tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
+  ldp(z, ylen, Address(post(sp, 2 * wordSize)));
+  ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
+
+  addw(tmp3, xlen, 1);
+  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
+  subsw(tmp3, tmp3, 1);
+  br(Assembler::MI, L_done);
+
+  lsr(carry, carry, 32);
+  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
+  b(L_second_loop);
+
+  // Next infrequent code is moved outside loops.
+  bind(L_last_x);
+  ldrw(product_hi, Address(x,  0));
+  b(L_third_loop_prologue);
+
+  bind(L_done);
+}
+
+/**
  * Emits code to update CRC-32 with a byte value according to constants in table
  *
  * @param [in,out]crc   Register containing the crc.
@@ -2344,8 +2796,8 @@
       uzp2(v21, v20, v16, T2D);
       eor(v20, T16B, v17, v21);
 
-      shl(v16, v28, T2D, 1);
-      shl(v17, v20, T2D, 1);
+      shl(v16, T2D, v28, 1);
+      shl(v17, T2D, v20, 1);
 
       eor(v0, T16B, v0, v16);
       eor(v1, T16B, v1, v17);
@@ -2482,6 +2934,11 @@
     if (Universe::narrow_klass_base() == NULL) {
       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
       return;
+    } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+	       && Universe::narrow_klass_shift() == 0) {
+      // Only the bottom 32 bits matter
+      cmpw(trial_klass, tmp);
+      return;
     }
     decode_klass_not_null(tmp);
   } else {
@@ -2666,6 +3123,12 @@
     return;
   }
 
+  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+      && Universe::narrow_klass_shift() == 0) {
+    movw(dst, src);
+    return;
+  }
+
 #ifdef ASSERT
   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
 #endif
@@ -2709,6 +3172,14 @@
     return;
   }
 
+  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+      && Universe::narrow_klass_shift() == 0) {
+    if (dst != src)
+      movw(dst, src);
+    movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
+    return;
+  }
+
   // Cannot assert, unverified entry point counts instructions (see .ad file)
   // vtableStubs also counts instructions in pd_code_size_limit.
   // Also do not verify_oop as this is called by verify_oop.
@@ -3214,8 +3685,8 @@
     br(Assembler::HI, slow_case);
 
     // If heap_top hasn't been changed by some other thread, update it.
-    stlxr(rscratch1, end, rscratch1);
-    cbnzw(rscratch1, retry);
+    stlxr(rscratch2, end, rscratch1);
+    cbnzw(rscratch2, retry);
   }
 }
 
@@ -3353,6 +3824,346 @@
   }
 }
 
+// Search for str1 in str2 and return index or -1
+void MacroAssembler::string_indexof(Register str2, Register str1,
+                                    Register cnt2, Register cnt1,
+                                    Register tmp1, Register tmp2,
+                                    Register tmp3, Register tmp4,
+                                    int icnt1, Register result) {
+  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
+
+  Register ch1 = rscratch1;
+  Register ch2 = rscratch2;
+  Register cnt1tmp = tmp1;
+  Register cnt2tmp = tmp2;
+  Register cnt1_neg = cnt1;
+  Register cnt2_neg = cnt2;
+  Register result_tmp = tmp4;
+
+  // Note, inline_string_indexOf() generates checks:
+  // if (substr.count > string.count) return -1;
+  // if (substr.count == 0) return 0;
+
+// We have two strings, a source string in str2, cnt2 and a pattern string
+// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+// For larger pattern and source we use a simplified Boyer Moore algorithm.
+// With a small pattern and source we use linear scan.
+
+  if (icnt1 == -1) {
+    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
+    br(LO, LINEARSEARCH);       // a byte array.
+    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
+    br(HS, LINEARSEARCH);
+  }
+
+// The Boyer Moore alogorithm is based on the description here:-
+//
+// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
+//
+// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
+// and the 'Good Suffix' rule.
+//
+// These rules are essentially heuristics for how far we can shift the
+// pattern along the search string.
+//
+// The implementation here uses the 'Bad Character' rule only because of the
+// complexity of initialisation for the 'Good Suffix' rule.
+//
+// This is also known as the Boyer-Moore-Horspool algorithm:-
+//
+// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
+//
+// #define ASIZE 128
+//
+//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
+//       int i, j;
+//       unsigned c;
+//       unsigned char bc[ASIZE];
+//    
+//       /* Preprocessing */
+//       for (i = 0; i < ASIZE; ++i)
+//          bc[i] = 0;
+//       for (i = 0; i < m - 1; ) {
+//          c = x[i];
+//          ++i;
+//          if (c < ASIZE) bc[c] = i;
+//       }
+//    
+//       /* Searching */
+//       j = 0;
+//       while (j <= n - m) {
+//          c = y[i+j];
+//          if (x[m-1] == c)
+//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
+//          if (i < 0) return j;
+//          if (c < ASIZE)
+//            j = j - bc[y[j+m-1]] + m;
+//          else
+//            j += 1; // Advance by 1 only if char >= ASIZE
+//       }
+//    }
+
+  if (icnt1 == -1) {
+    BIND(BM);
+
+    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
+    Label BMADV, BMMATCH, BMCHECKEND;
+
+    Register cnt1end = tmp2;
+    Register str2end = cnt2;
+    Register skipch = tmp2;
+
+    // Restrict ASIZE to 128 to reduce stack space/initialisation.
+    // The presence of chars >= ASIZE in the target string does not affect
+    // performance, but we must be careful not to initialise them in the stack
+    // array.
+    // The presence of chars >= ASIZE in the source string may adversely affect
+    // performance since we can only advance by one when we encounter one.
+
+      stp(zr, zr, pre(sp, -128));
+      for (int i = 1; i < 8; i++)
+          stp(zr, zr, Address(sp, i*16));
+
+      mov(cnt1tmp, 0);
+      sub(cnt1end, cnt1, 1);
+    BIND(BCLOOP);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, 128);
+      add(cnt1tmp, cnt1tmp, 1);
+      br(HS, BCSKIP);
+      strb(cnt1tmp, Address(sp, ch1));
+    BIND(BCSKIP);
+      cmp(cnt1tmp, cnt1end);
+      br(LT, BCLOOP);
+
+      mov(result_tmp, str2);
+
+      sub(cnt2, cnt2, cnt1);
+      add(str2end, str2, cnt2, LSL, 1);
+    BIND(BMLOOPSTR2);
+      sub(cnt1tmp, cnt1, 1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, skipch);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMMATCH);
+    BIND(BMLOOPSTR1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, ch2);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(GE, BMLOOPSTR1);
+    BIND(BMMATCH);
+      sub(result_tmp, str2, result_tmp);
+      lsr(result, result_tmp, 1);
+      add(sp, sp, 128);
+      b(DONE);
+    BIND(BMADV);
+      add(str2, str2, 2);
+      b(BMCHECKEND);
+    BIND(BMSKIP);
+      cmp(skipch, 128);
+      br(HS, BMADV);
+      ldrb(ch2, Address(sp, skipch));
+      add(str2, str2, cnt1, LSL, 1);
+      sub(str2, str2, ch2, LSL, 1);
+    BIND(BMCHECKEND);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, 128);
+      b(NOMATCH);
+  }
+
+  BIND(LINEARSEARCH);
+  {
+    Label DO1, DO2, DO3;
+
+    Register str2tmp = tmp2;
+    Register first = tmp3;
+
+    if (icnt1 == -1)
+    {
+        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
+
+        cmp(cnt1, 4);
+        br(LT, DOSHORT);
+
+        sub(cnt2, cnt2, cnt1);
+        sub(cnt1, cnt1, 4);
+        mov(result_tmp, cnt2);
+
+        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt1_neg, zr, cnt1, LSL, 1);
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+        ldr(first, Address(str1, cnt1_neg));
+
+      BIND(FIRST_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        adds(cnt1tmp, cnt1_neg, 8);
+        add(cnt2tmp, cnt2_neg, 8);
+        br(GE, LAST_WORD);
+
+      BIND(STR1_NEXT);
+        ldr(ch1, Address(str1, cnt1tmp));
+        ldr(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        adds(cnt1tmp, cnt1tmp, 8);
+        add(cnt2tmp, cnt2tmp, 8);
+        br(LT, STR1_NEXT);
+
+      BIND(LAST_WORD);
+        ldr(ch1, Address(str1));
+        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
+        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+
+      BIND(DOSHORT);
+        cmp(cnt1, 2);
+        br(LT, DO1);
+        br(GT, DO3);
+    }
+
+    if (icnt1 == 4) {
+      Label CH1_LOOP;
+
+        ldr(ch1, str1);
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 2) {
+      Label CH1_LOOP;
+
+      BIND(DO2);
+        ldrw(ch1, str1);
+        sub(cnt2, cnt2, 2);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 3) {
+      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
+
+      BIND(DO3);
+        ldrw(first, str1);
+        ldrh(ch1, Address(str1, 4));
+
+        sub(cnt2, cnt2, 3);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(FIRST_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmpw(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        add(cnt2tmp, cnt2_neg, 4);
+        ldrh(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 1) {
+      Label CH1_LOOP, HAS_ZERO;
+      Label DO1_SHORT, DO1_LOOP;
+
+      BIND(DO1);
+        ldrh(ch1, str1);
+        cmp(cnt2, 4);
+        br(LT, DO1_SHORT);
+
+        orr(ch1, ch1, ch1, LSL, 16);
+        orr(ch1, ch1, ch1, LSL, 32);
+
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+        mov(tmp3, 0x0001000100010001);
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        eor(ch2, ch1, ch2);
+        sub(tmp1, ch2, tmp3);
+        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
+        bics(tmp1, tmp1, tmp2);
+        br(NE, HAS_ZERO);
+        adds(cnt2_neg, cnt2_neg, 8);
+        br(LT, CH1_LOOP);
+
+        cmp(cnt2_neg, 8);
+        mov(cnt2_neg, 0);
+        br(LT, CH1_LOOP);
+        b(NOMATCH);
+
+      BIND(HAS_ZERO);
+        rev(tmp1, tmp1);
+        clz(tmp1, tmp1);
+        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
+        b(MATCH);
+
+      BIND(DO1_SHORT);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+      BIND(DO1_LOOP);
+        ldrh(ch2, Address(str2, cnt2_neg));
+        cmpw(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LT, DO1_LOOP);
+    }
+  }
+  BIND(NOMATCH);
+    mov(result, -1);
+    b(DONE);
+  BIND(MATCH);
+    add(result, result_tmp, cnt2_neg, ASR, 1);
+  BIND(DONE);
+}
+
 // Compare strings.
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
@@ -3512,3 +4323,136 @@
 
   BLOCK_COMMENT("} string_equals");
 }
+
+// Compare char[] arrays aligned to 4 bytes
+void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
+                                        Register result, Register tmp1)
+{
+  Register cnt1 = rscratch1;
+  Register cnt2 = rscratch2;
+  Register tmp2 = rscratch2;
+
+  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
+
+  int length_offset  = arrayOopDesc::length_offset_in_bytes();
+  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+  BLOCK_COMMENT("char_arrays_equals  {");
+
+    // different until proven equal
+    mov(result, false);
+
+    // same array?
+    cmp(ary1, ary2);
+    br(Assembler::EQ, SAME);
+
+    // ne if either null
+    cbz(ary1, DIFFER);
+    cbz(ary2, DIFFER);
+
+    // lengths ne?
+    ldrw(cnt1, Address(ary1, length_offset));
+    ldrw(cnt2, Address(ary2, length_offset));
+    cmp(cnt1, cnt2);
+    br(Assembler::NE, DIFFER);
+
+    lea(ary1, Address(ary1, base_offset));
+    lea(ary2, Address(ary2, base_offset));
+
+    subs(cnt1, cnt1, 4);
+    br(LT, TAIL03);
+
+  BIND(NEXT);
+    ldr(tmp1, Address(post(ary1, 8)));
+    ldr(tmp2, Address(post(ary2, 8)));
+    subs(cnt1, cnt1, 4);
+    eor(tmp1, tmp1, tmp2);
+    cbnz(tmp1, DIFFER);
+    br(GE, NEXT);
+
+  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
+    tst(cnt1, 0b10);
+    br(EQ, TAIL01);
+    ldrw(tmp1, Address(post(ary1, 4)));
+    ldrw(tmp2, Address(post(ary2, 4)));
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+  BIND(TAIL01);  // 0-1 chars left
+    tst(cnt1, 0b01);
+    br(EQ, SAME);
+    ldrh(tmp1, ary1);
+    ldrh(tmp2, ary2);
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+
+  BIND(SAME);
+    mov(result, true);
+  BIND(DIFFER);	// result already set
+  
+  BLOCK_COMMENT("} char_arrays_equals");
+}
+
+// encode char[] to byte[] in ISO_8859_1
+void MacroAssembler::encode_iso_array(Register src, Register dst,
+		      Register len, Register result,
+		      FloatRegister Vtmp1, FloatRegister Vtmp2,
+                      FloatRegister Vtmp3, FloatRegister Vtmp4)
+{
+    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
+    Register tmp1 = rscratch1;
+
+      mov(result, len);	// Save initial len
+
+#ifndef BUILTIN_SIM
+      subs(len, len, 32);
+      br(LT, LOOP_8);
+
+// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
+// to convert chars to bytes. These set the 'QC' bit in the FPSR if
+// any char could not fit in a byte, so clear the FPSR so we can test it.
+      clear_fpsr();
+
+    BIND(NEXT_32);
+      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
+      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
+      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
+      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_8);
+      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
+      subs(len, len, 32);
+      add(src, src, 64);
+      br(GE, NEXT_32);
+
+    BIND(LOOP_8);
+      adds(len, len, 32-8);
+      br(LT, LOOP_1);
+      clear_fpsr(); // QC may be set from loop above, clear again
+    BIND(NEXT_8);
+      ld1(Vtmp1, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_1);
+      st1(Vtmp1, T8B, post(dst, 8));
+      subs(len, len, 8);
+      add(src, src, 16);
+      br(GE, NEXT_8);
+
+    BIND(LOOP_1);
+      adds(len, len, 8);
+      br(LE, DONE);
+#else
+      cbz(len, DONE);
+#endif
+    BIND(NEXT_1);
+      ldrh(tmp1, Address(post(src, 2)));
+      tst(tmp1, 0xff00);
+      br(NE, DONE);
+      strb(tmp1, Address(post(dst, 1)));
+      subs(len, len, 1);
+      br(GT, NEXT_1);
+
+    BIND(DONE);
+      sub(result, result, len); // Return index where we stopped
+}
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -37,7 +37,9 @@
 class MacroAssembler: public Assembler {
   friend class LIR_Assembler;
 
+ public:
   using Assembler::mov;
+  using Assembler::movi;
 
  protected:
 
@@ -106,9 +108,7 @@
   // Biased locking support
   // lock_reg and obj_reg must be loaded up with the appropriate values.
   // swap_reg is killed.
-  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
-  // be killed; if not supplied, push/pop will be used internally to
-  // allocate a temporary (inefficient, avoid if possible).
+  // tmp_reg must be supplied and must not be rscratch1 or rscratch2
   // Optional slow case is for implementations (interpreter and C1) which branch to
   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
   // Returns offset of first potentially-faulting instruction for null
@@ -125,10 +125,10 @@
 
   // Helper functions for statistics gathering.
   // Unconditional atomic increment.
-  void atomic_incw(Register counter_addr, Register tmp);
-  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+  void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
     lea(tmp1, counter_addr);
-    atomic_incw(tmp1, tmp2);
+    atomic_incw(tmp1, tmp2, tmp3);
   }
   // Load Effective Address
   void lea(Register r, const Address &a) {
@@ -168,9 +168,8 @@
 
   // aliases defined in AARCH64 spec
 
-
   template<class T>
-  inline void  cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
+  inline void cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
   inline void cmp(Register Rd, unsigned imm)  { subs(zr, Rd, imm); }
 
   inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); }
@@ -179,11 +178,17 @@
   void cset(Register Rd, Assembler::Condition cond) {
     csinc(Rd, zr, zr, ~cond);
   }
-
   void csetw(Register Rd, Assembler::Condition cond) {
     csincw(Rd, zr, zr, ~cond);
   }
 
+  void cneg(Register Rd, Register Rn, Assembler::Condition cond) {
+    csneg(Rd, Rn, Rn, ~cond);
+  }
+  void cnegw(Register Rd, Register Rn, Assembler::Condition cond) {
+    csnegw(Rd, Rn, Rn, ~cond);
+  }
+
   inline void movw(Register Rd, Register Rn) {
     if (Rd == sp || Rn == sp) {
       addw(Rd, Rn, 0U);
@@ -401,6 +406,16 @@
     umaddl(Rd, Rn, Rm, zr);
   }
 
+#define WRAP(INSN)                                                \
+  void INSN(Register Rd, Register Rn, Register Rm, Register Ra) { \
+    if (Ra != zr) nop();                                          \
+    Assembler::INSN(Rd, Rn, Rm, Ra);                              \
+  }
+
+  WRAP(madd) WRAP(msub) WRAP(maddw) WRAP(msubw)
+  WRAP(smaddl) WRAP(smsubl) WRAP(umaddl) WRAP(umsubl)
+#undef WRAP
+
   // macro assembly operations needed for aarch64
 
   // first two private routines for loading 32 bit or 64 bit constants
@@ -448,6 +463,12 @@
 
   void movptr(Register r, uintptr_t imm64);
 
+  void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32);
+
+  void mov(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
+    orr(Vd, T, Vn, Vn);
+  }
+
   // macro instructions for accessing and updating floating point
   // status register
   //
@@ -491,7 +512,10 @@
 
   // Required platform-specific helpers for Label::patch_instructions.
   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
-  static void pd_patch_instruction(address branch, address target);
+  static int pd_patch_instruction_size(address branch, address target);
+  static void pd_patch_instruction(address branch, address target) {
+    pd_patch_instruction_size(branch, target);
+  }
   static address pd_call_destination(address branch) {
     unsigned insn = *(unsigned*)branch;
     return target_addr_for_insn(branch, insn);
@@ -500,7 +524,7 @@
   static void pd_print_patched_instruction(address branch);
 #endif
 
-  static void patch_oop(address insn_addr, address o);
+  static int patch_oop(address insn_addr, address o);
 
   // The following 4 methods return the offset of the appropriate move instruction
 
@@ -859,7 +883,7 @@
     // stack grows down, caller passes positive offset
     assert(offset > 0, "must bang with negative offset");
     mov(rscratch2, -offset);
-    ldr(zr, Address(sp, rscratch2));
+    str(zr, Address(sp, rscratch2));
   }
 
   // Writes to stack successive pages until offset reached to check for
@@ -1005,6 +1029,8 @@
 
   void add(Register Rd, Register Rn, RegisterOrConstant increment);
   void addw(Register Rd, Register Rn, RegisterOrConstant increment);
+  void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+  void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
 
   void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
 
@@ -1085,6 +1111,85 @@
   void string_equals(Register str1, Register str2,
 		     Register cnt, Register result,
 		     Register tmp1);
+  void char_arrays_equals(Register ary1, Register ary2,
+                          Register result, Register tmp1);
+  void encode_iso_array(Register src, Register dst,
+                        Register len, Register result,
+                        FloatRegister Vtmp1, FloatRegister Vtmp2,
+                        FloatRegister Vtmp3, FloatRegister Vtmp4);
+  void string_indexof(Register str1, Register str2,
+                      Register cnt1, Register cnt2,
+                      Register tmp1, Register tmp2,
+                      Register tmp3, Register tmp4,
+                      int int_cnt1, Register result);
+private:
+  void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
+                       Register src1, Register src2);
+  void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+    add2_with_carry(dest_hi, dest_hi, dest_lo, src1, src2);
+  }
+  void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                             Register y, Register y_idx, Register z,
+                             Register carry, Register product,
+                             Register idx, Register kdx);
+  void multiply_128_x_128_loop(Register y, Register z,
+                               Register carry, Register carry2,
+                               Register idx, Register jdx,
+                               Register yz_idx1, Register yz_idx2,
+                               Register tmp, Register tmp3, Register tmp4,
+                               Register tmp7, Register product_hi);
+public:
+  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
+                       Register zlen, Register tmp1, Register tmp2, Register tmp3,
+                       Register tmp4, Register tmp5, Register tmp6, Register tmp7);
+  // ISB may be needed because of a safepoint
+  void maybe_isb() { isb(); }
+
+private:
+  // Return the effective address r + (r1 << ext) + offset.
+  // Uses rscratch2.
+  Address offsetted_address(Register r, Register r1, Address::extend ext,
+                            int offset, int size);
+
+private:
+  // Returns an address on the stack which is reachable with a ldr/str of size
+  // Uses rscratch2 if the address is not directly reachable
+  Address spill_address(int size, int offset, Register tmp=rscratch2);
+
+public:
+  void spill(Register Rx, bool is64, int offset) {
+    if (is64) {
+      str(Rx, spill_address(8, offset));
+    } else {
+      strw(Rx, spill_address(4, offset));
+    }
+  }
+  void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
+    str(Vx, T, spill_address(1 << (int)T, offset));
+  }
+  void unspill(Register Rx, bool is64, int offset) {
+    if (is64) {
+      ldr(Rx, spill_address(8, offset));
+    } else {
+      ldrw(Rx, spill_address(4, offset));
+    }
+  }
+  void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) {
+    ldr(Vx, T, spill_address(1 << (int)T, offset));
+  }
+  void spill_copy128(int src_offset, int dst_offset,
+                     Register tmp1=rscratch1, Register tmp2=rscratch2) {
+    if (src_offset < 512 && (src_offset & 7) == 0 &&
+        dst_offset < 512 && (dst_offset & 7) == 0) {
+      ldp(tmp1, tmp2, Address(sp, src_offset));
+      stp(tmp1, tmp2, Address(sp, dst_offset));
+    } else {
+      unspill(tmp1, true, src_offset);
+      spill(tmp1, true, dst_offset);
+      unspill(tmp1, true, src_offset+8);
+      spill(tmp1, true, dst_offset+8);
+    }
+  }
 };
 
 // Used by aarch64.ad to control code generation
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -37,11 +37,6 @@
 #include "c1/c1_Runtime1.hpp"
 #endif
 
-void NativeInstruction::wrote(int offset) {
-  // FIXME: Native needs ISB here
-; }
-
-
 void NativeCall::verify() { ; }
 
 address NativeCall::destination() const {
@@ -51,10 +46,13 @@
 // Inserts a native call instruction at a given pc
 void NativeCall::insert(address code_pos, address entry) { Unimplemented(); }
 
+//-------------------------------------------------------------------
+
 void NativeMovConstReg::verify() {
   // make sure code pattern is actually mov reg64, imm64 instructions
 }
 
+
 intptr_t NativeMovConstReg::data() const {
   // das(uint64_t(instruction_address()),2);
   address addr = MacroAssembler::pd_call_destination(instruction_address());
@@ -71,6 +69,7 @@
     *(intptr_t*)addr = x;
   } else {
     MacroAssembler::pd_patch_instruction(instruction_address(), (address)x);
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }
 };
 
@@ -102,6 +101,7 @@
     *(long*)addr = x;
   } else {
     MacroAssembler::pd_patch_instruction(pc, (address)intptr_t(x));
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }
 }
 
@@ -138,8 +138,11 @@
     dest = instruction_address();
 
   MacroAssembler::pd_patch_instruction(instruction_address(), dest);
+  ICache::invalidate_range(instruction_address(), instruction_size);
 };
 
+//-------------------------------------------------------------------
+
 bool NativeInstruction::is_safepoint_poll() {
   // a safepoint_poll is implemented in two steps as either
   //
@@ -189,7 +192,9 @@
   return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b11100101;
 }
 
-// MT safe inserting of a jump over an unknown instruction sequence (used by nmethod::makeZombie)
+//-------------------------------------------------------------------
+
+// MT safe inserting of a jump over a jump or a nop (used by nmethod::makeZombie)
 
 void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
   ptrdiff_t disp = dest - verified_entry;
@@ -203,23 +208,22 @@
   ICache::invalidate_range(verified_entry, instruction_size);
 }
 
-
 void NativeGeneralJump::verify() {  }
 
-
 void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
+  NativeGeneralJump* n_jump = (NativeGeneralJump*)code_pos;
   ptrdiff_t disp = entry - code_pos;
   guarantee(disp < 1 << 27 && disp > - (1 << 27), "branch overflow");
 
   unsigned int insn = (0b000101 << 26) | ((disp >> 2) & 0x3ffffff);
-
   *(unsigned int*)code_pos = insn;
   ICache::invalidate_range(code_pos, instruction_size);
 }
 
 // MT-safe patching of a long jump instruction.
 void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
-  assert(nativeInstruction_at(instr_addr)->is_jump_or_nop(),
+  NativeGeneralJump* n_jump = (NativeGeneralJump*)instr_addr;
+  assert(n_jump->is_jump_or_nop(),
 	 "Aarch64 cannot replace non-jump with jump");
   uint32_t instr = *(uint32_t*)code_buffer;
   *(uint32_t*)instr_addr = instr;
--- a/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -80,14 +80,10 @@
   oop  oop_at (int offset) const       { return *(oop*) addr_at(offset); }
 
 
-  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; wrote(offset); }
-  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i;  wrote(offset); }
-  void set_ptr_at (int offset, intptr_t  ptr) { *(intptr_t*) addr_at(offset) = ptr;  wrote(offset); }
-  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o;  wrote(offset); }
-
-  // This doesn't really do anything on AArch64, but it is the place where
-  // cache invalidation belongs, generically:
-  void wrote(int offset);
+  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; }
+  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i; }
+  void set_ptr_at (int offset, intptr_t  ptr) { *(intptr_t*) addr_at(offset) = ptr; }
+  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o; }
 
  public:
 
@@ -142,6 +138,7 @@
     offset &= (1 << 26) - 1; // mask off insn part
     insn |= offset;
     set_int_at(displacement_offset, insn);
+    ICache::invalidate_range(instruction_address(), instruction_size);
   }
 
   // Similar to replace_mt_safe, but just changes the destination.  The
--- a/src/cpu/aarch64/vm/register_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/register_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -62,7 +62,10 @@
   bool  has_byte_register() const                { return 0 <= (intptr_t)this && (intptr_t)this < number_of_byte_registers; }
   const char* name() const;
   int   encoding_nocheck() const                 { return (intptr_t)this; }
-  unsigned long bit(bool yes = true) const       { return yes << encoding(); }
+
+  // Return the bit which represents this register.  This is intended
+  // to be ORed into a bitmask: for usage see class RegSet below.
+  unsigned long bit(bool should_set = true) const { return should_set ? 1 << encoding() : 0; }
 };
 
 // The integer registers of the aarch64 architecture
@@ -185,7 +188,7 @@
   // it's optoregs.
 
     number_of_registers = (2 * RegisterImpl::number_of_registers +
-                           2 * FloatRegisterImpl::number_of_registers +
+                           4 * FloatRegisterImpl::number_of_registers +
                            1) // flags
   };
 
--- a/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -33,23 +33,29 @@
 
 
 void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
+  if (verify_only)
+    return;
+
+  int bytes;
+
   switch(type()) {
   case relocInfo::oop_type:
     {
       oop_Relocation *reloc = (oop_Relocation *)this;
       if (NativeInstruction::is_ldr_literal_at(addr())) {
 	address constptr = (address)code()->oop_addr_at(reloc->oop_index());
-	MacroAssembler::pd_patch_instruction(addr(), constptr);
+	bytes = MacroAssembler::pd_patch_instruction_size(addr(), constptr);
 	assert(*(address*)constptr == x, "error in oop relocation");
       } else{
-	MacroAssembler::patch_oop(addr(), x);
+	bytes = MacroAssembler::patch_oop(addr(), x);
       }
     }
     break;
   default:
-    MacroAssembler::pd_patch_instruction(addr(), x);
+    bytes = MacroAssembler::pd_patch_instruction_size(addr(), x);
     break;
   }
+  ICache::invalidate_range(addr(), bytes);
 }
 
 address Relocation::pd_call_destination(address orig_addr) {
--- a/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -318,6 +318,7 @@
   __ mov(c_rarg1, lr);
   __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
   __ blrt(rscratch1, 2, 0, 0);
+  __ maybe_isb();
 
   __ pop_CPU_state();
   // restore sp
@@ -1171,7 +1172,7 @@
     __ lea(rscratch1, RuntimeAddress(dest));
     __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
     __ blrt(rscratch1, rscratch2);
-    // __ blrt(rscratch1, gpargs, fpargs, type);
+    __ maybe_isb();
   }
 }
 
@@ -1797,6 +1798,7 @@
   const Register obj_reg  = r19;  // Will contain the oop
   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
   const Register old_hdr  = r13;  // value of old header at unlock time
+  const Register tmp = c_rarg3;
 
   Label slow_path_lock;
   Label lock_done;
@@ -1818,7 +1820,7 @@
     __ ldr(obj_reg, Address(oop_handle_reg, 0));
 
     if (UseBiasedLocking) {
-      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
     }
 
     // Load (object->mark() | 1) into swap_reg %r0
@@ -1868,7 +1870,8 @@
 
   // Now set thread in native
   __ mov(rscratch1, _thread_in_native);
-  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
 
   {
     int return_type = 0;
@@ -1925,7 +1928,8 @@
   //     Thread A is resumed to finish this native method, but doesn't block here since it
   //     didn't see any synchronization is progress, and escapes.
   __ mov(rscratch1, _thread_in_native_trans);
-  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
 
   if(os::is_MP()) {
     if (UseMembar) {
@@ -1974,6 +1978,7 @@
       __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
     }
     __ blrt(rscratch1, 1, 0, 1);
+    __ maybe_isb();
     // Restore any method result value
     restore_native_result(masm, ret_type, stack_slots);
 
@@ -1988,7 +1993,8 @@
 
   // change thread state
   __ mov(rscratch1, _thread_in_Java);
-  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
   __ bind(after_transition);
 
   Label reguard;
@@ -2139,6 +2145,7 @@
       save_native_result(masm, ret_type, stack_slots);
     }
 
+    __ mov(c_rarg2, rthread);
     __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
     __ mov(c_rarg0, obj_reg);
 
@@ -2147,7 +2154,7 @@
     __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
     __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 
-    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 2, 0, 1);
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
 
 #ifdef ASSERT
     {
@@ -2813,6 +2820,9 @@
 
   __ reset_last_Java_frame(false, true);
 
+  __ maybe_isb();
+  __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+
   __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
   __ cbz(rscratch1, noException);
 
@@ -2882,6 +2892,8 @@
 
   oop_maps->add_gc_map( __ offset() - start, map);
 
+  __ maybe_isb();
+
   // r0 contains the address we are going to jump to assuming no exception got installed
 
   // clear last_Java_sp
@@ -3004,6 +3016,7 @@
   __ mov(c_rarg0, rthread);
   __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
   __ blrt(rscratch1, 1, 0, MacroAssembler::ret_type_integral);
+  __ maybe_isb();
 
   // Set an oopmap for the call site.  This oopmap will only be used if we
   // are unwinding the stack.  Hence, all locations will be dead.
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -305,7 +305,8 @@
 #endif
     // pass parameters if any
     __ mov(esp, sp);
-    __ sub(sp, sp, os::vm_page_size()); // Move SP out of the way
+    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
+    __ andr(sp, rscratch1, -2 * wordSize);
 
     BLOCK_COMMENT("pass parameters if any");
     Label parameters_done;
@@ -1891,7 +1892,7 @@
     address start = __ pc();
       __ enter();
 
-      __ mov(rscratch1, len_reg);
+      __ mov(rscratch2, len_reg);
       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 
       __ ld1(v0, __ T16B, rvec);
@@ -1958,6 +1959,8 @@
 
       __ leave();
       __ ret(lr);
+
+      return start;
   }
 
   // Arguments:
@@ -2066,6 +2069,212 @@
     return start;
   }
 
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - byte[]  source+offset
+  //   c_rarg1   - int[]   SHA.state
+  //   c_rarg2   - int     offset
+  //   c_rarg3   - int     limit
+  //
+  address generate_sha1_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf   = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs   = c_rarg2;
+    Register limit = c_rarg3;
+
+    Label keys;
+    Label sha1_loop;
+
+    // load the keys into v0..v3
+    __ adr(rscratch1, keys);
+    __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
+    // load 5 words state into v6, v7
+    __ ldrq(v6, Address(state, 0));
+    __ ldrs(v7, Address(state, 16));
+
+
+    __ BIND(sha1_loop);
+    // load 64 bytes of data into v16..v19
+    __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
+    __ rev32(v16, __ T16B, v16);
+    __ rev32(v17, __ T16B, v17);
+    __ rev32(v18, __ T16B, v18);
+    __ rev32(v19, __ T16B, v19);
+
+    // do the sha1
+    __ addv(v4, __ T4S, v16, v0);
+    __ orr(v20, __ T16B, v6, v6);
+
+    FloatRegister d0 = v16;
+    FloatRegister d1 = v17;
+    FloatRegister d2 = v18;
+    FloatRegister d3 = v19;
+
+    for (int round = 0; round < 20; round++) {
+      FloatRegister tmp1 = (round & 1) ? v4 : v5;
+      FloatRegister tmp2 = (round & 1) ? v21 : v22;
+      FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
+      FloatRegister tmp4 = (round & 1) ? v5 : v4;
+      FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
+
+      if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
+      if (round < 19) __ addv(tmp1, __ T4S, d1, key);
+      __ sha1h(tmp2, __ T4S, v20);
+      if (round < 5)
+        __ sha1c(v20, __ T4S, tmp3, tmp4);
+      else if (round < 10 || round >= 15)
+        __ sha1p(v20, __ T4S, tmp3, tmp4);
+      else
+        __ sha1m(v20, __ T4S, tmp3, tmp4);
+      if (round < 16) __ sha1su1(d0, __ T4S, d3);
+
+      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
+    }
+
+    __ addv(v7, __ T2S, v7, v21);
+    __ addv(v6, __ T4S, v6, v20);
+
+    if (multi_block) {
+      __ add(ofs, ofs, 64);
+      __ cmp(ofs, limit);
+      __ br(Assembler::LE, sha1_loop);
+      __ mov(c_rarg0, ofs); // return ofs
+    }
+
+    __ strq(v6, Address(state, 0));
+    __ strs(v7, Address(state, 16));
+
+    __ ret(lr);
+
+    __ bind(keys);
+    __ emit_int32(0x5a827999);
+    __ emit_int32(0x6ed9eba1);
+    __ emit_int32(0x8f1bbcdc);
+    __ emit_int32(0xca62c1d6);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - byte[]  source+offset
+  //   c_rarg1   - int[]   SHA.state
+  //   c_rarg2   - int     offset
+  //   c_rarg3   - int     limit
+  //
+  address generate_sha256_implCompress(bool multi_block, const char *name) {
+    static const uint32_t round_consts[64] = {
+      0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+      0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+      0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+      0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+      0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+      0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+      0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+      0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+      0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+      0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+      0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+      0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+      0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+    };
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf   = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs   = c_rarg2;
+    Register limit = c_rarg3;
+
+    Label sha1_loop;
+
+    __ stpd(v8, v9, __ pre(sp, -32));
+    __ stpd(v10, v11, Address(sp, 16));
+
+// dga == v0
+// dgb == v1
+// dg0 == v2
+// dg1 == v3
+// dg2 == v4
+// t0 == v6
+// t1 == v7
+
+    // load 16 keys to v16..v31
+    __ lea(rscratch1, ExternalAddress((address)round_consts));
+    __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
+
+    // load 8 words (256 bits) state
+    __ ldpq(v0, v1, state);
+
+    __ BIND(sha1_loop);
+    // load 64 bytes of data into v8..v11
+    __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
+    __ rev32(v8, __ T16B, v8);
+    __ rev32(v9, __ T16B, v9);
+    __ rev32(v10, __ T16B, v10);
+    __ rev32(v11, __ T16B, v11);
+
+    __ addv(v6, __ T4S, v8, v16);
+    __ orr(v2, __ T16B, v0, v0);
+    __ orr(v3, __ T16B, v1, v1);
+
+    FloatRegister d0 = v8;
+    FloatRegister d1 = v9;
+    FloatRegister d2 = v10;
+    FloatRegister d3 = v11;
+
+
+    for (int round = 0; round < 16; round++) {
+      FloatRegister tmp1 = (round & 1) ? v6 : v7;
+      FloatRegister tmp2 = (round & 1) ? v7 : v6;
+      FloatRegister tmp3 = (round & 1) ? v2 : v4;
+      FloatRegister tmp4 = (round & 1) ? v4 : v2;
+
+      if (round < 12) __ sha256su0(d0, __ T4S, d1);
+       __ orr(v4, __ T16B, v2, v2);
+      if (round < 15)
+        __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
+      __ sha256h(v2, __ T4S, v3, tmp2);
+      __ sha256h2(v3, __ T4S, v4, tmp2);
+      if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
+
+      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
+    }
+
+    __ addv(v0, __ T4S, v0, v2);
+    __ addv(v1, __ T4S, v1, v3);
+
+    if (multi_block) {
+      __ add(ofs, ofs, 64);
+      __ cmp(ofs, limit);
+      __ br(Assembler::LE, sha1_loop);
+      __ mov(c_rarg0, ofs); // return ofs
+    }
+
+    __ ldpd(v10, v11, Address(sp, 16));
+    __ ldpd(v8, v9, __ post(sp, 32));
+
+    __ stpq(v0, v1, state);
+
+    __ ret(lr);
+
+    return start;
+  }
+
 #ifndef BUILTIN_SIM
   // Safefetch stubs.
   void generate_safefetch(const char* name, int size, address* entry,
@@ -2152,8 +2361,45 @@
     return start;
   }
 
-#undef __
-#define __ masm->
+  /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - x address
+   *    c_rarg1   - x length
+   *    c_rarg2   - y address
+   *    c_rarg3   - y lenth
+   *    c_rarg4   - z address
+   *    c_rarg5   - z length
+   */
+  address generate_multiplyToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+    address start = __ pc();
+    const Register x     = r0;
+    const Register xlen  = r1;
+    const Register y     = r2;
+    const Register ylen  = r3;
+    const Register z     = r4;
+    const Register zlen  = r5;
+
+    const Register tmp1  = r10;
+    const Register tmp2  = r11;
+    const Register tmp3  = r12;
+    const Register tmp4  = r13;
+    const Register tmp5  = r14;
+    const Register tmp6  = r15;
+    const Register tmp7  = r16;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
 
   // Continuation point for throwing of implicit exceptions that are
   // not handled in the current activation. Fabricates an exception
@@ -2171,6 +2417,9 @@
   // otherwise assume that stack unwinding will be initiated, so
   // caller saved registers were assumed volatile in the compiler.
 
+#undef __
+#define __ masm->
+
   address generate_throw_exception(const char* name,
                                    address runtime_entry,
                                    Register arg1 = noreg,
@@ -2234,6 +2483,7 @@
     oop_maps->add_gc_map(the_pc - start, map);
 
     __ reset_last_Java_frame(true, true);
+    __ maybe_isb();
 
     __ leave();
 
@@ -2313,6 +2563,10 @@
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
 
+    if (UseMultiplyToLenIntrinsic) {
+      StubRoutines::_multiplyToLen = generate_multiplyToLen();
+    }
+
 #ifndef BUILTIN_SIM
     if (UseAESIntrinsics) {
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
@@ -2321,6 +2575,15 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    if (UseSHA1Intrinsics) {
+      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
+      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
+    }
+    if (UseSHA256Intrinsics) {
+      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
+      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
+    }
+
     // Safefetch stubs.
     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
                                                        &StubRoutines::_safefetch32_fault_pc,
--- a/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -199,8 +199,7 @@
   // Restore machine SP
   __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
   __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
-  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size()
-	 + (EnableInvokeDynamic ? 2 : 0));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
   __ ldr(rscratch2,
 	 Address(rfp, frame::interpreter_frame_initial_sp_offset * wordSize));
   __ sub(rscratch1, rscratch2, rscratch1, ext::uxtw, 3);
@@ -313,6 +312,7 @@
   address entry = __ pc();
   __ push(state);
   __ call_VM(noreg, runtime_entry);
+  __ membar(Assembler::AnyAny);
   __ dispatch_via(vtos, Interpreter::_normal_table.table_for(vtos));
   return entry;
 }
@@ -670,7 +670,78 @@
 
 // Method entry for java.lang.ref.Reference.get.
 address InterpreterGenerator::generate_Reference_get_entry(void) {
-  return NULL;
+#if INCLUDE_ALL_GCS
+  // Code: _aload_0, _getfield, _areturn
+  // parameter size = 1
+  //
+  // The code that gets generated by this routine is split into 2 parts:
+  //    1. The "intrinsified" code for G1 (or any SATB based GC),
+  //    2. The slow path - which is an expansion of the regular method entry.
+  //
+  // Notes:-
+  // * In the G1 code we do not check whether we need to block for
+  //   a safepoint. If G1 is enabled then we must execute the specialized
+  //   code for Reference.get (except when the Reference object is null)
+  //   so that we can log the value in the referent field with an SATB
+  //   update buffer.
+  //   If the code for the getfield template is modified so that the
+  //   G1 pre-barrier code is executed when the current method is
+  //   Reference.get() then going through the normal method entry
+  //   will be fine.
+  // * The G1 code can, however, check the receiver object (the instance
+  //   of java.lang.Reference) and jump to the slow path if null. If the
+  //   Reference object is null then we obviously cannot fetch the referent
+  //   and so we don't need to call the G1 pre-barrier. Thus we can use the
+  //   regular method entry code to generate the NPE.
+  //
+  // This code is based on generate_accessor_enty.
+  //
+  // rmethod: Method*
+  // r13: senderSP must preserve for slow path, set SP to it on fast path
+
+  address entry = __ pc();
+
+  const int referent_offset = java_lang_ref_Reference::referent_offset;
+  guarantee(referent_offset > 0, "referent offset not initialized");
+
+  if (UseG1GC) {
+    Label slow_path;
+    const Register local_0 = c_rarg0;
+    // Check if local 0 != NULL
+    // If the receiver is null then it is OK to jump to the slow path.
+    __ ldr(local_0, Address(esp, 0));
+    __ cbz(local_0, slow_path);
+
+
+    // Load the value of the referent field.
+    const Address field_address(local_0, referent_offset);
+    __ load_heap_oop(local_0, field_address);
+
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+    __ enter(); // g1_write may call runtime
+    __ g1_write_barrier_pre(noreg /* obj */,
+                            local_0 /* pre_val */,
+                            rthread /* thread */,
+                            rscratch2 /* tmp */,
+                            true /* tosca_live */,
+                            true /* expand_call */);
+    __ leave();
+    // areturn
+    __ andr(sp, r13, -16);  // done with stack
+    __ ret(lr);
+
+    // generate a vanilla interpreter entry as the slow path
+    __ bind(slow_path);
+    (void) generate_normal_entry(false);
+
+    return entry;
+  }
+#endif // INCLUDE_ALL_GCS
+
+  // If G1 is not enabled then attempt to go through the accessor entry point
+  // Reference.get is an accessor
+  return generate_accessor_entry();
 }
 
 /**
@@ -799,7 +870,7 @@
     const int page_size = os::vm_page_size();
     for (int pages = start_page; pages <= StackShadowPages ; pages++) {
       __ sub(rscratch2, sp, pages*page_size);
-      __ ldr(zr, Address(rscratch2));
+      __ str(zr, Address(rscratch2));
     }
   }
 }
@@ -1034,13 +1105,15 @@
 
   // Change state to native
   __ mov(rscratch1, _thread_in_native);
-  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
 
   // load call format
   __ ldrw(rscratch1, Address(rmethod, Method::call_format_offset()));
 
   // Call the native method.
   __ blrt(r10, rscratch1);
+  __ maybe_isb();
   __ get_method(rmethod);
   // result potentially in r0 or v0
 
@@ -1057,7 +1130,8 @@
 
   // change thread state
   __ mov(rscratch1, _thread_in_native_trans);
-  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
 
   if (os::is_MP()) {
     if (UseMembar) {
@@ -1098,6 +1172,7 @@
     __ mov(c_rarg0, rthread);
     __ mov(rscratch2, CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans));
     __ blrt(rscratch2, 1, 0, 0);
+    __ maybe_isb();
     __ get_method(rmethod);
     __ reinit_heapbase();
     __ bind(Continue);
@@ -1105,7 +1180,8 @@
 
   // change thread state
   __ mov(rscratch1, _thread_in_Java);
-  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ stlrw(rscratch1, rscratch2);
 
   // reset_last_Java_frame
   __ reset_last_Java_frame(true, true);
@@ -1543,29 +1619,18 @@
   return (overhead_size + method_stack + stub_code);
 }
 
-int AbstractInterpreter::layout_activation(Method* method,
-                                           int tempcount,
-                                           int popframe_extra_args,
-                                           int moncount,
-                                           int caller_actual_parameters,
-                                           int callee_param_count,
-                                           int callee_locals,
-                                           frame* caller,
-                                           frame* interpreter_frame,
-                                           bool is_top_frame,
-                                           bool is_bottom_frame) {
+// asm based interpreter deoptimization helpers
+int AbstractInterpreter::size_activation(int max_stack,
+                                         int temps,
+                                         int extra_args,
+                                         int monitors,
+                                         int callee_params,
+                                         int callee_locals,
+                                         bool is_top_frame) {
   // Note: This calculation must exactly parallel the frame setup
   // in AbstractInterpreterGenerator::generate_method_entry.
-  // If interpreter_frame!=NULL, set up the method, locals, and monitors.
-  // The frame interpreter_frame, if not NULL, is guaranteed to be the
-  // right size, as determined by a previous call to this method.
-  // It is also guaranteed to be walkable even though it is in a skeletal state
 
   // fixed size of an interpreter frame:
-  int max_locals = method->max_locals() * Interpreter::stackElementWords;
-  int extra_locals = (method->max_locals() - method->size_of_parameters()) *
-                     Interpreter::stackElementWords;
-
   int overhead = frame::sender_sp_offset -
                  frame::interpreter_frame_initial_sp_offset;
   // Our locals were accounted for by the caller (or last_frame_adjust
@@ -1573,65 +1638,79 @@
   // for the callee's params we only need to account for the extra
   // locals.
   int size = overhead +
-         (callee_locals - callee_param_count)*Interpreter::stackElementWords +
-         moncount * frame::interpreter_frame_monitor_size() +
-         tempcount* Interpreter::stackElementWords + popframe_extra_args;
+         (callee_locals - callee_params)*Interpreter::stackElementWords +
+         monitors * frame::interpreter_frame_monitor_size() +
+         temps* Interpreter::stackElementWords + extra_args;
 
   // On AArch64 we always keep the stack pointer 16-aligned, so we
   // must round up here.
   size = round_to(size, 2);
 
-  if (interpreter_frame != NULL) {
-#ifdef ASSERT
-    if (!EnableInvokeDynamic)
-      // @@@ FIXME: Should we correct interpreter_frame_sender_sp in the calling sequences?
-      // Probably, since deoptimization doesn't work yet.
-      assert(caller->unextended_sp() == interpreter_frame->interpreter_frame_sender_sp(), "Frame not properly walkable");
-    assert(caller->sp() == interpreter_frame->sender_sp(), "Frame not properly walkable(2)");
-#endif
+  return size;
+}
 
-    interpreter_frame->interpreter_frame_set_method(method);
-    // NOTE the difference in using sender_sp and
-    // interpreter_frame_sender_sp interpreter_frame_sender_sp is
-    // the original sp of the caller (the unextended_sp) and
-    // sender_sp is fp+16 XXX
-    intptr_t* locals = interpreter_frame->sender_sp() + max_locals - 1;
+void AbstractInterpreter::layout_activation(Method* method,
+                                            int tempcount,
+                                            int popframe_extra_args,
+                                            int moncount,
+                                            int caller_actual_parameters,
+                                            int callee_param_count,
+                                            int callee_locals,
+                                            frame* caller,
+                                            frame* interpreter_frame,
+                                            bool is_top_frame,
+                                            bool is_bottom_frame) {
+  // The frame interpreter_frame is guaranteed to be the right size,
+  // as determined by a previous call to the size_activation() method.
+  // It is also guaranteed to be walkable even though it is in a
+  // skeletal state
+
+  int max_locals = method->max_locals() * Interpreter::stackElementWords;
+  int extra_locals = (method->max_locals() - method->size_of_parameters()) *
+    Interpreter::stackElementWords;
 
 #ifdef ASSERT
-    if (caller->is_interpreted_frame()) {
-      assert(locals < caller->fp() + frame::interpreter_frame_initial_sp_offset, "bad placement");
-    }
+  assert(caller->sp() == interpreter_frame->sender_sp(), "Frame not properly walkable");
 #endif
 
-    interpreter_frame->interpreter_frame_set_locals(locals);
-    BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
-    BasicObjectLock* monbot = montop - moncount;
-    interpreter_frame->interpreter_frame_set_monitor_end(monbot);
+  interpreter_frame->interpreter_frame_set_method(method);
+  // NOTE the difference in using sender_sp and
+  // interpreter_frame_sender_sp interpreter_frame_sender_sp is
+  // the original sp of the caller (the unextended_sp) and
+  // sender_sp is fp+8/16 (32bit/64bit) XXX
+  intptr_t* locals = interpreter_frame->sender_sp() + max_locals - 1;
 
-    // Set last_sp
-    intptr_t*  esp = (intptr_t*) monbot -
-                     tempcount*Interpreter::stackElementWords -
-                     popframe_extra_args;
-    interpreter_frame->interpreter_frame_set_last_sp(esp);
+#ifdef ASSERT
+  if (caller->is_interpreted_frame()) {
+    assert(locals < caller->fp() + frame::interpreter_frame_initial_sp_offset, "bad placement");
+  }
+#endif
+
+  interpreter_frame->interpreter_frame_set_locals(locals);
+  BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
+  BasicObjectLock* monbot = montop - moncount;
+  interpreter_frame->interpreter_frame_set_monitor_end(monbot);
 
-    // All frames but the initial (oldest) interpreter frame we fill in have
-    // a value for sender_sp that allows walking the stack but isn't
-    // truly correct. Correct the value here.
-    if (extra_locals != 0 &&
-        interpreter_frame->sender_sp() ==
-        interpreter_frame->interpreter_frame_sender_sp()) {
-      interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() +
-                                                         extra_locals);
-    }
-    *interpreter_frame->interpreter_frame_cache_addr() =
-      method->constants()->cache();
+  // Set last_sp
+  intptr_t*  esp = (intptr_t*) monbot -
+    tempcount*Interpreter::stackElementWords -
+    popframe_extra_args;
+  interpreter_frame->interpreter_frame_set_last_sp(esp);
 
-    // interpreter_frame->obj_at_put(frame::sender_sp_offset,
-    // 				  (oop)interpreter_frame->addr_at(frame::sender_sp_offset));
+  // All frames but the initial (oldest) interpreter frame we fill in have
+  // a value for sender_sp that allows walking the stack but isn't
+  // truly correct. Correct the value here.
+  if (extra_locals != 0 &&
+      interpreter_frame->sender_sp() ==
+      interpreter_frame->interpreter_frame_sender_sp()) {
+    interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() +
+                                                       extra_locals);
   }
-  return size;
+  *interpreter_frame->interpreter_frame_cache_addr() =
+    method->constants()->cache();
 }
 
+
 //-----------------------------------------------------------------------------
 // Exceptions
 
@@ -1947,15 +2026,18 @@
 }
 
 void TemplateInterpreterGenerator::count_bytecode() {
+  Register rscratch3 = r0;
   __ push(rscratch1);
   __ push(rscratch2);
+  __ push(rscratch3);
   Label L;
   __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
   __ bind(L);
   __ ldxr(rscratch1, rscratch2);
   __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch1, rscratch1, rscratch2);
-  __ cbnzw(rscratch1, L);
+  __ stxr(rscratch3, rscratch1, rscratch2);
+  __ cbnzw(rscratch3, L);
+  __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
 }
--- a/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -508,23 +508,61 @@
 {
   transition(vtos, itos);
   if (RewriteFrequentPairs) {
-    // TODO : check x86 code for what to do here
-    __ call_Unimplemented();
-  } else {
-    locals_index(r1);
-    __ ldr(r0, iaddress(r1));
+    Label rewrite, done;
+    Register bc = r4;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_iload)));
+
+    // if _iload, wait to rewrite to iload2.  We only want to rewrite the
+    // last two iloads in a pair.  Comparing against fast_iload means that
+    // the next bytecode is neither an iload or a caload, and therefore
+    // an iload pair.
+    __ cmpw(r1, Bytecodes::_iload);
+    __ br(Assembler::EQ, done);
+
+    // if _fast_iload rewrite to _fast_iload2
+    __ cmpw(r1, Bytecodes::_fast_iload);
+    __ movw(bc, Bytecodes::_fast_iload2);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _caload rewrite to _fast_icaload
+    __ cmpw(r1, Bytecodes::_caload);
+    __ movw(bc, Bytecodes::_fast_icaload);
+    __ br(Assembler::EQ, rewrite);
+
+    // else rewrite to _fast_iload
+    __ movw(bc, Bytecodes::_fast_iload);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_iload, bc, r1, false);
+    __ bind(done);
+
   }
 
+  // do iload, get the local value into tos
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+
 }
 
 void TemplateTable::fast_iload2()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+  __ push(itos);
+  locals_index(r1, 3);
+  __ ldr(r0, iaddress(r1));
 }
 
 void TemplateTable::fast_iload()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
 }
 
 void TemplateTable::lload()
@@ -716,7 +754,18 @@
 // iload followed by caload frequent pair
 void TemplateTable::fast_icaload()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  // load index out of locals
+  locals_index(r2);
+  __ ldr(r1, iaddress(r2));
+
+  __ pop_ptr(r0);
+
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(1)));
+  __ load_unsigned_short(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_CHAR)));
 }
 
 void TemplateTable::saload()
@@ -785,7 +834,47 @@
   // These bytecodes with a small amount of code are most profitable
   // to rewrite
   if (RewriteFrequentPairs) {
-    __ call_Unimplemented();
+    Label rewrite, done;
+    const Register bc = r4;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_aload_0)));
+
+    // do actual aload_0
+    aload(0);
+
+    // if _getfield then wait with rewrite
+    __ cmpw(r1, Bytecodes::Bytecodes::_getfield);
+    __ br(Assembler::EQ, done);
+
+    // if _igetfield then reqrite to _fast_iaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_igetfield);
+    __ movw(bc, Bytecodes::_fast_iaccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _agetfield then reqrite to _fast_aaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_agetfield);
+    __ movw(bc, Bytecodes::_fast_aaccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _fgetfield then reqrite to _fast_faccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_faccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_fgetfield);
+    __ movw(bc, Bytecodes::_fast_faccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // else rewrite to _fast_aload0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aload_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ movw(bc, Bytecodes::Bytecodes::_fast_aload_0);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_aload_0, bc, r1, false);
+
+    __ bind(done);
   } else {
     aload(0);
   }
@@ -1568,6 +1657,12 @@
 
 void TemplateTable::branch(bool is_jsr, bool is_wide)
 {
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
   __ profile_taken_branch(r0, r1);
   const ByteSize be_offset = MethodCounters::backedge_counter_offset() +
                              InvocationCounter::counter_offset();
@@ -1849,6 +1944,12 @@
 
 void TemplateTable::ret() {
   transition(vtos, vtos);
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
   locals_index(r1);
   __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
   __ profile_ret(r1, r2);
@@ -3327,6 +3428,8 @@
 
   // continue
   __ bind(done);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
 }
 
 void TemplateTable::newarray() {
@@ -3335,6 +3438,8 @@
   __ mov(c_rarg2, r0);
   call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray),
           c_rarg1, c_rarg2);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
 }
 
 void TemplateTable::anewarray() {
@@ -3344,6 +3449,8 @@
   __ mov(c_rarg3, r0);
   call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray),
           c_rarg1, c_rarg2, c_rarg3);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
 }
 
 void TemplateTable::arraylength() {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -38,19 +38,30 @@
 #ifndef BUILTIN_SIM
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
+#else
+#define getauxval(hwcap) 0
+#endif
 
 #ifndef HWCAP_AES
 #define HWCAP_AES   (1<<3)
 #endif
 
+#ifndef HWCAP_SHA1
+#define HWCAP_SHA1  (1<<5)
+#endif
+
+#ifndef HWCAP_SHA2
+#define HWCAP_SHA2  (1<<6)
+#endif
+
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1<<7)
 #endif
 
-#endif
-
 int VM_Version::_cpu;
 int VM_Version::_model;
+int VM_Version::_variant;
+int VM_Version::_revision;
 int VM_Version::_stepping;
 int VM_Version::_cpuFeatures;
 const char*           VM_Version::_features_str = "";
@@ -101,13 +112,51 @@
   _supports_atomic_getset8 = true;
   _supports_atomic_getadd8 = true;
 
-  FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
+    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
+    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, 64);
   FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
   FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
   FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
 
-#ifndef BUILTIN_SIM
   unsigned long auxv = getauxval(AT_HWCAP);
+
+  char buf[512];
+
+  strcpy(buf, "simd");
+  if (auxv & HWCAP_CRC32) strcat(buf, ", crc");
+  if (auxv & HWCAP_AES)   strcat(buf, ", aes");
+  if (auxv & HWCAP_SHA1)  strcat(buf, ", sha1");
+  if (auxv & HWCAP_SHA2)  strcat(buf, ", sha256");
+
+  _features_str = strdup(buf);
+  _cpuFeatures = auxv;
+
+  if (FILE *f = fopen("/proc/cpuinfo", "r")) {
+    char buf[128], *p;
+    while (fgets(buf, sizeof (buf), f) != NULL) {
+      if (p = strchr(buf, ':')) {
+        long v = strtol(p+1, NULL, 0);
+        if (strncmp(buf, "CPU implementer", sizeof "CPU implementer" - 1) == 0) {
+          _cpu = v;
+        } else if (strncmp(buf, "CPU variant", sizeof "CPU variant" - 1) == 0) {
+          _variant = v;
+        } else if (strncmp(buf, "CPU part", sizeof "CPU part" - 1) == 0) {
+          _model = v;
+        } else if (strncmp(buf, "CPU revision", sizeof "CPU revision" - 1) == 0) {
+          _revision = v;
+        }
+      }
+    }
+    fclose(f);
+  }
+
+  // Enable vendor specific features
+  if (_cpu == CPU_CAVIUM) _cpuFeatures |= CPU_DMB_ATOMICS;
+  if (_cpu == CPU_ARM) _cpuFeatures |= CPU_A53MAC;
+
   if (FLAG_IS_DEFAULT(UseCRC32)) {
     UseCRC32 = (auxv & HWCAP_CRC32) != 0;
   }
@@ -130,11 +179,60 @@
       warning("UseAESIntrinsics specified, but not supported on this CPU");
     }
   }
-#endif
 
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
+
+  if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
+    if (FLAG_IS_DEFAULT(UseSHA)) {
+      FLAG_SET_DEFAULT(UseSHA, true);
+    }
+  } else if (UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
+  if (!UseSHA) {
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+  } else {
+    if (auxv & HWCAP_SHA1) {
+      if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+      }
+    } else if (UseSHA1Intrinsics) {
+      warning("SHA1 instruction is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+    }
+    if (auxv & HWCAP_SHA2) {
+      if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+      }
+    } else if (UseSHA256Intrinsics) {
+      warning("SHA256 instruction (for SHA-224 and SHA-256) is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    }
+    if (UseSHA512Intrinsics) {
+      warning("SHA512 instruction (for SHA-384 and SHA-512) is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+    }
+  }
+
+  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+    UseMultiplyToLenIntrinsic = true;
+  }
+
+  if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
+    UsePopCountInstruction = true;
+  }
+
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(OptoScheduling)) {
+    OptoScheduling = true;
+  }
+#endif
 }
 
 void VM_Version::initialize() {
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -35,6 +35,8 @@
 protected:
   static int _cpu;
   static int _model;
+  static int _variant;
+  static int _revision;
   static int _stepping;
   static int _cpuFeatures;     // features returned by the "cpuid" instruction
                                // 0 if this instruction is not available
@@ -50,7 +52,39 @@
   static void assert_is_initialized() {
   }
 
+  enum {
+    CPU_ARM       = 'A',
+    CPU_BROADCOM  = 'B',
+    CPU_CAVIUM    = 'C',
+    CPU_DEC       = 'D',
+    CPU_INFINEON  = 'I',
+    CPU_MOTOROLA  = 'M',
+    CPU_NVIDIA    = 'N',
+    CPU_AMCC      = 'P',
+    CPU_QUALCOM   = 'Q',
+    CPU_MARVELL   = 'V',
+    CPU_INTEL     = 'i',
+  } cpuFamily;
+
+  enum {
+    CPU_FP           = (1<<0),
+    CPU_ASIMD        = (1<<1),
+    CPU_EVTSTRM      = (1<<2),
+    CPU_AES          = (1<<3),
+    CPU_PMULL        = (1<<4),
+    CPU_SHA1         = (1<<5),
+    CPU_SHA2         = (1<<6),
+    CPU_CRC32        = (1<<7),
+    CPU_A53MAC       = (1 << 30),
+    CPU_DMB_ATOMICS  = (1 << 31),
+  } cpuFeatureFlags;
+
   static const char* cpu_features()           { return _features_str; }
+  static int cpu_family()                     { return _cpu; }
+  static int cpu_model()                      { return _model; }
+  static int cpu_variant()                    { return _variant; }
+  static int cpu_revision()                   { return _revision; }
+  static int cpu_cpuFeatures()                { return _cpuFeatures; }
 
 };
 
--- a/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -58,7 +58,8 @@
 
 #ifndef PRODUCT
   if (CountCompiledCalls) {
-    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ lea(r19, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r19));
   }
 #endif
 
@@ -73,12 +74,14 @@
   if (DebugVtables) {
     Label L;
     // check offset vs vtable length
-    __ ldrw(rscratch1, Address(r0, InstanceKlass::vtable_length_offset() * wordSize));
+    __ ldrw(rscratch1, Address(r19, InstanceKlass::vtable_length_offset() * wordSize));
     __ cmpw(rscratch1, vtable_index * vtableEntry::size());
     __ br(Assembler::GT, L);
+    __ enter();
     __ mov(r2, vtable_index);
     __ call_VM(noreg,
                CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), j_rarg0, r2);
+    __ leave();
     __ bind(L);
   }
 #endif // PRODUCT
@@ -109,9 +112,6 @@
                   (int)(s->code_end() - __ pc()));
   }
   guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(vtable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
 
   s->set_exception_points(npe_addr, ame_addr);
   return s;
@@ -130,7 +130,8 @@
 
 #ifndef PRODUCT
   if (CountCompiledCalls) {
-    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ lea(r10, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r10));
   }
 #endif
 
@@ -190,9 +191,6 @@
                   (int)(s->code_end() - __ pc()));
   }
   guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(itable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
 
   s->set_exception_points(npe_addr, ame_addr);
   return s;
@@ -200,8 +198,49 @@
 
 
 int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
+  int size = DebugVtables ? 216 : 0;
+  if (CountCompiledCalls)
+    size += 6 * 4;
   // FIXME
-  return 200;
+  if (is_vtable_stub)
+    size += 52;
+  else
+    size += 104;
+  return size;
+
+  // In order to tune these parameters, run the JVM with VM options
+  // +PrintMiscellaneous and +WizardMode to see information about
+  // actual itable stubs.  Run it with -Xmx31G -XX:+UseCompressedOops.
+  //
+  // If Universe::narrow_klass_base is nonzero, decoding a compressed
+  // class can take zeveral instructions.  Run it with -Xmx31G
+  // -XX:+UseCompressedOops.
+  //
+  // The JVM98 app. _202_jess has a megamorphic interface call.
+  // The itable code looks like this:
+  // Decoding VtableStub itbl[1]@12
+  //     ldr     w10, [x1,#8]
+  //     lsl     x10, x10, #3
+  //     ldr     w11, [x10,#280]
+  //     add     x11, x10, x11, uxtx #3
+  //     add     x11, x11, #0x1b8
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.eq    success
+  // loop:
+  //     cbz     x12, throw_icce
+  //     add     x11, x11, #0x10
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.ne    loop
+  // success:
+  //     ldr     x11, [x11,#8]
+  //     ldr     x12, [x10,x11]
+  //     ldr     x8, [x12,#72]
+  //     br      x8
+  // throw_icce:
+  //     b	throw_ICCE_entry
+
 }
 
 int VtableStub::pd_code_alignment() { return 4; }
--- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -2470,7 +2470,8 @@
 
     // Slow case of monitor enter.
     // Inline a special case of call_VM that disallows any pending_exception.
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box);
+    // Arguments are (oop obj, BasicLock* lock, JavaThread* thread).
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box, R16_thread);
 
     __ asm_assert_mem8_is_zero(thread_(pending_exception),
        "no pending exception allowed on exit from SharedRuntime::complete_monitor_unlocking_C", 0);
--- a/src/cpu/x86/vm/x86_64.ad	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/cpu/x86/vm/x86_64.ad	Fri Oct 02 04:37:30 2015 +0100
@@ -3732,6 +3732,23 @@
   %}
 %}
 
+// Indirect Memory Plus Positive Index Register Plus Offset Operand
+operand indPosIndexOffset(any_RegP reg, immL32 off, rRegI idx)
+%{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  predicate(n->in(2)->in(3)->as_Type()->type()->is_long()->_lo >= 0);
+  match(AddP (AddP reg (ConvI2L idx)) off);
+
+  op_cost(10);
+  format %{"[$reg + $off + $idx]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($idx);
+    scale(0x0);
+    disp($off);
+  %}
+%}
+
 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 operand indPosIndexScaleOffset(any_RegP reg, immL32 off, rRegI idx, immI2 scale)
 %{
@@ -3883,6 +3900,23 @@
   %}
 %}
 
+// Indirect Memory Times Plus Positive Index Register Plus Offset Operand
+operand indPosIndexOffsetNarrow(rRegN reg, immL32 off, rRegI idx)
+%{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  predicate(Universe::narrow_oop_shift() == 0 && n->in(2)->in(3)->as_Type()->type()->is_long()->_lo >= 0);
+  match(AddP (AddP (DecodeN reg) (ConvI2L idx)) off);
+
+  op_cost(10);
+  format %{"[$reg + $off + $idx]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($idx);
+    scale(0x0);
+    disp($off);
+  %}
+%}
+
 // Indirect Memory Times Scale Plus Positive Index Register Plus Offset Operand
 operand indPosIndexScaleOffsetNarrow(rRegN reg, immL32 off, rRegI idx, immI2 scale)
 %{
@@ -4074,11 +4108,11 @@
 // case of this is memory operands.
 
 opclass memory(indirect, indOffset8, indOffset32, indIndexOffset, indIndex,
-               indIndexScale, indIndexScaleOffset, indPosIndexScaleOffset,
+               indIndexScale, indIndexScaleOffset, indPosIndexOffset, indPosIndexScaleOffset,
                indCompressedOopOffset,
                indirectNarrow, indOffset8Narrow, indOffset32Narrow,
                indIndexOffsetNarrow, indIndexNarrow, indIndexScaleNarrow,
-               indIndexScaleOffsetNarrow, indPosIndexScaleOffsetNarrow);
+               indIndexScaleOffsetNarrow, indPosIndexOffsetNarrow, indPosIndexScaleOffsetNarrow);
 
 //----------PIPELINE-----------------------------------------------------------
 // Rules which define the behavior of the target architectures pipeline.
@@ -5112,6 +5146,17 @@
   ins_pipe(ialu_reg_reg_fat);
 %}
 
+instruct leaPPosIdxOff(rRegP dst, indPosIndexOffset mem)
+%{
+  match(Set dst mem);
+
+  ins_cost(110);
+  format %{ "leaq    $dst, $mem\t# ptr posidxoff" %}
+  opcode(0x8D);
+  ins_encode(REX_reg_mem_wide(dst, mem), OpcP, reg_mem(dst, mem));
+  ins_pipe(ialu_reg_reg_fat);
+%}
+
 instruct leaPPosIdxScaleOff(rRegP dst, indPosIndexScaleOffset mem)
 %{
   match(Set dst mem);
@@ -5196,6 +5241,18 @@
   ins_pipe(ialu_reg_reg_fat);
 %}
 
+instruct leaPPosIdxOffNarrow(rRegP dst, indPosIndexOffsetNarrow mem)
+%{
+  predicate(Universe::narrow_oop_shift() == 0);
+  match(Set dst mem);
+
+  ins_cost(110);
+  format %{ "leaq    $dst, $mem\t# ptr posidxoffnarrow" %}
+  opcode(0x8D);
+  ins_encode(REX_reg_mem_wide(dst, mem), OpcP, reg_mem(dst, mem));
+  ins_pipe(ialu_reg_reg_fat);
+%}
+
 instruct leaPPosIdxScaleOffNarrow(rRegP dst, indPosIndexScaleOffsetNarrow mem)
 %{
   predicate(Universe::narrow_oop_shift() == 0);
--- a/src/os/linux/vm/os_linux.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os/linux/vm/os_linux.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -5953,22 +5953,6 @@
 
 extern char** environ;
 
-#ifndef __NR_fork
-#ifdef BUILTIN_SIM
-#define __NR_fork 57
-#else
-#define __NR_fork IA32_ONLY(2) IA64_ONLY(not defined) AMD64_ONLY(57) AARCH64_ONLY(1079)
-#endif
-#endif
-
-#ifndef __NR_execve
-#ifdef BUILTIN_SIM
-#define __NR_execve 59
-#else
-#define __NR_execve IA32_ONLY(11) IA64_ONLY(1033) AMD64_ONLY(59) AARCH64_ONLY(221)
-#endif
-#endif
-
 // Run the specified command in a separate process. Return its exit value,
 // or -1 on failure (e.g. can't fork a new process).
 // Unlike system(), this function can be called from signal handler. It
--- a/src/os_cpu/linux_aarch64/vm/atomic_linux_aarch64.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/atomic_linux_aarch64.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -31,6 +31,10 @@
 
 // Implementation of class atomic
 
+#define FULL_MEM_BARRIER  __sync_synchronize()
+#define READ_MEM_BARRIER  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+#define WRITE_MEM_BARRIER __atomic_thread_fence(__ATOMIC_RELEASE);
+
 inline void Atomic::store    (jbyte    store_value, jbyte*    dest) { *dest = store_value; }
 inline void Atomic::store    (jshort   store_value, jshort*   dest) { *dest = store_value; }
 inline void Atomic::store    (jint     store_value, jint*     dest) { *dest = store_value; }
@@ -71,7 +75,9 @@
 
 inline jint Atomic::xchg (jint exchange_value, volatile jint* dest)
 {
- return __sync_lock_test_and_set (dest, exchange_value);
+  jint res = __sync_lock_test_and_set (dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return res;
 }
 
 inline void* Atomic::xchg_ptr(void* exchange_value, volatile void* dest)
@@ -111,7 +117,9 @@
 
 inline intptr_t Atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
 {
- return __sync_lock_test_and_set (dest, exchange_value);
+  intptr_t res = __sync_lock_test_and_set (dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return res;
 }
 
 inline jlong Atomic::cmpxchg (jlong exchange_value, volatile jlong* dest, jlong compare_value)
--- a/src/os_cpu/linux_aarch64/vm/globals_linux_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/globals_linux_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -39,4 +39,6 @@
 // Used on 64 bit platforms for UseCompressedOops base address
 define_pd_global(uintx,HeapBaseMinAddress,       2*G);
 
+extern __thread Thread *aarch64_currentThread;
+
 #endif // OS_CPU_LINUX_AARCH64_VM_GLOBALS_LINUX_AARCH64_HPP
--- a/src/os_cpu/linux_aarch64/vm/orderAccess_linux_aarch64.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/orderAccess_linux_aarch64.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -31,10 +31,6 @@
 #include "runtime/os.hpp"
 #include "vm_version_aarch64.hpp"
 
-#define FULL_MEM_BARRIER  __sync_synchronize()
-#define READ_MEM_BARRIER  __atomic_thread_fence(__ATOMIC_ACQUIRE);
-#define WRITE_MEM_BARRIER __atomic_thread_fence(__ATOMIC_RELEASE);
-
 // Implementation of class OrderAccess.
 
 inline void OrderAccess::loadload()   { acquire(); }
--- a/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/os_linux_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -172,10 +172,14 @@
   return frame(sp, fp, epc.pc());
 }
 
-// By default, gcc always save frame pointer (%ebp/%rbp) on stack. It may get
-// turned off by -fomit-frame-pointer,
+// By default, gcc always saves frame pointer rfp on this stack. This
+// may get turned off by -fomit-frame-pointer.
 frame os::get_sender_for_C_frame(frame* fr) {
+#ifdef BUILTIN_SIM
   return frame(fr->sender_sp(), fr->link(), fr->sender_pc());
+#else
+  return frame(fr->link(), fr->link(), fr->sender_pc());
+#endif
 }
 
 intptr_t* _get_previous_fp() {
--- a/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -26,32 +26,6 @@
 #include "runtime/threadLocalStorage.hpp"
 #include "runtime/thread.inline.hpp"
 
-// Map stack pointer (%esp) to thread pointer for faster TLS access
-//
-// Here we use a flat table for better performance. Getting current thread
-// is down to one memory access (read _sp_map[%esp>>12]) in generated code
-// and two in runtime code (-fPIC code needs an extra load for _sp_map).
-//
-// This code assumes stack page is not shared by different threads. It works
-// in 32-bit VM when page size is 4K (or a multiple of 4K, if that matters).
-//
-// Notice that _sp_map is allocated in the bss segment, which is ZFOD
-// (zero-fill-on-demand). While it reserves 4M address space upfront,
-// actual memory pages are committed on demand.
-//
-// If an application creates and destroys a lot of threads, usually the
-// stack space freed by a thread will soon get reused by new thread
-// (this is especially true in NPTL or LinuxThreads in fixed-stack mode).
-// No memory page in _sp_map is wasted.
-//
-// However, it's still possible that we might end up populating &
-// committing a large fraction of the 4M table over time, but the actual
-// amount of live data in the table could be quite small. The max wastage
-// is less than 4M bytes. If it becomes an issue, we could use madvise()
-// with MADV_DONTNEED to reclaim unused (i.e. all-zero) pages in _sp_map.
-// MADV_DONTNEED on Linux keeps the virtual memory mapping, but zaps the
-// physical memory page (i.e. similar to MADV_FREE on Solaris).
-
 void ThreadLocalStorage::generate_code_for_get_thread() {
     // nothing we can do here for user-level thread
 }
@@ -59,6 +33,9 @@
 void ThreadLocalStorage::pd_init() {
 }
 
+__thread Thread *aarch64_currentThread;
+
 void ThreadLocalStorage::pd_set_thread(Thread* thread) {
   os::thread_local_storage_at_put(ThreadLocalStorage::thread_index(), thread);
+  aarch64_currentThread = thread;
 }
--- a/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/os_cpu/linux_aarch64/vm/threadLS_linux_aarch64.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -29,8 +29,8 @@
 
 public:
 
-   static Thread* thread() {
-     return (Thread*) os::thread_local_storage_at(thread_index());
-   }
+  static Thread *thread() {
+    return aarch64_currentThread;
+  }
 
 #endif // OS_CPU_LINUX_AARCH64_VM_THREADLS_LINUX_AARCH64_HPP
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -2102,13 +2102,6 @@
     addr = new LIR_Address(base_op, index_op->as_jint(), dst_type);
   } else {
 #if defined(X86) || defined(AARCH64)
-#ifdef _LP64
-    if (!index_op->is_illegal() && index_op->type() == T_INT) {
-      LIR_Opr tmp = new_pointer_register();
-      __ convert(Bytecodes::_i2l, index_op, tmp);
-      index_op = tmp;
-    }
-#endif
     addr = new LIR_Address(base_op, index_op, LIR_Address::Scale(log2_scale), 0, dst_type);
 #elif defined(GENERATE_ADDRESS_IS_PREFERRED)
     addr = generate_address(base_op, index_op, log2_scale, 0, dst_type);
--- a/src/share/vm/code/nmethod.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/code/nmethod.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -47,12 +47,12 @@
 #include "shark/sharkCompiler.hpp"
 #endif
 
+PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
+
 #ifdef BUILTIN_SIM
 #include "../../../../../simulator/simulator.hpp"
 #endif
 
-PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
-
 unsigned char nmethod::_global_unloading_clock = 0;
 
 #ifdef DTRACE_ENABLED
--- a/src/share/vm/memory/metaspace.cpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/memory/metaspace.cpp	Fri Oct 02 04:37:30 2015 +0100
@@ -3018,10 +3018,50 @@
   // Don't use large pages for the class space.
   bool large_pages = false;
 
+#ifndef AARCH64
   ReservedSpace metaspace_rs = ReservedSpace(compressed_class_space_size(),
                                              _reserve_alignment,
                                              large_pages,
                                              requested_addr, 0);
+#else // AARCH64
+  ReservedSpace metaspace_rs;
+
+  // Our compressed klass pointers may fit nicely into the lower 32
+  // bits.
+  if ((uint64_t)requested_addr + compressed_class_space_size() < 4*G)
+    metaspace_rs = ReservedSpace(compressed_class_space_size(),
+                                             _reserve_alignment,
+                                             large_pages,
+                                             requested_addr, 0);
+
+  if (! metaspace_rs.is_reserved()) {
+    // Try to align metaspace so that we can decode a compressed klass
+    // with a single MOVK instruction.  We can do this iff the
+    // compressed class base is a multiple of 4G.
+    for (char *a = (char*)align_ptr_up(requested_addr, 4*G);
+	 a < (char*)(1024*G);
+	 a += 4*G) {
+      if (UseSharedSpaces
+	  && ! can_use_cds_with_metaspace_addr(a, cds_base)) {
+	// We failed to find an aligned base that will reach.  Fall
+	// back to using our requested addr.
+	metaspace_rs = ReservedSpace(compressed_class_space_size(),
+                                             _reserve_alignment,
+                                             large_pages,
+                                             requested_addr, 0);
+	break;
+      }
+      metaspace_rs = ReservedSpace(compressed_class_space_size(),
+				   _reserve_alignment,
+				   large_pages,
+				   a, 0);
+      if (metaspace_rs.is_reserved())
+	break;
+    }
+  }
+
+#endif // AARCH64
+
   if (!metaspace_rs.is_reserved()) {
 #if INCLUDE_CDS
     if (UseSharedSpaces) {
--- a/src/share/vm/memory/metaspaceShared.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/memory/metaspaceShared.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -126,10 +126,6 @@
 
   static void print_shared_spaces();
 
-#if defined(BUILTIN_SIM)
-  static void relocate_vtbl_list(char **buffer);
-#endif
-
   static bool try_link_class(InstanceKlass* ik, TRAPS);
   static void link_one_shared_class(Klass* obj, TRAPS);
   static void check_one_shared_class(Klass* obj);
@@ -137,5 +133,9 @@
 
   static int count_class(const char* classlist_file);
   static void estimate_regions_size() NOT_CDS_RETURN;
+
+#if defined(BUILTIN_SIM)
+  static void relocate_vtbl_list(char **buffer);
+#endif
 };
 #endif // SHARE_VM_MEMORY_METASPACE_SHARED_HPP
--- a/src/share/vm/opto/c2_globals.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/opto/c2_globals.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -659,9 +659,6 @@
   product(bool, UseMathExactIntrinsics, true,                               \
           "Enables intrinsification of various java.lang.Math functions")   \
                                                                             \
-  product(bool, UseMultiplyToLenIntrinsic, false,                           \
-          "Enables intrinsification of BigInteger.multiplyToLen()")         \
-                                                                            \
   product(bool, UseTypeSpeculation, true,                                   \
           "Speculatively propagate types from profiles")                    \
                                                                             \
--- a/src/share/vm/runtime/globals.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/runtime/globals.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -732,6 +732,9 @@
   product(bool, UseCRC32Intrinsics, false,                                  \
           "use intrinsics for java.util.zip.CRC32")                         \
                                                                             \
+  product(bool, UseMultiplyToLenIntrinsic, false,                           \
+          "Enables intrinsification of BigInteger.multiplyToLen()")         \
+                                                                            \
   develop(bool, TraceCallFixup, false,                                      \
           "Trace all call fixups")                                          \
                                                                             \
--- a/src/share/vm/runtime/orderAccess.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/runtime/orderAccess.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -41,6 +41,9 @@
 #ifdef TARGET_OS_ARCH_linux_arm
 # include "orderAccess_linux_arm.inline.hpp"
 #endif
+#ifdef TARGET_OS_ARCH_linux_aarch64
+# include "orderAccess_linux_aarch64.inline.hpp"
+#endif
 #ifdef TARGET_OS_ARCH_linux_ppc
 # include "orderAccess_linux_ppc.inline.hpp"
 #endif
--- a/src/share/vm/runtime/prefetch.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/runtime/prefetch.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -40,6 +40,9 @@
 #ifdef TARGET_OS_ARCH_linux_arm
 # include "prefetch_linux_arm.inline.hpp"
 #endif
+#ifdef TARGET_OS_ARCH_linux_aarch64
+# include "prefetch_linux_aarch64.inline.hpp"
+#endif
 #ifdef TARGET_OS_ARCH_linux_ppc
 # include "prefetch_linux_ppc.inline.hpp"
 #endif
--- a/src/share/vm/runtime/thread.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/runtime/thread.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -1050,7 +1050,7 @@
   address last_Java_pc(void)                     { return _anchor.last_Java_pc(); }
 
   // Safepoint support
-#ifndef PPC64
+#if !(defined(PPC64) || defined(AARCH64))
   JavaThreadState thread_state() const           { return _thread_state; }
   void set_thread_state(JavaThreadState s)       { _thread_state = s;    }
 #else
--- a/src/share/vm/runtime/thread.inline.hpp	Wed Sep 30 16:43:15 2015 +0100
+++ b/src/share/vm/runtime/thread.inline.hpp	Fri Oct 02 04:37:30 2015 +0100
@@ -59,7 +59,7 @@
   return allocated_bytes;
 }
 
-#ifdef PPC64
+#if defined(PPC64) || defined (AARCH64)
 inline JavaThreadState JavaThread::thread_state() const    {
   return (JavaThreadState) OrderAccess::load_acquire((volatile jint*)&_thread_state);
 }
--- a/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java	Fri Oct 02 04:37:30 2015 +0100
@@ -34,6 +34,7 @@
  *      -XX:CompileCommand=inline,java.math.BigInteger::multiply TestMultiplyToLen
  */
 
+import java.util.Arrays;
 import java.util.Random;
 import java.math.*;
 
@@ -97,12 +98,36 @@
         newsum = newsum.add(newres);
 
         if (!bytecompare(oldres,newres)) {
+          System.out.println(b1);
+          System.out.println(b2);
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          throw new Exception("Failed");
+        }
+      }
+
+      // Test carry propagation.  Multiple carries during bignum
+      // multiplication are rare (especially when using 64-bit
+      // arithmetic) so we have to provoke them deliberately.
+      for (int j = 4; j <= 396; j += 4) {
+        byte[] bytes = new byte[j];
+        Arrays.fill(bytes, (byte)255);
+        b1 = new BigInteger(bytes);
+        b2 = new BigInteger(bytes);
+
+        oldres = base_multiply(b1,b2);
+        newres = new_multiply(b1,b2);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
           System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
           System.out.println(b1);
           System.out.println(b2);
           throw new Exception("Failed");
         }
       }
+
       if (!bytecompare(oldsum,newsum))  {
         System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
         throw new Exception("Failed");
--- a/test/compiler/intrinsics/sha/cli/SHAOptionsBase.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/SHAOptionsBase.java	Fri Oct 02 04:37:30 2015 +0100
@@ -95,6 +95,19 @@
                 default:
                     throw new Error("Unexpected option " + optionName);
             }
+        } else if (Platform.isAArch64()) {
+            switch (optionName) {
+                case SHAOptionsBase.USE_SHA_OPTION:
+                    return SHAOptionsBase.SHA_INSTRUCTIONS_ARE_NOT_AVAILABLE;
+                case SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION:
+                    return SHAOptionsBase.SHA1_INSTRUCTION_IS_NOT_AVAILABLE;
+                case SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION:
+                    return SHAOptionsBase.SHA256_INSTRUCTION_IS_NOT_AVAILABLE;
+                case SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION:
+                    return SHAOptionsBase.SHA512_INSTRUCTION_IS_NOT_AVAILABLE;
+                default:
+                    throw new Error("Unexpected option " + optionName);
+            }
         } else {
             throw new Error("Support for CPUs other then X86 or SPARC is not "
                     + "implemented.");
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA1IntrinsicsOptionOnSupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA1IntrinsicsOptionOnSupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -34,7 +34,10 @@
  */
 public class TestUseSHA1IntrinsicsOptionOnSupportedCPU {
     public static void main(String args[]) throws Throwable {
-        new SHAOptionsBase(new GenericTestCaseForSupportedSparcCPU(
-                SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION)).test();
+        new SHAOptionsBase(
+                new GenericTestCaseForSupportedSparcCPU(
+                        SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION),
+                new GenericTestCaseForSupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION)).test();
     }
 }
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA1IntrinsicsOptionOnUnsupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA1IntrinsicsOptionOnUnsupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -40,6 +40,8 @@
                         SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION),
                 new UseSHAIntrinsicsSpecificTestCaseForUnsupportedSparcCPU(
                         SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION),
+                new GenericTestCaseForUnsupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION),
                 new GenericTestCaseForUnsupportedX86CPU(
                         SHAOptionsBase.USE_SHA1_INTRINSICS_OPTION),
                 new GenericTestCaseForOtherCPU(
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA256IntrinsicsOptionOnSupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA256IntrinsicsOptionOnSupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -35,7 +35,10 @@
  */
 public class TestUseSHA256IntrinsicsOptionOnSupportedCPU {
     public static void main(String args[]) throws Throwable {
-        new SHAOptionsBase(new GenericTestCaseForSupportedSparcCPU(
-                SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION)).test();
+        new SHAOptionsBase(
+                new GenericTestCaseForSupportedSparcCPU(
+                        SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION),
+                new GenericTestCaseForSupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION)).test();
     }
 }
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA256IntrinsicsOptionOnUnsupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA256IntrinsicsOptionOnUnsupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -40,6 +40,8 @@
                         SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION),
                 new UseSHAIntrinsicsSpecificTestCaseForUnsupportedSparcCPU(
                         SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION),
+                new GenericTestCaseForUnsupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION),
                 new GenericTestCaseForUnsupportedX86CPU(
                         SHAOptionsBase.USE_SHA256_INTRINSICS_OPTION),
                 new GenericTestCaseForOtherCPU(
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA512IntrinsicsOptionOnSupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA512IntrinsicsOptionOnSupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -35,7 +35,10 @@
  */
 public class TestUseSHA512IntrinsicsOptionOnSupportedCPU {
     public static void main(String args[]) throws Throwable {
-        new SHAOptionsBase(new GenericTestCaseForSupportedSparcCPU(
-                SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION)).test();
+        new SHAOptionsBase(
+                new GenericTestCaseForSupportedSparcCPU(
+                        SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION),
+                new GenericTestCaseForSupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION)).test();
     }
 }
--- a/test/compiler/intrinsics/sha/cli/TestUseSHA512IntrinsicsOptionOnUnsupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHA512IntrinsicsOptionOnUnsupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -40,6 +40,8 @@
                         SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION),
                 new UseSHAIntrinsicsSpecificTestCaseForUnsupportedSparcCPU(
                         SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION),
+                new GenericTestCaseForUnsupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION),
                 new GenericTestCaseForUnsupportedX86CPU(
                         SHAOptionsBase.USE_SHA512_INTRINSICS_OPTION),
                 new GenericTestCaseForOtherCPU(
--- a/test/compiler/intrinsics/sha/cli/TestUseSHAOptionOnSupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHAOptionOnSupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -38,6 +38,8 @@
                 new GenericTestCaseForSupportedSparcCPU(
                         SHAOptionsBase.USE_SHA_OPTION),
                 new UseSHASpecificTestCaseForSupportedSparcCPU(
+                        SHAOptionsBase.USE_SHA_OPTION),
+                new GenericTestCaseForSupportedAArch64CPU(
                         SHAOptionsBase.USE_SHA_OPTION)).test();
     }
 }
--- a/test/compiler/intrinsics/sha/cli/TestUseSHAOptionOnUnsupportedCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/TestUseSHAOptionOnUnsupportedCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -39,6 +39,8 @@
                         SHAOptionsBase.USE_SHA_OPTION),
                 new UseSHASpecificTestCaseForUnsupportedSparcCPU(
                         SHAOptionsBase.USE_SHA_OPTION),
+                new GenericTestCaseForUnsupportedAArch64CPU(
+                        SHAOptionsBase.USE_SHA_OPTION),
                 new GenericTestCaseForUnsupportedX86CPU(
                         SHAOptionsBase.USE_SHA_OPTION),
                 new GenericTestCaseForOtherCPU(
--- a/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForOtherCPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -36,7 +36,8 @@
     public GenericTestCaseForOtherCPU(String optionName) {
         // Execute the test case on any CPU except SPARC and X86
         super(optionName, new NotPredicate(new OrPredicate(Platform::isSparc,
-                new OrPredicate(Platform::isX64, Platform::isX86))));
+                new OrPredicate(Platform::isAArch64,
+                        new OrPredicate(Platform::isX64, Platform::isX86)))));
     }
 
     @Override
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForSupportedAArch64CPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import com.oracle.java.testlibrary.ExitCode;
+import com.oracle.java.testlibrary.Platform;
+import com.oracle.java.testlibrary.cli.CommandLineOptionTest;
+import com.oracle.java.testlibrary.cli.predicate.AndPredicate;
+
+/**
+ * Generic test case for SHA-related options targeted to AArch64 CPUs which
+ * support instructions required by the tested option.
+ */
+public class GenericTestCaseForSupportedAArch64CPU extends
+        SHAOptionsBase.TestCase {
+    public GenericTestCaseForSupportedAArch64CPU(String optionName) {
+        super(optionName, new AndPredicate(Platform::isAArch64,
+                SHAOptionsBase.getPredicateForOption(optionName)));
+    }
+
+    @Override
+    protected void verifyWarnings() throws Throwable {
+        // Verify that there are no warning when option is explicitly enabled.
+        CommandLineOptionTest.verifySameJVMStartup(null, new String[] {
+                        SHAOptionsBase.getWarningForUnsupportedCPU(optionName)
+                }, ExitCode.OK,
+                CommandLineOptionTest.prepareBooleanFlag(optionName, true));
+
+        // Verify that option could be disabled even if +UseSHA was passed to
+        // JVM.
+        CommandLineOptionTest.verifySameJVMStartup(null, new String[] {
+                        SHAOptionsBase.getWarningForUnsupportedCPU(optionName)
+                }, ExitCode.OK,
+                CommandLineOptionTest.prepareBooleanFlag(
+                        SHAOptionsBase.USE_SHA_OPTION, true),
+                CommandLineOptionTest.prepareBooleanFlag(optionName, false));
+
+        // Verify that it is possible to enable the tested option and disable
+        // all SHA intrinsics via -UseSHA without any warnings.
+        CommandLineOptionTest.verifySameJVMStartup(null, new String[] {
+                        SHAOptionsBase.getWarningForUnsupportedCPU(optionName)
+                }, ExitCode.OK,
+                CommandLineOptionTest.prepareBooleanFlag(
+                        SHAOptionsBase.USE_SHA_OPTION, false),
+                CommandLineOptionTest.prepareBooleanFlag(optionName, true));
+    }
+
+    @Override
+    protected void verifyOptionValues() throws Throwable {
+        // Verify that on supported CPU option is enabled by default.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "true");
+
+        // Verify that it is possible to explicitly enable the option.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "true",
+                CommandLineOptionTest.prepareBooleanFlag(optionName, true));
+
+        // Verify that it is possible to explicitly disable the option.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false",
+                CommandLineOptionTest.prepareBooleanFlag(optionName, false));
+
+        // verify that option is disabled when -UseSHA was passed to JVM.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false",
+                CommandLineOptionTest.prepareBooleanFlag(optionName, true),
+                CommandLineOptionTest.prepareBooleanFlag(
+                        SHAOptionsBase.USE_SHA_OPTION, false));
+
+        // Verify that it is possible to explicitly disable the tested option
+        // even if +UseSHA was passed to JVM.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false",
+                CommandLineOptionTest.prepareBooleanFlag(
+                        SHAOptionsBase.USE_SHA_OPTION, true),
+                CommandLineOptionTest.prepareBooleanFlag(optionName, false));
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/intrinsics/sha/cli/testcases/GenericTestCaseForUnsupportedAArch64CPU.java	Fri Oct 02 04:37:30 2015 +0100
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import com.oracle.java.testlibrary.ExitCode;
+import com.oracle.java.testlibrary.Platform;
+import com.oracle.java.testlibrary.cli.CommandLineOptionTest;
+import com.oracle.java.testlibrary.cli.predicate.AndPredicate;
+import com.oracle.java.testlibrary.cli.predicate.NotPredicate;
+
+/**
+ * Generic test case for SHA-related options targeted to AArch64 CPUs which don't
+ * support instruction required by the tested option.
+ */
+public class GenericTestCaseForUnsupportedAArch64CPU extends
+        SHAOptionsBase.TestCase {
+    public GenericTestCaseForUnsupportedAArch64CPU(String optionName) {
+        super(optionName, new AndPredicate(Platform::isAArch64,
+                new NotPredicate(SHAOptionsBase.getPredicateForOption(
+                        optionName))));
+    }
+
+    @Override
+    protected void verifyWarnings() throws Throwable {
+        //Verify that option could be disabled without any warnings.
+        CommandLineOptionTest.verifySameJVMStartup(null, new String[] {
+                        SHAOptionsBase.getWarningForUnsupportedCPU(optionName)
+                }, ExitCode.OK,
+                CommandLineOptionTest.prepareBooleanFlag(optionName, false));
+    }
+
+    @Override
+    protected void verifyOptionValues() throws Throwable {
+        // Verify that option is disabled by default.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false");
+
+        // Verify that option is disabled even if it was explicitly enabled
+        // using CLI options.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false",
+                CommandLineOptionTest.prepareBooleanFlag(optionName, true));
+
+        // Verify that option is disabled when +UseSHA was passed to JVM.
+        CommandLineOptionTest.verifyOptionValueForSameVM(optionName, "false",
+                CommandLineOptionTest.prepareBooleanFlag(
+                        SHAOptionsBase.USE_SHA_OPTION, true));
+    }
+}
--- a/test/compiler/stable/StableConfiguration.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/stable/StableConfiguration.java	Fri Oct 02 04:37:30 2015 +0100
@@ -41,10 +41,30 @@
         System.out.println("Server Compiler: " + get());
     }
 
+    // The method 'get' below returns true if the method is server compiled
+    // and is used by the Stable tests to determine whether methods in
+    // general are being server compiled or not as the -XX:+FoldStableValues
+    // option is only applicable to -server.
+    //
+    // On aarch64 we DeOptimize when patching. This means that when the
+    // method is compiled as a result of -Xcomp it DeOptimizes immiediately.
+    // The result is that getMethodCompilationLevel returns 0. This means
+    // the method returns true based on java.vm.name.
+    //
+    // However when the tests are run with -XX:+TieredCompilation and
+    // -XX:TieredStopAtLevel=1 this fails because methods will always
+    // be client compiled.
+    //
+    // Solution is to add a simple method 'get1' which should never be
+    // DeOpted and use that to determine the compilation level instead.
+    static void get1() {
+    }
+
     // ::get() is among immediately compiled methods.
     static boolean get() {
         try {
-            Method m = StableConfiguration.class.getDeclaredMethod("get");
+            get1();
+            Method m = StableConfiguration.class.getDeclaredMethod("get1");
             int level = WB.getMethodCompilationLevel(m);
             if (level > 0) {
               return (level == 4);
--- a/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Wed Sep 30 16:43:15 2015 +0100
+++ b/test/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	Fri Oct 02 04:37:30 2015 +0100
@@ -59,16 +59,25 @@
     };
 
     public static final BooleanSupplier SHA1_INSTRUCTION_AVAILABLE
-            = new CPUSpecificPredicate("sparc.*", new String[] { "sha1" },
-                    null);
+            = new OrPredicate(
+                    new CPUSpecificPredicate("sparc.*", new String[] { "sha1" },
+                            null),
+                    new CPUSpecificPredicate("aarch64", new String[] { "sha1" },
+                            null));
 
     public static final BooleanSupplier SHA256_INSTRUCTION_AVAILABLE
-            = new CPUSpecificPredicate("sparc.*", new String[] { "sha256" },
-                    null);
+            = new OrPredicate(
+                    new CPUSpecificPredicate("sparc.*", new String[] { "sha256" },
+                            null),
+                    new CPUSpecificPredicate("aarch64", new String[] { "sha256" },
+                            null));
 
     public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE
-            = new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },
-                    null);
+            = new OrPredicate(
+                    new CPUSpecificPredicate("sparc.*", new String[] { "sha512" },
+                            null),
+                    new CPUSpecificPredicate("aarch64", new String[] { "sha512" },
+                            null));
 
     public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
             = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,