Mercurial > hg > openjdk > jdk9 > hotspot

--- a/make/test/JtregNative.gmk	Mon Dec 14 15:53:48 2015 +0000
+++ b/make/test/JtregNative.gmk	Tue Dec 15 13:42:13 2015 +0000
@@ -46,6 +46,7 @@
     $(HOTSPOT_TOPDIR)/test/runtime/jni/8033445 \
     $(HOTSPOT_TOPDIR)/test/runtime/jni/ToStringInInterfaceTest \
     $(HOTSPOT_TOPDIR)/test/runtime/SameObject \
+    $(HOTSPOT_TOPDIR)/test/compiler/floatingpoint/ \
     #

 # Add conditional directories here when needed.
--- a/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -28,6 +28,10 @@

 const int StackAlignmentInBytes  = 16;

+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
 #define SUPPORTS_NATIVE_CX8

 // The maximum B/BL offset range on AArch64 is 128MB.
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -182,6 +182,11 @@
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
   }

+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   if (auxv & HWCAP_AES) {
     UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
     UseAESIntrinsics =
--- a/src/cpu/ppc/vm/globalDefinitions_ppc.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/ppc/vm/globalDefinitions_ppc.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -31,6 +31,10 @@

 const int StackAlignmentInBytes = 16;

+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = true;
+
 #define SUPPORTS_NATIVE_CX8

 // The PPC CPUs are NOT multiple-copy-atomic.
--- a/src/cpu/ppc/vm/ppc.ad	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/ppc/vm/ppc.ad	Tue Dec 15 13:42:13 2015 +0000
@@ -3486,6 +3486,7 @@
     call->_jvmadj            = _jvmadj;
     call->_in_rms            = _in_rms;
     call->_nesting           = _nesting;
+    call->_override_symbolic_info = _override_symbolic_info;

     // New call needs all inputs of old call.
     // Req...
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -223,6 +223,11 @@
     UseMultiplyToLenIntrinsic = true;
   }

+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags.
   if (!has_tcheck() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/src/cpu/sparc/vm/globalDefinitions_sparc.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/sparc/vm/globalDefinitions_sparc.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -30,6 +30,10 @@

 const int StackAlignmentInBytes = (2*wordSize);

+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
 #define SUPPORTS_NATIVE_CX8

 // The expected size in bytes of a cache line, used to pad data structures.
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -356,6 +356,11 @@
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }

+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -2152,33 +2152,64 @@
   emit_int8(0xC0 | encode);
 }

-void Assembler::kmovwl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+void Assembler::kmovbl(KRegister dst, Register src) {
+  assert(VM_Version::supports_avx512dq(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovbl(Register dst, KRegister src) {
+  assert(VM_Version::supports_avx512dq(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x93);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovwl(KRegister dst, Register src) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::kmovwl(Register dst, KRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x93);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::kmovdl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::kmovdl(Register dst, KRegister src) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x93);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::kmovql(KRegister dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x90);
   emit_int8((unsigned char)(0xC0 | encode));
 }

 void Assembler::kmovql(KRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@@ -2187,7 +2218,7 @@
 }

 void Assembler::kmovql(Address dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionMark im(this);
   InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@@ -2196,46 +2227,53 @@
 }

 void Assembler::kmovql(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_bw, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x92);
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::kmovql(Register dst, KRegister src) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x93);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 // This instruction produces ZF or CF flags
 void Assembler::kortestbl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512dq(), ""));
+  assert(VM_Version::supports_avx512dq(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x98);
   emit_int8((unsigned char)(0xC0 | encode));
 }

 // This instruction produces ZF or CF flags
 void Assembler::kortestwl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x98);
   emit_int8((unsigned char)(0xC0 | encode));
 }

 // This instruction produces ZF or CF flags
 void Assembler::kortestdl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x98);
   emit_int8((unsigned char)(0xC0 | encode));
 }

 // This instruction produces ZF or CF flags
 void Assembler::kortestql(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0x98);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2375,7 +2413,7 @@
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
 void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2395,7 +2433,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
@@ -2404,7 +2442,7 @@

 void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2424,7 +2462,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x7F);
@@ -3069,7 +3107,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x67);
@@ -3078,7 +3116,7 @@

 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x67);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3086,7 +3124,7 @@

 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "some form of AVX must be enabled");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x67);
@@ -3128,7 +3166,7 @@

 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
@@ -3148,16 +3186,28 @@
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x74);
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x74);
+  emit_operand(as_Register(dst_enc), src);
+}
+
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
@@ -3177,16 +3227,28 @@
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x75);
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x75);
+  emit_operand(as_Register(dst_enc), src);
+}
+
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x76);
@@ -3213,9 +3275,21 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x76);
+  emit_operand(as_Register(dst_enc), src);
+}
+
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqq(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse4_1(), ""));
+  assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x29);
@@ -3328,7 +3402,7 @@
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3337,7 +3411,7 @@

 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3347,7 +3421,7 @@
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
@@ -3452,7 +3526,7 @@

 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3461,7 +3535,7 @@
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x00);
@@ -3495,7 +3569,7 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3507,7 +3581,7 @@
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x70);
@@ -4723,7 +4797,7 @@

 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4731,7 +4805,7 @@

 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4771,7 +4845,7 @@

 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFC);
@@ -4780,7 +4854,7 @@

 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xFD);
@@ -4808,7 +4882,7 @@
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -4819,7 +4893,7 @@
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -4851,7 +4925,7 @@

 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4859,7 +4933,7 @@

 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4882,7 +4956,7 @@

 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF8);
@@ -4891,7 +4965,7 @@

 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF9);
@@ -4919,7 +4993,7 @@
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -4930,7 +5004,7 @@
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -4962,7 +5036,7 @@

 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4978,7 +5052,7 @@

 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD5);
@@ -5006,7 +5080,7 @@
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -5039,7 +5113,7 @@
 // Shift packed integers left by specified number of bits.
 void Assembler::psllw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5069,7 +5143,7 @@

 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5093,7 +5167,7 @@

 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
   int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5124,7 +5198,7 @@

 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xF1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5149,7 +5223,7 @@
 // Shift packed integers logically right by specified number of bits.
 void Assembler::psrlw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5181,7 +5255,7 @@

 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5205,7 +5279,7 @@

 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
   int encode = vex_prefix_and_encode(xmm2->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5235,7 +5309,7 @@

 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xD1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5260,7 +5334,7 @@
 // Shift packed integers arithmetically right by specified number of bits.
 void Assembler::psraw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
   int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5280,7 +5354,7 @@

 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xE1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5296,7 +5370,7 @@

 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
   int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8(0x71);
@@ -5316,7 +5390,7 @@

 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
   assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
   emit_int8((unsigned char)0xE1);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5706,7 +5780,7 @@
 // duplicate 2-bytes integer data from src into 16 locations in dest
 void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6573,18 +6647,6 @@
   }
 }

-int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
-                                      VexOpcode opc, InstructionAttr *attributes) {
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
-}
-
-int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
-                                      VexOpcode opc, InstructionAttr *attributes) {
-  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
-}
-
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   assert(!VM_Version::supports_evex(), "");
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -655,12 +655,6 @@
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
                              VexOpcode opc, InstructionAttr *attributes);

-  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
-                             VexOpcode opc, InstructionAttr *attributes);
-
-  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
-                             VexOpcode opc, InstructionAttr *attributes);
-
   // Helper functions for groups of instructions
   void emit_arith_b(int op1, int op2, Register dst, int imm8);

@@ -1331,12 +1325,17 @@

   void movddup(XMMRegister dst, XMMRegister src);

+  void kmovbl(KRegister dst, Register src);
+  void kmovbl(Register dst, KRegister src);
   void kmovwl(KRegister dst, Register src);
+  void kmovwl(Register dst, KRegister src);
   void kmovdl(KRegister dst, Register src);
+  void kmovdl(Register dst, KRegister src);
   void kmovql(KRegister dst, KRegister src);
-  void kmovql(KRegister dst, Register src);
   void kmovql(Address dst, KRegister src);
   void kmovql(KRegister dst, Address src);
+  void kmovql(KRegister dst, Register src);
+  void kmovql(Register dst, KRegister src);

   void kortestbl(KRegister dst, KRegister src);
   void kortestwl(KRegister dst, KRegister src);
@@ -1521,14 +1520,17 @@
   void pcmpeqb(XMMRegister dst, XMMRegister src);
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);

   void pcmpeqw(XMMRegister dst, XMMRegister src);
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);

   void pcmpeqd(XMMRegister dst, XMMRegister src);
   void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);

   void pcmpeqq(XMMRegister dst, XMMRegister src);
   void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/src/cpu/x86/vm/globalDefinitions_x86.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/globalDefinitions_x86.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -27,6 +27,10 @@

 const int StackAlignmentInBytes  = 16;

+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
 #define SUPPORTS_NATIVE_CX8

 // The expected size in bytes of a cache line, used to pad data structures.
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -7999,9 +7999,15 @@
                                     XMMRegister vec1, int ae) {
   ShortBranchVerifier sbv(this);
   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
+  Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
+  int stride2x2 = 0x40;
   Address::ScaleFactor scale, scale1, scale2;

+  if (ae != StrIntrinsicNode::LL) {
+    stride2x2 = 0x20;
+  }
+
   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
     shrl(cnt2, 1);
   }
@@ -8011,15 +8017,15 @@
   movl(result, cnt1);
   subl(cnt1, cnt2);
   push(cnt1);
-  cmov32(Assembler::lessEqual, cnt2, result);
+  cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)

   // Is the minimum length zero?
   testl(cnt2, cnt2);
   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
   if (ae == StrIntrinsicNode::LL) {
     // Load first bytes
-    load_unsigned_byte(result, Address(str1, 0));
-    load_unsigned_byte(cnt1, Address(str2, 0));
+    load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
+    load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
   } else if (ae == StrIntrinsicNode::UU) {
     // Load first characters
     load_unsigned_short(result, Address(str1, 0));
@@ -8060,7 +8066,10 @@
     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
+    Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
     Label COMPARE_TAIL_LONG;
+    Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
+
     int pcmpmask = 0x19;
     if (ae == StrIntrinsicNode::LL) {
       pcmpmask &= ~0x01;
@@ -8123,11 +8132,40 @@
     }
     subl(result, stride2);
     subl(cnt2, stride2);
-    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
+    jcc(Assembler::zero, COMPARE_WIDE_TAIL);
     negptr(result);

     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
     bind(COMPARE_WIDE_VECTORS_LOOP);
+
+#ifdef _LP64
+    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+      cmpl(cnt2, stride2x2);
+      jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+      testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
+      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
+
+      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
+      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
+        evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
+        evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+      } else {
+        vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
+        evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+      }
+      kortestql(k7, k7);
+      jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
+      addptr(result, stride2x2);  // update since we already compared at this addr
+      subl(cnt2, stride2x2);      // and sub the size too
+      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
+
+      vpxor(vec1, vec1);
+      jmpb(COMPARE_WIDE_TAIL);
+    }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
+
+    bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
       vmovdqu(vec1, Address(str1, result, scale));
       vpxor(vec1, Address(str2, result, scale));
@@ -8136,7 +8174,7 @@
       vpxor(vec1, Address(str2, result, scale2));
     }
     vptest(vec1, vec1);
-    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+    jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
     addptr(result, stride2);
     subl(cnt2, stride2);
     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
@@ -8151,7 +8189,7 @@
     movl(result, stride2);
     movl(cnt2, result);
     negptr(result);
-    jmpb(COMPARE_WIDE_VECTORS_LOOP);
+    jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);

     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
     bind(VECTOR_NOT_EQUAL);
@@ -8295,6 +8333,34 @@
   }
   jmpb(DONE_LABEL);

+#ifdef _LP64
+  if (VM_Version::supports_avx512vlbw()) {
+
+    bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
+
+    kmovql(cnt1, k7);
+    notq(cnt1);
+    bsfq(cnt2, cnt1);
+    if (ae != StrIntrinsicNode::LL) {
+      // Divide diff by 2 to get number of chars
+      sarl(cnt2, 1);
+    }
+    addq(result, cnt2);
+    if (ae == StrIntrinsicNode::LL) {
+      load_unsigned_byte(cnt1, Address(str2, result));
+      load_unsigned_byte(result, Address(str1, result));
+    } else if (ae == StrIntrinsicNode::UU) {
+      load_unsigned_short(cnt1, Address(str2, result, scale));
+      load_unsigned_short(result, Address(str1, result, scale));
+    } else {
+      load_unsigned_short(cnt1, Address(str2, result, scale2));
+      load_unsigned_byte(result, Address(str1, result, scale1));
+    }
+    subl(result, cnt1);
+    jmpb(POP_LABEL);
+  }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
   // Discard the stored length difference
   bind(POP_LABEL);
   pop(cnt1);
@@ -8304,6 +8370,7 @@
   if(ae == StrIntrinsicNode::UL) {
     negl(result);
   }
+
 }

 // Search for Non-ASCII character (Negative byte value) in a byte array,
@@ -9439,13 +9506,184 @@
   pop(tmp1);
 }

+void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
+  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
+  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
+  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
+  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
+  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
+  Label SAME_TILL_END, DONE;
+  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
+
+  //scale is in rcx in both Win64 and Unix
+  ShortBranchVerifier sbv(this);
+
+  shlq(length);
+  xorq(result, result);
+
+  cmpq(length, 8);
+  jcc(Assembler::equal, VECTOR8_LOOP);
+  jcc(Assembler::less, VECTOR4_TAIL);
+
+  if (UseAVX >= 2){
+
+    cmpq(length, 16);
+    jcc(Assembler::equal, VECTOR16_LOOP);
+    jcc(Assembler::less, VECTOR8_LOOP);
+
+    cmpq(length, 32);
+    jccb(Assembler::less, VECTOR16_TAIL);
+
+    subq(length, 32);
+    bind(VECTOR32_LOOP);
+    vmovdqu(rymm0, Address(obja, result));
+    vmovdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
+    vptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
+    addq(result, 32);
+    subq(length, 32);
+    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
+    addq(length, 32);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 32 bytes left //close the branch here.
+
+    bind(VECTOR16_TAIL);
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
+    ptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  } else {//regular intrinsics
+
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+
+    subq(length, 16);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    pxor(rymm0, rymm1);
+    ptest(rymm0, rymm0);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
+    addq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  }
+
+  bind(VECTOR8_TAIL);
+  cmpq(length, 8);
+  jccb(Assembler::less, VECTOR4_TAIL);
+  bind(VECTOR8_LOOP);
+  movq(tmp1, Address(obja, result));
+  movq(tmp2, Address(objb, result));
+  xorq(tmp1, tmp2);
+  testq(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
+  addq(result, 8);
+  subq(length, 8);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 8 bytes left
+
+  bind(VECTOR4_TAIL);
+  cmpq(length, 4);
+  jccb(Assembler::less, BYTES_TAIL);
+  bind(VECTOR4_LOOP);
+  movl(tmp1, Address(obja, result));
+  xorl(tmp1, Address(objb, result));
+  testl(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
+  addq(result, 4);
+  subq(length, 4);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 4 bytes left
+
+  bind(BYTES_TAIL);
+  bind(BYTES_LOOP);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jmpb(SAME_TILL_END);
+
+  if (UseAVX >= 2){
+    bind(VECTOR32_NOT_EQUAL);
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
+    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
+    vpmovmskb(tmp1, rymm0);
+    bsfq(tmp1, tmp1);
+    addq(result, tmp1);
+    shrq(result);
+    jmpb(DONE);
+  }
+
+  bind(VECTOR16_NOT_EQUAL);
+  if (UseAVX >= 2){
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
+    pxor(rymm0, rymm2);
+  } else {
+    pcmpeqb(rymm2, rymm2);
+    pxor(rymm0, rymm1);
+    pcmpeqb(rymm0, rymm1);
+    pxor(rymm0, rymm2);
+  }
+  pmovmskb(tmp1, rymm0);
+  bsfq(tmp1, tmp1);
+  addq(result, tmp1);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(VECTOR8_NOT_EQUAL);
+  bind(VECTOR4_NOT_EQUAL);
+  bsfq(tmp1, tmp1);
+  shrq(tmp1, 3);
+  addq(result, tmp1);
+  bind(BYTES_NOT_EQUAL);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(SAME_TILL_END);
+  mov64(result, -1);
+
+  bind(DONE);
+}
+
+
 //Helper functions for square_to_len()

 /**
  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
  * Preserves x and z and modifies rest of the registers.
  */
-
 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   // Perform square and right shift by 1
   // Handle odd xlen case first, then for even xlen do the following
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -1346,7 +1346,6 @@
                                Register carry2);
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
-
   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
@@ -1365,6 +1364,9 @@
   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
                Register raxReg);
+  void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
+                           Register result, Register tmp1, Register tmp2,
+                           XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
 #endif

   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -189,7 +189,7 @@
       }
       // Save full ZMM registes(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
-      int off = 0;
+      off = 0;
       int vector_len = Assembler::AVX_512bit;
       for (int n = 16; n < num_xmm_regs; n++) {
         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
@@ -199,7 +199,7 @@
     if (VM_Version::supports_evex()) {
       // Save upper bank of ZMM registers(16..31) for double/float usage
       int base_addr = XSAVE_AREA_UPPERBANK;
-      int off = 0;
+      off = 0;
       for (int n = 16; n < num_xmm_regs; n++) {
         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
       }
@@ -325,7 +325,7 @@
     assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
   }
 #else
-  assert(!save_vectors, "vectors are generated only by C2");
+  assert(!restore_vectors, "vectors are generated only by C2");
 #endif

   // On EVEX enabled targets everything is handled in pop fpu state
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -170,7 +170,7 @@
     // provide initial value for required masks
     if (UseAVX > 2) {
       __ movl(rbx, 0xffff);
-      __ kmovdl(k1, rbx);
+      __ kmovwl(k1, rbx);
     }

     // save and initialize %mxcsr
@@ -798,7 +798,7 @@
     if (UseAVX > 2) {
       __ push(rbx);
       __ movl(rbx, 0xffff);
-      __ kmovdl(k1, rbx);
+      __ kmovwl(k1, rbx);
       __ pop(rbx);
     }
     // Copy 64-byte chunks
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -266,7 +266,7 @@
     __ movptr(r15_save, r15);
     if (UseAVX > 2) {
       __ movl(rbx, 0xffff);
-      __ kmovql(k1, rbx);
+      __ kmovwl(k1, rbx);
     }
 #ifdef _WIN64
     int last_reg = 15;
@@ -1350,7 +1350,7 @@
       Label L_end;
       if (UseAVX > 2) {
         __ movl(to, 0xffff);
-        __ kmovql(k1, to);
+        __ kmovwl(k1, to);
       }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
@@ -1434,7 +1434,7 @@
       Label L_end;
       if (UseAVX > 2) {
         __ movl(to, 0xffff);
-        __ kmovql(k1, to);
+        __ kmovwl(k1, to);
       }
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
@@ -4054,6 +4054,54 @@
     return start;
   }

+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *    c_rarg0   - obja     address
+  *    c_rarg1   - objb     address
+  *    c_rarg3   - length   length
+  *    c_rarg4   - scale    log2_array_indxscale
+  */
+  address generate_vectorizedMismatch() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+#ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register scale = c_rarg0;  //rcx, will exchange with r9
+    const Register objb = c_rarg1;   //rdx
+    const Register length = c_rarg2; //r8
+    const Register obja = c_rarg3;   //r9
+    __ xchgq(obja, scale);  //now obja and scale contains the correct contents
+
+    const Register tmp1 = r10;
+    const Register tmp2 = r11;
+#endif
+#ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register obja = c_rarg0;   //U:rdi
+    const Register objb = c_rarg1;   //U:rsi
+    const Register length = c_rarg2; //U:rdx
+    const Register scale = c_rarg3;  //U:rcx
+    const Register tmp1 = r8;
+    const Register tmp2 = r9;
+#endif
+    const Register result = rax; //return value
+    const XMMRegister vec0 = xmm0;
+    const XMMRegister vec1 = xmm1;
+    const XMMRegister vec2 = xmm2;
+
+    __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
+
+    __ leave();
+    __ ret(0);
+
+    return start;
+  }
+
 /**
    *  Arguments:
    *
@@ -4505,7 +4553,9 @@
     if (UseMulAddIntrinsic) {
       StubRoutines::_mulAdd = generate_mulAdd();
     }
-
+    if (UseVectorizedMismatchIntrinsic) {
+      StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
+    }
 #ifndef _WINDOWS
     if (UseMontgomeryMultiplyIntrinsic) {
       StubRoutines::_montgomeryMultiply
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -1039,6 +1039,25 @@
     }
   }

+#ifdef _LP64
+  if (UseSSE42Intrinsics) {
+    if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
+      UseVectorizedMismatchIntrinsic = true;
+    }
+  } else if (UseVectorizedMismatchIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic))
+      warning("vectorizedMismatch intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+#else
+  if (UseVectorizedMismatchIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
+      warning("vectorizedMismatch intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+#endif // _LP64
+
   // Use count leading zeros count instruction if available.
   if (supports_lzcnt()) {
     if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
--- a/src/cpu/zero/vm/globalDefinitions_zero.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/cpu/zero/vm/globalDefinitions_zero.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -28,4 +28,8 @@

 #include <ffi.h>

+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
 #endif // CPU_ZERO_VM_GLOBALDEFINITIONS_ZERO_HPP
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -3055,13 +3055,16 @@
   __ cmove(lir_cond(x->cond()), t_val.result(), f_val.result(), reg, as_BasicType(x->x()->type()));
 }

-void LIRGenerator::do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x) {
-    assert(x->number_of_arguments() == expected_arguments, "wrong type");
-    LIR_Opr reg = result_register_for(x->type());
-    __ call_runtime_leaf(routine, getThreadTemp(),
-                         reg, new LIR_OprList());
-    LIR_Opr result = rlock_result(x);
-    __ move(reg, result);
+void LIRGenerator::do_RuntimeCall(address routine, Intrinsic* x) {
+  assert(x->number_of_arguments() == 0, "wrong type");
+  // Enforce computation of _reserved_argument_area_size which is required on some platforms.
+  BasicTypeList signature;
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+  LIR_Opr reg = result_register_for(x->type());
+  __ call_runtime_leaf(routine, getThreadTemp(),
+                       reg, new LIR_OprList());
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
 }

 #ifdef TRACE_HAVE_INTRINSICS
@@ -3115,16 +3118,16 @@
   case vmIntrinsics::_threadID: do_ThreadIDIntrinsic(x); break;
   case vmIntrinsics::_classID: do_ClassIDIntrinsic(x); break;
   case vmIntrinsics::_counterTime:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), x);
     break;
 #endif

   case vmIntrinsics::_currentTimeMillis:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), x);
     break;

   case vmIntrinsics::_nanoTime:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), x);
     break;

   case vmIntrinsics::_Object_init:    do_RegisterFinalizer(x); break;
--- a/src/share/vm/c1/c1_LIRGenerator.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/c1/c1_LIRGenerator.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -157,8 +157,8 @@
  private:
   void* operator new(size_t size) throw();
   void* operator new[](size_t size) throw();
-  void operator delete(void* p);
-  void operator delete[](void* p);
+  void operator delete(void* p) { ShouldNotReachHere(); }
+  void operator delete[](void* p) { ShouldNotReachHere(); }

   Compilation*  _compilation;
   ciMethod*     _method;    // method that we are compiling
@@ -439,7 +439,7 @@
   SwitchRangeArray* create_lookup_ranges(LookupSwitch* x);
   void do_SwitchRanges(SwitchRangeArray* x, LIR_Opr value, BlockBegin* default_sux);

-  void do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x);
+  void do_RuntimeCall(address routine, Intrinsic* x);
 #ifdef TRACE_HAVE_INTRINSICS
   void do_ThreadIDIntrinsic(Intrinsic* x);
   void do_ClassIDIntrinsic(Intrinsic* x);
--- a/src/share/vm/c1/c1_RangeCheckElimination.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/c1/c1_RangeCheckElimination.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -50,8 +50,8 @@
   private:
     void* operator new(size_t size) throw();
     void* operator new[](size_t size) throw();
-    void operator delete(void* p);
-    void operator delete[](void* p);
+    void operator delete(void* p) { ShouldNotReachHere(); }
+    void operator delete[](void* p) { ShouldNotReachHere(); }

     IR *_ir;
     boolArray _used;
--- a/src/share/vm/classfile/vmSymbols.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/classfile/vmSymbols.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -681,6 +681,9 @@
   case vmIntrinsics::_montgomerySquare:
     if (!UseMontgomerySquareIntrinsic) return true;
     break;
+  case vmIntrinsics::_vectorizedMismatch:
+    if (!UseVectorizedMismatchIntrinsic) return true;
+    break;
   case vmIntrinsics::_addExactI:
   case vmIntrinsics::_addExactL:
   case vmIntrinsics::_decrementExactI:
--- a/src/share/vm/classfile/vmSymbols.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/classfile/vmSymbols.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -957,6 +957,11 @@
    do_name(     montgomerySquare_name,                             "implMontgomerySquare")                              \
    do_signature(montgomerySquare_signature,                        "([I[IIJ[I)[I")                                      \
                                                                                                                         \
+  do_class(java_util_ArraysSupport, "java/util/ArraysSupport")                                                          \
+  do_intrinsic(_vectorizedMismatch, java_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\
+   do_name(vectorizedMismatch_name, "vectorizedMismatch")                                                               \
+   do_signature(vectorizedMismatch_signature, "(Ljava/lang/Object;JLjava/lang/Object;JII)I")                            \
+                                                                                                                        \
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
--- a/src/share/vm/code/dependencies.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/code/dependencies.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -346,7 +346,6 @@
       }
     }
   } else {
-    assert(dep_implicit_context_arg(dept) == 0, "sanity");
     if (note_dep_seen(dept, x0) && note_dep_seen(dept, x1)) {
       // look in this bucket for redundant assertions
       const int stride = 2;
--- a/src/share/vm/compiler/compileBroker.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/compiler/compileBroker.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -56,6 +56,7 @@
 #if INCLUDE_JVMCI
 #include "jvmci/jvmciCompiler.hpp"
 #include "jvmci/jvmciRuntime.hpp"
+#include "jvmci/jvmciJavaClasses.hpp"
 #include "runtime/vframe.hpp"
 #endif
 #ifdef COMPILER2
@@ -498,7 +499,7 @@
 // CompileBroker::compilation_init
 //
 // Initialize the Compilation object
-void CompileBroker::compilation_init() {
+void CompileBroker::compilation_init(TRAPS) {
   _last_method_compiled[0] = '\0';

   // No need to initialize compilation system if we do not use it.
@@ -529,6 +530,17 @@
       } else {
         c1_count = JVMCIHostThreads;
       }
+
+      if (!UseInterpreter) {
+        // Force initialization of JVMCI compiler otherwise JVMCI
+        // compilations will not block until JVMCI is initialized
+        ResourceMark rm;
+        TempNewSymbol getCompiler = SymbolTable::new_symbol("getCompiler", CHECK);
+        TempNewSymbol sig = SymbolTable::new_symbol("()Ljdk/vm/ci/runtime/JVMCICompiler;", CHECK);
+        Handle jvmciRuntime = JVMCIRuntime::get_HotSpotJVMCIRuntime(CHECK);
+        JavaValue result(T_OBJECT);
+        JavaCalls::call_virtual(&result, jvmciRuntime, HotSpotJVMCIRuntime::klass(), getCompiler, sig, CHECK);
+      }
     }
   }
 #endif // INCLUDE_JVMCI
--- a/src/share/vm/compiler/compileBroker.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/compiler/compileBroker.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -276,7 +276,7 @@
     CompileQueue *q = compile_queue(comp_level);
     return q != NULL ? q->size() : 0;
   }
-  static void compilation_init();
+  static void compilation_init(TRAPS);
   static void init_compiler_thread_log();
   static nmethod* compile_method(const methodHandle& method,
                                  int osr_bci,
--- a/src/share/vm/opto/c2compiler.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/c2compiler.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -441,6 +441,7 @@
   case vmIntrinsics::_mulAdd:
   case vmIntrinsics::_montgomeryMultiply:
   case vmIntrinsics::_montgomerySquare:
+  case vmIntrinsics::_vectorizedMismatch:
   case vmIntrinsics::_ghash_processBlocks:
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
--- a/src/share/vm/opto/escape.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/escape.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -987,7 +987,8 @@
                   strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0)
                  ))) {
             call->dump();
             fatal("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name);
--- a/src/share/vm/opto/generateOptoStub.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/generateOptoStub.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -72,16 +72,18 @@

   // Make up the parameters
   uint i;
-  for( i = 0; i < parm_cnt; i++ )
+  for (i = 0; i < parm_cnt; i++) {
     map()->init_req(i, _gvn.transform(new ParmNode(start, i)));
-  for( ; i<map()->req(); i++ )
+  }
+  for ( ; i<map()->req(); i++) {
     map()->init_req(i, top());      // For nicer debugging
+  }

   // GraphKit requires memory to be a MergeMemNode:
   set_all_memory(map()->memory());

   // Get base of thread-local storage area
-  Node* thread = _gvn.transform( new ThreadLocalNode() );
+  Node* thread = _gvn.transform(new ThreadLocalNode());

   const int NoAlias = Compile::AliasIdxBot;

@@ -113,21 +115,27 @@

   //-----------------------------
   // Compute signature for C call.  Varies from the Java signature!
+
   const Type **fields = TypeTuple::fields(2*parm_cnt+2);
   uint cnt = TypeFunc::Parms;
   // The C routines gets the base of thread-local storage passed in as an
-  // extra argument.  Not all calls need it, but its cheap to add here.
+  // extra argument. Not all calls need it, but it is cheap to add here.
   for (uint pcnt = cnt; pcnt < parm_cnt; pcnt++, cnt++) {
-    fields[cnt] = jdomain->field_at(pcnt);
+    const Type *f = jdomain->field_at(pcnt);
+    if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) {
+      fields[cnt++] = TypeLong::LONG;
+      fields[cnt] = Type::HALF; // Must add an additional half for a long.
+    } else {
+      fields[cnt] = f;
+    }
   }
-
   fields[cnt++] = TypeRawPtr::BOTTOM; // Thread-local storage
   // Also pass in the caller's PC, if asked for.
   if (return_pc) {
     fields[cnt++] = TypeRawPtr::BOTTOM; // Return PC
   }
+  const TypeTuple* domain = TypeTuple::make(cnt, fields);

-  const TypeTuple* domain = TypeTuple::make(cnt,fields);
   // The C routine we are about to call cannot return an oop; it can block on
   // exit and a GC will trash the oop while it sits in C-land.  Instead, we
   // return the oop through TLS for runtime calls.
@@ -155,37 +163,44 @@
       rfields[TypeFunc::Parms+1] = jrange->field_at(TypeFunc::Parms+1);
     }
   }
-  const TypeTuple* range = TypeTuple::make(jrange->cnt(),rfields);
+  const TypeTuple* range = TypeTuple::make(jrange->cnt(), rfields);

   // Final C signature
-  const TypeFunc *c_sig = TypeFunc::make(domain,range);
+  const TypeFunc *c_sig = TypeFunc::make(domain, range);

   //-----------------------------
-  // Make the call node
+  // Make the call node.
   CallRuntimeNode *call = new CallRuntimeNode(c_sig, C_function, name, TypePtr::BOTTOM);
   //-----------------------------

-  // Fix-up the debug info for the call
-  call->set_jvms( new (C) JVMState(0) );
+  // Fix-up the debug info for the call.
+  call->set_jvms(new (C) JVMState(0));
   call->jvms()->set_bci(0);
   call->jvms()->set_offsets(cnt);

-  // Set fixed predefined input arguments
+  // Set fixed predefined input arguments.
   cnt = 0;
-  for (i = 0; i < TypeFunc::Parms; i++)
-    call->init_req(cnt++, map()->in(i));
-  // A little too aggressive on the parm copy; return address is not an input
-  call->set_req(TypeFunc::ReturnAdr, top());
-  for (; i < parm_cnt; i++) { // Regular input arguments
+  for (i = 0; i < TypeFunc::Parms; i++) {
     call->init_req(cnt++, map()->in(i));
   }
+  // A little too aggressive on the parm copy; return address is not an input.
+  call->set_req(TypeFunc::ReturnAdr, top());
+  for (; i < parm_cnt; i++) { // Regular input arguments.
+    const Type *f = jdomain->field_at(i);
+    if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) {
+      call->init_req(cnt++, _gvn.transform(new ConvI2LNode(map()->in(i))));
+      call->init_req(cnt++, top());
+    } else {
+      call->init_req(cnt++, map()->in(i));
+    }
+  }
+  call->init_req(cnt++, thread);
+  if (return_pc) {             // Return PC, if asked for.
+    call->init_req(cnt++, returnadr());
+  }

-  call->init_req( cnt++, thread );
-  if( return_pc )             // Return PC, if asked for
-    call->init_req( cnt++, returnadr() );
   _gvn.transform_no_reclaim(call);

-
   //-----------------------------
   // Now set up the return results
   set_control( _gvn.transform( new ProjNode(call,TypeFunc::Control)) );
--- a/src/share/vm/opto/library_call.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/library_call.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -312,6 +312,7 @@
   bool inline_mulAdd();
   bool inline_montgomeryMultiply();
   bool inline_montgomerySquare();
+  bool inline_vectorizedMismatch();

   bool inline_profileBoolean();
   bool inline_isCompileConstant();
@@ -720,6 +721,9 @@
   case vmIntrinsics::_montgomerySquare:
     return inline_montgomerySquare();

+  case vmIntrinsics::_vectorizedMismatch:
+    return inline_vectorizedMismatch();
+
   case vmIntrinsics::_ghash_processBlocks:
     return inline_ghash_processBlocks();

@@ -5581,6 +5585,50 @@
   return true;
 }

+//-------------inline_vectorizedMismatch------------------------------
+bool LibraryCallKit::inline_vectorizedMismatch() {
+  assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::vectorizedMismatch();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "vectorizedMismatch";
+  int size_l = callee()->signature()->size();
+  assert(callee()->signature()->size() == 8, "vectorizedMismatch has 6 parameters");
+
+  Node* obja = argument(0);
+  Node* aoffset = argument(1);
+  Node* objb = argument(3);
+  Node* boffset = argument(4);
+  Node* length = argument(6);
+  Node* scale = argument(7);
+
+  const Type* a_type = obja->Value(&_gvn);
+  const Type* b_type = objb->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const TypeAryPtr* top_b = b_type->isa_aryptr();
+  if (top_a == NULL || top_a->klass() == NULL ||
+    top_b == NULL || top_b->klass() == NULL) {
+    // failed array check
+    return false;
+  }
+
+  Node* call;
+  jvms()->set_should_reexecute(true);
+
+  Node* obja_adr = make_unsafe_address(obja, aoffset);
+  Node* objb_adr = make_unsafe_address(objb, boffset);
+
+  call = make_runtime_call(RC_LEAF,
+    OptoRuntime::vectorizedMismatch_Type(),
+    stubAddr, stubName, TypePtr::BOTTOM,
+    obja_adr, objb_adr, length, scale);
+
+  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}

 /**
  * Calculate CRC32 for byte.
--- a/src/share/vm/opto/runtime.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/runtime.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -1103,6 +1103,26 @@
   return TypeFunc::make(domain, range);
 }

+const TypeFunc* OptoRuntime::vectorizedMismatch_Type() {
+  // create input type (domain)
+  int num_args = 4;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // obja
+  fields[argp++] = TypePtr::NOTNULL;    // objb
+  fields[argp++] = TypeInt::INT;        // length, number of elements
+  fields[argp++] = TypeInt::INT;        // log2scale, element size
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+
+  //return mismatch index (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 // GHASH block processing
 const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
     int argcnt = 4;
--- a/src/share/vm/opto/runtime.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/opto/runtime.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -299,6 +299,8 @@

   static const TypeFunc* mulAdd_Type();

+  static const TypeFunc* vectorizedMismatch_Type();
+
   static const TypeFunc* ghash_processBlocks_Type();

   static const TypeFunc* updateBytesCRC32_Type();
--- a/src/share/vm/runtime/globals.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/runtime/globals.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -855,6 +855,9 @@
   product(bool, UseAdler32Intrinsics, false,                                \
           "use intrinsics for java.util.zip.Adler32")                       \
                                                                             \
+  product(bool, UseVectorizedMismatchIntrinsic, false,                      \
+          "Enables intrinsification of ArraysSupport.vectorizedMismatch()") \
+                                                                            \
   diagnostic(ccstrlist, DisableIntrinsic, "",                               \
          "do not expand intrinsics whose (internal) names appear here")     \
                                                                             \
--- a/src/share/vm/runtime/stubRoutines.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/runtime/stubRoutines.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -148,6 +148,8 @@
 address StubRoutines::_montgomeryMultiply = NULL;
 address StubRoutines::_montgomerySquare = NULL;

+address StubRoutines::_vectorizedMismatch = NULL;
+
 address StubRoutines::_dexp = NULL;
 address StubRoutines::_dlog = NULL;
--- a/src/share/vm/runtime/stubRoutines.hpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/runtime/stubRoutines.hpp	Tue Dec 15 13:42:13 2015 +0000
@@ -207,6 +207,8 @@
   static address _montgomeryMultiply;
   static address _montgomerySquare;

+  static address _vectorizedMismatch;
+
   static address _dexp;
   static address _dlog;

@@ -376,6 +378,8 @@
   static address montgomeryMultiply()  { return _montgomeryMultiply; }
   static address montgomerySquare()    { return _montgomerySquare; }

+  static address vectorizedMismatch()  { return _vectorizedMismatch; }
+
   static address dexp()                { return _dexp; }
   static address dlog()                { return _dlog; }
--- a/src/share/vm/runtime/thread.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/runtime/thread.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -3628,7 +3628,7 @@

   // initialize compiler(s)
 #if defined(COMPILER1) || defined(COMPILER2) || defined(SHARK) || INCLUDE_JVMCI
-  CompileBroker::compilation_init();
+  CompileBroker::compilation_init(CHECK_JNI_ERR);
 #endif

   // Pre-initialize some JSR292 core classes to avoid deadlock during class loading.
--- a/src/share/vm/runtime/vmStructs.cpp	Mon Dec 14 15:53:48 2015 +0000
+++ b/src/share/vm/runtime/vmStructs.cpp	Tue Dec 15 13:42:13 2015 +0000
@@ -860,6 +860,7 @@
      static_field(StubRoutines,                _mulAdd,                                       address)                               \
      static_field(StubRoutines,                _dexp,                                         address)                               \
      static_field(StubRoutines,                _dlog,                                         address)                               \
+     static_field(StubRoutines,                _vectorizedMismatch,                           address)                               \
      static_field(StubRoutines,                _jbyte_arraycopy,                              address)                               \
      static_field(StubRoutines,                _jshort_arraycopy,                             address)                               \
      static_field(StubRoutines,                _jint_arraycopy,                               address)                               \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java	Tue Dec 15 13:42:13 2015 +0000
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 SAP SE. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @summary Test that overflowed integers passed to arraycopy don't do any harm. This might
+ *          be the case on platforms where C-code expects that ints passed to a call
+ *          are properly sign extended to 64 bit (e.g., PPC64, s390x). This can fail
+ *          if slow_arraycopy_C() is commpiled by the C compiler without any imlicit
+ *          casts (as spill stores to the stack that are done with 4-byte instruction).
+ * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestArrayCopyOverflowArguments
+ *
+ */
+
+public class TestArrayCopyOverflowArguments {
+
+    // Without volatile the overflowing computation was moved up and then
+    // spilled to the stack. The 32-bit spill store caused proper rounding.
+    static volatile int mod = Integer.MAX_VALUE;
+
+    public static int[] m1(Object src) {
+        if (src == null) return null;
+        int[] dest = new int[10];
+        try {
+            // PPC C calling conventions require that ints are properly expanded
+            // to longs when passed to a function.
+            int pos   =  8 + mod + mod; // = 0x1_0000_0006.
+            int start =  2 + mod + mod; // = 0x1_0000_0000.
+            int len   = 12 + mod + mod; // = 0x1_0000_0010.
+            // This is supposed to call SharedRuntime::slow_arraycopy_C().
+            System.arraycopy(src, pos, dest, 0, 10);
+        } catch (ArrayStoreException npe) {
+        }
+        return dest;
+    }
+
+    static public void main(String[] args) throws Exception {
+        int[] src = new int[20];
+
+        for (int i  = 0; i < 20; ++i) {
+            src[i] = i * (i-1);
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            m1(src);
+        }
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/floatingpoint/Test15FloatJNIArgs.java	Tue Dec 15 13:42:13 2015 +0000
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2015 SAP SE. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/* @test
+ * @bug 8139258
+ * @summary Regression test for 8139258 which failed to properly pass float args
+ *          to a jni function on ppc64le.
+ * @run main/othervm -Xint Test15FloatJNIArgs
+ * @run main/othervm -XX:+TieredCompilation -Xcomp Test15FloatJNIArgs
+ * @run main/othervm -XX:-TieredCompilation -Xcomp Test15FloatJNIArgs
+ */
+
+public class Test15FloatJNIArgs {
+    static {
+        try {
+            System.loadLibrary("Test15FloatJNIArgs");
+        } catch (UnsatisfiedLinkError e) {
+            System.out.println("could not load native lib: " + e);
+        }
+    }
+
+    public static native float add15floats(
+        float f1, float f2, float f3, float f4,
+        float f5, float f6, float f7, float f8,
+        float f9, float f10, float f11, float f12,
+        float f13, float f14, float f15);
+
+    static void test() throws Exception {
+        float sum = Test15FloatJNIArgs.add15floats(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                                   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
+        if (sum != 15.0f) {
+            throw new Error("Passed 15 times 1.0f to jni function which didn't add them properly: " + sum);
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        for (int i = 0; i < 200; ++i) {
+            test();
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/floatingpoint/libTest15FloatJNIArgs.c	Tue Dec 15 13:42:13 2015 +0000
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT jfloat JNICALL Java_Test15FloatJNIArgs_add15floats
+  (JNIEnv *env, jclass cls,
+   jfloat  f1, jfloat  f2, jfloat  f3, jfloat  f4,
+   jfloat  f5, jfloat  f6, jfloat  f7, jfloat  f8,
+   jfloat  f9, jfloat f10, jfloat f11, jfloat f12,
+   jfloat f13, jfloat f14, jfloat f15) {
+  return f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11 + f12 + f13 + f14 + f15;
+}
+
+#ifdef __cplusplus
+}
+#endif