Mercurial > hg > icedtea8-forest > hotspot

--- a/.hgtags	Mon Jul 25 06:02:58 2016 +0100
+++ b/.hgtags	Mon Jul 25 06:38:24 2016 +0100
@@ -617,6 +617,7 @@
 6824e2475e0432e27f9cc51838bc34ea5fbf5113 jdk8u40-b27
 8220f68a195f6eeed2f5fb6e8a303726b512e899 jdk8u40-b31
 850a290eb1088a61178d1910c500e170ef4f4386 jdk8u40-b32
+e6aa4a8c1b46a05b6c493b4ffe5c2555013f5c99 jdk8u40-b33
 1b3abbeee961dee49780c0e4af5337feb918c555 jdk8u40-b10
 f10fe402dfb1543723b4b117a7cba3ea3d4159f1 hs25.40-b15
 99372b2fee0eb8b3452f47230e84aa6e97003184 jdk8u40-b11
@@ -826,17 +827,28 @@
 dc2fdd4e0b8105268b8231040f761f27ab4523f2 jdk8u72-b14
 dbf85d44da89a2428b3bd678be791c314e148845 icedtea-3.0.0pre09
 d6670c5d49ba381405ec9f69a78ccc5b8b0c8473 jdk8u72-b15
+da43260704c28b9f19cb652090ae65c258220fd6 jdk8u72-b31
 26b99cd20661a1fa05939d1856a9389311e01c4f jdk8u73-b00
 931c31db01ae873525a1b2c306b01129eb431960 jdk8u73-b01
 67566d815a66d958c1f817d65f1621ba1d2e5f33 jdk8u73-b02
 451dda77f6c29bd3260e87f847a9eadae122a759 jdk8u74-b00
 c1031a924f2c910fad078838b88a2f0146f2de98 jdk8u74-b01
 ca9cae9aa9e989bbe6713c91d55c913edeaecce4 jdk8u74-b02
+a5b78b56841e97ce00463874f1b7f63c54d84934 jdk8u74-b31
+94ec11846b18111e73929b6caa9fbe7262e142c1 jdk8u74-b32
+1b6d4fd2730e58f17820930f797938dc182117c4 jdk8u77-b00
+ddd297e340b1170d3cec011ee64e729f8b493c86 jdk8u77-b01
+1b4072e4bb3ad54c4e894998486a8b33f0689160 jdk8u77-b02
+223b64a19e94222dd97b92bb40abcfbc0bf6ef1f jdk8u77-b03
+dd8507f51d786572dae18af8ffdc5a1ea34c755e jdk8u77-b31
+
+94ec11846b18111e73929b6caa9fbe7262e142c1 jdk8u74-b32
 da43260704c28b9f19cb652090ae65c258220fd6 jdk8u72-b31
 c0242ea4bde19d72be5149feda112a39e8c89b0a jdk8u75-b00
 ca3b8c8e390ab0540b0cc2e5def869b38e460d86 jdk8u75-b01
 9aef5b5e0a68f20059cfa9e2806b4ff0e11a3d31 jdk8u75-b02
 2df9fe896819362b9075a670b78106b249e50d6d jdk8u75-b03
+b374548dcb4834eb8731a06b52faddd0f10bd45d jdk8u101-b00
 32b682649973231b54740c09b10889660f6ebde5 jdk8u75-b04
 1f43bd4fab06d2ca5d1964611df14d8506d6b36e jdk8u75-b05
 916712f178c39d0acbc590f38802133fc86a7346 jdk8u75-b06
@@ -855,6 +867,7 @@
 bbbb05e91c629f8d9eef2ba43933767f68a898b0 jdk8u91-b00
 e36b6ade0499eadfd8673fe62ef0a613af2e6d67 jdk8u91-b13
 fa8991ccf6e5b74890a0b5672440b3c09d8d8732 jdk8u91-b14
+e1ea97ad19af4d1e0bda449aa43be7e1b118ffe9 jdk8u91-b15
 d7b01fb81aa8a5437cb03bc36afe15cf0e55fb89 jdk8u76-b00
 c1679cc87ba045219169cabb6b9b378c2b5cc578 jdk8u76-b01
 218483967e52b419d885d34af4488a81c5133804 jdk8u76-b02
@@ -874,3 +887,18 @@
 76eca5cf31500ecb1d1807685729a7ea5c3780e7 icedtea-3.1.0pre02
 ea6933324a7a52379d7f1e18e7525fd619079865 icedtea-3.1.0pre03
 0f47eef348e2ec17dd6492886c57e23023fbc435 icedtea-3.1.0pre04
+24a09407d71bb2cc4848bfa21660c890b4d722b1 jdk8u92-b14
+445941ba41c0e3829fe02140690b144281ac2141 jdk8u92-b31
+b374548dcb4834eb8731a06b52faddd0f10bd45d jdk8u81-b00
+ead07188d11107e877e8e4ad215ff6cb238a8a92 jdk8u101-b01
+34429bad9986677f4991c80aeb22665842881cba jdk8u101-b02
+b41d5faaf1d32ed1bf9592f65f2f94ddd4c60fc4 jdk8u101-b03
+ceecf88e5c2c09bfabf5926581e6d0b0f65f5148 jdk8u101-b04
+19e74265fc8def6a7fc96c836d8ebe38ad1cf199 jdk8u101-b05
+7c60503b0888ac16eac80a6cd074195973f8dedb jdk8u101-b06
+cb4af293fe70549b51039bb9197f373e6750fafb jdk8u101-b07
+8ed377d2cec94435d1617a37999960a24be73ad9 jdk8u101-b08
+9be452c4e7161e60d623d55bb72ad013386aefd1 jdk8u101-b09
+218a44a163fa8c2532fd5f2e8ea9bc3c9c2ca8cf jdk8u101-b10
+0095e54dcaa1acfe1614feff9600734c26af7ae8 jdk8u101-b11
+286fe17d81c3d153611a28e50926083ae934cc56 jdk8u101-b12
--- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -2318,6 +2318,13 @@
   emit_arith(0x0B, 0xC0, dst, src);
 }

+void Assembler::orl(Address dst, Register src) {
+  InstructionMark im(this);
+  prefix(dst, src);
+  emit_int8(0x09);
+  emit_operand(src, dst);
+}
+
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@@ -5613,6 +5620,19 @@
   }
 }

+void Assembler::rcrq(Register dst, int imm8) {
+  assert(isShiftCount(imm8 >> 1), "illegal shift count");
+  int encode = prefixq_and_encode(dst->encoding());
+  if (imm8 == 1) {
+    emit_int8((unsigned char)0xD1);
+    emit_int8((unsigned char)(0xD8 | encode));
+  } else {
+    emit_int8((unsigned char)0xC1);
+    emit_int8((unsigned char)(0xD8 | encode));
+    emit_int8(imm8);
+  }
+}
+
 void Assembler::rorq(Register dst, int imm8) {
   assert(isShiftCount(imm8 >> 1), "illegal shift count");
   int encode = prefixq_and_encode(dst->encoding());
--- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1455,6 +1455,7 @@
   void orl(Register dst, int32_t imm32);
   void orl(Register dst, Address src);
   void orl(Register dst, Register src);
+  void orl(Address dst, Register src);

   void orq(Address dst, int32_t imm32);
   void orq(Register dst, int32_t imm32);
@@ -1555,6 +1556,8 @@

   void rclq(Register dst, int imm8);

+  void rcrq(Register dst, int imm8);
+
   void rdtsc();

   void ret(int imm16);
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -7769,6 +7769,503 @@
   pop(tmp2);
   pop(tmp1);
 }
+
+//Helper functions for square_to_len()
+
+/**
+ * Store the squares of x[], right shifted one bit (divided by 2) into z[]
+ * Preserves x and z and modifies rest of the registers.
+ */
+
+void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+  // Perform square and right shift by 1
+  // Handle odd xlen case first, then for even xlen do the following
+  // jlong carry = 0;
+  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
+  //     huge_128 product = x[j:j+1] * x[j:j+1];
+  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
+  //     z[i+2:i+3] = (jlong)(product >>> 1);
+  //     carry = (jlong)product;
+  // }
+
+  xorq(tmp5, tmp5);     // carry
+  xorq(rdxReg, rdxReg);
+  xorl(tmp1, tmp1);     // index for x
+  xorl(tmp4, tmp4);     // index for z
+
+  Label L_first_loop, L_first_loop_exit;
+
+  testl(xlen, 1);
+  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
+
+  // Square and right shift by 1 the odd element using 32 bit multiply
+  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
+  imulq(raxReg, raxReg);
+  shrq(raxReg, 1);
+  adcq(tmp5, 0);
+  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
+  incrementl(tmp1);
+  addl(tmp4, 2);
+
+  // Square and  right shift by 1 the rest using 64 bit multiply
+  bind(L_first_loop);
+  cmpptr(tmp1, xlen);
+  jccb(Assembler::equal, L_first_loop_exit);
+
+  // Square
+  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
+  rorq(raxReg, 32);    // convert big-endian to little-endian
+  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
+
+  // Right shift by 1 and save carry
+  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
+  rcrq(rdxReg, 1);
+  rcrq(raxReg, 1);
+  adcq(tmp5, 0);
+
+  // Store result in z
+  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
+  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
+
+  // Update indices for x and z
+  addl(tmp1, 2);
+  addl(tmp4, 4);
+  jmp(L_first_loop);
+
+  bind(L_first_loop_exit);
+}
+
+
+/**
+ * Perform the following multiply add operation using BMI2 instructions
+ * carry:sum = sum + op1*op2 + carry
+ * op2 should be in rdx
+ * op2 is preserved, all other registers are modified
+ */
+void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
+  // assert op2 is rdx
+  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
+  addq(sum, carry);
+  adcq(tmp2, 0);
+  addq(sum, op1);
+  adcq(tmp2, 0);
+  movq(carry, tmp2);
+}
+
+/**
+ * Perform the following multiply add operation:
+ * carry:sum = sum + op1*op2 + carry
+ * Preserves op1, op2 and modifies rest of registers
+ */
+void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
+  // rdx:rax = op1 * op2
+  movq(raxReg, op2);
+  mulq(op1);
+
+  //  rdx:rax = sum + carry + rdx:rax
+  addq(sum, carry);
+  adcq(rdxReg, 0);
+  addq(sum, raxReg);
+  adcq(rdxReg, 0);
+
+  // carry:sum = rdx:sum
+  movq(carry, rdxReg);
+}
+
+/**
+ * Add 64 bit long carry into z[] with carry propogation.
+ * Preserves z and carry register values and modifies rest of registers.
+ *
+ */
+void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
+  Label L_fourth_loop, L_fourth_loop_exit;
+
+  movl(tmp1, 1);
+  subl(zlen, 2);
+  addq(Address(z, zlen, Address::times_4, 0), carry);
+
+  bind(L_fourth_loop);
+  jccb(Assembler::carryClear, L_fourth_loop_exit);
+  subl(zlen, 2);
+  jccb(Assembler::negative, L_fourth_loop_exit);
+  addq(Address(z, zlen, Address::times_4, 0), tmp1);
+  jmp(L_fourth_loop);
+  bind(L_fourth_loop_exit);
+}
+
+/**
+ * Shift z[] left by 1 bit.
+ * Preserves x, len, z and zlen registers and modifies rest of the registers.
+ *
+ */
+void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
+
+  Label L_fifth_loop, L_fifth_loop_exit;
+
+  // Fifth loop
+  // Perform primitiveLeftShift(z, zlen, 1)
+
+  const Register prev_carry = tmp1;
+  const Register new_carry = tmp4;
+  const Register value = tmp2;
+  const Register zidx = tmp3;
+
+  // int zidx, carry;
+  // long value;
+  // carry = 0;
+  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
+  //    (carry:value)  = (z[i] << 1) | carry ;
+  //    z[i] = value;
+  // }
+
+  movl(zidx, zlen);
+  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
+
+  bind(L_fifth_loop);
+  decl(zidx);  // Use decl to preserve carry flag
+  decl(zidx);
+  jccb(Assembler::negative, L_fifth_loop_exit);
+
+  if (UseBMI2Instructions) {
+     movq(value, Address(z, zidx, Address::times_4, 0));
+     rclq(value, 1);
+     rorxq(value, value, 32);
+     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+  }
+  else {
+    // clear new_carry
+    xorl(new_carry, new_carry);
+
+    // Shift z[i] by 1, or in previous carry and save new carry
+    movq(value, Address(z, zidx, Address::times_4, 0));
+    shlq(value, 1);
+    adcl(new_carry, 0);
+
+    orq(value, prev_carry);
+    rorq(value, 0x20);
+    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
+
+    // Set previous carry = new carry
+    movl(prev_carry, new_carry);
+  }
+  jmp(L_fifth_loop);
+
+  bind(L_fifth_loop_exit);
+}
+
+
+/**
+ * Code for BigInteger::squareToLen() intrinsic
+ *
+ * rdi: x
+ * rsi: len
+ * r8:  z
+ * rcx: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
+  push(tmp1);
+  push(tmp2);
+  push(tmp3);
+  push(tmp4);
+  push(tmp5);
+
+  // First loop
+  // Store the squares, right shifted one bit (i.e., divided by 2).
+  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+  // Add in off-diagonal sums.
+  //
+  // Second, third (nested) and fourth loops.
+  // zlen +=2;
+  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
+  //    carry = 0;
+  //    long op2 = x[xidx:xidx+1];
+  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
+  //       k -= 2;
+  //       long op1 = x[j:j+1];
+  //       long sum = z[k:k+1];
+  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
+  //       z[k:k+1] = sum;
+  //    }
+  //    add_one_64(z, k, carry, tmp_regs);
+  // }
+
+  const Register carry = tmp5;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  Register op2 = tmp2;
+
+  push(zlen);
+  push(len);
+  addl(zlen,2);
+  bind(L_second_loop);
+  xorq(carry, carry);
+  subl(zlen, 4);
+  subl(len, 2);
+  push(zlen);
+  push(len);
+  cmpl(len, 0);
+  jccb(Assembler::lessEqual, L_second_loop_exit);
+
+  // Multiply an array by one 64 bit long.
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+    movq(op2, Address(x, len, Address::times_4,  0));
+    rorxq(op2, op2, 32);
+  }
+  else {
+    movq(op2, Address(x, len, Address::times_4,  0));
+    rorq(op2, 32);
+  }
+
+  bind(L_third_loop);
+  decrementl(len);
+  jccb(Assembler::negative, L_third_loop_exit);
+  decrementl(len);
+  jccb(Assembler::negative, L_last_x);
+
+  movq(op1, Address(x, len, Address::times_4,  0));
+  rorq(op1, 32);
+
+  bind(L_multiply);
+  subl(zlen, 2);
+  movq(sum, Address(z, zlen, Address::times_4,  0));
+
+  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+
+  movq(Address(z, zlen, Address::times_4, 0), sum);
+
+  jmp(L_third_loop);
+  bind(L_third_loop_exit);
+
+  // Fourth loop
+  // Add 64 bit long carry into z with carry propogation.
+  // Uses offsetted zlen.
+  add_one_64(z, zlen, carry, tmp1);
+
+  pop(len);
+  pop(zlen);
+  jmp(L_second_loop);
+
+  // Next infrequent code is moved outside loops.
+  bind(L_last_x);
+  movl(op1, Address(x, 0));
+  jmp(L_multiply);
+
+  bind(L_second_loop_exit);
+  pop(len);
+  pop(zlen);
+  pop(len);
+  pop(zlen);
+
+  // Fifth loop
+  // Shift z left 1 bit.
+  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
+
+  // z[zlen-1] |= x[len-1] & 1;
+  movl(tmp3, Address(x, len, Address::times_4, -4));
+  andl(tmp3, 1);
+  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
+
+  pop(tmp5);
+  pop(tmp4);
+  pop(tmp3);
+  pop(tmp2);
+  pop(tmp1);
+}
+
+/**
+ * Helper function for mul_add()
+ * Multiply the in[] by int k and add to out[] starting at offset offs using
+ * 128 bit by 32 bit multiply and return the carry in tmp5.
+ * Only quad int aligned length of in[] is operated on in this function.
+ * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
+ * This function preserves out, in and k registers.
+ * len and offset point to the appropriate index in "in" & "out" correspondingly
+ * tmp5 has the carry.
+ * other registers are temporary and are modified.
+ *
+ */
+void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
+  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
+  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_first_loop, L_first_loop_exit;
+
+  movl(tmp1, len);
+  shrl(tmp1, 2);
+
+  bind(L_first_loop);
+  subl(tmp1, 1);
+  jccb(Assembler::negative, L_first_loop_exit);
+
+  subl(len, 4);
+  subl(offset, 4);
+
+  Register op2 = tmp2;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  const Register carry = tmp5;
+
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+  }
+
+  movq(op1, Address(in, len, Address::times_4,  8));
+  rorq(op1, 32);
+  movq(sum, Address(out, offset, Address::times_4,  8));
+  rorq(sum, 32);
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offset, Address::times_4,  8), sum);
+
+  movq(op1, Address(in, len, Address::times_4,  0));
+  rorq(op1, 32);
+  movq(sum, Address(out, offset, Address::times_4,  0));
+  rorq(sum, 32);
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offset, Address::times_4,  0), sum);
+
+  jmp(L_first_loop);
+  bind(L_first_loop_exit);
+}
+
+/**
+ * Code for BigInteger::mulAdd() intrinsic
+ *
+ * rdi: out
+ * rsi: in
+ * r11: offs (out.length - offset)
+ * rcx: len
+ * r8:  k
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ * Multiply the in[] by word k and add to out[], return the carry in rax
+ */
+void MacroAssembler::mul_add(Register out, Register in, Register offs,
+   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
+   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+  Label L_carry, L_last_in, L_done;
+
+// carry = 0;
+// for (int j=len-1; j >= 0; j--) {
+//    long product = (in[j] & LONG_MASK) * kLong +
+//                   (out[offs] & LONG_MASK) + carry;
+//    out[offs--] = (int)product;
+//    carry = product >>> 32;
+// }
+//
+  push(tmp1);
+  push(tmp2);
+  push(tmp3);
+  push(tmp4);
+  push(tmp5);
+
+  Register op2 = tmp2;
+  const Register sum = tmp3;
+  const Register op1 = tmp4;
+  const Register carry =  tmp5;
+
+  if (UseBMI2Instructions) {
+    op2 = rdxReg;
+    movl(op2, k);
+  }
+  else {
+    movl(op2, k);
+  }
+
+  xorq(carry, carry);
+
+  //First loop
+
+  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
+  //The carry is in tmp5
+  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
+  decrementl(len);
+  jccb(Assembler::negative, L_carry);
+  decrementl(len);
+  jccb(Assembler::negative, L_last_in);
+
+  movq(op1, Address(in, len, Address::times_4,  0));
+  rorq(op1, 32);
+
+  subl(offs, 2);
+  movq(sum, Address(out, offs, Address::times_4,  0));
+  rorq(sum, 32);
+
+  if (UseBMI2Instructions) {
+    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+  }
+  else {
+    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+  }
+
+  // Store back in big endian from little endian
+  rorq(sum, 0x20);
+  movq(Address(out, offs, Address::times_4,  0), sum);
+
+  testl(len, len);
+  jccb(Assembler::zero, L_carry);
+
+  //Multiply the last in[] entry, if any
+  bind(L_last_in);
+  movl(op1, Address(in, 0));
+  movl(sum, Address(out, offs, Address::times_4,  -4));
+
+  movl(raxReg, k);
+  mull(op1); //tmp4 * eax -> edx:eax
+  addl(sum, carry);
+  adcl(rdxReg, 0);
+  addl(sum, raxReg);
+  adcl(rdxReg, 0);
+  movl(carry, rdxReg);
+
+  movl(Address(out, offs, Address::times_4,  -4), sum);
+
+  bind(L_carry);
+  //return tmp5/carry as carry in rax
+  movl(rax, carry);
+
+  bind(L_done);
+  pop(tmp5);
+  pop(tmp4);
+  pop(tmp3);
+  pop(tmp2);
+  pop(tmp1);
+}
 #endif

 /**
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1241,6 +1241,25 @@
                                Register carry2);
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+
+  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
+                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
+                            Register tmp2);
+  void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
+                       Register rdxReg, Register raxReg);
+  void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
+  void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+                       Register tmp3, Register tmp4);
+  void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+                     Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+
+  void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
+               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+               Register raxReg);
+  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
+               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+               Register raxReg);
 #endif

   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
--- a/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -23,6 +23,9 @@
  */

 #include "precompiled.hpp"
+#ifndef _WINDOWS
+#include "alloca.h"
+#endif
 #include "asm/macroAssembler.hpp"
 #include "asm/macroAssembler.inline.hpp"
 #include "code/debugInfoRec.hpp"
@@ -3966,6 +3969,256 @@
 }


+//------------------------------Montgomery multiplication------------------------
+//
+
+#ifndef _WINDOWS
+
+#define ASM_SUBTRACT
+
+#ifdef ASM_SUBTRACT
+// Subtract 0:b from carry:a.  Return carry.
+static unsigned long
+sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
+  long i = 0, cnt = len;
+  unsigned long tmp;
+  asm volatile("clc; "
+               "0: ; "
+               "mov (%[b], %[i], 8), %[tmp]; "
+               "sbb %[tmp], (%[a], %[i], 8); "
+               "inc %[i]; dec %[cnt]; "
+               "jne 0b; "
+               "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
+               : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
+               : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
+               : "memory");
+  return tmp;
+}
+#else // ASM_SUBTRACT
+typedef int __attribute__((mode(TI))) int128;
+
+// Subtract 0:b from carry:a.  Return carry.
+static unsigned long
+sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
+  int128 tmp = 0;
+  int i;
+  for (i = 0; i < len; i++) {
+    tmp += a[i];
+    tmp -= b[i];
+    a[i] = tmp;
+    tmp >>= 64;
+    assert(-1 <= tmp && tmp <= 0, "invariant");
+  }
+  return tmp + carry;
+}
+#endif // ! ASM_SUBTRACT
+
+// Multiply (unsigned) Long A by Long B, accumulating the double-
+// length result into the accumulator formed of T0, T1, and T2.
+#define MACC(A, B, T0, T1, T2)                                      \
+do {                                                                \
+  unsigned long hi, lo;                                             \
+  asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
+           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
+           : "r"(A), "a"(B) : "cc");                                \
+ } while(0)
+
+// As above, but add twice the double-length result into the
+// accumulator.
+#define MACC2(A, B, T0, T1, T2)                                     \
+do {                                                                \
+  unsigned long hi, lo;                                             \
+  asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;"  \
+           "add %%rax, %2; adc %%rdx, %3; adc $0, %4"               \
+           : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
+           : "r"(A), "a"(B) : "cc");                                \
+ } while(0)
+
+// Fast Montgomery multiplication.  The derivation of the algorithm is
+// in  A Cryptographic Library for the Motorola DSP56000,
+// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
+
+static void __attribute__((noinline))
+montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
+                    unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    for (j = 0; j < i; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    MACC(a[i], b[0], t0, t1, t2);
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery multiply");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int j;
+    for (j = i-len+1; j < len; j++) {
+      MACC(a[j], b[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0)
+    t0 = sub(m, n, t0, len);
+}
+
+// Fast Montgomery squaring.  This uses asymptotically 25% fewer
+// multiplies so it should be up to 25% faster than Montgomery
+// multiplication.  However, its loop control is more complex and it
+// may actually run slower on some machines.
+
+static void __attribute__((noinline))
+montgomery_square(unsigned long a[], unsigned long n[],
+                  unsigned long m[], unsigned long inv, int len) {
+  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+  int i;
+
+  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+
+  for (i = 0; i < len; i++) {
+    int j;
+    int end = (i+1)/2;
+    for (j = 0; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < i; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i] = t0 * inv;
+    MACC(m[i], n[0], t0, t1, t2);
+
+    assert(t0 == 0, "broken Montgomery square");
+
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  for (i = len; i < 2*len; i++) {
+    int start = i-len+1;
+    int end = start + (len - start)/2;
+    int j;
+    for (j = start; j < end; j++) {
+      MACC2(a[j], a[i-j], t0, t1, t2);
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    if ((i & 1) == 0) {
+      MACC(a[j], a[j], t0, t1, t2);
+    }
+    for (; j < len; j++) {
+      MACC(m[j], n[i-j], t0, t1, t2);
+    }
+    m[i-len] = t0;
+    t0 = t1; t1 = t2; t2 = 0;
+  }
+
+  while (t0)
+    t0 = sub(m, n, t0, len);
+}
+
+// Swap words in a longword.
+static unsigned long swap(unsigned long x) {
+  return (x << 32) | (x >> 32);
+}
+
+// Copy len longwords from s to d, word-swapping as we go.  The
+// destination array is reversed.
+static void reverse_words(unsigned long *s, unsigned long *d, int len) {
+  d += len;
+  while(len-- > 0) {
+    d--;
+    *d = swap(*s);
+    s++;
+  }
+}
+
+// The threshold at which squaring is advantageous was determined
+// experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
+#define MONTGOMERY_SQUARING_THRESHOLD 64
+
+void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+                                        jint len, jlong inv,
+                                        jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
+  int longwords = len/2;
+
+  // Make very sure we don't use so much space that the stack might
+  // overflow.  512 jints corresponds to an 16384-bit integer and
+  // will use here a total of 8k bytes of stack space.
+  int total_allocation = longwords * sizeof (unsigned long) * 4;
+  guarantee(total_allocation <= 8192, "must be");
+  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+  // Local scratch arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *b = scratch + 1 * longwords,
+    *n = scratch + 2 * longwords,
+    *m = scratch + 3 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)b_ints, b, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
+
+void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
+                                      jint len, jlong inv,
+                                      jint *m_ints) {
+  assert(len % 2 == 0, "array length in montgomery_square must be even");
+  int longwords = len/2;
+
+  // Make very sure we don't use so much space that the stack might
+  // overflow.  512 jints corresponds to an 16384-bit integer and
+  // will use here a total of 6k bytes of stack space.
+  int total_allocation = longwords * sizeof (unsigned long) * 3;
+  guarantee(total_allocation <= 8192, "must be");
+  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+
+  // Local scratch arrays
+  unsigned long
+    *a = scratch + 0 * longwords,
+    *n = scratch + 1 * longwords,
+    *m = scratch + 2 * longwords;
+
+  reverse_words((unsigned long *)a_ints, a, longwords);
+  reverse_words((unsigned long *)n_ints, n, longwords);
+
+  //montgomery_square fails to pass BigIntegerTest on solaris amd64
+  //on jdk7 and jdk8.
+#ifndef SOLARIS
+  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
+#else
+  if (0) {
+#endif
+    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
+  } else {
+    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
+  }
+
+  reverse_words(m, (unsigned long *)m_ints, longwords);
+}
+
+#endif // WINDOWS
+
 #ifdef COMPILER2
 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
 //
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -3743,6 +3743,107 @@
     return start;
   }

+/**
+   *  Arguments:
+   *
+  //  Input:
+  //    c_rarg0   - x address
+  //    c_rarg1   - x length
+  //    c_rarg2   - z address
+  //    c_rarg3   - z lenth
+   *
+   */
+  address generate_squareToLen() {
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "squareToLen");
+
+    address start = __ pc();
+    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
+    const Register x      = rdi;
+    const Register len    = rsi;
+    const Register z      = r8;
+    const Register zlen   = rcx;
+
+   const Register tmp1      = r12;
+   const Register tmp2      = r13;
+   const Register tmp3      = r14;
+   const Register tmp4      = r15;
+   const Register tmp5      = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+       setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
+                          // zlen => rcx
+                          // r9 and r10 may be used to save non-volatile registers
+    __ movptr(r8, rdx);
+    __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+    restore_arg_regs();
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+   /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - out address
+   *    c_rarg1   - in address
+   *    c_rarg2   - offset
+   *    c_rarg3   - len
+   * not Win64
+   *    c_rarg4   - k
+   * Win64
+   *    rsp+40    - k
+   */
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ pc();
+    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register out     = rdi;
+    const Register in      = rsi;
+    const Register offset  = r11;
+    const Register len     = rcx;
+    const Register k       = r8;
+
+    // Next registers will be saved on stack in mul_add().
+    const Register tmp1  = r12;
+    const Register tmp2  = r13;
+    const Register tmp3  = r14;
+    const Register tmp4  = r15;
+    const Register tmp5  = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
+                       // len => rcx, k => r8
+                       // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+    // last argument is on stack on Win64
+    __ movl(k, Address(rsp, 6 * wordSize));
+#endif
+    __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
+    __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+    restore_arg_regs();
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
+
 #undef __
 #define __ masm->

@@ -3987,7 +4088,24 @@
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
-#endif
+    if (UseSquareToLenIntrinsic) {
+      StubRoutines::_squareToLen = generate_squareToLen();
+    }
+    if (UseMulAddIntrinsic) {
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
+
+#ifndef _WINDOWS
+    if (UseMontgomeryMultiplyIntrinsic) {
+      StubRoutines::_montgomeryMultiply
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
+    }
+    if (UseMontgomerySquareIntrinsic) {
+      StubRoutines::_montgomerySquare
+        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
+    }
+#endif // WINDOWS
+#endif // COMPILER2
   }

  public:
--- a/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -33,7 +33,7 @@

 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 23000           // simply increase if too small (assembler will crash if too small)
 };

 class x86 {
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -703,6 +703,18 @@
   if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
     UseMultiplyToLenIntrinsic = true;
   }
+  if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+    UseSquareToLenIntrinsic = false;
+  }
+  if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+    UseMulAddIntrinsic = false;
+  }
+  if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+    UseMontgomeryMultiplyIntrinsic = false;
+  }
+  if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+    UseMontgomerySquareIntrinsic = false;
+  }
 #else
   if (UseMultiplyToLenIntrinsic) {
     if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
@@ -710,6 +722,30 @@
     }
     FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
   }
+  if (UseSquareToLenIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+      warning("squareToLen intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
+  }
+  if (UseMulAddIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+      warning("mulAdd intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
+  }
+  if (UseMontgomeryMultiplyIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+      warning("montgomeryMultiply intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false);
+  }
+  if (UseMontgomerySquareIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+      warning("montgomerySquare intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false);
+  }
 #endif
 #endif // COMPILER2
--- a/src/share/vm/classfile/dictionary.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/dictionary.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "classfile/dictionary.hpp"
 #include "classfile/systemDictionary.hpp"
+#include "classfile/systemDictionaryShared.hpp"
 #include "memory/iterator.hpp"
 #include "oops/oop.inline.hpp"
 #include "prims/jvmtiRedefineClassesTrace.hpp"
@@ -36,9 +37,16 @@
 DictionaryEntry*  Dictionary::_current_class_entry = NULL;
 int               Dictionary::_current_class_index =    0;

+size_t Dictionary::entry_size() {
+  if (DumpSharedSpaces) {
+    return SystemDictionaryShared::dictionary_entry_size();
+  } else {
+    return sizeof(DictionaryEntry);
+  }
+}

 Dictionary::Dictionary(int table_size)
-  : TwoOopHashtable<Klass*, mtClass>(table_size, sizeof(DictionaryEntry)) {
+  : TwoOopHashtable<Klass*, mtClass>(table_size, (int)entry_size()) {
   _current_class_index = 0;
   _current_class_entry = NULL;
   _pd_cache_table = new ProtectionDomainCacheTable(defaultProtectionDomainCacheSize);
@@ -47,7 +55,7 @@

 Dictionary::Dictionary(int table_size, HashtableBucket<mtClass>* t,
                        int number_of_entries)
-  : TwoOopHashtable<Klass*, mtClass>(table_size, sizeof(DictionaryEntry), t, number_of_entries) {
+  : TwoOopHashtable<Klass*, mtClass>(table_size, (int)entry_size(), t, number_of_entries) {
   _current_class_index = 0;
   _current_class_entry = NULL;
   _pd_cache_table = new ProtectionDomainCacheTable(defaultProtectionDomainCacheSize);
@@ -63,6 +71,9 @@
   entry->set_loader_data(loader_data);
   entry->set_pd_set(NULL);
   assert(klass->oop_is_instance(), "Must be");
+  if (DumpSharedSpaces) {
+    SystemDictionaryShared::init_shared_dictionary_entry(klass, entry);
+  }
   return entry;
 }
--- a/src/share/vm/classfile/dictionary.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/dictionary.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,6 +53,7 @@
   DictionaryEntry* get_entry(int index, unsigned int hash,
                              Symbol* name, ClassLoaderData* loader_data);

+protected:
   DictionaryEntry* bucket(int i) {
     return (DictionaryEntry*)Hashtable<Klass*, mtClass>::bucket(i);
   }
@@ -66,6 +67,8 @@
     Hashtable<Klass*, mtClass>::add_entry(index, (HashtableEntry<Klass*, mtClass>*)new_entry);
   }

+  static size_t entry_size();
+
 public:
   Dictionary(int table_size);
   Dictionary(int table_size, HashtableBucket<mtClass>* t, int number_of_entries);
--- a/src/share/vm/classfile/systemDictionary.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/systemDictionary.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1198,8 +1198,13 @@

     if (ik->super() != NULL) {
       Symbol*  cn = ik->super()->name();
-      resolve_super_or_fail(class_name, cn,
-                            class_loader, protection_domain, true, CHECK_(nh));
+      Klass *s = resolve_super_or_fail(class_name, cn,
+                                       class_loader, protection_domain, true, CHECK_(nh));
+      if (s != ik->super()) {
+        // The dynamically resolved super class is not the same as the one we used during dump time,
+        // so we cannot use ik.
+        return nh;
+      }
     }

     Array<Klass*>* interfaces = ik->local_interfaces();
@@ -1212,7 +1217,12 @@
       // reinitialized yet (they will be once the interface classes
       // are loaded)
       Symbol*  name  = k->name();
-      resolve_super_or_fail(class_name, name, class_loader, protection_domain, false, CHECK_(nh));
+      Klass* i = resolve_super_or_fail(class_name, name, class_loader, protection_domain, false, CHECK_(nh));
+      if (k != i) {
+        // The dynamically resolved interface class is not the same as the one we used during dump time,
+        // so we cannot use ik.
+        return nh;
+      }
     }

     // Adjust methods to recover missing data.  They need addresses for
--- a/src/share/vm/classfile/systemDictionaryShared.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/systemDictionaryShared.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #ifndef SHARE_VM_CLASSFILE_SYSTEMDICTIONARYSHARED_HPP
 #define SHARE_VM_CLASSFILE_SYSTEMDICTIONARYSHARED_HPP

+#include "classfile/dictionary.hpp"
 #include "classfile/systemDictionary.hpp"

 class SystemDictionaryShared: public SystemDictionary {
@@ -42,6 +43,22 @@
     oop class_loader = loader_data->class_loader();
     return (class_loader == NULL);
   }
+
+  static size_t dictionary_entry_size() {
+    return sizeof(DictionaryEntry);
+  }
+  static void init_shared_dictionary_entry(Klass* k, DictionaryEntry* entry) {}
+
+  // The (non-application) CDS implementation supports only classes in the boot
+  // class loader, which ensures that the verification dependencies are the same
+  // during archive creation time and runtime. Thus we can do the dependency checks
+  // entirely during archive creation time.
+  static void add_verification_dependency(Klass* k, Symbol* accessor_clsname,
+                                          Symbol* target_clsname) {}
+  static void finalize_verification_dependencies() {}
+  static bool check_verification_dependencies(Klass* k, Handle class_loader,
+                                              Handle protection_domain,
+                                              char** message_buffer, TRAPS) {return true;}
 };

 #endif // SHARE_VM_CLASSFILE_SYSTEMDICTIONARYSHARED_HPP
--- a/src/share/vm/classfile/verificationType.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/verificationType.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,6 +24,7 @@

 #include "precompiled.hpp"
 #include "classfile/symbolTable.hpp"
+#include "classfile/systemDictionaryShared.hpp"
 #include "classfile/verificationType.hpp"
 #include "classfile/verifier.hpp"

@@ -73,7 +74,23 @@
       Klass* from_class = SystemDictionary::resolve_or_fail(
           from.name(), Handle(THREAD, klass->class_loader()),
           Handle(THREAD, klass->protection_domain()), true, CHECK_false);
-      return InstanceKlass::cast(from_class)->is_subclass_of(this_class());
+      bool result = InstanceKlass::cast(from_class)->is_subclass_of(this_class());
+      if (result && DumpSharedSpaces) {
+        if (klass()->is_subclass_of(from_class) && klass()->is_subclass_of(this_class())) {
+          // No need to save verification dependency. At run time, <klass> will be
+          // loaded from the archived only if <from_class> and <this_class> are
+          // also loaded from the archive. I.e., all 3 classes are exactly the same
+          // as we saw at archive creation time.
+        } else {
+          // Save the dependency. At run time, we need to check that the condition
+          // from_class->is_subclass_of(this_class() is still true.
+          Symbol* accessor_clsname = from.name();
+          Symbol* target_clsname = this_class()->name();
+          SystemDictionaryShared::add_verification_dependency(klass(),
+                       accessor_clsname, target_clsname);
+        }
+      }
+      return result;
     }
   } else if (is_array() && from.is_array()) {
     VerificationType comp_this = get_component(context, CHECK_false);
--- a/src/share/vm/classfile/verifier.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/verifier.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -2326,9 +2326,17 @@
       case Bytecodes::_ifnonnull:
         target = bcs.dest();
         if (visited_branches->contains(bci)) {
-          if (bci_stack->is_empty()) return true;
-          // Pop a bytecode starting offset and scan from there.
-          bcs.set_start(bci_stack->pop());
+          if (bci_stack->is_empty()) {
+            if (handler_stack->is_empty()) {
+              return true;
+            } else {
+              // Parse the catch handlers for try blocks containing athrow.
+              bcs.set_start(handler_stack->pop());
+            }
+          } else {
+            // Pop a bytecode starting offset and scan from there.
+            bcs.set_start(bci_stack->pop());
+          }
         } else {
           if (target > bci) { // forward branch
             if (target >= code_length) return false;
@@ -2351,9 +2359,17 @@
       case Bytecodes::_goto_w:
         target = (opcode == Bytecodes::_goto ? bcs.dest() : bcs.dest_w());
         if (visited_branches->contains(bci)) {
-          if (bci_stack->is_empty()) return true;
-          // Been here before, pop new starting offset from stack.
-          bcs.set_start(bci_stack->pop());
+          if (bci_stack->is_empty()) {
+            if (handler_stack->is_empty()) {
+              return true;
+            } else {
+              // Parse the catch handlers for try blocks containing athrow.
+              bcs.set_start(handler_stack->pop());
+            }
+          } else {
+            // Been here before, pop new starting offset from stack.
+            bcs.set_start(bci_stack->pop());
+          }
         } else {
           if (target >= code_length) return false;
           // Continue scanning from the target onward.
--- a/src/share/vm/classfile/vmSymbols.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/classfile/vmSymbols.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -793,10 +793,26 @@
    do_signature(encodeISOArray_signature,                        "([CI[BII)I")                                          \
                                                                                                                         \
   do_class(java_math_BigInteger,                      "java/math/BigInteger")                                           \
-  do_intrinsic(_multiplyToLen,      java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_R)             \
+  do_intrinsic(_multiplyToLen,      java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_S)             \
    do_name(     multiplyToLen_name,                             "multiplyToLen")                                        \
    do_signature(multiplyToLen_signature,                        "([II[II[I)[I")                                         \
                                                                                                                         \
+  do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S)                        \
+   do_name(     squareToLen_name,                             "implSquareToLen")                                        \
+   do_signature(squareToLen_signature,                        "([II[II)[I")                                             \
+                                                                                                                        \
+  do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S)                                       \
+   do_name(     mulAdd_name,                                  "implMulAdd")                                             \
+   do_signature(mulAdd_signature,                             "([I[IIII)I")                                             \
+                                                                                                                        \
+  do_intrinsic(_montgomeryMultiply,      java_math_BigInteger, montgomeryMultiply_name, montgomeryMultiply_signature, F_S) \
+   do_name(     montgomeryMultiply_name,                             "implMontgomeryMultiply")                          \
+   do_signature(montgomeryMultiply_signature,                        "([I[I[IIJ[I)[I")                                  \
+                                                                                                                        \
+  do_intrinsic(_montgomerySquare,      java_math_BigInteger, montgomerySquare_name, montgomerySquare_signature, F_S)    \
+   do_name(     montgomerySquare_name,                             "implMontgomerySquare")                              \
+   do_signature(montgomerySquare_signature,                        "([I[IIJ[I)[I")                                      \
+                                                                                                                        \
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
--- a/src/share/vm/interpreter/bytecodeStream.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/interpreter/bytecodeStream.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,12 +31,12 @@
   // set next bytecode position
   address bcp = RawBytecodeStream::bcp();
   address end = method()->code_base() + end_bci();
-  int l = Bytecodes::raw_special_length_at(bcp, end);
-  if (l <= 0 || (_bci + l) > _end_bci) {
+  int len = Bytecodes::raw_special_length_at(bcp, end);
+  // Very large tableswitch or lookupswitch size can cause _next_bci to overflow.
+  if (len <= 0 || (_bci > _end_bci - len) || (_bci - len >= _next_bci)) {
     code = Bytecodes::_illegal;
   } else {
-    _next_bci += l;
-    assert(_bci < _next_bci, "length must be > 0");
+    _next_bci += len;
     // set attributes
     _is_wide = false;
     // check for special (uncommon) cases
--- a/src/share/vm/interpreter/bytecodeStream.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/interpreter/bytecodeStream.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -152,12 +152,15 @@
     code        = Bytecodes::code_or_bp_at(bcp);

     // set next bytecode position
-    int l = Bytecodes::length_for(code);
-    if (l > 0 && (_bci + l) <= _end_bci) {
+    int len = Bytecodes::length_for(code);
+    if (len > 0 && (_bci <= _end_bci - len)) {
       assert(code != Bytecodes::_wide && code != Bytecodes::_tableswitch
              && code != Bytecodes::_lookupswitch, "can't be special bytecode");
       _is_wide = false;
-      _next_bci += l;
+      _next_bci += len;
+      if (_next_bci <= _bci) { // Check for integer overflow
+        code = Bytecodes::_illegal;
+      }
       _raw_code = code;
       return code;
     } else {
@@ -206,19 +209,23 @@
       // note that we cannot advance before having the
       // tty bytecode otherwise the stepping is wrong!
       // (carefull: length_for(...) must be used first!)
-      int l = Bytecodes::length_for(code);
-      if (l == 0) l = Bytecodes::length_at(_method(), bcp);
-      _next_bci  += l;
-      assert(_bci < _next_bci, "length must be > 0");
-      // set attributes
-      _is_wide      = false;
-      // check for special (uncommon) cases
-      if (code == Bytecodes::_wide) {
-        raw_code = (Bytecodes::Code)bcp[1];
-        code = raw_code;  // wide BCs are always Java-normal
-        _is_wide = true;
+      int len = Bytecodes::length_for(code);
+      if (len == 0) len = Bytecodes::length_at(_method(), bcp);
+      if (len <= 0 || (_bci > _end_bci - len) || (_bci - len >= _next_bci)) {
+        raw_code = code = Bytecodes::_illegal;
+      } else {
+        _next_bci  += len;
+        assert(_bci < _next_bci, "length must be > 0");
+        // set attributes
+        _is_wide      = false;
+        // check for special (uncommon) cases
+        if (code == Bytecodes::_wide) {
+          raw_code = (Bytecodes::Code)bcp[1];
+          code = raw_code;  // wide BCs are always Java-normal
+          _is_wide = true;
+        }
+        assert(Bytecodes::is_java_code(code), "sanity check");
       }
-      assert(Bytecodes::is_java_code(code), "sanity check");
     }
     _raw_code = raw_code;
     _code = code;
--- a/src/share/vm/memory/metaspaceShared.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/memory/metaspaceShared.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -30,6 +30,7 @@
 #include "classfile/sharedClassUtil.hpp"
 #include "classfile/symbolTable.hpp"
 #include "classfile/systemDictionary.hpp"
+#include "classfile/systemDictionaryShared.hpp"
 #include "code/codeCache.hpp"
 #include "memory/filemap.hpp"
 #include "memory/gcLocker.hpp"
@@ -53,6 +54,7 @@
 bool MetaspaceShared::_check_classes_made_progress;
 bool MetaspaceShared::_has_error_classes;
 bool MetaspaceShared::_archive_loading_failed = false;
+bool MetaspaceShared::_remapped_readwrite = false;
 // Read/write a data stream for restoring/preserving metadata pointers and
 // miscellaneous data from/to the shared archive file.

@@ -684,6 +686,10 @@
       exit(1);
     }
   }
+
+  // Copy the dependencies from C_HEAP-alloced GrowableArrays to RO-alloced
+  // Arrays
+  SystemDictionaryShared::finalize_verification_dependencies();
 }

 void MetaspaceShared::prepare_for_dumping() {
@@ -1100,6 +1106,7 @@
     if (!mapinfo->remap_shared_readonly_as_readwrite()) {
       return false;
     }
+    _remapped_readwrite = true;
   }
   return true;
 }
--- a/src/share/vm/memory/metaspaceShared.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/memory/metaspaceShared.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -55,6 +55,7 @@
   static bool _check_classes_made_progress;
   static bool _has_error_classes;
   static bool _archive_loading_failed;
+  static bool _remapped_readwrite;
  public:
   enum {
     vtbl_list_size         = 17,   // number of entries in the shared space vtable list.
@@ -123,6 +124,10 @@
   // sharing is enabled. Simply returns true if sharing is not enabled
   // or if the remapping has already been done by a prior call.
   static bool remap_shared_readonly_as_readwrite() NOT_CDS_RETURN_(true);
+  static bool remapped_readwrite() {
+    CDS_ONLY(return _remapped_readwrite);
+    NOT_CDS(return false);
+  }

   static void print_shared_spaces();
--- a/src/share/vm/oops/instanceKlass.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/oops/instanceKlass.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "classfile/javaClasses.hpp"
 #include "classfile/systemDictionary.hpp"
+#include "classfile/systemDictionaryShared.hpp"
 #include "classfile/verifier.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "compiler/compileBroker.hpp"
@@ -706,6 +707,16 @@

         // also sets rewritten
         this_oop->rewrite_class(CHECK_false);
+      } else if (this_oop()->is_shared()) {
+        ResourceMark rm(THREAD);
+        char* message_buffer; // res-allocated by check_verification_dependencies
+        Handle loader = this_oop()->class_loader();
+        Handle pd     = this_oop()->protection_domain();
+        bool verified = SystemDictionaryShared::check_verification_dependencies(this_oop(),
+                        loader, pd, &message_buffer, THREAD);
+        if (!verified) {
+          THROW_MSG_(vmSymbols::java_lang_VerifyError(), message_buffer, false);
+        }
       }

       // relocate jsrs and link methods after they are all rewritten
@@ -715,7 +726,12 @@
       // methods have been rewritten since rewrite may
       // fabricate new Method*s.
       // also does loader constraint checking
-      if (!this_oop()->is_shared()) {
+      //
+      // Initialize_vtable and initialize_itable need to be rerun for
+      // a shared class if the class is not loaded by the NULL classloader.
+      ClassLoaderData * loader_data = this_oop->class_loader_data();
+      if (!(this_oop()->is_shared() &&
+            loader_data->is_the_null_class_loader_data())) {
         ResourceMark rm(THREAD);
         this_oop->vtable()->initialize_vtable(true, CHECK_false);
         this_oop->itable()->initialize_itable(true, CHECK_false);
--- a/src/share/vm/oops/klassVtable.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/oops/klassVtable.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -27,6 +27,7 @@
 #include "classfile/vmSymbols.hpp"
 #include "gc_implementation/shared/markSweep.inline.hpp"
 #include "memory/gcLocker.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "memory/resourceArea.hpp"
 #include "memory/universe.inline.hpp"
 #include "oops/instanceKlass.hpp"
@@ -47,6 +48,10 @@
   return (InstanceKlass*)k;
 }

+bool klassVtable::is_preinitialized_vtable() {
+  return _klass->is_shared() && !MetaspaceShared::remapped_readwrite();
+}
+

 // this function computes the vtable size (including the size needed for miranda
 // methods) and the number of miranda methods in this class.
@@ -128,6 +133,12 @@
 int klassVtable::initialize_from_super(KlassHandle super) {
   if (super.is_null()) {
     return 0;
+  } else if (is_preinitialized_vtable()) {
+    // A shared class' vtable is preinitialized at dump time. No need to copy
+    // methods from super class for shared class, as that was already done
+    // during archiving time. However, if Jvmti has redefined a class,
+    // copy super class's vtable in case the super class has changed.
+    return super->vtable()->length();
   } else {
     // copy methods from superKlass
     // can't inherit from array class, so must be InstanceKlass
@@ -157,6 +168,8 @@
   KlassHandle super (THREAD, klass()->java_super());
   int nofNewEntries = 0;

+  bool is_shared = _klass->is_shared();
+
   if (PrintVtables && !klass()->oop_is_array()) {
     ResourceMark rm(THREAD);
     tty->print_cr("Initializing: %s", _klass->name()->as_C_string());
@@ -169,6 +182,7 @@
 #endif

   if (Universe::is_bootstrapping()) {
+    assert(!is_shared, "sanity");
     // just clear everything
     for (int i = 0; i < _length; i++) table()[i].clear();
     return;
@@ -208,6 +222,7 @@
       if (len > 0) {
         Array<int>* def_vtable_indices = NULL;
         if ((def_vtable_indices = ik()->default_vtable_indices()) == NULL) {
+          assert(!is_shared, "shared class def_vtable_indices does not exist");
           def_vtable_indices = ik()->create_new_default_vtable_indices(len, CHECK);
         } else {
           assert(def_vtable_indices->length() == len, "reinit vtable len?");
@@ -222,7 +237,15 @@
           // needs new entry
           if (needs_new_entry) {
             put_method_at(mh(), initialized);
-            def_vtable_indices->at_put(i, initialized); //set vtable index
+            if (is_preinitialized_vtable()) {
+              // At runtime initialize_vtable is rerun for a shared class
+              // (loaded by the non-boot loader) as part of link_class_impl().
+              // The dumptime vtable index should be the same as the runtime index.
+              assert(def_vtable_indices->at(i) == initialized,
+                     "dump time vtable index is different from runtime index");
+            } else {
+              def_vtable_indices->at_put(i, initialized); //set vtable index
+            }
             initialized++;
           }
         }
@@ -365,7 +388,8 @@
   }

   // we need a new entry if there is no superclass
-  if (klass->super() == NULL) {
+  Klass* super = klass->super();
+  if (super == NULL) {
     return allocate_new;
   }

@@ -394,7 +418,15 @@

   Symbol* target_classname = target_klass->name();
   for(int i = 0; i < super_vtable_len; i++) {
-    Method* super_method = method_at(i);
+    Method* super_method;
+    if (is_preinitialized_vtable()) {
+      // If this is a shared class, the vtable is already in the final state (fully
+      // initialized). Need to look at the super's vtable.
+      klassVtable* superVtable = super->vtable();
+      super_method = superVtable->method_at(i);
+    } else {
+      super_method = method_at(i);
+    }
     // Check if method name matches
     if (super_method->name() == name && super_method->signature() == signature) {

@@ -458,7 +490,15 @@
          target_method()->set_vtable_index(i);
        } else {
          if (def_vtable_indices != NULL) {
-           def_vtable_indices->at_put(default_index, i);
+           if (is_preinitialized_vtable()) {
+             // At runtime initialize_vtable is rerun as part of link_class_impl()
+             // for a shared class loaded by the non-boot loader.
+             // The dumptime vtable index should be the same as the runtime index.
+             assert(def_vtable_indices->at(default_index) == i,
+                    "dump time vtable index is different from runtime index");
+           } else {
+             def_vtable_indices->at_put(default_index, i);
+           }
          }
          assert(super_method->is_default_method() || super_method->is_overpass()
                 || super_method->is_abstract(), "default override error");
@@ -523,24 +563,33 @@
 }

 void klassVtable::put_method_at(Method* m, int index) {
+  if (is_preinitialized_vtable()) {
+    // At runtime initialize_vtable is rerun as part of link_class_impl()
+    // for shared class loaded by the non-boot loader to obtain the loader
+    // constraints based on the runtime classloaders' context. The dumptime
+    // method at the vtable index should be the same as the runtime method.
+    assert(table()[index].method() == m,
+           "archived method is different from the runtime method");
+  } else {
 #ifndef PRODUCT
-  if (PrintVtables && Verbose) {
-    ResourceMark rm;
-    const char* sig = (m != NULL) ? m->name_and_sig_as_C_string() : "<NULL>";
-    tty->print("adding %s at index %d, flags: ", sig, index);
-    if (m != NULL) {
-      m->access_flags().print_on(tty);
-      if (m->is_default_method()) {
-        tty->print("default ");
+    if (PrintVtables && Verbose) {
+      ResourceMark rm;
+      const char* sig = (m != NULL) ? m->name_and_sig_as_C_string() : "<NULL>";
+      tty->print("adding %s at index %d, flags: ", sig, index);
+      if (m != NULL) {
+        m->access_flags().print_on(tty);
+        if (m->is_default_method()) {
+          tty->print("default ");
+        }
+        if (m->is_overpass()) {
+          tty->print("overpass");
+        }
       }
-      if (m->is_overpass()) {
-        tty->print("overpass");
-      }
+      tty->cr();
     }
-    tty->cr();
+#endif
+    table()[index].set(m);
   }
-#endif
-  table()[index].set(m);
 }

 // Find out if a method "m" with superclass "super", loader "classloader" and
@@ -971,7 +1020,15 @@
 void itableMethodEntry::initialize(Method* m) {
   if (m == NULL) return;

-  _method = m;
+  if (MetaspaceShared::is_in_shared_space((void*)&_method) &&
+     !MetaspaceShared::remapped_readwrite()) {
+    // At runtime initialize_itable is rerun as part of link_class_impl()
+    // for a shared class loaded by the non-boot loader.
+    // The dumptime itable method entry should be the same as the runtime entry.
+    assert(_method == m, "sanity");
+  } else {
+    _method = m;
+  }
 }

 klassItable::klassItable(instanceKlassHandle klass) {
@@ -1081,7 +1138,11 @@
         tty->cr();
       }
       if (!m->has_vtable_index()) {
-        assert(m->vtable_index() == Method::pending_itable_index, "set by initialize_vtable");
+        // A shared method could have an initialized itable_index that
+        // is < 0.
+        assert(m->vtable_index() == Method::pending_itable_index ||
+               m->is_shared(),
+               "set by initialize_vtable");
         m->set_itable_index(ime_num);
         // Progress to next itable entry
         ime_num++;
@@ -1277,7 +1338,6 @@
 }
 #endif // INCLUDE_JVMTI

-
 // Setup
 class InterfaceVisiterClosure : public StackObj {
  public:
--- a/src/share/vm/oops/klassVtable.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/oops/klassVtable.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -142,6 +142,19 @@
       Array<Klass*>* local_interfaces);
   void verify_against(outputStream* st, klassVtable* vt, int index);
   inline InstanceKlass* ik() const;
+  // When loading a class from CDS archive at run time, and no class redefintion
+  // has happened, it is expected that the class's itable/vtables are
+  // laid out exactly the same way as they had been during dump time.
+  // Therefore, in klassVtable::initialize_[iv]table, we do not layout the
+  // tables again. Instead, we only rerun the process to create/check
+  // the class loader constraints. In non-product builds, we add asserts to
+  // guarantee that the table's layout would be the same as at dump time.
+  //
+  // If JVMTI redefines any class, the read-only shared memory are remapped
+  // as read-write. A shared class' vtable/itable are re-initialized and
+  // might have different layout due to class redefinition of the shared class
+  // or its super types.
+  bool is_preinitialized_vtable();
 };
--- a/src/share/vm/oops/method.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/oops/method.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -36,6 +36,7 @@
 #include "memory/generation.hpp"
 #include "memory/heapInspection.hpp"
 #include "memory/metadataFactory.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "memory/oopFactory.hpp"
 #include "oops/constMethod.hpp"
 #include "oops/methodData.hpp"
@@ -310,6 +311,33 @@
   unlink_method();
 }

+void Method::set_vtable_index(int index) {
+  if (is_shared() && !MetaspaceShared::remapped_readwrite()) {
+    // At runtime initialize_vtable is rerun as part of link_class_impl()
+    // for a shared class loaded by the non-boot loader to obtain the loader
+    // constraints based on the runtime classloaders' context.
+    return; // don't write into the shared class
+  } else {
+    _vtable_index = index;
+  }
+}
+
+void Method::set_itable_index(int index) {
+  if (is_shared() && !MetaspaceShared::remapped_readwrite()) {
+    // At runtime initialize_itable is rerun as part of link_class_impl()
+    // for a shared class loaded by the non-boot loader to obtain the loader
+    // constraints based on the runtime classloaders' context. The dumptime
+    // itable index should be the same as the runtime index.
+    assert(_vtable_index == itable_index_max - index,
+           "archived itable index is different from runtime index");
+    return; // don’t write into the shared class
+  } else {
+    _vtable_index = itable_index_max - index;
+  }
+  assert(valid_itable_index(), "");
+}
+
+

 bool Method::was_executed_more_than(int n) {
   // Invocation counter is reset when the Method* is compiled.
--- a/src/share/vm/oops/method.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/oops/method.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -470,12 +470,12 @@
   DEBUG_ONLY(bool valid_vtable_index() const     { return _vtable_index >= nonvirtual_vtable_index; })
   bool has_vtable_index() const                  { return _vtable_index >= 0; }
   int  vtable_index() const                      { return _vtable_index; }
-  void set_vtable_index(int index)               { _vtable_index = index; }
+  void set_vtable_index(int index);
   DEBUG_ONLY(bool valid_itable_index() const     { return _vtable_index <= pending_itable_index; })
   bool has_itable_index() const                  { return _vtable_index <= itable_index_max; }
   int  itable_index() const                      { assert(valid_itable_index(), "");
                                                    return itable_index_max - _vtable_index; }
-  void set_itable_index(int index)               { _vtable_index = itable_index_max - index; assert(valid_itable_index(), ""); }
+  void set_itable_index(int index);

   // interpreter entry
   address interpreter_entry() const              { return _i2i_entry; }
--- a/src/share/vm/opto/c2_globals.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/c2_globals.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -665,6 +665,18 @@
   product(bool, UseMultiplyToLenIntrinsic, false,                           \
           "Enables intrinsification of BigInteger.multiplyToLen()")         \
                                                                             \
+  product(bool, UseSquareToLenIntrinsic, false,                             \
+          "Enables intrinsification of BigInteger.squareToLen()")           \
+                                                                            \
+  product(bool, UseMulAddIntrinsic, false,                                  \
+          "Enables intrinsification of BigInteger.mulAdd()")                \
+                                                                            \
+  product(bool, UseMontgomeryMultiplyIntrinsic, false,                      \
+          "Enables intrinsification of BigInteger.montgomeryMultiply()")    \
+                                                                            \
+  product(bool, UseMontgomerySquareIntrinsic, false,                        \
+          "Enables intrinsification of BigInteger.montgomerySquare()")      \
+                                                                            \
   product(bool, UseTypeSpeculation, true,                                   \
           "Speculatively propagate types from profiles")                    \
                                                                             \
--- a/src/share/vm/opto/compile.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/compile.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -417,6 +417,13 @@
       remove_macro_node(n);
     }
   }
+  // Remove useless CastII nodes with range check dependency
+  for (int i = range_check_cast_count() - 1; i >= 0; i--) {
+    Node* cast = range_check_cast_node(i);
+    if (!useful.member(cast)) {
+      remove_range_check_cast(cast);
+    }
+  }
   // Remove useless expensive node
   for (int i = C->expensive_count()-1; i >= 0; i--) {
     Node* n = C->expensive_node(i);
@@ -1184,6 +1191,7 @@
   _macro_nodes = new(comp_arena()) GrowableArray<Node*>(comp_arena(), 8,  0, NULL);
   _predicate_opaqs = new(comp_arena()) GrowableArray<Node*>(comp_arena(), 8,  0, NULL);
   _expensive_nodes = new(comp_arena()) GrowableArray<Node*>(comp_arena(), 8,  0, NULL);
+  _range_check_casts = new(comp_arena()) GrowableArray<Node*>(comp_arena(), 8,  0, NULL);
   register_library_intrinsics();
 }

@@ -1912,6 +1920,22 @@
   assert(predicate_count()==0, "should be clean!");
 }

+void Compile::add_range_check_cast(Node* n) {
+  assert(n->isa_CastII()->has_range_check(), "CastII should have range check dependency");
+  assert(!_range_check_casts->contains(n), "duplicate entry in range check casts");
+  _range_check_casts->append(n);
+}
+
+// Remove all range check dependent CastIINodes.
+void Compile::remove_range_check_casts(PhaseIterGVN &igvn) {
+  for (int i = range_check_cast_count(); i > 0; i--) {
+    Node* cast = range_check_cast_node(i-1);
+    assert(cast->isa_CastII()->has_range_check(), "CastII should have range check dependency");
+    igvn.replace_node(cast, cast->in(1));
+  }
+  assert(range_check_cast_count() == 0, "should be empty");
+}
+
 // StringOpts and late inlining of string methods
 void Compile::inline_string_calls(bool parse_time) {
   {
@@ -2254,6 +2278,12 @@
     PhaseIdealLoop::verify(igvn);
   }

+  if (range_check_cast_count() > 0) {
+    // No more loop optimizations. Remove all range check dependent CastIINodes.
+    C->remove_range_check_casts(igvn);
+    igvn.optimize();
+  }
+
   {
     NOT_PRODUCT( TracePhase t2("macroExpand", &_t_macroExpand, TimeCompiler); )
     PhaseMacroExpand  mex(igvn);
@@ -3023,6 +3053,16 @@

 #endif

+#ifdef ASSERT
+  case Op_CastII:
+    // Verify that all range check dependent CastII nodes were removed.
+    if (n->isa_CastII()->has_range_check()) {
+      n->dump(3);
+      assert(false, "Range check dependent CastII node was not removed");
+    }
+    break;
+#endif
+
   case Op_ModI:
     if (UseDivMod) {
       // Check if a%b and a/b both exist
@@ -4060,6 +4100,24 @@
   }
 }

+// Convert integer value to a narrowed long type dependent on ctrl (for example, a range check)
+Node* Compile::constrained_convI2L(PhaseGVN* phase, Node* value, const TypeInt* itype, Node* ctrl) {
+  if (ctrl != NULL) {
+    // Express control dependency by a CastII node with a narrow type.
+    value = new (phase->C) CastIINode(value, itype, false, true /* range check dependency */);
+    // Make the CastII node dependent on the control input to prevent the narrowed ConvI2L
+    // node from floating above the range check during loop optimizations. Otherwise, the
+    // ConvI2L node may be eliminated independently of the range check, causing the data path
+    // to become TOP while the control path is still there (although it's unreachable).
+    value->set_req(0, ctrl);
+    // Save CastII node to remove it after loop optimizations.
+    phase->C->add_range_check_cast(value);
+    value = phase->transform(value);
+  }
+  const TypeLong* ltype = TypeLong::make(itype->_lo, itype->_hi, itype->_widen);
+  return phase->transform(new (phase->C) ConvI2LNode(value, ltype));
+}
+
 // Auxiliary method to support randomized stressing/fuzzing.
 //
 // This method can be called the arbitrary number of times, with current count
--- a/src/share/vm/opto/compile.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/compile.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -75,6 +75,7 @@
 class JVMState;
 class Type;
 class TypeData;
+class TypeInt;
 class TypePtr;
 class TypeOopPtr;
 class TypeFunc;
@@ -334,6 +335,7 @@
   GrowableArray<Node*>* _macro_nodes;           // List of nodes which need to be expanded before matching.
   GrowableArray<Node*>* _predicate_opaqs;       // List of Opaque1 nodes for the loop predicates.
   GrowableArray<Node*>* _expensive_nodes;       // List of nodes that are expensive to compute and that we'd better not let the GVN freely common
+  GrowableArray<Node*>* _range_check_casts;     // List of CastII nodes with a range check dependency
   ConnectionGraph*      _congraph;
 #ifndef PRODUCT
   IdealGraphPrinter*    _printer;
@@ -669,7 +671,7 @@
   void set_congraph(ConnectionGraph* congraph)  { _congraph = congraph;}
   void add_macro_node(Node * n) {
     //assert(n->is_macro(), "must be a macro node");
-    assert(!_macro_nodes->contains(n), " duplicate entry in expand list");
+    assert(!_macro_nodes->contains(n), "duplicate entry in expand list");
     _macro_nodes->append(n);
   }
   void remove_macro_node(Node * n) {
@@ -689,10 +691,23 @@
     }
   }
   void add_predicate_opaq(Node * n) {
-    assert(!_predicate_opaqs->contains(n), " duplicate entry in predicate opaque1");
+    assert(!_predicate_opaqs->contains(n), "duplicate entry in predicate opaque1");
     assert(_macro_nodes->contains(n), "should have already been in macro list");
     _predicate_opaqs->append(n);
   }
+
+  // Range check dependent CastII nodes that can be removed after loop optimizations
+  void add_range_check_cast(Node* n);
+  void remove_range_check_cast(Node* n) {
+    if (_range_check_casts->contains(n)) {
+      _range_check_casts->remove(n);
+    }
+  }
+  Node* range_check_cast_node(int idx) const { return _range_check_casts->at(idx);  }
+  int   range_check_cast_count()       const { return _range_check_casts->length(); }
+  // Remove all range check dependent CastIINodes.
+  void  remove_range_check_casts(PhaseIterGVN &igvn);
+
   // remove the opaque nodes that protect the predicates so that the unused checks and
   // uncommon traps will be eliminated from the graph.
   void cleanup_loop_predicates(PhaseIterGVN &igvn);
@@ -1201,6 +1216,9 @@
   // Definitions of pd methods
   static void pd_compiler2_init();

+  // Convert integer value to a narrowed long type dependent on ctrl (for example, a range check)
+  static Node* constrained_convI2L(PhaseGVN* phase, Node* value, const TypeInt* itype, Node* ctrl);
+
   // Auxiliary method for randomized fuzzing/stressing
   static bool randomized_select(int count);
 };
--- a/src/share/vm/opto/connode.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/connode.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -535,6 +535,9 @@
   if (_carry_dependency) {
     st->print(" carry dependency");
   }
+  if (_range_check_dependency) {
+    st->print(" range check dependency");
+  }
 }
 #endif

@@ -994,7 +997,8 @@
   }

 #ifdef _LP64
-  // Convert ConvI2L(AddI(x, y)) to AddL(ConvI2L(x), ConvI2L(y)) ,
+  // Convert ConvI2L(AddI(x, y)) to AddL(ConvI2L(x), ConvI2L(y)) or
+  // ConvI2L(CastII(AddI(x, y))) to AddL(ConvI2L(CastII(x)), ConvI2L(CastII(y))),
   // but only if x and y have subranges that cannot cause 32-bit overflow,
   // under the assumption that x+y is in my own subrange this->type().

@@ -1018,6 +1022,13 @@

   Node* z = in(1);
   int op = z->Opcode();
+  Node* ctrl = NULL;
+  if (op == Op_CastII && z->as_CastII()->has_range_check()) {
+    // Skip CastII node but save control dependency
+    ctrl = z->in(0);
+    z = z->in(1);
+    op = z->Opcode();
+  }
   if (op == Op_AddI || op == Op_SubI) {
     Node* x = z->in(1);
     Node* y = z->in(2);
@@ -1075,9 +1086,10 @@
       rylo = -ryhi;
       ryhi = -rylo0;
     }
-
-    Node* cx = phase->transform( new (phase->C) ConvI2LNode(x, TypeLong::make(rxlo, rxhi, widen)) );
-    Node* cy = phase->transform( new (phase->C) ConvI2LNode(y, TypeLong::make(rylo, ryhi, widen)) );
+    assert(rxlo == (int)rxlo && rxhi == (int)rxhi, "x should not overflow");
+    assert(rylo == (int)rylo && ryhi == (int)ryhi, "y should not overflow");
+    Node* cx = phase->C->constrained_convI2L(phase, x, TypeInt::make(rxlo, rxhi, widen), ctrl);
+    Node* cy = phase->C->constrained_convI2L(phase, y, TypeInt::make(rylo, ryhi, widen), ctrl);
     switch (op) {
     case Op_AddI:  return new (phase->C) AddLNode(cx, cy);
     case Op_SubI:  return new (phase->C) SubLNode(cx, cy);
--- a/src/share/vm/opto/connode.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/connode.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -244,19 +244,31 @@
   private:
   // Can this node be removed post CCP or does it carry a required dependency?
   const bool _carry_dependency;
+  // Is this node dependent on a range check?
+  const bool _range_check_dependency;

   protected:
   virtual uint cmp( const Node &n ) const;
   virtual uint size_of() const;

 public:
-  CastIINode(Node *n, const Type *t, bool carry_dependency = false)
-    : ConstraintCastNode(n,t), _carry_dependency(carry_dependency) {}
+  CastIINode(Node *n, const Type *t, bool carry_dependency = false, bool range_check_dependency = false)
+    : ConstraintCastNode(n,t), _carry_dependency(carry_dependency), _range_check_dependency(range_check_dependency) {
+    init_class_id(Class_CastII);
+  }
   virtual int Opcode() const;
   virtual uint ideal_reg() const { return Op_RegI; }
   virtual Node *Identity( PhaseTransform *phase );
   virtual const Type *Value( PhaseTransform *phase ) const;
   virtual Node *Ideal_DU_postCCP( PhaseCCP * );
+  const bool has_range_check() {
+ #ifdef _LP64
+     return _range_check_dependency;
+ #else
+     assert(!_range_check_dependency, "Should not have range check dependency");
+     return false;
+ #endif
+   }
 #ifndef PRODUCT
   virtual void dump_spec(outputStream *st) const;
 #endif
--- a/src/share/vm/opto/escape.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/escape.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -958,8 +958,12 @@
                   strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
-                  ))) {
+                  strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                 ))) {
             call->dump();
             fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
           }
--- a/src/share/vm/opto/graphKit.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/graphKit.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1645,7 +1645,7 @@

 //-------------------------array_element_address-------------------------
 Node* GraphKit::array_element_address(Node* ary, Node* idx, BasicType elembt,
-                                      const TypeInt* sizetype) {
+                                      const TypeInt* sizetype, Node* ctrl) {
   uint shift  = exact_log2(type2aelembytes(elembt));
   uint header = arrayOopDesc::base_offset_in_bytes(elembt);

@@ -1670,9 +1670,9 @@
   // number.  (The prior range check has ensured this.)
   // This assertion is used by ConvI2LNode::Ideal.
   int index_max = max_jint - 1;  // array size is max_jint, index is one less
-  if (sizetype != NULL)  index_max = sizetype->_hi - 1;
-  const TypeLong* lidxtype = TypeLong::make(CONST64(0), index_max, Type::WidenMax);
-  idx = _gvn.transform( new (C) ConvI2LNode(idx, lidxtype) );
+  if (sizetype != NULL) index_max = sizetype->_hi - 1;
+  const TypeInt* iidxtype = TypeInt::make(0, index_max, Type::WidenMax);
+  idx = C->constrained_convI2L(&_gvn, idx, iidxtype, ctrl);
 #endif
   Node* scale = _gvn.transform( new (C) LShiftXNode(idx, intcon(shift)) );
   return basic_plus_adr(ary, base, scale);
@@ -3491,10 +3491,6 @@

   Node* initial_slow_cmp  = _gvn.transform( new (C) CmpUNode( length, intcon( fast_size_limit ) ) );
   Node* initial_slow_test = _gvn.transform( new (C) BoolNode( initial_slow_cmp, BoolTest::gt ) );
-  if (initial_slow_test->is_Bool()) {
-    // Hide it behind a CMoveI, or else PhaseIdealLoop::split_up will get sick.
-    initial_slow_test = initial_slow_test->as_Bool()->as_int_value(&_gvn);
-  }

   // --- Size Computation ---
   // array_size = round_to_heap(array_header + (length << elem_shift));
@@ -3540,13 +3536,35 @@
   Node* lengthx = ConvI2X(length);
   Node* headerx = ConvI2X(header_size);
 #ifdef _LP64
-  { const TypeLong* tllen = _gvn.find_long_type(lengthx);
-    if (tllen != NULL && tllen->_lo < 0) {
+  { const TypeInt* tilen = _gvn.find_int_type(length);
+    if (tilen != NULL && tilen->_lo < 0) {
       // Add a manual constraint to a positive range.  Cf. array_element_address.
-      jlong size_max = arrayOopDesc::max_array_length(T_BYTE);
-      if (size_max > tllen->_hi)  size_max = tllen->_hi;
-      const TypeLong* tlcon = TypeLong::make(CONST64(0), size_max, Type::WidenMin);
-      lengthx = _gvn.transform( new (C) ConvI2LNode(length, tlcon));
+      jlong size_max = fast_size_limit;
+      if (size_max > tilen->_hi)  size_max = tilen->_hi;
+      const TypeInt* tlcon = TypeInt::make(0, size_max, Type::WidenMin);
+
+      // Only do a narrow I2L conversion if the range check passed.
+      IfNode* iff = new (C) IfNode(control(), initial_slow_test, PROB_MIN, COUNT_UNKNOWN);
+      _gvn.transform(iff);
+      RegionNode* region = new (C) RegionNode(3);
+      _gvn.set_type(region, Type::CONTROL);
+      lengthx = new (C) PhiNode(region, TypeLong::LONG);
+      _gvn.set_type(lengthx, TypeLong::LONG);
+
+      // Range check passed. Use ConvI2L node with narrow type.
+      Node* passed = IfFalse(iff);
+      region->init_req(1, passed);
+      // Make I2L conversion control dependent to prevent it from
+      // floating above the range check during loop optimizations.
+      lengthx->init_req(1, C->constrained_convI2L(&_gvn, length, tlcon, passed));
+
+      // Range check failed. Use ConvI2L with wide type because length may be invalid.
+      region->init_req(2, IfTrue(iff));
+      lengthx->init_req(2, ConvI2X(length));
+
+      set_control(region);
+      record_for_igvn(region);
+      record_for_igvn(lengthx);
     }
   }
 #endif
@@ -3577,6 +3595,11 @@
   Node *mem = reset_memory();
   set_all_memory(mem); // Create new memory state

+  if (initial_slow_test->is_Bool()) {
+    // Hide it behind a CMoveI, or else PhaseIdealLoop::split_up will get sick.
+    initial_slow_test = initial_slow_test->as_Bool()->as_int_value(&_gvn);
+  }
+
   // Create the AllocateArrayNode and its result projections
   AllocateArrayNode* alloc
     = new (C) AllocateArrayNode(C, AllocateArrayNode::alloc_type(TypeInt::INT),
--- a/src/share/vm/opto/graphKit.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/graphKit.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -626,7 +626,9 @@
   // Return addressing for an array element.
   Node* array_element_address(Node* ary, Node* idx, BasicType elembt,
                               // Optional constraint on the array size:
-                              const TypeInt* sizetype = NULL);
+                              const TypeInt* sizetype = NULL,
+                              // Optional control dependency (for example, on range check)
+                              Node* ctrl = NULL);

   // Return a load of array element at idx.
   Node* load_array_element(Node* ctl, Node* ary, Node* idx, const TypeAryPtr* arytype);
--- a/src/share/vm/opto/library_call.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/library_call.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -324,6 +324,10 @@
   bool inline_updateBytesCRC32();
   bool inline_updateByteBufferCRC32();
   bool inline_multiplyToLen();
+  bool inline_squareToLen();
+  bool inline_mulAdd();
+  bool inline_montgomeryMultiply();
+  bool inline_montgomerySquare();

   bool inline_profileBoolean();
 };
@@ -527,6 +531,21 @@
     if (!UseMultiplyToLenIntrinsic) return NULL;
     break;

+  case vmIntrinsics::_squareToLen:
+    if (!UseSquareToLenIntrinsic) return NULL;
+    break;
+
+  case vmIntrinsics::_mulAdd:
+    if (!UseMulAddIntrinsic) return NULL;
+    break;
+
+  case vmIntrinsics::_montgomeryMultiply:
+     if (!UseMontgomeryMultiplyIntrinsic) return NULL;
+    break;
+  case vmIntrinsics::_montgomerySquare:
+     if (!UseMontgomerySquareIntrinsic) return NULL;
+    break;
+
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     if (!UseAESIntrinsics) return NULL;
@@ -927,6 +946,17 @@
   case vmIntrinsics::_multiplyToLen:
     return inline_multiplyToLen();

+  case vmIntrinsics::_squareToLen:
+    return inline_squareToLen();
+
+  case vmIntrinsics::_mulAdd:
+    return inline_mulAdd();
+
+  case vmIntrinsics::_montgomeryMultiply:
+    return inline_montgomeryMultiply();
+  case vmIntrinsics::_montgomerySquare:
+    return inline_montgomerySquare();
+
   case vmIntrinsics::_encodeISOArray:
     return inline_encodeISOArray();

@@ -5767,11 +5797,12 @@

   assert(callee()->signature()->size() == 5, "multiplyToLen has 5 parameters");

-  Node* x    = argument(1);
-  Node* xlen = argument(2);
-  Node* y    = argument(3);
-  Node* ylen = argument(4);
-  Node* z    = argument(5);
+  // no receiver because it is a static method
+  Node* x    = argument(0);
+  Node* xlen = argument(1);
+  Node* y    = argument(2);
+  Node* ylen = argument(3);
+  Node* z    = argument(4);

   const Type* x_type = x->Value(&_gvn);
   const Type* y_type = y->Value(&_gvn);
@@ -5856,6 +5887,215 @@
   return true;
 }

+//-------------inline_squareToLen------------------------------------
+bool LibraryCallKit::inline_squareToLen() {
+  assert(UseSquareToLenIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::squareToLen();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "squareToLen";
+
+  assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
+
+  Node* x    = argument(0);
+  Node* len  = argument(1);
+  Node* z    = argument(2);
+  Node* zlen = argument(3);
+
+  const Type* x_type = x->Value(&_gvn);
+  const Type* z_type = z->Value(&_gvn);
+  const TypeAryPtr* top_x = x_type->isa_aryptr();
+  const TypeAryPtr* top_z = z_type->isa_aryptr();
+  if (top_x  == NULL || top_x->klass()  == NULL ||
+      top_z  == NULL || top_z->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (x_elem != T_INT || z_elem != T_INT) {
+    return false;
+  }
+
+
+  Node* x_start = array_element_address(x, intcon(0), x_elem);
+  Node* z_start = array_element_address(z, intcon(0), z_elem);
+
+  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::squareToLen_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  x_start, len, z_start, zlen);
+
+  set_result(z);
+  return true;
+}
+
+//-------------inline_mulAdd------------------------------------------
+bool LibraryCallKit::inline_mulAdd() {
+  assert(UseMulAddIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::mulAdd();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "mulAdd";
+
+  assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
+
+  Node* out      = argument(0);
+  Node* in       = argument(1);
+  Node* offset   = argument(2);
+  Node* len      = argument(3);
+  Node* k        = argument(4);
+
+  const Type* out_type = out->Value(&_gvn);
+  const Type* in_type = in->Value(&_gvn);
+  const TypeAryPtr* top_out = out_type->isa_aryptr();
+  const TypeAryPtr* top_in = in_type->isa_aryptr();
+  if (top_out  == NULL || top_out->klass()  == NULL ||
+      top_in == NULL || top_in->klass() == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (out_elem != T_INT || in_elem != T_INT) {
+    return false;
+  }
+
+  Node* outlen = load_array_length(out);
+  Node* new_offset = _gvn.transform(new (C) SubINode(outlen, offset));
+  Node* out_start = array_element_address(out, intcon(0), out_elem);
+  Node* in_start = array_element_address(in, intcon(0), in_elem);
+
+  Node*  call = make_runtime_call(RC_LEAF|RC_NO_FP,
+                                  OptoRuntime::mulAdd_Type(),
+                                  stubAddr, stubName, TypePtr::BOTTOM,
+                                  out_start,in_start, new_offset, len, k);
+  Node* result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
+
+//-------------inline_montgomeryMultiply-----------------------------------
+bool LibraryCallKit::inline_montgomeryMultiply() {
+  address stubAddr = StubRoutines::montgomeryMultiply();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+
+  assert(UseMontgomeryMultiplyIntrinsic, "not implemented on this platform");
+  const char* stubName = "montgomery_square";
+
+  assert(callee()->signature()->size() == 7, "montgomeryMultiply has 7 parameters");
+
+  Node* a    = argument(0);
+  Node* b    = argument(1);
+  Node* n    = argument(2);
+  Node* len  = argument(3);
+  Node* inv  = argument(4);
+  Node* m    = argument(6);
+
+  const Type* a_type = a->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const Type* b_type = b->Value(&_gvn);
+  const TypeAryPtr* top_b = b_type->isa_aryptr();
+  const Type* n_type = a->Value(&_gvn);
+  const TypeAryPtr* top_n = n_type->isa_aryptr();
+  const Type* m_type = a->Value(&_gvn);
+  const TypeAryPtr* top_m = m_type->isa_aryptr();
+  if (top_a  == NULL || top_a->klass()  == NULL ||
+      top_b == NULL || top_b->klass()  == NULL ||
+      top_n == NULL || top_n->klass()  == NULL ||
+      top_m == NULL || top_m->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType b_elem = b_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType m_elem = m_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (a_elem != T_INT || b_elem != T_INT || n_elem != T_INT || m_elem != T_INT) {
+    return false;
+  }
+
+  // Make the call
+  {
+    Node* a_start = array_element_address(a, intcon(0), a_elem);
+    Node* b_start = array_element_address(b, intcon(0), b_elem);
+    Node* n_start = array_element_address(n, intcon(0), n_elem);
+    Node* m_start = array_element_address(m, intcon(0), m_elem);
+
+    Node* call = make_runtime_call(RC_LEAF,
+                                   OptoRuntime::montgomeryMultiply_Type(),
+                                   stubAddr, stubName, TypePtr::BOTTOM,
+                                   a_start, b_start, n_start, len, inv, top(),
+                                   m_start);
+    set_result(m);
+  }
+
+  return true;
+}
+
+bool LibraryCallKit::inline_montgomerySquare() {
+  address stubAddr = StubRoutines::montgomerySquare();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+
+  assert(UseMontgomerySquareIntrinsic, "not implemented on this platform");
+  const char* stubName = "montgomery_square";
+
+  assert(callee()->signature()->size() == 6, "montgomerySquare has 6 parameters");
+
+  Node* a    = argument(0);
+  Node* n    = argument(1);
+  Node* len  = argument(2);
+  Node* inv  = argument(3);
+  Node* m    = argument(5);
+
+  const Type* a_type = a->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const Type* n_type = a->Value(&_gvn);
+  const TypeAryPtr* top_n = n_type->isa_aryptr();
+  const Type* m_type = a->Value(&_gvn);
+  const TypeAryPtr* top_m = m_type->isa_aryptr();
+  if (top_a  == NULL || top_a->klass()  == NULL ||
+      top_n == NULL || top_n->klass()  == NULL ||
+      top_m == NULL || top_m->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  BasicType a_elem = a_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType n_elem = n_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  BasicType m_elem = m_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (a_elem != T_INT || n_elem != T_INT || m_elem != T_INT) {
+    return false;
+  }
+
+  // Make the call
+  {
+    Node* a_start = array_element_address(a, intcon(0), a_elem);
+    Node* n_start = array_element_address(n, intcon(0), n_elem);
+    Node* m_start = array_element_address(m, intcon(0), m_elem);
+
+    Node* call = make_runtime_call(RC_LEAF,
+                                   OptoRuntime::montgomerySquare_Type(),
+                                   stubAddr, stubName, TypePtr::BOTTOM,
+                                   a_start, n_start, len, inv, top(),
+                                   m_start);
+    set_result(m);
+  }
+
+  return true;
+}
+

 /**
  * Calculate CRC32 for byte.
--- a/src/share/vm/opto/loopTransform.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/loopTransform.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -2438,7 +2438,7 @@

 //=============================================================================
 // Process all the loops in the loop tree and replace any fill
-// patterns with an intrisc version.
+// patterns with an intrinsic version.
 bool PhaseIdealLoop::do_intrinsify_fill() {
   bool changed = false;
   for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
@@ -2536,8 +2536,9 @@
   }

   // Make sure the address expression can be handled.  It should be
-  // head->phi * elsize + con.  head->phi might have a ConvI2L.
+  // head->phi * elsize + con.  head->phi might have a ConvI2L(CastII()).
   Node* elements[4];
+  Node* cast = NULL;
   Node* conv = NULL;
   bool found_index = false;
   int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
@@ -2552,6 +2553,12 @@
         conv = value;
         value = value->in(1);
       }
+      if (value->Opcode() == Op_CastII &&
+          value->as_CastII()->has_range_check()) {
+        // Skip range check dependent CastII nodes
+        cast = value;
+        value = value->in(1);
+      }
 #endif
       if (value != head->phi()) {
         msg = "unhandled shift in address";
@@ -2564,9 +2571,16 @@
         }
       }
     } else if (n->Opcode() == Op_ConvI2L && conv == NULL) {
-      if (n->in(1) == head->phi()) {
+      conv = n;
+      n = n->in(1);
+      if (n->Opcode() == Op_CastII &&
+          n->as_CastII()->has_range_check()) {
+        // Skip range check dependent CastII nodes
+        cast = n;
+        n = n->in(1);
+      }
+      if (n == head->phi()) {
         found_index = true;
-        conv = n;
       } else {
         msg = "unhandled input to ConvI2L";
       }
@@ -2625,6 +2639,7 @@
   // Address elements are ok
   if (con)   ok.set(con->_idx);
   if (shift) ok.set(shift->_idx);
+  if (cast)  ok.set(cast->_idx);
   if (conv)  ok.set(conv->_idx);

   for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
--- a/src/share/vm/opto/loopopts.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/loopopts.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -772,6 +772,9 @@
 #ifdef _LP64
         if (m->Opcode() == Op_ConvI2L)
           return false;
+        if (m->is_CastII() && m->isa_CastII()->has_range_check()) {
+          return false;
+        }
 #endif
       }
     }
--- a/src/share/vm/opto/node.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/node.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -521,6 +521,11 @@
     C->add_macro_node(n);
   if (is_expensive())
     C->add_expensive_node(n);
+  // If the cloned node is a range check dependent CastII, add it to the list.
+  CastIINode* cast = n->isa_CastII();
+  if (cast != NULL && cast->has_range_check()) {
+    C->add_range_check_cast(cast);
+  }

   n->set_idx(C->next_unique()); // Get new unique index as well
   debug_only( n->verify_construction() );
@@ -649,6 +654,11 @@
   if (is_expensive()) {
     compile->remove_expensive_node(this);
   }
+  CastIINode* cast = isa_CastII();
+  if (cast != NULL && cast->has_range_check()) {
+    compile->remove_range_check_cast(cast);
+  }
+
   if (is_SafePoint()) {
     as_SafePoint()->delete_replaced_nodes();
   }
@@ -1344,6 +1354,10 @@
       if (dead->is_expensive()) {
         igvn->C->remove_expensive_node(dead);
       }
+      CastIINode* cast = dead->isa_CastII();
+      if (cast != NULL && cast->has_range_check()) {
+        igvn->C->remove_range_check_cast(cast);
+      }
       igvn->C->record_dead_node(dead->_idx);
       // Kill all inputs to the dead guy
       for (uint i=0; i < dead->req(); i++) {
--- a/src/share/vm/opto/node.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/node.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -54,6 +54,7 @@
 class CatchNode;
 class CatchProjNode;
 class CheckCastPPNode;
+class CastIINode;
 class ClearArrayNode;
 class CmpNode;
 class CodeBuffer;
@@ -603,6 +604,7 @@
     DEFINE_CLASS_ID(Type,  Node, 2)
       DEFINE_CLASS_ID(Phi,   Type, 0)
       DEFINE_CLASS_ID(ConstraintCast, Type, 1)
+        DEFINE_CLASS_ID(CastII, ConstraintCast, 0)
       DEFINE_CLASS_ID(CheckCastPP, Type, 2)
       DEFINE_CLASS_ID(CMove, Type, 3)
       DEFINE_CLASS_ID(SafePointScalarObject, Type, 4)
@@ -727,6 +729,7 @@
   DEFINE_CLASS_QUERY(Catch)
   DEFINE_CLASS_QUERY(CatchProj)
   DEFINE_CLASS_QUERY(CheckCastPP)
+  DEFINE_CLASS_QUERY(CastII)
   DEFINE_CLASS_QUERY(ConstraintCast)
   DEFINE_CLASS_QUERY(ClearArray)
   DEFINE_CLASS_QUERY(CMove)
--- a/src/share/vm/opto/parse2.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/parse2.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -158,7 +158,9 @@
   // Check for always knowing you are throwing a range-check exception
   if (stopped())  return top();

-  Node* ptr = array_element_address(ary, idx, type, sizetype);
+  // Make array address computation control dependent to prevent it
+  // from floating above the range check during loop optimizations.
+  Node* ptr = array_element_address(ary, idx, type, sizetype, control());

   if (result2 != NULL)  *result2 = elemtype;

@@ -461,9 +463,12 @@
 #ifdef _LP64
   // Clean the 32-bit int into a real 64-bit offset.
   // Otherwise, the jint value 0 might turn into an offset of 0x0800000000.
-  const TypeLong* lkeytype = TypeLong::make(CONST64(0), num_cases-1, Type::WidenMin);
-  key_val       = _gvn.transform( new (C) ConvI2LNode(key_val, lkeytype) );
+  const TypeInt* ikeytype = TypeInt::make(0, num_cases-1, Type::WidenMin);
+  // Make I2L conversion control dependent to prevent it from
+  // floating above the range check during loop optimizations.
+  key_val = C->constrained_convI2L(&_gvn, key_val, ikeytype, control());
 #endif
+
   // Shift the value by wordsize so we have an index into the table, rather
   // than a switch value
   Node *shiftWord = _gvn.MakeConX(wordSize);
--- a/src/share/vm/opto/phaseX.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/phaseX.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1339,6 +1339,10 @@
       if (dead->is_expensive()) {
         C->remove_expensive_node(dead);
       }
+      CastIINode* cast = dead->isa_CastII();
+      if (cast != NULL && cast->has_range_check()) {
+        C->remove_range_check_cast(cast);
+      }
     }
   } // while (_stack.is_nonempty())
 }
--- a/src/share/vm/opto/runtime.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/runtime.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -958,6 +958,94 @@
   return TypeFunc::make(domain, range);
 }

+const TypeFunc* OptoRuntime::squareToLen_Type() {
+  // create input type (domain)
+  int num_args      = 4;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // x
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypePtr::NOTNULL;    // z
+  fields[argp++] = TypeInt::INT;        // zlen
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = NULL;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+// for mulAdd calls, 2 pointers and 3 ints, returning int
+const TypeFunc* OptoRuntime::mulAdd_Type() {
+  // create input type (domain)
+  int num_args      = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // out
+  fields[argp++] = TypePtr::NOTNULL;    // in
+  fields[argp++] = TypeInt::INT;        // offset
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeInt::INT;        // k
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // returning carry (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
+  return TypeFunc::make(domain, range);
+}
+
+const TypeFunc* OptoRuntime::montgomeryMultiply_Type() {
+  // create input type (domain)
+  int num_args      = 7;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a
+  fields[argp++] = TypePtr::NOTNULL;    // b
+  fields[argp++] = TypePtr::NOTNULL;    // n
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeLong::LONG;      // inv
+  fields[argp++] = Type::HALF;
+  fields[argp++] = TypePtr::NOTNULL;    // result
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
+
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
+const TypeFunc* OptoRuntime::montgomerySquare_Type() {
+  // create input type (domain)
+  int num_args      = 6;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // a
+  fields[argp++] = TypePtr::NOTNULL;    // n
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypeLong::LONG;      // inv
+  fields[argp++] = Type::HALF;
+  fields[argp++] = TypePtr::NOTNULL;    // result
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypePtr::NOTNULL;
+
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}


 //------------- Interpreter state access for on stack replacement
--- a/src/share/vm/opto/runtime.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/runtime.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -305,6 +305,12 @@

   static const TypeFunc* multiplyToLen_Type();

+  static const TypeFunc* squareToLen_Type();
+
+  static const TypeFunc* mulAdd_Type();
+  static const TypeFunc* montgomeryMultiply_Type();
+  static const TypeFunc* montgomerySquare_Type();
+
   static const TypeFunc* updateBytesCRC32_Type();

   // leaf on stack replacement interpreter accessor types
--- a/src/share/vm/opto/superword.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/opto/superword.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -2388,6 +2388,11 @@
       return true;
     }
   } else if (opc == Op_ConvI2L) {
+    if (n->in(1)->Opcode() == Op_CastII &&
+        n->in(1)->as_CastII()->has_range_check()) {
+      // Skip range check dependent CastII nodes
+      n = n->in(1);
+    }
     if (scaled_iv_plus_offset(n->in(1))) {
       return true;
     }
--- a/src/share/vm/prims/whitebox.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/prims/whitebox.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"

 #include "memory/metadataFactory.hpp"
+#include "memory/metaspaceShared.hpp"
 #include "memory/universe.hpp"
 #include "oops/oop.inline.hpp"

@@ -914,6 +915,10 @@
   return (jlong) MetaspaceGC::capacity_until_GC();
 WB_END

+WB_ENTRY(jboolean, WB_IsSharedClass(JNIEnv* env, jobject wb, jclass clazz))
+  return (jboolean)MetaspaceShared::is_in_shared_space(java_lang_Class::as_Klass(JNIHandles::resolve_non_null(clazz)));
+WB_END
+
 WB_ENTRY(jboolean, WB_IsMonitorInflated(JNIEnv* env, jobject wb, jobject obj))
   oop obj_oop = JNIHandles::resolve(obj);
   return (jboolean) obj_oop->mark()->has_monitor();
@@ -1034,6 +1039,7 @@
   {CC"runMemoryUnitTests", CC"()V",                   (void*)&WB_RunMemoryUnitTests},
   {CC"readFromNoaccessArea",CC"()V",                  (void*)&WB_ReadFromNoaccessArea},
   {CC"stressVirtualSpaceResize",CC"(JJJ)I",           (void*)&WB_StressVirtualSpaceResize},
+  {CC"isSharedClass", CC"(Ljava/lang/Class;)Z",       (void*)&WB_IsSharedClass },
 #if INCLUDE_ALL_GCS
   {CC"g1InConcurrentMark", CC"()Z",                   (void*)&WB_G1InConcurrentMark},
   {CC"g1IsHumongous",      CC"(Ljava/lang/Object;)Z", (void*)&WB_G1IsHumongous     },
--- a/src/share/vm/runtime/sharedRuntime.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/runtime/sharedRuntime.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -145,6 +145,12 @@
   static double dsqrt(double f);
 #endif

+  // Montgomery multiplication
+  static void montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
+                                  jint len, jlong inv, jint *m_ints);
+  static void montgomery_square(jint *a_ints, jint *n_ints,
+                                jint len, jlong inv, jint *m_ints);
+
 #ifdef __SOFTFP__
   // C++ compiler generates soft float instructions as well as passing
   // float and double in registers.
--- a/src/share/vm/runtime/stubRoutines.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/runtime/stubRoutines.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -136,6 +136,10 @@
 address StubRoutines::_crc_table_adr = NULL;

 address StubRoutines::_multiplyToLen = NULL;
+address StubRoutines::_squareToLen = NULL;
+address StubRoutines::_mulAdd = NULL;
+address StubRoutines::_montgomeryMultiply = NULL;
+address StubRoutines::_montgomerySquare = NULL;

 double (* StubRoutines::_intrinsic_log   )(double) = NULL;
 double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
--- a/src/share/vm/runtime/stubRoutines.hpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/runtime/stubRoutines.hpp	Mon Jul 25 06:38:24 2016 +0100
@@ -214,6 +214,10 @@
   static address _crc_table_adr;

   static address _multiplyToLen;
+  static address _squareToLen;
+  static address _mulAdd;
+  static address _montgomeryMultiply;
+  static address _montgomerySquare;

   // These are versions of the java.lang.Math methods which perform
   // the same operations as the intrinsic version.  They are used for
@@ -374,6 +378,10 @@
   static address crc_table_addr()      { return _crc_table_adr; }

   static address multiplyToLen()       {return _multiplyToLen; }
+  static address squareToLen()         {return _squareToLen; }
+  static address mulAdd()              {return _mulAdd; }
+  static address montgomeryMultiply()  { return _montgomeryMultiply; }
+  static address montgomerySquare()    { return _montgomerySquare; }

   static address select_fill_function(BasicType t, bool aligned, const char* &name);
--- a/src/share/vm/runtime/vmStructs.cpp	Mon Jul 25 06:02:58 2016 +0100
+++ b/src/share/vm/runtime/vmStructs.cpp	Mon Jul 25 06:38:24 2016 +0100
@@ -821,6 +821,8 @@
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
      static_field(StubRoutines,                _multiplyToLen,                                address)                               \
+     static_field(StubRoutines,                _squareToLen,                                  address)                               \
+     static_field(StubRoutines,                _mulAdd,                                       address)                               \
                                                                                                                                      \
   /*****************/                                                                                                                \
   /* SharedRuntime */                                                                                                                \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/intrinsics/montgomerymultiply/MontgomeryMultiplyTest.java	Mon Jul 25 06:38:24 2016 +0100
@@ -0,0 +1,284 @@
+//
+// Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2015, Red Hat Inc. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.MethodType;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * @test
+ * @bug 8130150
+ * @library /testlibrary
+ * @requires (os.simpleArch == "x64") & (os.family != "windows")
+ * @summary Verify that the Montgomery multiply intrinsic works and correctly checks its arguments.
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UseMontgomerySquareIntrinsic
+ *      -XX:+UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UseMontgomerySquareIntrinsic
+ *      -XX:-UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-UseMontgomerySquareIntrinsic
+ *      -XX:+UseMontgomeryMultiplyIntrinsic MontgomeryMultiplyTest
+ */
+
+public class MontgomeryMultiplyTest {
+
+    static final MethodHandles.Lookup lookup = MethodHandles.lookup();
+
+    static final MethodHandle montgomeryMultiplyHandle, montgomerySquareHandle;
+    static final MethodHandle bigIntegerConstructorHandle;
+    static final Field bigIntegerMagField;
+
+    static {
+       // Use reflection to gain access to the methods we want to test.
+        try {
+            Method m = BigInteger.class.getDeclaredMethod("montgomeryMultiply",
+                /*a*/int[].class, /*b*/int[].class, /*n*/int[].class, /*len*/int.class,
+                /*inv*/long.class, /*product*/int[].class);
+            m.setAccessible(true);
+            montgomeryMultiplyHandle = lookup.unreflect(m);
+
+            m = BigInteger.class.getDeclaredMethod("montgomerySquare",
+                /*a*/int[].class, /*n*/int[].class, /*len*/int.class,
+                /*inv*/long.class, /*product*/int[].class);
+            m.setAccessible(true);
+            montgomerySquareHandle = lookup.unreflect(m);
+
+            Constructor c
+                = BigInteger.class.getDeclaredConstructor(int.class, int[].class);
+            c.setAccessible(true);
+            bigIntegerConstructorHandle = lookup.unreflectConstructor(c);
+
+            bigIntegerMagField = BigInteger.class.getDeclaredField("mag");
+            bigIntegerMagField.setAccessible(true);
+
+        } catch (Throwable ex) {
+            throw new RuntimeException(ex);
+        }
+    }
+
+    // Invoke either BigInteger.montgomeryMultiply or BigInteger.montgomerySquare.
+    int[] montgomeryMultiply(int[] a, int[] b, int[] n, int len, long inv,
+                             int[] product) throws Throwable {
+        int[] result =
+            (a == b) ? (int[]) montgomerySquareHandle.invokeExact(a, n, len, inv, product)
+                     : (int[]) montgomeryMultiplyHandle.invokeExact(a, b, n, len, inv, product);
+        return Arrays.copyOf(result, len);
+    }
+
+    // Invoke the private constructor BigInteger(int[]).
+    BigInteger newBigInteger(int[] val) throws Throwable {
+        return (BigInteger) bigIntegerConstructorHandle.invokeExact(1, val);
+    }
+
+    // Get the private field BigInteger.mag
+    int[] mag(BigInteger n) {
+        try {
+            return (int[]) bigIntegerMagField.get(n);
+        } catch (Exception ex) {
+            throw new RuntimeException(ex);
+        }
+    }
+
+    // Montgomery multiplication
+    // Calculate a * b * r^-1 mod n)
+    //
+    // R is a power of the word size
+    // N' = R^-1 mod N
+    //
+    // T := ab
+    // m := (T mod R)N' mod R [so 0 <= m < R]
+    // t := (T + mN)/R
+    // if t >= N then return t - N else return t
+    //
+    BigInteger montgomeryMultiply(BigInteger a, BigInteger b, BigInteger N,
+            int len, BigInteger n_prime)
+            throws Throwable {
+        BigInteger T = a.multiply(b);
+        BigInteger R = BigInteger.ONE.shiftLeft(len*32);
+        BigInteger mask = R.subtract(BigInteger.ONE);
+        BigInteger m = (T.and(mask)).multiply(n_prime);
+        m = m.and(mask); // i.e. m.mod(R)
+        T = T.add(m.multiply(N));
+        T = T.shiftRight(len*32); // i.e. T.divide(R)
+        if (T.compareTo(N) > 0) {
+            T = T.subtract(N);
+        }
+        return T;
+    }
+
+    // Call the Montgomery multiply intrinsic.
+    BigInteger montgomeryMultiply(int[] a_words, int[] b_words, int[] n_words,
+            int len, BigInteger inv)
+            throws Throwable {
+        BigInteger t = montgomeryMultiply(
+                newBigInteger(a_words),
+                newBigInteger(b_words),
+                newBigInteger(n_words),
+                len, inv);
+        return t;
+    }
+
+    // Check that the Montgomery multiply intrinsic returns the same
+    // result as the longhand calculation.
+    void check(int[] a_words, int[] b_words, int[] n_words, int len, BigInteger inv)
+            throws Throwable {
+        BigInteger n = newBigInteger(n_words);
+        BigInteger slow = montgomeryMultiply(a_words, b_words, n_words, len, inv);
+        BigInteger fast
+            = newBigInteger(montgomeryMultiply
+                            (a_words, b_words, n_words, len, inv.longValue(), null));
+        // The intrinsic may not return the same value as the longhand
+        // calculation but they must have the same residue mod N.
+        if (!slow.mod(n).equals(fast.mod(n))) {
+            throw new RuntimeException();
+        }
+    }
+
+    Random rnd = new Random(0);
+
+    // Return a random value of length <= bits in an array of even length
+    int[] random_val(int bits) {
+        int len = (bits+63)/64;  // i.e. length in longs
+        int[] val = new int[len*2];
+        for (int i = 0; i < val.length; i++)
+            val[i] = rnd.nextInt();
+        int leadingZeros = 64 - (bits & 64);
+        if (leadingZeros >= 32) {
+            val[0] = 0;
+            val[1] &= ~(-1l << (leadingZeros & 31));
+        } else {
+            val[0] &= ~(-1l << leadingZeros);
+        }
+        return val;
+    }
+
+    void testOneLength(int lenInBits, int lenInInts) throws Throwable {
+        BigInteger mod = new BigInteger(lenInBits, 2, rnd);
+        BigInteger r = BigInteger.ONE.shiftLeft(lenInInts * 32);
+        BigInteger n_prime = mod.modInverse(r).negate();
+
+        // Make n.length even, padding with a zero if necessary
+        int[] n = mag(mod);
+        if (n.length < lenInInts) {
+            int[] x = new int[lenInInts];
+            System.arraycopy(n, 0, x, lenInInts-n.length, n.length);
+            n = x;
+        }
+
+        for (int i = 0; i < 10000; i++) {
+            // multiply
+            check(random_val(lenInBits), random_val(lenInBits), n, lenInInts, n_prime);
+            // square
+            int[] tmp = random_val(lenInBits);
+            check(tmp, tmp, n, lenInInts, n_prime);
+        }
+    }
+
+    // Test the Montgomery multiply intrinsic with a bunch of random
+    // values of varying lengths.  Do this for long enough that the
+    // caller of the intrinsic is C2-compiled.
+    void testResultValues() throws Throwable {
+        // Test a couple of interesting edge cases.
+        testOneLength(1024, 32);
+        testOneLength(1025, 34);
+        for (int j = 10; j > 0; j--) {
+            // Construct a random prime whose length in words is even
+            int lenInBits = rnd.nextInt(2048) + 64;
+            int lenInInts = (lenInBits + 63)/64*2;
+            testOneLength(lenInBits, lenInInts);
+        }
+    }
+
+    // Range checks
+    void testOneMontgomeryMultiplyCheck(int[] a, int[] b, int[] n, int len, long inv,
+                                        int[] product, Class klass) {
+        try {
+            montgomeryMultiply(a, b, n, len, inv, product);
+        } catch (Throwable ex) {
+            if (klass.isAssignableFrom(ex.getClass()))
+                return;
+            throw new RuntimeException(klass + " expected, " + ex + " was thrown");
+        }
+        throw new RuntimeException(klass + " expected, was not thrown");
+    }
+
+    void testOneMontgomeryMultiplyCheck(int[] a, int[] b, BigInteger n, int len, BigInteger inv,
+            Class klass) {
+        testOneMontgomeryMultiplyCheck(a, b, mag(n), len, inv.longValue(), null, klass);
+    }
+
+    void testOneMontgomeryMultiplyCheck(int[] a, int[] b, BigInteger n, int len, BigInteger inv,
+            int[] product, Class klass) {
+        testOneMontgomeryMultiplyCheck(a, b, mag(n), len, inv.longValue(), product, klass);
+    }
+
+    void testMontgomeryMultiplyChecks() {
+        int[] blah = random_val(40);
+        int[] small = random_val(39);
+        BigInteger mod = new BigInteger(40*32 , 2, rnd);
+        BigInteger r = BigInteger.ONE.shiftLeft(40*32);
+        BigInteger n_prime = mod.modInverse(r).negate();
+
+        // Length out of range: square
+        testOneMontgomeryMultiplyCheck(blah, blah, mod, 41, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah, mod, 0, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah, mod, -1, n_prime, IllegalArgumentException.class);
+        // As above, but for multiply
+        testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 41, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 0, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 0, n_prime, IllegalArgumentException.class);
+
+        // Length odd
+        testOneMontgomeryMultiplyCheck(small, small, mod, 39, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, small, mod, 0, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, small, mod, -1, n_prime, IllegalArgumentException.class);
+        // As above, but for multiply
+        testOneMontgomeryMultiplyCheck(small, small.clone(), mod, 39, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, small.clone(), mod, 0, n_prime, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, small.clone(), mod, -1, n_prime, IllegalArgumentException.class);
+
+        // array too small
+        testOneMontgomeryMultiplyCheck(blah, blah, mod, 40, n_prime, small, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah.clone(), mod, 40, n_prime, small, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, blah, mod, 40, n_prime, blah, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, small, mod, 40, n_prime, blah, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(blah, blah, mod, 40, n_prime, small, IllegalArgumentException.class);
+        testOneMontgomeryMultiplyCheck(small, small, mod, 40, n_prime, blah, IllegalArgumentException.class);
+    }
+
+    public static void main(String args[]) {
+        try {
+            new MontgomeryMultiplyTest().testMontgomeryMultiplyChecks();
+            new MontgomeryMultiplyTest().testResultValues();
+        } catch (Throwable ex) {
+            throw new RuntimeException(ex);
+        }
+     }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/intrinsics/muladd/TestMulAdd.java	Mon Jul 25 06:38:24 2016 +0100
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ *      -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
+ *      -XX:+UseMulAddIntrinsic
+ *      -XX:CompileCommand=dontinline,TestMulAdd::main
+ *      -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
+ *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ *      -XX:CompileCommand=inline,java.math.BigInteger::square
+ *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
+ *      -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestMulAdd {
+
+    // Avoid intrinsic by preventing inlining multiply() and mulAdd().
+    public static BigInteger base_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    // Generate mulAdd() intrinsic by inlining multiply().
+    public static BigInteger new_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+      byte[] data1 = b1.toByteArray();
+      byte[] data2 = b2.toByteArray();
+      if (data1.length != data2.length)
+        return false;
+      for (int i = 0; i < data1.length; i++) {
+        if (data1[i] != data2[i])
+          return false;
+      }
+      return true;
+    }
+
+    public static String stringify(BigInteger b) {
+      String strout= "";
+      byte [] data = b.toByteArray();
+      for (int i = 0; i < data.length; i++) {
+        strout += (String.format("%02x",data[i]) + " ");
+      }
+      return strout;
+    }
+
+    public static void main(String args[]) throws Exception {
+
+      BigInteger oldsum = new BigInteger("0");
+      BigInteger newsum = new BigInteger("0");
+
+      BigInteger b1, b2, oldres, newres;
+
+      Random rand = new Random();
+      long seed = System.nanoTime();
+      Random rand1 = new Random();
+      long seed1 = System.nanoTime();
+      rand.setSeed(seed);
+      rand1.setSeed(seed1);
+
+      for (int j = 0; j < 100000; j++) {
+        int rand_int = rand1.nextInt(3136)+32;
+        b1 = new BigInteger(rand_int, rand);
+
+        oldres = base_multiply(b1);
+        newres = new_multiply(b1);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          System.out.println(b1);
+          throw new Exception("Failed");
+        }
+      }
+      if (!bytecompare(oldsum,newsum))  {
+        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+        throw new Exception("Failed");
+      } else {
+        System.out.println("Success");
+      }
+   }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/intrinsics/squaretolen/TestSquareToLen.java	Mon Jul 25 06:38:24 2016 +0100
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ *      -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:+UseSquareToLenIntrinsic
+ *      -XX:CompileCommand=exclude,TestSquareToLen::main
+ *      -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
+ *      -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ *      -XX:CompileCommand=inline,java.math.BigInteger::square
+ *      -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestSquareToLen {
+
+    // Avoid intrinsic by preventing inlining multiply() and squareToLen().
+    public static BigInteger base_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    // Generate squareToLen() intrinsic by inlining multiply().
+    public static BigInteger new_multiply(BigInteger op1) {
+      return op1.multiply(op1);
+    }
+
+    public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+      byte[] data1 = b1.toByteArray();
+      byte[] data2 = b2.toByteArray();
+      if (data1.length != data2.length)
+        return false;
+      for (int i = 0; i < data1.length; i++) {
+        if (data1[i] != data2[i])
+          return false;
+      }
+      return true;
+    }
+
+    public static String stringify(BigInteger b) {
+      String strout= "";
+      byte [] data = b.toByteArray();
+      for (int i = 0; i < data.length; i++) {
+        strout += (String.format("%02x",data[i]) + " ");
+      }
+      return strout;
+    }
+
+    public static void main(String args[]) throws Exception {
+
+      BigInteger oldsum = new BigInteger("0");
+      BigInteger newsum = new BigInteger("0");
+
+      BigInteger b1, b2, oldres, newres;
+
+      Random rand = new Random();
+      long seed = System.nanoTime();
+      Random rand1 = new Random();
+      long seed1 = System.nanoTime();
+      rand.setSeed(seed);
+      rand1.setSeed(seed1);
+
+      for (int j = 0; j < 100000; j++) {
+        int rand_int = rand1.nextInt(3136)+32;
+        b1 = new BigInteger(rand_int, rand);
+
+        oldres = base_multiply(b1);
+        newres = new_multiply(b1);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          System.out.println(b1);
+          throw new Exception("Failed");
+        }
+      }
+      if (!bytecompare(oldsum,newsum))  {
+        System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+        throw new Exception("Failed");
+      } else {
+        System.out.println("Success");
+      }
+   }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/loopopts/TestLoopPeeling.java	Mon Jul 25 06:38:24 2016 +0100
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8078262
+ * @summary Tests correct dominator information after loop peeling.
+ * @run main/othervm -Xcomp -XX:CompileCommand=compileonly,TestLoopPeeling::test* TestLoopPeeling
+ */
+public class TestLoopPeeling {
+
+    public int[] array = new int[100];
+
+    public static void main(String args[]) {
+        TestLoopPeeling test = new TestLoopPeeling();
+        try {
+            test.testArrayAccess(0, 1);
+            test.testArrayAllocation(0, 1);
+        } catch (Exception e) {
+            // Ignore exceptions
+        }
+    }
+
+    public void testArrayAccess(int index, int inc) {
+        int storeIndex = -1;
+
+        for (; index < 10; index += inc) {
+            // This loop invariant check triggers loop peeling because it can
+            // be moved out of the loop (see 'IdealLoopTree::policy_peeling').
+            if (inc == 42) return;
+
+            // This loop variant usage of LShiftL( ConvI2L( Phi(storeIndex) ) )
+            // prevents the split if optimization that would otherwise clone the
+            // LShiftL and ConvI2L nodes and assign them to their corresponding array
+            // address computation (see 'PhaseIdealLoop::split_if_with_blocks_post').
+            if (storeIndex > 0 && array[storeIndex] == 42) return;
+
+            if (index == 42) {
+                // This store and the corresponding range check are moved out of the
+                // loop and both used after old loop and the peeled iteration exit.
+                // For the peeled iteration, storeIndex is always -1 and the ConvI2L
+                // is replaced by TOP. However, the range check is not folded because
+                // we don't do the split if optimization in PhaseIdealLoop2.
+                // As a result, we have a (dead) control path from the peeled iteration
+                // to the StoreI but the data path is removed.
+                array[storeIndex] = 1;
+                return;
+            }
+
+            storeIndex++;
+        }
+    }
+
+    public byte[] testArrayAllocation(int index, int inc) {
+        int allocationCount = -1;
+        byte[] result;
+
+        for (; index < 10; index += inc) {
+            // This loop invariant check triggers loop peeling because it can
+            // be moved out of the loop (see 'IdealLoopTree::policy_peeling').
+            if (inc == 42) return null;
+
+            if (index == 42) {
+                // This allocation and the corresponding size check are moved out of the
+                // loop and both used after old loop and the peeled iteration exit.
+                // For the peeled iteration, allocationCount is always -1 and the ConvI2L
+                // is replaced by TOP. However, the size check is not folded because
+                // we don't do the split if optimization in PhaseIdealLoop2.
+                // As a result, we have a (dead) control path from the peeled iteration
+                // to the allocation but the data path is removed.
+                result = new byte[allocationCount];
+                return result;
+            }
+
+            allocationCount++;
+        }
+        return null;
+    }
+}
+
--- a/test/testlibrary/whitebox/sun/hotspot/WhiteBox.java	Mon Jul 25 06:02:58 2016 +0100
+++ b/test/testlibrary/whitebox/sun/hotspot/WhiteBox.java	Mon Jul 25 06:38:24 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -233,4 +233,6 @@
     return offset;
   }

+  // Class Data Sharing
+  public native boolean isSharedClass(Class<?> c);
 }