Mercurial > hg > icedtea7-forest-aarch64 > hotspot

--- a/make/hotspot_version	Wed Jan 09 20:33:26 2013 -0800
+++ b/make/hotspot_version	Fri Jan 11 10:38:38 2013 -0800
@@ -35,7 +35,7 @@

 HS_MAJOR_VER=24
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=28
+HS_BUILD_NUMBER=29

 JDK_MAJOR_VER=1
 JDK_MINOR_VER=7
--- a/src/cpu/sparc/vm/sparc.ad	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/sparc/vm/sparc.ad	Fri Jan 11 10:38:38 2013 -0800
@@ -10177,7 +10177,7 @@

 //---------- Zeros Count Instructions ------------------------------------------

-instruct countLeadingZerosI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
+instruct countLeadingZerosI(iRegIsafe dst, iRegI src, iRegI tmp, flagsReg cr) %{
   predicate(UsePopCountInstruction);  // See Matcher::match_rule_supported
   match(Set dst (CountLeadingZerosI src));
   effect(TEMP dst, TEMP tmp, KILL cr);
@@ -10274,7 +10274,7 @@
   ins_pipe(ialu_reg);
 %}

-instruct countTrailingZerosI(iRegI dst, iRegI src, flagsReg cr) %{
+instruct countTrailingZerosI(iRegIsafe dst, iRegI src, flagsReg cr) %{
   predicate(UsePopCountInstruction);  // See Matcher::match_rule_supported
   match(Set dst (CountTrailingZerosI src));
   effect(TEMP dst, KILL cr);
@@ -10317,19 +10317,21 @@

 //---------- Population Count Instructions -------------------------------------

-instruct popCountI(iRegI dst, iRegI src) %{
+instruct popCountI(iRegIsafe dst, iRegI src) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountI src));

-  format %{ "POPC   $src, $dst" %}
-  ins_encode %{
-    __ popc($src$$Register, $dst$$Register);
+  format %{ "SRL    $src, G0, $dst\t! clear upper word for 64 bit POPC\n\t"
+            "POPC   $dst, $dst" %}
+  ins_encode %{
+    __ srl($src$$Register, G0, $dst$$Register);
+    __ popc($dst$$Register, $dst$$Register);
   %}
   ins_pipe(ialu_reg);
 %}

 // Note: Long.bitCount(long) returns an int.
-instruct popCountL(iRegI dst, iRegL src) %{
+instruct popCountL(iRegIsafe dst, iRegL src) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountL src));
--- a/src/cpu/x86/vm/assembler_x86.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/assembler_x86.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -2407,7 +2407,6 @@

 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
-  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
   simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x00);
@@ -2476,6 +2475,26 @@
   emit_byte(0xC0 | encode);
 }

+void Assembler::vptest(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
+  emit_byte(0x17);
+  emit_operand(dst, src);
+}
+
+void Assembler::vptest(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_byte(0x17);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::punpcklbw(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@@ -2552,12 +2571,18 @@
   emit_byte(0xA5);
 }

+// sets rcx bytes with rax, value at [edi]
+void Assembler::rep_stosb() {
+  emit_byte(0xF3); // REP
+  LP64_ONLY(prefix(REX_W));
+  emit_byte(0xAA); // STOSB
+}
+
 // sets rcx pointer sized words with rax, value at [edi]
 // generic
-void Assembler::rep_set() { // rep_set
-  emit_byte(0xF3);
-  // STOSQ
-  LP64_ONLY(prefix(REX_W));
+void Assembler::rep_stos() {
+  emit_byte(0xF3); // REP
+  LP64_ONLY(prefix(REX_W));       // LP64:STOSQ, LP32:STOSD
   emit_byte(0xAB);
 }

@@ -3648,6 +3673,15 @@
   emit_byte(0x01);
 }

+// duplicate 4-bytes integer data from src into 8 locations in dest
+void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_byte(0x58);
+  emit_byte((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vzeroupper() {
   assert(VM_Version::supports_avx(), "");
   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
@@ -8427,7 +8461,8 @@

 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
   // Used in sign-bit flipping with aligned address.
-  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
+  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
   if (reachable(src)) {
     Assembler::pshufb(dst, as_Address(src));
   } else {
@@ -10483,6 +10518,22 @@

 }

+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+  // cnt - number of qwords (8-byte words).
+  // base - start address, qword aligned.
+  assert(base==rdi, "base register must be edi for rep stos");
+  assert(tmp==rax,   "tmp register must be eax for rep stos");
+  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
+
+  xorptr(tmp, tmp);
+  if (UseFastStosb) {
+    shlptr(cnt,3); // convert to number of bytes
+    rep_stosb();
+  } else {
+    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+    rep_stos();
+  }
+}

 // IndexOf for constant substrings with size >= 8 chars
 // which don't need to be loaded through stack.
@@ -10918,42 +10969,114 @@
   testl(cnt2, cnt2);
   jcc(Assembler::zero, LENGTH_DIFF_LABEL);

-  // Load first characters
+  // Compare first characters
   load_unsigned_short(result, Address(str1, 0));
   load_unsigned_short(cnt1, Address(str2, 0));
-
-  // Compare first characters
   subl(result, cnt1);
   jcc(Assembler::notZero,  POP_LABEL);
-  decrementl(cnt2);
-  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
-
-  {
-    // Check after comparing first character to see if strings are equivalent
-    Label LSkip2;
-    // Check if the strings start at same location
-    cmpptr(str1, str2);
-    jccb(Assembler::notEqual, LSkip2);
-
-    // Check if the length difference is zero (from stack)
-    cmpl(Address(rsp, 0), 0x0);
-    jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
-
-    // Strings might not be equivalent
-    bind(LSkip2);
-  }
+  cmpl(cnt2, 1);
+  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
+
+  // Check if the strings start at the same location.
+  cmpptr(str1, str2);
+  jcc(Assembler::equal, LENGTH_DIFF_LABEL);

   Address::ScaleFactor scale = Address::times_2;
   int stride = 8;

-  // Advance to next element
-  addptr(str1, 16/stride);
-  addptr(str2, 16/stride);
-
-  if (UseSSE42Intrinsics) {
+  if (UseAVX >= 2) {
+    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
+    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
+    Label COMPARE_TAIL_LONG;
+    int pcmpmask = 0x19;
+
+    // Setup to compare 16-chars (32-bytes) vectors,
+    // start from first character again because it has aligned address.
+    int stride2 = 16;
+    int adr_stride  = stride  << scale;
+    int adr_stride2 = stride2 << scale;
+
+    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
+    // rax and rdx are used by pcmpestri as elements counters
+    movl(result, cnt2);
+    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
+    jcc(Assembler::zero, COMPARE_TAIL_LONG);
+
+    // fast path : compare first 2 8-char vectors.
+    bind(COMPARE_16_CHARS);
+    movdqu(vec1, Address(str1, 0));
+    pcmpestri(vec1, Address(str2, 0), pcmpmask);
+    jccb(Assembler::below, COMPARE_INDEX_CHAR);
+
+    movdqu(vec1, Address(str1, adr_stride));
+    pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
+    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
+    addl(cnt1, stride);
+
+    // Compare the characters at index in cnt1
+    bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
+    load_unsigned_short(result, Address(str1, cnt1, scale));
+    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
+    subl(result, cnt2);
+    jmp(POP_LABEL);
+
+    // Setup the registers to start vector comparison loop
+    bind(COMPARE_WIDE_VECTORS);
+    lea(str1, Address(str1, result, scale));
+    lea(str2, Address(str2, result, scale));
+    subl(result, stride2);
+    subl(cnt2, stride2);
+    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
+    negptr(result);
+
+    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
+    bind(COMPARE_WIDE_VECTORS_LOOP);
+    vmovdqu(vec1, Address(str1, result, scale));
+    vpxor(vec1, Address(str2, result, scale));
+    vptest(vec1, vec1);
+    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+    addptr(result, stride2);
+    subl(cnt2, stride2);
+    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
+
+    // compare wide vectors tail
+    bind(COMPARE_WIDE_TAIL);
+    testptr(result, result);
+    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+
+    movl(result, stride2);
+    movl(cnt2, result);
+    negptr(result);
+    jmpb(COMPARE_WIDE_VECTORS_LOOP);
+
+    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
+    bind(VECTOR_NOT_EQUAL);
+    lea(str1, Address(str1, result, scale));
+    lea(str2, Address(str2, result, scale));
+    jmp(COMPARE_16_CHARS);
+
+    // Compare tail chars, length between 1 to 15 chars
+    bind(COMPARE_TAIL_LONG);
+    movl(cnt2, result);
+    cmpl(cnt2, stride);
+    jccb(Assembler::less, COMPARE_SMALL_STR);
+
+    movdqu(vec1, Address(str1, 0));
+    pcmpestri(vec1, Address(str2, 0), pcmpmask);
+    jcc(Assembler::below, COMPARE_INDEX_CHAR);
+    subptr(cnt2, stride);
+    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+    lea(str1, Address(str1, result, scale));
+    lea(str2, Address(str2, result, scale));
+    negptr(cnt2);
+    jmpb(WHILE_HEAD_LABEL);
+
+    bind(COMPARE_SMALL_STR);
+  } else if (UseSSE42Intrinsics) {
     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
     int pcmpmask = 0x19;
-    // Setup to compare 16-byte vectors
+    // Setup to compare 8-char (16-byte) vectors,
+    // start from first character again because it has aligned address.
     movl(result, cnt2);
     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
     jccb(Assembler::zero, COMPARE_TAIL);
@@ -10985,7 +11108,7 @@
     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);

     // compare wide vectors tail
-    testl(result, result);
+    testptr(result, result);
     jccb(Assembler::zero, LENGTH_DIFF_LABEL);

     movl(cnt2, stride);
@@ -10997,21 +11120,20 @@

     // Mismatched characters in the vectors
     bind(VECTOR_NOT_EQUAL);
-    addptr(result, cnt1);
-    movptr(cnt2, result);
-    load_unsigned_short(result, Address(str1, cnt2, scale));
-    load_unsigned_short(cnt1, Address(str2, cnt2, scale));
-    subl(result, cnt1);
+    addptr(cnt1, result);
+    load_unsigned_short(result, Address(str1, cnt1, scale));
+    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
+    subl(result, cnt2);
     jmpb(POP_LABEL);

     bind(COMPARE_TAIL); // limit is zero
     movl(cnt2, result);
     // Fallthru to tail compare
   }
-
   // Shift str2 and str1 to the end of the arrays, negate min
-  lea(str1, Address(str1, cnt2, scale, 0));
-  lea(str2, Address(str2, cnt2, scale, 0));
+  lea(str1, Address(str1, cnt2, scale));
+  lea(str2, Address(str2, cnt2, scale));
+  decrementl(cnt2);  // first character was compared already
   negptr(cnt2);

   // Compare the rest of the elements
@@ -11076,7 +11198,44 @@
   shll(limit, 1);      // byte count != 0
   movl(result, limit); // copy

-  if (UseSSE42Intrinsics) {
+  if (UseAVX >= 2) {
+    // With AVX2, use 32-byte vector compare
+    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+
+    // Compare 32-byte vectors
+    andl(result, 0x0000001e);  //   tail count (in bytes)
+    andl(limit, 0xffffffe0);   // vector count (in bytes)
+    jccb(Assembler::zero, COMPARE_TAIL);
+
+    lea(ary1, Address(ary1, limit, Address::times_1));
+    lea(ary2, Address(ary2, limit, Address::times_1));
+    negptr(limit);
+
+    bind(COMPARE_WIDE_VECTORS);
+    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
+    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
+    vpxor(vec1, vec2);
+
+    vptest(vec1, vec1);
+    jccb(Assembler::notZero, FALSE_LABEL);
+    addptr(limit, 32);
+    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+
+    testl(result, result);
+    jccb(Assembler::zero, TRUE_LABEL);
+
+    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
+    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
+    vpxor(vec1, vec2);
+
+    vptest(vec1, vec1);
+    jccb(Assembler::notZero, FALSE_LABEL);
+    jmpb(TRUE_LABEL);
+
+    bind(COMPARE_TAIL); // limit is zero
+    movl(limit, result);
+    // Fallthru to tail compare
+  } else if (UseSSE42Intrinsics) {
     // With SSE4.2, use double quad vector compare
     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;

@@ -11254,29 +11413,53 @@
     {
       assert( UseSSE >= 2, "supported cpu only" );
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
-      // Fill 32-byte chunks
       movdl(xtmp, value);
-      pshufd(xtmp, xtmp, 0);
-
-      subl(count, 8 << shift);
-      jcc(Assembler::less, L_check_fill_8_bytes);
-      align(16);
-
-      BIND(L_fill_32_bytes_loop);
-
-      if (UseUnalignedLoadStores) {
-        movdqu(Address(to, 0), xtmp);
-        movdqu(Address(to, 16), xtmp);
+      if (UseAVX >= 2 && UseUnalignedLoadStores) {
+        // Fill 64-byte chunks
+        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
+        vpbroadcastd(xtmp, xtmp);
+
+        subl(count, 16 << shift);
+        jcc(Assembler::less, L_check_fill_32_bytes);
+        align(16);
+
+        BIND(L_fill_64_bytes_loop);
+        vmovdqu(Address(to, 0), xtmp);
+        vmovdqu(Address(to, 32), xtmp);
+        addptr(to, 64);
+        subl(count, 16 << shift);
+        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
+
+        BIND(L_check_fill_32_bytes);
+        addl(count, 8 << shift);
+        jccb(Assembler::less, L_check_fill_8_bytes);
+        vmovdqu(Address(to, 0), xtmp);
+        addptr(to, 32);
+        subl(count, 8 << shift);
       } else {
-        movq(Address(to, 0), xtmp);
-        movq(Address(to, 8), xtmp);
-        movq(Address(to, 16), xtmp);
-        movq(Address(to, 24), xtmp);
+        // Fill 32-byte chunks
+        pshufd(xtmp, xtmp, 0);
+
+        subl(count, 8 << shift);
+        jcc(Assembler::less, L_check_fill_8_bytes);
+        align(16);
+
+        BIND(L_fill_32_bytes_loop);
+
+        if (UseUnalignedLoadStores) {
+          movdqu(Address(to, 0), xtmp);
+          movdqu(Address(to, 16), xtmp);
+        } else {
+          movq(Address(to, 0), xtmp);
+          movq(Address(to, 8), xtmp);
+          movq(Address(to, 16), xtmp);
+          movq(Address(to, 24), xtmp);
+        }
+
+        addptr(to, 32);
+        subl(count, 8 << shift);
+        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
       }
-
-      addptr(to, 32);
-      subl(count, 8 << shift);
-      jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
       BIND(L_check_fill_8_bytes);
       addl(count, 8 << shift);
       jccb(Assembler::zero, L_exit);
--- a/src/cpu/x86/vm/assembler_x86.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/assembler_x86.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -842,7 +842,8 @@

   // These do register sized moves/scans
   void rep_mov();
-  void rep_set();
+  void rep_stos();
+  void rep_stosb();
   void repne_scan();
 #ifdef _LP64
   void repne_scanl();
@@ -1460,9 +1461,12 @@
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);

-  // Logical Compare Double Quadword
+  // Logical Compare 128bit
   void ptest(XMMRegister dst, XMMRegister src);
   void ptest(XMMRegister dst, Address src);
+  // Logical Compare 256bit
+  void vptest(XMMRegister dst, XMMRegister src);
+  void vptest(XMMRegister dst, Address src);

   // Interleave Low Bytes
   void punpcklbw(XMMRegister dst, XMMRegister src);
@@ -1774,6 +1778,9 @@
   void vextractf128h(Address dst, XMMRegister src);
   void vextracti128h(Address dst, XMMRegister src);

+  // duplicate 4-bytes integer data from src into 8 locations in dest
+  void vpbroadcastd(XMMRegister dst, XMMRegister src);
+
   // AVX instruction which is used to clear upper 128 bits of YMM registers and
   // to avoid transaction penalty between AVX and SSE states. There is no
   // penalty if legacy SSE instructions are encoded using VEX prefix because
@@ -2733,6 +2740,10 @@
       Assembler::vxorpd(dst, nds, src, vector256);
   }

+  // Simple version for AVX2 256bit vectors
+  void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
+  void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
+
   // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
     if (UseAVX > 1) // vinserti128h is available only in AVX2
@@ -2814,6 +2825,9 @@
   // C2 compiled method's prolog code.
   void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);

+  // clear memory of size 'cnt' qwords, starting at 'base'.
+  void clear_mem(Register base, Register cnt, Register rtmp);
+
   // IndexOf strings.
   // Small strings are loaded through stack if they cross page boundary.
   void string_indexof(Register str1, Register str2,
--- a/src/cpu/x86/vm/globals_x86.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/globals_x86.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -120,6 +120,9 @@
   product(bool, UseUnalignedLoadStores, false,                              \
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
+  product(bool, UseFastStosb, false,                                        \
+          "Use fast-string operation for zeroing: rep stosb")               \
+                                                                            \
   /* assembler */                                                           \
   product(bool, Use486InstrsOnly, false,                                    \
           "Use 80486 Compliant instruction subset")                         \
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -825,16 +825,22 @@
     __ align(OptoLoopAlignment);
   __ BIND(L_copy_64_bytes_loop);

-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(from, 0));
-      __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
-      __ movdqu(xmm1, Address(from, 16));
-      __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
-      __ movdqu(xmm2, Address(from, 32));
-      __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
-      __ movdqu(xmm3, Address(from, 48));
-      __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
-
+    if (UseUnalignedLoadStores) {
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from,  0));
+        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
+        __ vmovdqu(xmm1, Address(from, 32));
+        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(from, 0));
+        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+        __ movdqu(xmm1, Address(from, 16));
+        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+        __ movdqu(xmm2, Address(from, 32));
+        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+        __ movdqu(xmm3, Address(from, 48));
+        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+      }
     } else {
       __ movq(xmm0, Address(from, 0));
       __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
@@ -2203,13 +2209,13 @@
   //   c_rarg2   - K (key) in little endian int array
   //
   address generate_aescrypt_encryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
     Label L_doLast;
     address start = __ pc();

-    const Register from        = rsi;      // source array address
+    const Register from        = rdx;      // source array address
     const Register to          = rdx;      // destination array address
     const Register key         = rcx;      // key array address
     const Register keylen      = rax;
@@ -2218,47 +2224,74 @@
     const Address  key_param (rbp, 8+8);

     const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
-
-    __ enter(); // required for proper stackwalking of RuntimeStub frame
-    __ push(rsi);
-    __ movptr(from , from_param);
-    __ movptr(to   , to_param);
-    __ movptr(key  , key_param);
-
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;
+
+    __ enter();   // required for proper stackwalking of RuntimeStub frame
+    __ movptr(from, from_param);
+    __ movptr(key, key_param);
+
+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
+    __ movptr(to, to_param);

     // For encryption, the java expanded key ordering is just what we need

-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ pxor(xmm_result, xmm_temp);
-    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
-      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp1);
+
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

     __ BIND(L_doLast);
-    __ aesenclast(xmm_result, xmm_temp);
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenclast(xmm_result, xmm_temp2);
     __ movdqu(Address(to, 0), xmm_result);        // store the result
     __ xorptr(rax, rax); // return 0
-    __ pop(rsi);
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

@@ -2274,13 +2307,13 @@
   //   c_rarg2   - K (key) in little endian int array
   //
   address generate_aescrypt_decryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
     Label L_doLast;
     address start = __ pc();

-    const Register from        = rsi;      // source array address
+    const Register from        = rdx;      // source array address
     const Register to          = rdx;      // destination array address
     const Register key         = rcx;      // key array address
     const Register keylen      = rax;
@@ -2289,51 +2322,76 @@
     const Address  key_param (rbp, 8+8);

     const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

     __ enter(); // required for proper stackwalking of RuntimeStub frame
-    __ push(rsi);
-    __ movptr(from , from_param);
-    __ movptr(to   , to_param);
-    __ movptr(key  , key_param);
-
+    __ movptr(from, from_param);
+    __ movptr(key, key_param);
+
+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     __ movdqu(xmm_result, Address(from, 0));
+    __ movptr(to, to_param);

     // for decryption java expanded key ordering is rotated one position from what we want
     // so we start from 0x10 here and hit 0x00 last
     // we don't know if the key is aligned, hence not using load-execute form
-    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
-    __ pxor  (xmm_result, xmm_temp);
-    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
-      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 192 and 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ pxor  (xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

     __ BIND(L_doLast);
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
     // for decryption the aesdeclast operation is always on key+0x00
-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ aesdeclast(xmm_result, xmm_temp);
-
+    __ aesdeclast(xmm_result, xmm_temp3);
     __ movdqu(Address(to, 0), xmm_result);  // store the result
-
     __ xorptr(rax, rax); // return 0
-    __ pop(rsi);
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

@@ -2369,7 +2427,7 @@
   //   c_rarg4   - input length
   //
   address generate_cipherBlockChaining_encryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
     address start = __ pc();
@@ -2422,7 +2480,7 @@
     __ jcc(Assembler::notEqual, L_key_192_256);

     // 128 bit code follows here
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
     __ align(OptoLoopAlignment);
     __ BIND(L_loopTop_128);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
@@ -2452,15 +2510,15 @@
     __ leave();                                  // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-  __ BIND(L_key_192_256);
-  // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    __ BIND(L_key_192_256);
+    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
     __ cmpl(rax, 52);
     __ jcc(Assembler::notEqual, L_key_256);

     // 192-bit code follows here (could be changed to use more xmm registers)
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_loopTop_192);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_192);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector

@@ -2481,11 +2539,11 @@
     __ jcc(Assembler::notEqual, L_loopTop_192);
     __ jmp(L_exit);

-  __ BIND(L_key_256);
+    __ BIND(L_key_256);
     // 256-bit code follows here (could be changed to use more xmm registers)
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_loopTop_256);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_loopTop_256);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector

@@ -2524,7 +2582,7 @@
   //

   address generate_cipherBlockChaining_decryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     address start = __ pc();
@@ -2585,9 +2643,9 @@


     // 128-bit code follows here, parallelized
-    __ movptr(pos, 0);
-  __ align(OptoLoopAlignment);
-  __ BIND(L_singleBlock_loopTop_128);
+    __ movl(pos, 0);
+    __ align(OptoLoopAlignment);
+    __ BIND(L_singleBlock_loopTop_128);
     __ cmpptr(len_reg, 0);           // any blocks left??
     __ jcc(Assembler::equal, L_exit);
     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
@@ -2626,7 +2684,7 @@
     __ jcc(Assembler::notEqual, L_key_256);

     // 192-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
     __ align(OptoLoopAlignment);
     __ BIND(L_singleBlock_loopTop_192);
     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
@@ -2651,7 +2709,7 @@

     __ BIND(L_key_256);
     // 256-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
+    __ movl(pos, 0);
     __ align(OptoLoopAlignment);
     __ BIND(L_singleBlock_loopTop_256);
     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1314,23 +1314,54 @@
   //   end_to       - destination array end address
   //   qword_count  - 64-bits element count, negative
   //   to           - scratch
-  //   L_copy_32_bytes - entry label
+  //   L_copy_bytes - entry label
   //   L_copy_8_bytes  - exit  label
   //
-  void copy_32_bytes_forward(Register end_from, Register end_to,
+  void copy_bytes_forward(Register end_from, Register end_to,
                              Register qword_count, Register to,
-                             Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+                             Label& L_copy_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(OptoLoopAlignment);
-  __ BIND(L_loop);
-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
-      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
-      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
-      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
-
+    if (UseUnalignedLoadStores) {
+      Label L_end;
+      // Copy 64-bytes per iteration
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+        __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+        __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+        __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+      }
+      __ BIND(L_copy_bytes);
+      __ addptr(qword_count, 8);
+      __ jcc(Assembler::lessEqual, L_loop);
+      __ subptr(qword_count, 4);  // sub(8) and add(4)
+      __ jccb(Assembler::greater, L_end);
+      // Copy trailing 32 bytes
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+      } else {
+        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+      }
+      __ addptr(qword_count, 4);
+      __ BIND(L_end);
     } else {
+      // Copy 32-bytes per iteration
+      __ BIND(L_loop);
       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
@@ -1339,15 +1370,15 @@
       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+
+      __ BIND(L_copy_bytes);
+      __ addptr(qword_count, 4);
+      __ jcc(Assembler::lessEqual, L_loop);
     }
-  __ BIND(L_copy_32_bytes);
-    __ addptr(qword_count, 4);
-    __ jcc(Assembler::lessEqual, L_loop);
     __ subptr(qword_count, 4);
     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
   }

-
   // Copy big chunks backward
   //
   // Inputs:
@@ -1355,23 +1386,55 @@
   //   dest         - destination array address
   //   qword_count  - 64-bits element count
   //   to           - scratch
-  //   L_copy_32_bytes - entry label
+  //   L_copy_bytes - entry label
   //   L_copy_8_bytes  - exit  label
   //
-  void copy_32_bytes_backward(Register from, Register dest,
+  void copy_bytes_backward(Register from, Register dest,
                               Register qword_count, Register to,
-                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(OptoLoopAlignment);
-  __ BIND(L_loop);
-    if(UseUnalignedLoadStores) {
-      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
-      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
-      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
-      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
-
+    if (UseUnalignedLoadStores) {
+      Label L_end;
+      // Copy 64-bytes per iteration
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
+        __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+      } else {
+        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+        __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+        __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+        __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
+        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
+      }
+      __ BIND(L_copy_bytes);
+      __ subptr(qword_count, 8);
+      __ jcc(Assembler::greaterEqual, L_loop);
+
+      __ addptr(qword_count, 4);  // add(8) and sub(4)
+      __ jccb(Assembler::less, L_end);
+      // Copy trailing 32 bytes
+      if (UseAVX >= 2) {
+        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
+      } else {
+        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+        __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+      }
+      __ subptr(qword_count, 4);
+      __ BIND(L_end);
     } else {
+      // Copy 32-bytes per iteration
+      __ BIND(L_loop);
       __ movq(to, Address(from, qword_count, Address::times_8, 24));
       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
       __ movq(to, Address(from, qword_count, Address::times_8, 16));
@@ -1380,10 +1443,11 @@
       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
       __ movq(to, Address(from, qword_count, Address::times_8,  0));
       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+
+      __ BIND(L_copy_bytes);
+      __ subptr(qword_count, 4);
+      __ jcc(Assembler::greaterEqual, L_loop);
     }
-  __ BIND(L_copy_32_bytes);
-    __ subptr(qword_count, 4);
-    __ jcc(Assembler::greaterEqual, L_loop);
     __ addptr(qword_count, 4);
     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
   }
@@ -1413,7 +1477,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
     Label L_copy_byte, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
@@ -1445,7 +1509,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count); // make the count negative
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1488,8 +1552,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy in 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);

     return start;
@@ -1516,7 +1580,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1559,10 +1623,10 @@
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
     __ testl(byte_count, 4);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1577,8 +1641,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);

     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1613,7 +1677,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1644,7 +1708,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1680,8 +1744,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy in 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);

     return start;
@@ -1728,7 +1792,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1763,10 +1827,10 @@
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
     __ testl(word_count, 2);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1781,8 +1845,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);

     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1818,7 +1882,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1854,7 +1918,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1881,8 +1945,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy 32-bytes chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
     __ jmp(L_copy_4_bytes);

     return start;
@@ -1910,7 +1974,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
@@ -1944,10 +2008,10 @@

     // Check for and copy trailing dword
     __ testl(dword_count, 1);
-    __ jcc(Assembler::zero, L_copy_32_bytes);
+    __ jcc(Assembler::zero, L_copy_bytes);
     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -1965,8 +2029,8 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);

-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);

    __ bind(L_exit);
      if (is_oop) {
@@ -2004,7 +2068,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register qword_count = rdx;  // elements count
@@ -2036,7 +2100,7 @@
     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
     __ negptr(qword_count);
-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -2055,8 +2119,8 @@
       __ ret(0);
     }

-    // Copy 64-byte chunks
-    copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);

     if (is_oop) {
     __ BIND(L_exit);
@@ -2093,7 +2157,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();

-    Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+    Label L_copy_bytes, L_copy_8_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register qword_count = rdx;  // elements count
@@ -2119,7 +2183,7 @@
       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
     }

-    __ jmp(L_copy_32_bytes);
+    __ jmp(L_copy_bytes);

     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
@@ -2138,8 +2202,8 @@
       __ ret(0);
     }

-    // Copy in 32-bytes chunks
-    copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+    // Copy in multi-bytes chunks
+    copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);

     if (is_oop) {
     __ BIND(L_exit);
@@ -2981,21 +3045,6 @@
     }
   }

-  // aesenc using specified key+offset
-  // can optionally specify that the shuffle mask is already in an xmmregister
-  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
-    load_key(xmmtmp, key, offset, xmm_shuf_mask);
-    __ aesenc(xmmdst, xmmtmp);
-  }
-
-  // aesdec using specified key+offset
-  // can optionally specify that the shuffle mask is already in an xmmregister
-  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
-    load_key(xmmtmp, key, offset, xmm_shuf_mask);
-    __ aesdec(xmmdst, xmmtmp);
-  }
-
-
   // Arguments:
   //
   // Inputs:
@@ -3004,7 +3053,7 @@
   //   c_rarg2   - K (key) in little endian int array
   //
   address generate_aescrypt_encryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
     Label L_doLast;
@@ -3016,15 +3065,17 @@
     const Register keylen      = rax;

     const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    // On win64 xmm6-xmm15 must be preserved so don't use them.
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

     __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
@@ -3032,25 +3083,53 @@
     // For encryption, the java expanded key ordering is just what we need
     // we don't know if the key is aligned, hence not using load-execute form

-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ pxor(xmm_result, xmm_temp);
-    for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
-      aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
+    __ pxor(xmm_result, xmm_temp1);
+
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+    __ aesenc(xmm_result, xmm_temp3);
+    __ aesenc(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenc(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

     __ BIND(L_doLast);
-    __ aesenclast(xmm_result, xmm_temp);
+    __ aesenc(xmm_result, xmm_temp1);
+    __ aesenclast(xmm_result, xmm_temp2);
     __ movdqu(Address(to, 0), xmm_result);        // store the result
     __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
@@ -3068,7 +3147,7 @@
   //   c_rarg2   - K (key) in little endian int array
   //
   address generate_aescrypt_decryptBlock() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
     Label L_doLast;
@@ -3080,15 +3159,17 @@
     const Register keylen      = rax;

     const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    const XMMRegister xmm_key_shuf_mask = xmm2;
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+    // On win64 xmm6-xmm15 must be preserved so don't use them.
+    const XMMRegister xmm_temp1  = xmm2;
+    const XMMRegister xmm_temp2  = xmm3;
+    const XMMRegister xmm_temp3  = xmm4;
+    const XMMRegister xmm_temp4  = xmm5;

     __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    // keylen = # of 32-bit words, convert to 128-bit words
-    __ shrl(keylen, 2);
-    __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more

     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     __ movdqu(xmm_result, Address(from, 0));
@@ -3096,29 +3177,55 @@
     // for decryption java expanded key ordering is rotated one position from what we want
     // so we start from 0x10 here and hit 0x00 last
     // we don't know if the key is aligned, hence not using load-execute form
-    load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
-    __ pxor  (xmm_result, xmm_temp);
-    for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
-      aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
-    }
-    __ cmpl(keylen, 0);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 192 and 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
-    __ subl(keylen, 2);
-    __ jcc(Assembler::equal, L_doLast);
-    // only in 256 bit keys
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+    __ pxor  (xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+    __ aesdec(xmm_result, xmm_temp3);
+    __ aesdec(xmm_result, xmm_temp4);
+
+    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 44);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+    __ cmpl(keylen, 52);
+    __ jccb(Assembler::equal, L_doLast);
+
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
+    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);

     __ BIND(L_doLast);
+    __ aesdec(xmm_result, xmm_temp1);
+    __ aesdec(xmm_result, xmm_temp2);
+
     // for decryption the aesdeclast operation is always on key+0x00
-    load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
-    __ aesdeclast(xmm_result, xmm_temp);
-
+    __ aesdeclast(xmm_result, xmm_temp3);
     __ movdqu(Address(to, 0), xmm_result);  // store the result
-
     __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
@@ -3137,7 +3244,7 @@
   //   c_rarg4   - input length
   //
   address generate_cipherBlockChaining_encryptAESCrypt() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
     address start = __ pc();
@@ -3161,16 +3268,19 @@
     const XMMRegister xmm_temp   = xmm1;
     // keys 0-10 preloaded into xmm2-xmm12
     const int XMM_REG_NUM_KEY_FIRST = 2;
-    const int XMM_REG_NUM_KEY_LAST  = 12;
+    const int XMM_REG_NUM_KEY_LAST  = 15;
     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
-    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
+    const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
+    const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
+    const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
+    const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);

     __ enter(); // required for proper stackwalking of RuntimeStub frame

 #ifdef _WIN64
     // on win64, fill len_reg from stack position
     __ movl(len_reg, len_mem);
-    // save the xmm registers which must be preserved 6-12
+    // save the xmm registers which must be preserved 6-15
     __ subptr(rsp, -rsp_after_call_off * wordSize);
     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
       __ movdqu(xmm_save(i), as_XMMRegister(i));
@@ -3179,12 +3289,11 @@

     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
-    // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
-    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
       offset += 0x10;
     }
-
     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec

     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
@@ -3195,16 +3304,15 @@
     // 128 bit code follows here
     __ movptr(pos, 0);
     __ align(OptoLoopAlignment);
+
     __ BIND(L_loopTop_128);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
       __ aesenc(xmm_result, as_XMMRegister(rnum));
     }
     __ aesenclast(xmm_result, xmm_key10);
-
     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
     // no need to store r to memory until we exit
     __ addptr(pos, AESBlockSize);
@@ -3226,24 +3334,23 @@

     __ BIND(L_key_192_256);
     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
+    load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
     __ cmpl(rax, 52);
     __ jcc(Assembler::notEqual, L_key_256);

     // 192-bit code follows here (could be changed to use more xmm registers)
     __ movptr(pos, 0);
     __ align(OptoLoopAlignment);
+
     __ BIND(L_loopTop_192);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
       __ aesenc(xmm_result, as_XMMRegister(rnum));
     }
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
-    load_key(xmm_temp, key, 0xc0);
-    __ aesenclast(xmm_result, xmm_temp);
-
+    __ aesenclast(xmm_result, xmm_key12);
     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
     // no need to store r to memory until we exit
     __ addptr(pos, AESBlockSize);
@@ -3253,22 +3360,19 @@

     __ BIND(L_key_256);
     // 256-bit code follows here (could be changed to use more xmm registers)
+    load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
     __ movptr(pos, 0);
     __ align(OptoLoopAlignment);
+
     __ BIND(L_loopTop_256);
     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
-
     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
+    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
       __ aesenc(xmm_result, as_XMMRegister(rnum));
     }
-    aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
-    aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
-    aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
     load_key(xmm_temp, key, 0xe0);
     __ aesenclast(xmm_result, xmm_temp);
-
     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
     // no need to store r to memory until we exit
     __ addptr(pos, AESBlockSize);
@@ -3295,7 +3399,7 @@
   //

   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
-    assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+    assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     address start = __ pc();
@@ -3316,12 +3420,10 @@
 #endif
     const Register pos         = rax;

-    // xmm register assignments for the loops below
-    const XMMRegister xmm_result = xmm0;
     // keys 0-10 preloaded into xmm2-xmm12
     const int XMM_REG_NUM_KEY_FIRST = 5;
     const int XMM_REG_NUM_KEY_LAST  = 15;
-    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);

     __ enter(); // required for proper stackwalking of RuntimeStub frame
@@ -3340,13 +3442,14 @@
     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
-    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
+    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
       offset += 0x10;
     }
+    load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);

     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
+
     // registers holding the four results in the parallelized loop
     const XMMRegister xmm_result0 = xmm0;
     const XMMRegister xmm_result1 = xmm2;
@@ -3404,8 +3507,12 @@
     __ jmp(L_multiBlock_loopTop_128);

     // registers used in the non-parallelized loops
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_result = xmm0;
     const XMMRegister xmm_prev_block_cipher_save = xmm2;
-    const XMMRegister xmm_temp   = xmm3;
+    const XMMRegister xmm_key11 = xmm3;
+    const XMMRegister xmm_key12 = xmm4;
+    const XMMRegister xmm_temp  = xmm4;

     __ align(OptoLoopAlignment);
     __ BIND(L_singleBlock_loopTop_128);
@@ -3443,12 +3550,15 @@

     __ BIND(L_key_192_256);
     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
+    load_key(xmm_key11, key, 0xb0);
     __ cmpl(rax, 52);
     __ jcc(Assembler::notEqual, L_key_256);

     // 192-bit code follows here (could be optimized to use parallelism)
+    load_key(xmm_key12, key, 0xc0);     // 192-bit key goes up to c0
     __ movptr(pos, 0);
     __ align(OptoLoopAlignment);
+
     __ BIND(L_singleBlock_loopTop_192);
     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
@@ -3456,14 +3566,13 @@
     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
       __ aesdec(xmm_result, as_XMMRegister(rnum));
     }
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
+    __ aesdec(xmm_result, xmm_key11);
+    __ aesdec(xmm_result, xmm_key12);
     __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
     // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
     __ addptr(pos, AESBlockSize);
     __ subptr(len_reg, AESBlockSize);
     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
@@ -3473,23 +3582,26 @@
     // 256-bit code follows here (could be optimized to use parallelism)
     __ movptr(pos, 0);
     __ align(OptoLoopAlignment);
+
     __ BIND(L_singleBlock_loopTop_256);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
+    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
       __ aesdec(xmm_result, as_XMMRegister(rnum));
     }
-    aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
-    aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
-    aes_dec_key(xmm_result, xmm_temp, key, 0xe0);
-    __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
+    __ aesdec(xmm_result, xmm_key11);
+    load_key(xmm_temp, key, 0xc0);
+    __ aesdec(xmm_result, xmm_temp);
+    load_key(xmm_temp, key, 0xd0);
+    __ aesdec(xmm_result, xmm_temp);
+    load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
+    __ aesdec(xmm_result, xmm_temp);
+    __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
+    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
     // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
+    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
     __ addptr(pos, AESBlockSize);
     __ subptr(len_reg, AESBlockSize);
     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
--- a/src/cpu/x86/vm/vm_version_x86.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -428,7 +428,7 @@
   }

   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -445,6 +445,7 @@
                (supports_avx()    ? ", avx" : ""),
                (supports_avx2()   ? ", avx2" : ""),
                (supports_aes()    ? ", aes" : ""),
+               (supports_erms()   ? ", erms" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
@@ -488,8 +489,8 @@
   }

   // The AES intrinsic stubs require AES instruction support (of course)
-  // but also require AVX and sse3 modes for instructions it use.
-  if (UseAES && (UseAVX > 0) && (UseSSE > 2)) {
+  // but also require sse3 mode for instructions it use.
+  if (UseAES && (UseSSE > 2)) {
     if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
       UseAESIntrinsics = true;
     }
@@ -670,6 +671,16 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }

+  // Use fast-string operations if available.
+  if (supports_erms()) {
+    if (FLAG_IS_DEFAULT(UseFastStosb)) {
+      UseFastStosb = true;
+    }
+  } else if (UseFastStosb) {
+    warning("fast-string operations are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFastStosb, false);
+  }
+
 #ifdef COMPILER2
   if (FLAG_IS_DEFAULT(AlignVector)) {
     // Modern processors allow misaligned memory operations for vectors.
--- a/src/cpu/x86/vm/vm_version_x86.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/vm_version_x86.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -204,7 +204,8 @@
                    avx2 : 1,
                         : 2,
                    bmi2 : 1,
-                        : 23;
+                   erms : 1,
+                        : 22;
     } bits;
   };

@@ -247,7 +248,8 @@
     CPU_TSCINV = (1 << 16),
     CPU_AVX    = (1 << 17),
     CPU_AVX2   = (1 << 18),
-    CPU_AES    = (1 << 19)
+    CPU_AES    = (1 << 19),
+    CPU_ERMS   = (1 << 20) // enhanced 'rep movsb/stosb' instructions
   } cpuFeatureFlags;

   enum {
@@ -425,6 +427,8 @@
       result |= CPU_TSCINV;
     if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
       result |= CPU_AES;
+    if (_cpuid_info.sef_cpuid7_ebx.bits.erms != 0)
+      result |= CPU_ERMS;

     // AMD features.
     if (is_amd()) {
@@ -489,7 +493,7 @@
     return (_cpuid_info.std_max_function >= 0xB) &&
            // eax[4:0] | ebx[0:15] == 0 indicates invalid topology level.
            // Some cpus have max cpuid >= 0xB but do not support processor topology.
-           ((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
+           (((_cpuid_info.tpl_cpuidB0_eax & 0x1f) | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
   }

   static uint cores_per_cpu()  {
@@ -550,6 +554,7 @@
   static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
   static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
   static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }
+  static bool supports_erms()     { return (_cpuFeatures & CPU_ERMS) != 0; }

   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
--- a/src/cpu/x86/vm/x86_32.ad	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/x86_32.ad	Fri Jan 11 10:38:38 2013 -0800
@@ -11578,15 +11578,28 @@
 // =======================================================================
 // fast clearing of an array
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(!UseFastStosb);
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "SHL    ECX,1\t# Convert doublewords to words\n\t"
-            "XOR    EAX,EAX\n\t"
+  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
+            "SHL    ECX,1\t# Convert doublewords to words\n\t"
             "REP STOS\t# store EAX into [EDI++] while ECX--" %}
-  opcode(0,0x4);
-  ins_encode( Opcode(0xD1), RegOpc(ECX),
-              OpcRegReg(0x33,EAX,EAX),
-              Opcode(0xF3), Opcode(0xAB) );
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(UseFastStosb);
+  match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
+            "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+            "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
--- a/src/cpu/x86/vm/x86_64.ad	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/cpu/x86/vm/x86_64.ad	Fri Jan 11 10:38:38 2013 -0800
@@ -10170,16 +10170,33 @@
 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
                   rFlagsReg cr)
 %{
+  predicate(!UseFastStosb);
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);

-  format %{ "xorl    rax, rax\t# ClearArray:\n\t"
-            "rep stosq\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode(opc_reg_reg(0x33, RAX, RAX), // xorl %eax, %eax
-             Opcode(0xF3), Opcode(0x48), Opcode(0xAB)); // rep REX_W stos
+  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
+            "rep     stosq\t# Store rax to *rdi++ while rcx--" %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}

+instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+                        rFlagsReg cr)
+%{
+  predicate(UseFastStosb);
+  match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
+            "shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+            "rep     stosb\t# Store rax to *rdi++ while rcx--" %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct string_compare(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
                         rax_RegI result, regD tmp1, rFlagsReg cr)
 %{
--- a/src/share/tools/LogCompilation/README	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/tools/LogCompilation/README	Fri Jan 11 10:38:38 2013 -0800
@@ -13,6 +13,6 @@

 More information about the LogCompilation output can be found at

-http://wikis.sun.com/display/HotSpotInternals/LogCompilation+overview
-http://wikis.sun.com/display/HotSpotInternals/PrintCompilation
-http://wikis.sun.com/display/HotSpotInternals/LogCompilation+tool
+https://wikis.oracle.com/display/HotSpotInternals/LogCompilation+overview
+https://wikis.oracle.com/display/HotSpotInternals/PrintCompilation
+https://wikis.oracle.com/display/HotSpotInternals/LogCompilation+tool
--- a/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/CallSite.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/CallSite.java	Fri Jan 11 10:38:38 2013 -0800
@@ -38,6 +38,7 @@
     private String reason;
     private List<CallSite> calls;
     private int endNodes;
+    private int endLiveNodes;
     private double timeStamp;

     CallSite() {
@@ -106,7 +107,7 @@
             }
         }
         if (getEndNodes() > 0) {
-          stream.printf(" (end time: %6.4f nodes: %d)", getTimeStamp(), getEndNodes());
+            stream.printf(" (end time: %6.4f nodes: %d live: %d)", getTimeStamp(), getEndNodes(), getEndLiveNodes());
         }
         stream.println("");
         if (getReceiver() != null) {
@@ -195,6 +196,14 @@
         return endNodes;
     }

+    void setEndLiveNodes(int n) {
+        endLiveNodes = n;
+    }
+
+    public int getEndLiveNodes() {
+        return endLiveNodes;
+    }
+
     void setTimeStamp(double time) {
         timeStamp = time;
     }
--- a/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/LogCompilation.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/LogCompilation.java	Fri Jan 11 10:38:38 2013 -0800
@@ -37,13 +37,13 @@
 public class LogCompilation extends DefaultHandler implements ErrorHandler, Constants {

     public static void usage(int exitcode) {
-        System.out.println("Usage: LogCompilation [ -v ] [ -c ] [ -s ] [ -e | -N ] file1 ...");
+        System.out.println("Usage: LogCompilation [ -v ] [ -c ] [ -s ] [ -e | -n ] file1 ...");
         System.out.println("  -c:   clean up malformed 1.5 xml");
         System.out.println("  -i:   print inlining decisions");
         System.out.println("  -S:   print compilation statistics");
         System.out.println("  -s:   sort events by start time");
         System.out.println("  -e:   sort events by elapsed time");
-        System.out.println("  -N:   sort events by name and start");
+        System.out.println("  -n:   sort events by name and start");
         System.exit(exitcode);
     }

@@ -137,7 +137,11 @@
                         v2 = Integer.valueOf(0);
                     }
                     phaseNodes.put(phase.getName(), Integer.valueOf(v2.intValue() + phase.getNodes()));
-                    out.printf("\t%s %6.4f %d %d\n", phase.getName(), phase.getElapsedTime(), phase.getStartNodes(), phase.getNodes());
+                    /* Print phase name, elapsed time, nodes at the start of the phase,
+                       nodes created in the phase, live nodes at the start of the phase,
+                       live nodes added in the phase.
+                    */
+                    out.printf("\t%s %6.4f %d %d %d %d\n", phase.getName(), phase.getElapsedTime(), phase.getStartNodes(), phase.getNodes(), phase.getStartLiveNodes(), phase.getLiveNodes());
                 }
             } else if (e instanceof MakeNotEntrantEvent) {
                 MakeNotEntrantEvent mne = (MakeNotEntrantEvent) e;
--- a/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/LogParser.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/LogParser.java	Fri Jan 11 10:38:38 2013 -0800
@@ -224,7 +224,6 @@
         throw new InternalError("can't find " + name);
     }
     int indent = 0;
-    String compile_id;

     String type(String id) {
         String result = types.get(id);
@@ -268,12 +267,18 @@
         if (qname.equals("phase")) {
             Phase p = new Phase(search(atts, "name"),
                     Double.parseDouble(search(atts, "stamp")),
-                    Integer.parseInt(search(atts, "nodes")));
+                    Integer.parseInt(search(atts, "nodes", "0")),
+                    Integer.parseInt(search(atts, "live")));
             phaseStack.push(p);
         } else if (qname.equals("phase_done")) {
             Phase p = phaseStack.pop();
-            p.setEndNodes(Integer.parseInt(search(atts, "nodes")));
+            if (! p.getId().equals(search(atts, "name"))) {
+                System.out.println("phase: " + p.getId());
+                throw new InternalError("phase name mismatch");
+            }
             p.setEnd(Double.parseDouble(search(atts, "stamp")));
+            p.setEndNodes(Integer.parseInt(search(atts, "nodes", "0")));
+            p.setEndLiveNodes(Integer.parseInt(search(atts, "live")));
             compile.getPhases().add(p);
         } else if (qname.equals("task")) {
             compile = new Compilation(Integer.parseInt(search(atts, "compile_id", "-1")));
@@ -317,13 +322,16 @@
             m.setName(search(atts, "name"));
             m.setReturnType(type(search(atts, "return")));
             m.setArguments(search(atts, "arguments", "void"));
-            m.setBytes(search(atts, "bytes"));
-            m.setIICount(search(atts, "iicount"));
-            m.setFlags(search(atts, "flags"));
+
+            if (search(atts, "unloaded", "0").equals("0")) {
+               m.setBytes(search(atts, "bytes"));
+               m.setIICount(search(atts, "iicount"));
+               m.setFlags(search(atts, "flags"));
+            }
             methods.put(id, m);
         } else if (qname.equals("call")) {
             site = new CallSite(bci, method(search(atts, "method")));
-            site.setCount(Integer.parseInt(search(atts, "count")));
+            site.setCount(Integer.parseInt(search(atts, "count", "0")));
             String receiver = atts.getValue("receiver");
             if (receiver != null) {
                 site.setReceiver(type(receiver));
@@ -406,6 +414,7 @@
         } else if (qname.equals("parse_done")) {
             CallSite call = scopes.pop();
             call.setEndNodes(Integer.parseInt(search(atts, "nodes", "1")));
+            call.setEndLiveNodes(Integer.parseInt(search(atts, "live", "1")));
             call.setTimeStamp(Double.parseDouble(search(atts, "stamp")));
             scopes.push(call);
         }
--- a/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/Phase.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/tools/LogCompilation/src/com/sun/hotspot/tools/compiler/Phase.java	Fri Jan 11 10:38:38 2013 -0800
@@ -30,10 +30,13 @@

     private final int startNodes;
     private int endNodes;
+    private final int startLiveNodes;
+    private int endLiveNodes;

-    Phase(String n, double s, int nodes) {
+    Phase(String n, double s, int nodes, int live) {
         super(s, n);
         startNodes = nodes;
+        startLiveNodes = live;
     }

     int getNodes() {
@@ -55,6 +58,22 @@
     public int getEndNodes() {
         return endNodes;
     }
+    /* Number of live nodes added by the phase */
+    int getLiveNodes() {
+        return getEndLiveNodes() - getStartLiveNodes();
+    }
+
+    void setEndLiveNodes(int n) {
+        endLiveNodes = n;
+    }
+
+    public int getStartLiveNodes() {
+        return startLiveNodes;
+    }
+
+    public int getEndLiveNodes() {
+        return endLiveNodes;
+    }

     @Override
     public void print(PrintStream stream) {
--- a/src/share/vm/c1/c1_Compilation.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/c1/c1_Compilation.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -129,7 +129,15 @@
   CHECK_BAILOUT();

   // setup ir
+  CompileLog* log = this->log();
+  if (log != NULL) {
+    log->begin_head("parse method='%d' ",
+                    log->identify(_method));
+    log->stamp();
+    log->end_head();
+  }
   _hir = new IR(this, method(), osr_bci());
+  if (log)  log->done("parse");
   if (!_hir->is_valid()) {
     bailout("invalid parsing");
     return;
--- a/src/share/vm/c1/c1_GraphBuilder.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/c1/c1_GraphBuilder.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1836,7 +1836,7 @@
   // check if we could do inlining
   if (!PatchALot && Inline && klass->is_loaded() &&
       (klass->is_initialized() || klass->is_interface() && target->holder()->is_initialized())
-      && target->will_link(klass, callee_holder, code)) {
+      && target->is_loaded()) {
     // callee is known => check if we have static binding
     assert(target->is_loaded(), "callee must be known");
     if (code == Bytecodes::_invokestatic  ||
--- a/src/share/vm/ci/bcEscapeAnalyzer.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/ci/bcEscapeAnalyzer.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -282,7 +282,7 @@
   ciMethod* inline_target = NULL;
   if (target->is_loaded() && klass->is_loaded()
       && (klass->is_initialized() || klass->is_interface() && target->holder()->is_initialized())
-      && target->will_link(klass, callee_holder, code)) {
+      && target->is_loaded()) {
     if (code == Bytecodes::_invokestatic
         || code == Bytecodes::_invokespecial
         || code == Bytecodes::_invokevirtual && target->is_final_method()) {
--- a/src/share/vm/ci/ciField.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/ci/ciField.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -366,10 +366,12 @@
 // ------------------------------------------------------------------
 // ciField::print
 void ciField::print() {
-  tty->print("<ciField ");
+  tty->print("<ciField name=");
   _holder->print_name();
   tty->print(".");
   _name->print_symbol();
+  tty->print(" signature=");
+  _signature->print_symbol();
   tty->print(" offset=%d type=", _offset);
   if (_type != NULL) _type->print_name();
   else               tty->print("(reference)");
--- a/src/share/vm/ci/ciMethod.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/ci/ciMethod.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -735,6 +735,24 @@
 }

 // ------------------------------------------------------------------
+// ciMethod::get_field_at_bci
+ciField* ciMethod::get_field_at_bci(int bci, bool &will_link) {
+  ciBytecodeStream iter(this);
+  iter.reset_to_bci(bci);
+  iter.next();
+  return iter.get_field(will_link);
+}
+
+// ------------------------------------------------------------------
+// ciMethod::get_method_at_bci
+ciMethod* ciMethod::get_method_at_bci(int bci, bool &will_link, ciSignature* *declared_signature) {
+  ciBytecodeStream iter(this);
+  iter.reset_to_bci(bci);
+  iter.next();
+  return iter.get_method(will_link, declared_signature);
+}
+
+// ------------------------------------------------------------------
 // Adjust a CounterData count to be commensurate with
 // interpreter_invocation_count.  If the MDO exists for
 // only 25% of the time the method exists, then the
@@ -869,25 +887,6 @@
 }

 // ------------------------------------------------------------------
-// ciMethod::will_link
-//
-// Will this method link in a specific calling context?
-bool ciMethod::will_link(ciKlass* accessing_klass,
-                         ciKlass* declared_method_holder,
-                         Bytecodes::Code bc) {
-  if (!is_loaded()) {
-    // Method lookup failed.
-    return false;
-  }
-
-  // The link checks have been front-loaded into the get_method
-  // call.  This method (ciMethod::will_link()) will be removed
-  // in the future.
-
-  return true;
-}
-
-// ------------------------------------------------------------------
 // ciMethod::should_exclude
 //
 // Should this method be excluded from compilation?
--- a/src/share/vm/ci/ciMethod.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/ci/ciMethod.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -224,6 +224,9 @@
   ciCallProfile call_profile_at_bci(int bci);
   int           interpreter_call_site_count(int bci);

+  ciField*      get_field_at_bci( int bci, bool &will_link);
+  ciMethod*     get_method_at_bci(int bci, bool &will_link, ciSignature* *declared_signature);
+
   // Given a certain calling environment, find the monomorphic target
   // for the call.  Return NULL if the call is not monomorphic in
   // its calling environment.
@@ -239,9 +242,6 @@
   int resolve_vtable_index(ciKlass* caller, ciKlass* receiver);

   // Compilation directives
-  bool will_link(ciKlass* accessing_klass,
-                 ciKlass* declared_method_holder,
-                 Bytecodes::Code bc);
   bool should_exclude();
   bool should_inline();
   bool should_not_inline();
--- a/src/share/vm/ci/ciSignature.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/ci/ciSignature.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -57,12 +57,14 @@
   ciSymbol* as_symbol() const                    { return _symbol; }
   ciKlass*  accessing_klass() const              { return _accessing_klass; }

-  ciType* return_type() const;
-  ciType* type_at(int index) const;
+  ciType*   return_type() const;
+  ciType*   type_at(int index) const;

   int       size() const                         { return _size; }
   int       count() const                        { return _count; }

+  int       arg_size_for_bc(Bytecodes::Code bc)  { return size() + (Bytecodes::has_receiver(bc) ? 1 : 0); }
+
   bool equals(ciSignature* that);

   void print_signature();
--- a/src/share/vm/compiler/compilerOracle.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/compiler/compilerOracle.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -538,6 +538,7 @@

   if (match != NULL) {
     if (!_quiet) {
+      ResourceMark rm;
       tty->print("CompilerOracle: %s ", command_names[command]);
       match->print();
     }
--- a/src/share/vm/interpreter/bytecodes.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/interpreter/bytecodes.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -423,7 +423,9 @@
   static bool        is_zero_const  (Code code)    { return (code == _aconst_null || code == _iconst_0
                                                            || code == _fconst_0 || code == _dconst_0); }
   static bool        is_invoke      (Code code)    { return (_invokevirtual <= code && code <= _invokedynamic); }
-
+  static bool        has_receiver   (Code code)    { assert(is_invoke(code), "");  return code == _invokevirtual ||
+                                                                                          code == _invokespecial ||
+                                                                                          code == _invokeinterface; }
   static bool        has_optional_appendix(Code code) { return code == _invokedynamic || code == _invokehandle; }

   static int         compute_flags  (const char* format, int more_flags = 0);  // compute the flags
--- a/src/share/vm/interpreter/interpreterRuntime.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/interpreter/interpreterRuntime.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -427,7 +427,7 @@

     // exception handler lookup
     KlassHandle h_klass(THREAD, h_exception->klass());
-    handler_bci = h_method->fast_exception_handler_bci_for(h_klass, current_bci, THREAD);
+    handler_bci = methodOopDesc::fast_exception_handler_bci_for(h_method, h_klass, current_bci, THREAD);
     if (HAS_PENDING_EXCEPTION) {
       // We threw an exception while trying to find the exception handler.
       // Transfer the new exception to the exception handle which will
--- a/src/share/vm/oops/methodOop.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/oops/methodOop.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -109,16 +109,16 @@
   return buf;
 }

-int  methodOopDesc::fast_exception_handler_bci_for(KlassHandle ex_klass, int throw_bci, TRAPS) {
+int  methodOopDesc::fast_exception_handler_bci_for(methodHandle mh, KlassHandle ex_klass, int throw_bci, TRAPS) {
   // exception table holds quadruple entries of the form (beg_bci, end_bci, handler_bci, klass_index)
   // access exception table
-  ExceptionTable table(this);
+  ExceptionTable table(mh());
   int length = table.length();
   // iterate through all entries sequentially
-  constantPoolHandle pool(THREAD, constants());
+  constantPoolHandle pool(THREAD, mh->constants());
   for (int i = 0; i < length; i ++) {
     //reacquire the table in case a GC happened
-    ExceptionTable table(this);
+    ExceptionTable table(mh());
     int beg_bci = table.start_pc(i);
     int end_bci = table.end_pc(i);
     assert(beg_bci <= end_bci, "inconsistent exception table");
--- a/src/share/vm/oops/methodOop.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/oops/methodOop.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -306,7 +306,7 @@
   // exception handler which caused the exception to be thrown, which
   // is needed for proper retries. See, for example,
   // InterpreterRuntime::exception_handler_for_exception.
-  int fast_exception_handler_bci_for(KlassHandle ex_klass, int throw_bci, TRAPS);
+  static int fast_exception_handler_bci_for(methodHandle mh, KlassHandle ex_klass, int throw_bci, TRAPS);

   // method data access
   methodDataOop method_data() const              {
--- a/src/share/vm/opto/addnode.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/addnode.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -189,6 +189,11 @@
       set_req(1, addx);
       set_req(2, a22);
       progress = this;
+      PhaseIterGVN *igvn = phase->is_IterGVN();
+      if (add2->outcnt() == 0 && igvn) {
+        // add disconnected.
+        igvn->_worklist.push(add2);
+      }
     }
   }

@@ -624,6 +629,11 @@
     if( t22->singleton() && (t22 != Type::TOP) ) {  // Right input is an add of a constant?
       set_req(Address, phase->transform(new (phase->C) AddPNode(in(Base),in(Address),add->in(1))));
       set_req(Offset, add->in(2));
+      PhaseIterGVN *igvn = phase->is_IterGVN();
+      if (add->outcnt() == 0 && igvn) {
+        // add disconnected.
+        igvn->_worklist.push((Node*)add);
+      }
       return this;              // Made progress
     }
   }
--- a/src/share/vm/opto/block.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/block.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -292,7 +292,7 @@
   void needed_for_next_call(Node *this_call, VectorSet &next_call, Block_Array &bbs);
   bool schedule_local(PhaseCFG *cfg, Matcher &m, GrowableArray<int> &ready_cnt, VectorSet &next_call);
   // Cleanup if any code lands between a Call and his Catch
-  void call_catch_cleanup(Block_Array &bbs);
+  void call_catch_cleanup(Block_Array &bbs, Compile *C);
   // Detect implicit-null-check opportunities.  Basically, find NULL checks
   // with suitable memory ops nearby.  Use the memory op to do the NULL check.
   // I can generate a memory op if there is not one nearby.
--- a/src/share/vm/opto/bytecodeInfo.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/bytecodeInfo.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -45,7 +45,8 @@
   _method(callee),
   _site_invoke_ratio(site_invoke_ratio),
   _max_inline_level(max_inline_level),
-  _count_inline_bcs(method()->code_size_for_inlining())
+  _count_inline_bcs(method()->code_size_for_inlining()),
+  _subtrees(c->comp_arena(), 2, 0, NULL)
 {
   NOT_PRODUCT(_count_inlines = 0;)
   if (_caller_jvms != NULL) {
@@ -208,16 +209,18 @@
   if ( callee_method->dont_inline())                        return "don't inline by annotation";
   if ( callee_method->has_unloaded_classes_in_signature())  return "unloaded signature classes";

-  if (callee_method->force_inline() || callee_method->should_inline()) {
+  if (callee_method->should_inline()) {
     // ignore heuristic controls on inlining
     return NULL;
   }

   // Now perform checks which are heuristic

-  if (callee_method->has_compiled_code() &&
-      callee_method->instructions_size(CompLevel_full_optimization) > InlineSmallCode) {
+  if (!callee_method->force_inline()) {
+    if (callee_method->has_compiled_code() &&
+        callee_method->instructions_size(CompLevel_full_optimization) > InlineSmallCode) {
     return "already compiled into a big method";
+    }
   }

   // don't inline exception code unless the top method belongs to an
@@ -270,12 +273,15 @@
 //-----------------------------try_to_inline-----------------------------------
 // return NULL if ok, reason for not inlining otherwise
 // Relocated from "InliningClosure::try_to_inline"
-const char* InlineTree::try_to_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) {
-
+const char* InlineTree::try_to_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result, bool& should_delay) {
   // Old algorithm had funny accumulating BC-size counters
   if (UseOldInlining && ClipInlining
       && (int)count_inline_bcs() >= DesiredMethodLimit) {
-    return "size > DesiredMethodLimit";
+    if (!callee_method->force_inline() || !IncrementalInline) {
+      return "size > DesiredMethodLimit";
+    } else if (!C->inlining_incrementally()) {
+      should_delay = true;
+    }
   }

   const char *msg = NULL;
@@ -296,8 +302,13 @@
   if (callee_method->code_size() > MaxTrivialSize) {

     // don't inline into giant methods
-    if (C->unique() > (uint)NodeCountInliningCutoff) {
-      return "NodeCountInliningCutoff";
+    if (C->over_inlining_cutoff()) {
+      if ((!callee_method->force_inline() && !caller_method->is_compiled_lambda_form())
+          || !IncrementalInline) {
+        return "NodeCountInliningCutoff";
+      } else {
+        should_delay = true;
+      }
     }

     if ((!UseInterpreter || CompileTheWorld) &&
@@ -316,7 +327,11 @@
     return "not an accessor";
   }
   if (inline_level() > _max_inline_level) {
-    return "inlining too deep";
+    if (!callee_method->force_inline() || !IncrementalInline) {
+      return "inlining too deep";
+    } else if (!C->inlining_incrementally()) {
+      should_delay = true;
+    }
   }

   // detect direct and indirect recursive inlining
@@ -341,7 +356,11 @@

   if (UseOldInlining && ClipInlining
       && (int)count_inline_bcs() + size >= DesiredMethodLimit) {
-    return "size > DesiredMethodLimit";
+    if (!callee_method->force_inline() || !IncrementalInline) {
+      return "size > DesiredMethodLimit";
+    } else if (!C->inlining_incrementally()) {
+      should_delay = true;
+    }
   }

   // ok, inline this method
@@ -396,7 +415,7 @@
 //------------------------------print_inlining---------------------------------
 // Really, the failure_msg can be a success message also.
 void InlineTree::print_inlining(ciMethod* callee_method, int caller_bci, const char* failure_msg) const {
-  CompileTask::print_inlining(callee_method, inline_level(), caller_bci, failure_msg ? failure_msg : "inline");
+  C->print_inlining(callee_method, inline_level(), caller_bci, failure_msg ? failure_msg : "inline");
   if (callee_method == NULL)  tty->print(" callee not monotonic or profiled");
   if (Verbose && callee_method) {
     const InlineTree *top = this;
@@ -406,8 +425,9 @@
 }

 //------------------------------ok_to_inline-----------------------------------
-WarmCallInfo* InlineTree::ok_to_inline(ciMethod* callee_method, JVMState* jvms, ciCallProfile& profile, WarmCallInfo* initial_wci) {
+WarmCallInfo* InlineTree::ok_to_inline(ciMethod* callee_method, JVMState* jvms, ciCallProfile& profile, WarmCallInfo* initial_wci, bool& should_delay) {
   assert(callee_method != NULL, "caller checks for optimized virtual!");
+  assert(!should_delay, "should be initialized to false");
 #ifdef ASSERT
   // Make sure the incoming jvms has the same information content as me.
   // This means that we can eventually make this whole class AllStatic.
@@ -437,7 +457,7 @@

   // Check if inlining policy says no.
   WarmCallInfo wci = *(initial_wci);
-  failure_msg = try_to_inline(callee_method, caller_method, caller_bci, profile, &wci);
+  failure_msg = try_to_inline(callee_method, caller_method, caller_bci, profile, &wci, should_delay);
   if (failure_msg != NULL && C->log() != NULL) {
     C->log()->inline_fail(failure_msg);
   }
--- a/src/share/vm/opto/c2_globals.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/c2_globals.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -115,6 +115,12 @@
   notproduct(bool, VerifyOpto, false,                                       \
           "Apply more time consuming verification during compilation")      \
                                                                             \
+  notproduct(bool, VerifyIdealNodeCount, false,                             \
+          "Verify that tracked dead ideal node count is accurate")          \
+                                                                            \
+  notproduct(bool, PrintIdealNodeCount, false,                              \
+          "Print liveness counts of ideal nodes")                           \
+                                                                            \
   notproduct(bool, VerifyOptoOopOffsets, false,                             \
           "Check types of base addresses in field references")              \
                                                                             \
@@ -600,6 +606,16 @@
                                                                             \
   develop(bool, VerifyAliases, false,                                       \
           "perform extra checks on the results of alias analysis")          \
+                                                                            \
+  product(bool, IncrementalInline, true,                                    \
+          "do post parse inlining")                                         \
+                                                                            \
+  develop(bool, AlwaysIncrementalInline, false,                             \
+          "do all inlining incrementally")                                  \
+                                                                            \
+  product(intx, LiveNodeCountInliningCutoff, 20000,                         \
+          "max number of live nodes in a method")                           \
+

 C2_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG)
--- a/src/share/vm/opto/callGenerator.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/callGenerator.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -139,7 +139,7 @@
   if (!is_static) {
     // Make an explicit receiver null_check as part of this call.
     // Since we share a map with the caller, his JVMS gets adjusted.
-    kit.null_check_receiver(method());
+    kit.null_check_receiver_before_call(method());
     if (kit.stopped()) {
       // And dump it back to the caller, decorated with any exceptions:
       return kit.transfer_exceptions_into_jvms();
@@ -207,7 +207,7 @@
        >= (uint)ImplicitNullCheckThreshold))) {
     // Make an explicit receiver null_check as part of this call.
     // Since we share a map with the caller, his JVMS gets adjusted.
-    receiver = kit.null_check_receiver(method());
+    receiver = kit.null_check_receiver_before_call(method());
     if (kit.stopped()) {
       // And dump it back to the caller, decorated with any exceptions:
       return kit.transfer_exceptions_into_jvms();
@@ -262,8 +262,11 @@

 // Allow inlining decisions to be delayed
 class LateInlineCallGenerator : public DirectCallGenerator {
+ protected:
   CallGenerator* _inline_cg;

+  virtual bool do_late_inline_check(JVMState* jvms) { return true; }
+
  public:
   LateInlineCallGenerator(ciMethod* method, CallGenerator* inline_cg) :
     DirectCallGenerator(method, true), _inline_cg(inline_cg) {}
@@ -274,9 +277,14 @@
   virtual void do_late_inline();

   virtual JVMState* generate(JVMState* jvms) {
+    Compile *C = Compile::current();
+    C->print_inlining_skip(this);
+
     // Record that this call site should be revisited once the main
     // parse is finished.
-    Compile::current()->add_late_inline(this);
+    if (!is_mh_late_inline()) {
+      C->add_late_inline(this);
+    }

     // Emit the CallStaticJava and request separate projections so
     // that the late inlining logic can distinguish between fall
@@ -285,15 +293,33 @@
     return DirectCallGenerator::generate(jvms);
   }

+  virtual void print_inlining_late(const char* msg) {
+    CallNode* call = call_node();
+    Compile* C = Compile::current();
+    C->print_inlining_insert(this);
+    C->print_inlining(method(), call->jvms()->depth()-1, call->jvms()->bci(), msg);
+  }
+
 };

-
 void LateInlineCallGenerator::do_late_inline() {
   // Can't inline it
   if (call_node() == NULL || call_node()->outcnt() == 0 ||
       call_node()->in(0) == NULL || call_node()->in(0)->is_top())
     return;

+  for (int i1 = 0; i1 < method()->arg_size(); i1++) {
+    if (call_node()->in(TypeFunc::Parms + i1)->is_top()) {
+      assert(Compile::current()->inlining_incrementally(), "shouldn't happen during parsing");
+      return;
+    }
+  }
+
+  if (call_node()->in(TypeFunc::Memory)->is_top()) {
+    assert(Compile::current()->inlining_incrementally(), "shouldn't happen during parsing");
+    return;
+  }
+
   CallStaticJavaNode* call = call_node();

   // Make a clone of the JVMState that appropriate to use for driving a parse
@@ -307,7 +333,9 @@

   // Make sure the state is a MergeMem for parsing.
   if (!map->in(TypeFunc::Memory)->is_MergeMem()) {
-    map->set_req(TypeFunc::Memory, MergeMemNode::make(C, map->in(TypeFunc::Memory)));
+    Node* mem = MergeMemNode::make(C, map->in(TypeFunc::Memory));
+    C->initial_gvn()->set_type_bottom(mem);
+    map->set_req(TypeFunc::Memory, mem);
   }

   // Make enough space for the expression stack and transfer the incoming arguments
@@ -320,6 +348,13 @@
     }
   }

+  if (!do_late_inline_check(jvms)) {
+    map->disconnect_inputs(NULL, C);
+    return;
+  }
+
+  C->print_inlining_insert(this);
+
   CompileLog* log = C->log();
   if (log != NULL) {
     log->head("late_inline method='%d'", log->identify(method()));
@@ -354,6 +389,10 @@
     result = (result_size == 1) ? kit.pop() : kit.pop_pair();
   }

+  C->set_has_loops(C->has_loops() || _inline_cg->method()->has_loops());
+  C->env()->notice_inlined_method(_inline_cg->method());
+  C->set_inlining_progress(true);
+
   kit.replace_call(call, result);
 }

@@ -362,6 +401,83 @@
   return new LateInlineCallGenerator(method, inline_cg);
 }

+class LateInlineMHCallGenerator : public LateInlineCallGenerator {
+  ciMethod* _caller;
+  int _attempt;
+  bool _input_not_const;
+
+  virtual bool do_late_inline_check(JVMState* jvms);
+  virtual bool already_attempted() const { return _attempt > 0; }
+
+ public:
+  LateInlineMHCallGenerator(ciMethod* caller, ciMethod* callee, bool input_not_const) :
+    LateInlineCallGenerator(callee, NULL), _caller(caller), _attempt(0), _input_not_const(input_not_const) {}
+
+  virtual bool is_mh_late_inline() const { return true; }
+
+  virtual JVMState* generate(JVMState* jvms) {
+    JVMState* new_jvms = LateInlineCallGenerator::generate(jvms);
+    if (_input_not_const) {
+      // inlining won't be possible so no need to enqueue right now.
+      call_node()->set_generator(this);
+    } else {
+      Compile::current()->add_late_inline(this);
+    }
+    return new_jvms;
+  }
+
+  virtual void print_inlining_late(const char* msg) {
+    if (!_input_not_const) return;
+    LateInlineCallGenerator::print_inlining_late(msg);
+  }
+};
+
+bool LateInlineMHCallGenerator::do_late_inline_check(JVMState* jvms) {
+
+  CallGenerator* cg = for_method_handle_inline(jvms, _caller, method(), _input_not_const);
+
+  if (!_input_not_const) {
+    _attempt++;
+  }
+
+  if (cg != NULL) {
+    assert(!cg->is_late_inline() && cg->is_inline(), "we're doing late inlining");
+    _inline_cg = cg;
+    Compile::current()->dec_number_of_mh_late_inlines();
+    return true;
+  }
+
+  call_node()->set_generator(this);
+  return false;
+}
+
+CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const) {
+  Compile::current()->inc_number_of_mh_late_inlines();
+  CallGenerator* cg = new LateInlineMHCallGenerator(caller, callee, input_not_const);
+  return cg;
+}
+
+class LateInlineStringCallGenerator : public LateInlineCallGenerator {
+
+ public:
+  LateInlineStringCallGenerator(ciMethod* method, CallGenerator* inline_cg) :
+    LateInlineCallGenerator(method, inline_cg) {}
+
+  virtual JVMState* generate(JVMState* jvms) {
+    Compile *C = Compile::current();
+    C->print_inlining_skip(this);
+
+    C->add_string_late_inline(this);
+
+    JVMState* new_jvms =  DirectCallGenerator::generate(jvms);
+    return new_jvms;
+  }
+};
+
+CallGenerator* CallGenerator::for_string_late_inline(ciMethod* method, CallGenerator* inline_cg) {
+  return new LateInlineStringCallGenerator(method, inline_cg);
+}
+

 //---------------------------WarmCallGenerator--------------------------------
 // Internal class which handles initial deferral of inlining decisions.
@@ -491,7 +607,7 @@
               jvms->bci(), log->identify(_predicted_receiver));
   }

-  receiver = kit.null_check_receiver(method());
+  receiver = kit.null_check_receiver_before_call(method());
   if (kit.stopped()) {
     return kit.transfer_exceptions_into_jvms();
   }
@@ -580,35 +696,52 @@
 }


-CallGenerator* CallGenerator::for_method_handle_call(JVMState* jvms, ciMethod* caller, ciMethod* callee) {
+CallGenerator* CallGenerator::for_method_handle_call(JVMState* jvms, ciMethod* caller, ciMethod* callee, bool delayed_forbidden) {
   assert(callee->is_method_handle_intrinsic() ||
          callee->is_compiled_lambda_form(), "for_method_handle_call mismatch");
-  CallGenerator* cg = CallGenerator::for_method_handle_inline(jvms, caller, callee);
-  if (cg != NULL)
-    return cg;
-  return CallGenerator::for_direct_call(callee);
+  bool input_not_const;
+  CallGenerator* cg = CallGenerator::for_method_handle_inline(jvms, caller, callee, input_not_const);
+  Compile* C = Compile::current();
+  if (cg != NULL) {
+    if (!delayed_forbidden && AlwaysIncrementalInline) {
+      return CallGenerator::for_late_inline(callee, cg);
+    } else {
+      return cg;
+    }
+  }
+  int bci = jvms->bci();
+  ciCallProfile profile = caller->call_profile_at_bci(bci);
+  int call_site_count = caller->scale_count(profile.count());
+
+  if (IncrementalInline && call_site_count > 0 &&
+      (input_not_const || !C->inlining_incrementally() || C->over_inlining_cutoff())) {
+    return CallGenerator::for_mh_late_inline(caller, callee, input_not_const);
+  } else {
+    return CallGenerator::for_direct_call(callee);
+  }
 }

-CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod* caller, ciMethod* callee) {
+CallGenerator* CallGenerator::for_method_handle_inline(JVMState* jvms, ciMethod* caller, ciMethod* callee, bool& input_not_const) {
   GraphKit kit(jvms);
   PhaseGVN& gvn = kit.gvn();
   Compile* C = kit.C;
   vmIntrinsics::ID iid = callee->intrinsic_id();
+  input_not_const = true;
   switch (iid) {
   case vmIntrinsics::_invokeBasic:
     {
-      // get MethodHandle receiver
+      // Get MethodHandle receiver:
       Node* receiver = kit.argument(0);
       if (receiver->Opcode() == Op_ConP) {
+        input_not_const = false;
         const TypeOopPtr* oop_ptr = receiver->bottom_type()->is_oopptr();
         ciMethod* target = oop_ptr->const_oop()->as_method_handle()->get_vmtarget();
         guarantee(!target->is_method_handle_intrinsic(), "should not happen");  // XXX remove
         const int vtable_index = methodOopDesc::invalid_vtable_index;
-        CallGenerator* cg = C->call_generator(target, vtable_index, false, jvms, true, PROB_ALWAYS);
+        CallGenerator* cg = C->call_generator(target, vtable_index, false, jvms, true, PROB_ALWAYS, true, true);
+        assert (!cg->is_late_inline() || cg->is_mh_late_inline(), "no late inline here");
         if (cg != NULL && cg->is_inline())
           return cg;
-      } else {
-        if (PrintInlining)  CompileTask::print_inlining(callee, jvms->depth() - 1, jvms->bci(), "receiver not constant");
       }
     }
     break;
@@ -618,9 +751,10 @@
   case vmIntrinsics::_linkToSpecial:
   case vmIntrinsics::_linkToInterface:
     {
-      // pop MemberName argument
+      // Get MemberName argument:
       Node* member_name = kit.argument(callee->arg_size() - 1);
       if (member_name->Opcode() == Op_ConP) {
+        input_not_const = false;
         const TypeOopPtr* oop_ptr = member_name->bottom_type()->is_oopptr();
         ciMethod* target = oop_ptr->const_oop()->as_member_name()->get_vmtarget();

@@ -655,7 +789,8 @@
         }
         const int vtable_index = methodOopDesc::invalid_vtable_index;
         const bool call_is_virtual = target->is_abstract();  // FIXME workaround
-        CallGenerator* cg = C->call_generator(target, vtable_index, call_is_virtual, jvms, true, PROB_ALWAYS);
+        CallGenerator* cg = C->call_generator(target, vtable_index, call_is_virtual, jvms, true, PROB_ALWAYS, true, true);
+        assert (!cg->is_late_inline() || cg->is_mh_late_inline(), "no late inline here");
         if (cg != NULL && cg->is_inline())
           return cg;
       }
--- a/src/share/vm/opto/callGenerator.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/callGenerator.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -68,6 +68,12 @@

   // is_late_inline: supports conversion of call into an inline
   virtual bool      is_late_inline() const      { return false; }
+  // same but for method handle calls
+  virtual bool      is_mh_late_inline() const   { return false; }
+
+  // for method handle calls: have we tried inlinining the call already?
+  virtual bool      already_attempted() const   { ShouldNotReachHere(); return false; }
+
   // Replace the call with an inline version of the code
   virtual void do_late_inline() { ShouldNotReachHere(); }

@@ -112,11 +118,13 @@
   static CallGenerator* for_virtual_call(ciMethod* m, int vtable_index);  // virtual, interface
   static CallGenerator* for_dynamic_call(ciMethod* m);   // invokedynamic

-  static CallGenerator* for_method_handle_call(  JVMState* jvms, ciMethod* caller, ciMethod* callee);
-  static CallGenerator* for_method_handle_inline(JVMState* jvms, ciMethod* caller, ciMethod* callee);
+  static CallGenerator* for_method_handle_call(  JVMState* jvms, ciMethod* caller, ciMethod* callee, bool delayed_forbidden);
+  static CallGenerator* for_method_handle_inline(JVMState* jvms, ciMethod* caller, ciMethod* callee, bool& input_not_const);

   // How to generate a replace a direct call with an inline version
   static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg);
+  static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
+  static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);

   // How to make a call but defer the decision whether to inline or not.
   static CallGenerator* for_warm_call(WarmCallInfo* ci,
@@ -147,9 +155,11 @@
                                                 CallGenerator* cg);
   virtual Node* generate_predicate(JVMState* jvms) { return NULL; };

-  static void print_inlining(ciMethod* callee, int inline_level, int bci, const char* msg) {
+  virtual void print_inlining_late(const char* msg) { ShouldNotReachHere(); }
+
+  static void print_inlining(Compile* C, ciMethod* callee, int inline_level, int bci, const char* msg) {
     if (PrintInlining)
-      CompileTask::print_inlining(callee, inline_level, bci, msg);
+      C->print_inlining(callee, inline_level, bci, msg);
   }
 };
--- a/src/share/vm/opto/callnode.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/callnode.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "ci/bcEscapeAnalyzer.hpp"
 #include "compiler/oopMap.hpp"
+#include "opto/callGenerator.hpp"
 #include "opto/callnode.hpp"
 #include "opto/escape.hpp"
 #include "opto/locknode.hpp"
@@ -746,7 +747,7 @@
         projs->fallthrough_ioproj = pn;
       for (DUIterator j = pn->outs(); pn->has_out(j); j++) {
         Node* e = pn->out(j);
-        if (e->Opcode() == Op_CreateEx && e->in(0)->is_CatchProj()) {
+        if (e->Opcode() == Op_CreateEx && e->in(0)->is_CatchProj() && e->outcnt() > 0) {
           assert(projs->exobj == NULL, "only one");
           projs->exobj = e;
         }
@@ -770,16 +771,38 @@
   // and the exception object may not exist if an exception handler
   // swallows the exception but all the other must exist and be found.
   assert(projs->fallthrough_proj      != NULL, "must be found");
-  assert(projs->fallthrough_catchproj != NULL, "must be found");
-  assert(projs->fallthrough_memproj   != NULL, "must be found");
-  assert(projs->fallthrough_ioproj    != NULL, "must be found");
-  assert(projs->catchall_catchproj    != NULL, "must be found");
+  assert(Compile::current()->inlining_incrementally() || projs->fallthrough_catchproj != NULL, "must be found");
+  assert(Compile::current()->inlining_incrementally() || projs->fallthrough_memproj   != NULL, "must be found");
+  assert(Compile::current()->inlining_incrementally() || projs->fallthrough_ioproj    != NULL, "must be found");
+  assert(Compile::current()->inlining_incrementally() || projs->catchall_catchproj    != NULL, "must be found");
   if (separate_io_proj) {
-    assert(projs->catchall_memproj      != NULL, "must be found");
-    assert(projs->catchall_ioproj       != NULL, "must be found");
+    assert(Compile::current()->inlining_incrementally() || projs->catchall_memproj    != NULL, "must be found");
+    assert(Compile::current()->inlining_incrementally() || projs->catchall_ioproj     != NULL, "must be found");
   }
 }

+Node *CallNode::Ideal(PhaseGVN *phase, bool can_reshape) {
+  CallGenerator* cg = generator();
+  if (can_reshape && cg != NULL && cg->is_mh_late_inline() && !cg->already_attempted()) {
+    // Check whether this MH handle call becomes a candidate for inlining
+    ciMethod* callee = cg->method();
+    vmIntrinsics::ID iid = callee->intrinsic_id();
+    if (iid == vmIntrinsics::_invokeBasic) {
+      if (in(TypeFunc::Parms)->Opcode() == Op_ConP) {
+        phase->C->prepend_late_inline(cg);
+        set_generator(NULL);
+      }
+    } else {
+      assert(callee->has_member_arg(), "wrong type of call?");
+      if (in(TypeFunc::Parms + callee->arg_size() - 1)->Opcode() == Op_ConP) {
+        phase->C->prepend_late_inline(cg);
+        set_generator(NULL);
+      }
+    }
+  }
+  return SafePointNode::Ideal(phase, can_reshape);
+}
+

 //=============================================================================
 uint CallJavaNode::size_of() const { return sizeof(*this); }
--- a/src/share/vm/opto/callnode.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/callnode.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -344,17 +344,26 @@
   OopMap *oop_map() const { return _oop_map; }
   void set_oop_map(OopMap *om) { _oop_map = om; }

+ private:
+  void verify_input(JVMState* jvms, uint idx) const {
+    assert(verify_jvms(jvms), "jvms must match");
+    Node* n = in(idx);
+    assert((!n->bottom_type()->isa_long() && !n->bottom_type()->isa_double()) ||
+           in(idx + 1)->is_top(), "2nd half of long/double");
+  }
+
+ public:
   // Functionality from old debug nodes which has changed
   Node *local(JVMState* jvms, uint idx) const {
-    assert(verify_jvms(jvms), "jvms must match");
+    verify_input(jvms, jvms->locoff() + idx);
     return in(jvms->locoff() + idx);
   }
   Node *stack(JVMState* jvms, uint idx) const {
-    assert(verify_jvms(jvms), "jvms must match");
+    verify_input(jvms, jvms->stkoff() + idx);
     return in(jvms->stkoff() + idx);
   }
   Node *argument(JVMState* jvms, uint idx) const {
-    assert(verify_jvms(jvms), "jvms must match");
+    verify_input(jvms, jvms->argoff() + idx);
     return in(jvms->argoff() + idx);
   }
   Node *monitor_box(JVMState* jvms, uint idx) const {
@@ -498,6 +507,7 @@
   Node* exobj;
 };

+class CallGenerator;

 //------------------------------CallNode---------------------------------------
 // Call nodes now subsume the function of debug nodes at callsites, so they
@@ -508,26 +518,31 @@
   const TypeFunc *_tf;        // Function type
   address      _entry_point;  // Address of method being called
   float        _cnt;          // Estimate of number of times called
+  CallGenerator* _generator;  // corresponding CallGenerator for some late inline calls

   CallNode(const TypeFunc* tf, address addr, const TypePtr* adr_type)
     : SafePointNode(tf->domain()->cnt(), NULL, adr_type),
       _tf(tf),
       _entry_point(addr),
-      _cnt(COUNT_UNKNOWN)
+      _cnt(COUNT_UNKNOWN),
+      _generator(NULL)
   {
     init_class_id(Class_Call);
   }

-  const TypeFunc* tf()        const { return _tf; }
-  const address entry_point() const { return _entry_point; }
-  const float   cnt()         const { return _cnt; }
+  const TypeFunc* tf()         const { return _tf; }
+  const address  entry_point() const { return _entry_point; }
+  const float    cnt()         const { return _cnt; }
+  CallGenerator* generator()   const { return _generator; }

-  void set_tf(const TypeFunc* tf) { _tf = tf; }
-  void set_entry_point(address p) { _entry_point = p; }
-  void set_cnt(float c)           { _cnt = c; }
+  void set_tf(const TypeFunc* tf)       { _tf = tf; }
+  void set_entry_point(address p)       { _entry_point = p; }
+  void set_cnt(float c)                 { _cnt = c; }
+  void set_generator(CallGenerator* cg) { _generator = cg; }

   virtual const Type *bottom_type() const;
   virtual const Type *Value( PhaseTransform *phase ) const;
+  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
   virtual Node *Identity( PhaseTransform *phase ) { return this; }
   virtual uint        cmp( const Node &n ) const;
   virtual uint        size_of() const = 0;
--- a/src/share/vm/opto/cfgnode.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/cfgnode.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -363,6 +363,49 @@
   return true; // The Region node is unreachable - it is dead.
 }

+bool RegionNode::try_clean_mem_phi(PhaseGVN *phase) {
+  // Incremental inlining + PhaseStringOpts sometimes produce:
+  //
+  // cmpP with 1 top input
+  //           |
+  //          If
+  //         /  \
+  //   IfFalse  IfTrue  /- Some Node
+  //         \  /      /    /
+  //        Region    / /-MergeMem
+  //             \---Phi
+  //
+  //
+  // It's expected by PhaseStringOpts that the Region goes away and is
+  // replaced by If's control input but because there's still a Phi,
+  // the Region stays in the graph. The top input from the cmpP is
+  // propagated forward and a subgraph that is useful goes away. The
+  // code below replaces the Phi with the MergeMem so that the Region
+  // is simplified.
+
+  PhiNode* phi = has_unique_phi();
+  if (phi && phi->type() == Type::MEMORY && req() == 3 && phi->is_diamond_phi(true)) {
+    MergeMemNode* m = NULL;
+    assert(phi->req() == 3, "same as region");
+    for (uint i = 1; i < 3; ++i) {
+      Node *mem = phi->in(i);
+      if (mem && mem->is_MergeMem() && in(i)->outcnt() == 1) {
+        // Nothing is control-dependent on path #i except the region itself.
+        m = mem->as_MergeMem();
+        uint j = 3 - i;
+        Node* other = phi->in(j);
+        if (other && other == m->base_memory()) {
+          // m is a successor memory to other, and is not pinned inside the diamond, so push it out.
+          // This will allow the diamond to collapse completely.
+          phase->is_IterGVN()->replace_node(phi, m);
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 //------------------------------Ideal------------------------------------------
 // Return a node which is more "ideal" than the current node.  Must preserve
 // the CFG, but we can still strip out dead paths.
@@ -375,6 +418,10 @@
   bool has_phis = false;
   if (can_reshape) {            // Need DU info to check for Phi users
     has_phis = (has_phi() != NULL);       // Cache result
+    if (has_phis && try_clean_mem_phi(phase)) {
+      has_phis = false;
+    }
+
     if (!has_phis) {            // No Phi users?  Nothing merging?
       for (uint i = 1; i < req()-1; i++) {
         Node *if1 = in(i);
@@ -1005,7 +1052,9 @@
 //------------------------------is_diamond_phi---------------------------------
 // Does this Phi represent a simple well-shaped diamond merge?  Return the
 // index of the true path or 0 otherwise.
-int PhiNode::is_diamond_phi() const {
+// If check_control_only is true, do not inspect the If node at the
+// top, and return -1 (not an edge number) on success.
+int PhiNode::is_diamond_phi(bool check_control_only) const {
   // Check for a 2-path merge
   Node *region = in(0);
   if( !region ) return 0;
@@ -1018,6 +1067,7 @@
   Node *iff = ifp1->in(0);
   if( !iff || !iff->is_If() ) return 0;
   if( iff != ifp2->in(0) ) return 0;
+  if (check_control_only)  return -1;
   // Check for a proper bool/cmp
   const Node *b = iff->in(1);
   if( !b->is_Bool() ) return 0;
@@ -1566,6 +1616,10 @@
     Node* n = in(j);            // Get the input
     if (rc == NULL || phase->type(rc) == Type::TOP) {
       if (n != top) {           // Not already top?
+        PhaseIterGVN *igvn = phase->is_IterGVN();
+        if (can_reshape && igvn != NULL) {
+          igvn->_worklist.push(r);
+        }
         set_req(j, top);        // Nuke it down
         progress = this;        // Record progress
       }
--- a/src/share/vm/opto/cfgnode.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/cfgnode.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -95,6 +95,7 @@
   virtual Node *Identity( PhaseTransform *phase );
   virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
   virtual const RegMask &out_RegMask() const;
+  bool try_clean_mem_phi(PhaseGVN *phase);
 };

 //------------------------------JProjNode--------------------------------------
@@ -181,7 +182,7 @@
   LoopSafety simple_data_loop_check(Node *in) const;
   // Is it unsafe data loop? It becomes a dead loop if this phi node removed.
   bool is_unsafe_data_reference(Node *in) const;
-  int  is_diamond_phi() const;
+  int  is_diamond_phi(bool check_control_only = false) const;
   virtual int Opcode() const;
   virtual bool pinned() const { return in(0) != 0; }
   virtual const TypePtr *adr_type() const { verify_adr_type(true); return _adr_type; }
--- a/src/share/vm/opto/chaitin.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/chaitin.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1495,7 +1495,7 @@
             cisc->ins_req(1,src);         // Requires a memory edge
           }
           b->_nodes.map(j,cisc);          // Insert into basic block
-          n->subsume_by(cisc); // Correct graph
+          n->subsume_by(cisc, C); // Correct graph
           //
           ++_used_cisc_instructions;
 #ifndef PRODUCT
--- a/src/share/vm/opto/compile.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/compile.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -134,7 +134,7 @@

 void Compile::register_intrinsic(CallGenerator* cg) {
   if (_intrinsics == NULL) {
-    _intrinsics = new GrowableArray<CallGenerator*>(60);
+    _intrinsics = new (comp_arena())GrowableArray<CallGenerator*>(comp_arena(), 60, 0, NULL);
   }
   // This code is stolen from ciObjectFactory::insert.
   // Really, GrowableArray should have methods for
@@ -316,7 +316,12 @@
 }


-
+static inline bool not_a_node(const Node* n) {
+  if (n == NULL)                   return true;
+  if (((intptr_t)n & 1) != 0)      return true;  // uninitialized, etc.
+  if (*(address*)n == badAddress)  return true;  // kill by Node::destruct
+  return false;
+}

 // Identify all nodes that are reachable from below, useful.
 // Use breadth-first pass that records state in a Unique_Node_List,
@@ -337,12 +342,42 @@
     uint max = n->len();
     for( uint i = 0; i < max; ++i ) {
       Node *m = n->in(i);
-      if( m == NULL ) continue;
+      if (not_a_node(m))  continue;
       useful.push(m);
     }
   }
 }

+// Update dead_node_list with any missing dead nodes using useful
+// list. Consider all non-useful nodes to be useless i.e., dead nodes.
+void Compile::update_dead_node_list(Unique_Node_List &useful) {
+  uint max_idx = unique();
+  VectorSet& useful_node_set = useful.member_set();
+
+  for (uint node_idx = 0; node_idx < max_idx; node_idx++) {
+    // If node with index node_idx is not in useful set,
+    // mark it as dead in dead node list.
+    if (! useful_node_set.test(node_idx) ) {
+      record_dead_node(node_idx);
+    }
+  }
+}
+
+void Compile::remove_useless_late_inlines(GrowableArray<CallGenerator*>* inlines, Unique_Node_List &useful) {
+  int shift = 0;
+  for (int i = 0; i < inlines->length(); i++) {
+    CallGenerator* cg = inlines->at(i);
+    CallNode* call = cg->call_node();
+    if (shift > 0) {
+      inlines->at_put(i-shift, cg);
+    }
+    if (!useful.member(call)) {
+      shift++;
+    }
+  }
+  inlines->trunc_to(inlines->length()-shift);
+}
+
 // Disconnect all useless nodes by disconnecting those at the boundary.
 void Compile::remove_useless_nodes(Unique_Node_List &useful) {
   uint next = 0;
@@ -372,6 +407,9 @@
       remove_macro_node(n);
     }
   }
+  // clean up the late inline lists
+  remove_useless_late_inlines(&_string_late_inlines, useful);
+  remove_useless_late_inlines(&_late_inlines, useful);
   debug_only(verify_graph_edges(true/*check for no_dead_code*/);)
 }

@@ -582,11 +620,21 @@
                   _inner_loops(0),
                   _scratch_const_size(-1),
                   _in_scratch_emit_size(false),
+                  _dead_node_list(comp_arena()),
+                  _dead_node_count(0),
 #ifndef PRODUCT
                   _trace_opto_output(TraceOptoOutput || method()->has_option("TraceOptoOutput")),
                   _printer(IdealGraphPrinter::printer()),
 #endif
-                  _congraph(NULL) {
+                  _congraph(NULL),
+                  _late_inlines(comp_arena(), 2, 0, NULL),
+                  _string_late_inlines(comp_arena(), 2, 0, NULL),
+                  _late_inlines_pos(0),
+                  _number_of_mh_late_inlines(0),
+                  _inlining_progress(false),
+                  _inlining_incrementally(false),
+                  _print_inlining_list(NULL),
+                  _print_inlining(0) {
   C = this;

   CompileWrapper cw(this);
@@ -642,6 +690,9 @@
   PhaseGVN gvn(node_arena(), estimated_size);
   set_initial_gvn(&gvn);

+  if (PrintInlining) {
+    _print_inlining_list = new (comp_arena())GrowableArray<PrintInliningBuffer>(comp_arena(), 1, 1, PrintInliningBuffer());
+  }
   { // Scope for timing the parser
     TracePhase t3("parse", &_t_parser, true);

@@ -708,28 +759,13 @@
       rethrow_exceptions(kit.transfer_exceptions_into_jvms());
     }

-    if (!failing() && has_stringbuilder()) {
-      {
-        // remove useless nodes to make the usage analysis simpler
-        ResourceMark rm;
-        PhaseRemoveUseless pru(initial_gvn(), &for_igvn);
-      }
-
-      {
-        ResourceMark rm;
-        print_method("Before StringOpts", 3);
-        PhaseStringOpts pso(initial_gvn(), &for_igvn);
-        print_method("After StringOpts", 3);
-      }
-
-      // now inline anything that we skipped the first time around
-      while (_late_inlines.length() > 0) {
-        CallGenerator* cg = _late_inlines.pop();
-        cg->do_late_inline();
-        if (failing())  return;
-      }
+    assert(IncrementalInline || (_late_inlines.length() == 0 && !has_mh_late_inlines()), "incremental inlining is off");
+
+    if (_late_inlines.length() == 0 && !has_mh_late_inlines() && !failing() && has_stringbuilder()) {
+      inline_string_calls(true);
     }
-    assert(_late_inlines.length() == 0, "should have been processed");
+
+    if (failing())  return;

     print_method("Before RemoveUseless", 3);

@@ -873,7 +909,14 @@
     _trace_opto_output(TraceOptoOutput),
     _printer(NULL),
 #endif
-    _congraph(NULL) {
+    _dead_node_list(comp_arena()),
+    _dead_node_count(0),
+    _congraph(NULL),
+    _number_of_mh_late_inlines(0),
+    _inlining_progress(false),
+    _inlining_incrementally(false),
+    _print_inlining_list(NULL),
+    _print_inlining(0) {
   C = this;

 #ifndef PRODUCT
@@ -1080,6 +1123,72 @@
   assert(_top == NULL || top()->is_top(), "");
 }

+#ifdef ASSERT
+uint Compile::count_live_nodes_by_graph_walk() {
+  Unique_Node_List useful(comp_arena());
+  // Get useful node list by walking the graph.
+  identify_useful_nodes(useful);
+  return useful.size();
+}
+
+void Compile::print_missing_nodes() {
+
+  // Return if CompileLog is NULL and PrintIdealNodeCount is false.
+  if ((_log == NULL) && (! PrintIdealNodeCount)) {
+    return;
+  }
+
+  // This is an expensive function. It is executed only when the user
+  // specifies VerifyIdealNodeCount option or otherwise knows the
+  // additional work that needs to be done to identify reachable nodes
+  // by walking the flow graph and find the missing ones using
+  // _dead_node_list.
+
+  Unique_Node_List useful(comp_arena());
+  // Get useful node list by walking the graph.
+  identify_useful_nodes(useful);
+
+  uint l_nodes = C->live_nodes();
+  uint l_nodes_by_walk = useful.size();
+
+  if (l_nodes != l_nodes_by_walk) {
+    if (_log != NULL) {
+      _log->begin_head("mismatched_nodes count='%d'", abs((int) (l_nodes - l_nodes_by_walk)));
+      _log->stamp();
+      _log->end_head();
+    }
+    VectorSet& useful_member_set = useful.member_set();
+    int last_idx = l_nodes_by_walk;
+    for (int i = 0; i < last_idx; i++) {
+      if (useful_member_set.test(i)) {
+        if (_dead_node_list.test(i)) {
+          if (_log != NULL) {
+            _log->elem("mismatched_node_info node_idx='%d' type='both live and dead'", i);
+          }
+          if (PrintIdealNodeCount) {
+            // Print the log message to tty
+              tty->print_cr("mismatched_node idx='%d' both live and dead'", i);
+              useful.at(i)->dump();
+          }
+        }
+      }
+      else if (! _dead_node_list.test(i)) {
+        if (_log != NULL) {
+          _log->elem("mismatched_node_info node_idx='%d' type='neither live nor dead'", i);
+        }
+        if (PrintIdealNodeCount) {
+          // Print the log message to tty
+          tty->print_cr("mismatched_node idx='%d' type='neither live nor dead'", i);
+        }
+      }
+    }
+    if (_log != NULL) {
+      _log->tail("mismatched_nodes");
+    }
+  }
+}
+#endif
+
 #ifndef PRODUCT
 void Compile::verify_top(Node* tn) const {
   if (tn != NULL) {
@@ -1671,6 +1780,124 @@
   assert(predicate_count()==0, "should be clean!");
 }

+// StringOpts and late inlining of string methods
+void Compile::inline_string_calls(bool parse_time) {
+  {
+    // remove useless nodes to make the usage analysis simpler
+    ResourceMark rm;
+    PhaseRemoveUseless pru(initial_gvn(), for_igvn());
+  }
+
+  {
+    ResourceMark rm;
+    print_method("Before StringOpts", 3);
+    PhaseStringOpts pso(initial_gvn(), for_igvn());
+    print_method("After StringOpts", 3);
+  }
+
+  // now inline anything that we skipped the first time around
+  if (!parse_time) {
+    _late_inlines_pos = _late_inlines.length();
+  }
+
+  while (_string_late_inlines.length() > 0) {
+    CallGenerator* cg = _string_late_inlines.pop();
+    cg->do_late_inline();
+    if (failing())  return;
+  }
+  _string_late_inlines.trunc_to(0);
+}
+
+void Compile::inline_incrementally_one(PhaseIterGVN& igvn) {
+  assert(IncrementalInline, "incremental inlining should be on");
+  PhaseGVN* gvn = initial_gvn();
+
+  set_inlining_progress(false);
+  for_igvn()->clear();
+  gvn->replace_with(&igvn);
+
+  int i = 0;
+
+  for (; i <_late_inlines.length() && !inlining_progress(); i++) {
+    CallGenerator* cg = _late_inlines.at(i);
+    _late_inlines_pos = i+1;
+    cg->do_late_inline();
+    if (failing())  return;
+  }
+  int j = 0;
+  for (; i < _late_inlines.length(); i++, j++) {
+    _late_inlines.at_put(j, _late_inlines.at(i));
+  }
+  _late_inlines.trunc_to(j);
+
+  {
+    ResourceMark rm;
+    PhaseRemoveUseless pru(C->initial_gvn(), C->for_igvn());
+  }
+
+  igvn = PhaseIterGVN(gvn);
+}
+
+// Perform incremental inlining until bound on number of live nodes is reached
+void Compile::inline_incrementally(PhaseIterGVN& igvn) {
+  PhaseGVN* gvn = initial_gvn();
+
+  set_inlining_incrementally(true);
+  set_inlining_progress(true);
+  uint low_live_nodes = 0;
+
+  while(inlining_progress() && _late_inlines.length() > 0) {
+
+    if (live_nodes() > (uint)LiveNodeCountInliningCutoff) {
+      if (low_live_nodes < (uint)LiveNodeCountInliningCutoff * 8 / 10) {
+        // PhaseIdealLoop is expensive so we only try it once we are
+        // out of loop and we only try it again if the previous helped
+        // got the number of nodes down significantly
+        PhaseIdealLoop ideal_loop( igvn, false, true );
+        if (failing())  return;
+        low_live_nodes = live_nodes();
+        _major_progress = true;
+      }
+
+      if (live_nodes() > (uint)LiveNodeCountInliningCutoff) {
+        break;
+      }
+    }
+
+    inline_incrementally_one(igvn);
+
+    if (failing())  return;
+
+    igvn.optimize();
+
+    if (failing())  return;
+  }
+
+  assert( igvn._worklist.size() == 0, "should be done with igvn" );
+
+  if (_string_late_inlines.length() > 0) {
+    assert(has_stringbuilder(), "inconsistent");
+    for_igvn()->clear();
+    initial_gvn()->replace_with(&igvn);
+
+    inline_string_calls(false);
+
+    if (failing())  return;
+
+    {
+      ResourceMark rm;
+      PhaseRemoveUseless pru(initial_gvn(), for_igvn());
+    }
+
+    igvn = PhaseIterGVN(gvn);
+
+    igvn.optimize();
+  }
+
+  set_inlining_incrementally(false);
+}
+
+
 //------------------------------Optimize---------------------------------------
 // Given a graph, optimize it.
 void Compile::Optimize() {
@@ -1703,6 +1930,12 @@

   if (failing())  return;

+  inline_incrementally(igvn);
+
+  print_method("Incremental Inline", 2);
+
+  if (failing())  return;
+
   // Perform escape analysis
   if (_do_escape_analysis && ConnectionGraph::has_candidates(this)) {
     if (has_loops()) {
@@ -1825,6 +2058,7 @@

  } // (End scope of igvn; run destructor if necessary for asserts.)

+ dump_inlining();
   // A method with only infinite loops has no edges entering loops from root
   {
     NOT_PRODUCT( TracePhase t2("graphReshape", &_t_graphReshaping, TimeCompiler); )
@@ -2098,7 +2332,7 @@

 // Eliminate trivially redundant StoreCMs and accumulate their
 // precedence edges.
-static void eliminate_redundant_card_marks(Node* n) {
+void Compile::eliminate_redundant_card_marks(Node* n) {
   assert(n->Opcode() == Op_StoreCM, "expected StoreCM");
   if (n->in(MemNode::Address)->outcnt() > 1) {
     // There are multiple users of the same address so it might be
@@ -2133,7 +2367,7 @@
         // Eliminate the previous StoreCM
         prev->set_req(MemNode::Memory, mem->in(MemNode::Memory));
         assert(mem->outcnt() == 0, "should be dead");
-        mem->disconnect_inputs(NULL);
+        mem->disconnect_inputs(NULL, this);
       } else {
         prev = mem;
       }
@@ -2144,7 +2378,7 @@

 //------------------------------final_graph_reshaping_impl----------------------
 // Implement items 1-5 from final_graph_reshaping below.
-static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
+void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {

   if ( n->outcnt() == 0 ) return; // dead node
   uint nop = n->Opcode();
@@ -2174,8 +2408,7 @@

 #ifdef ASSERT
   if( n->is_Mem() ) {
-    Compile* C = Compile::current();
-    int alias_idx = C->get_alias_index(n->as_Mem()->adr_type());
+    int alias_idx = get_alias_index(n->as_Mem()->adr_type());
     assert( n->in(0) != NULL || alias_idx != Compile::AliasIdxRaw ||
             // oop will be recorded in oop map if load crosses safepoint
             n->is_Load() && (n->as_Load()->bottom_type()->isa_oopptr() ||
@@ -2224,7 +2457,7 @@
     break;
   case Op_Opaque1:              // Remove Opaque Nodes before matching
   case Op_Opaque2:              // Remove Opaque Nodes before matching
-    n->subsume_by(n->in(1));
+    n->subsume_by(n->in(1), this);
     break;
   case Op_CallStaticJava:
   case Op_CallJava:
@@ -2344,8 +2577,7 @@
         Node* nn = NULL;

         // Look for existing ConN node of the same exact type.
-        Compile* C = Compile::current();
-        Node* r  = C->root();
+        Node* r  = root();
         uint cnt = r->outcnt();
         for (uint i = 0; i < cnt; i++) {
           Node* m = r->raw_out(i);
@@ -2358,11 +2590,11 @@
         if (nn != NULL) {
           // Decode a narrow oop to match address
           // [R12 + narrow_oop_reg<<3 + offset]
-          nn = new (C) DecodeNNode(nn, t);
+          nn = new (this) DecodeNNode(nn, t);
           n->set_req(AddPNode::Base, nn);
           n->set_req(AddPNode::Address, nn);
           if (addp->outcnt() == 0) {
-            addp->disconnect_inputs(NULL);
+            addp->disconnect_inputs(NULL, this);
           }
         }
       }
@@ -2374,7 +2606,6 @@
 #ifdef _LP64
   case Op_CastPP:
     if (n->in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks()) {
-      Compile* C = Compile::current();
       Node* in1 = n->in(1);
       const Type* t = n->bottom_type();
       Node* new_in1 = in1->clone();
@@ -2403,9 +2634,9 @@
         new_in1->set_req(0, n->in(0));
       }

-      n->subsume_by(new_in1);
+      n->subsume_by(new_in1, this);
       if (in1->outcnt() == 0) {
-        in1->disconnect_inputs(NULL);
+        in1->disconnect_inputs(NULL, this);
       }
     }
     break;
@@ -2422,7 +2653,6 @@
       }
       assert(in1->is_DecodeN(), "sanity");

-      Compile* C = Compile::current();
       Node* new_in2 = NULL;
       if (in2->is_DecodeN()) {
         new_in2 = in2->in(1);
@@ -2433,7 +2663,7 @@
           // oops implicit null check is not generated.
           // This will allow to generate normal oop implicit null check.
           if (Matcher::gen_narrow_oop_implicit_null_checks())
-            new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
+            new_in2 = ConNode::make(this, TypeNarrowOop::NULL_PTR);
           //
           // This transformation together with CastPP transformation above
           // will generated code for implicit NULL checks for compressed oops.
@@ -2472,17 +2702,17 @@
           //    NullCheck base_reg
           //
         } else if (t->isa_oopptr()) {
-          new_in2 = ConNode::make(C, t->make_narrowoop());
+          new_in2 = ConNode::make(this, t->make_narrowoop());
         }
       }
       if (new_in2 != NULL) {
-        Node* cmpN = new (C) CmpNNode(in1->in(1), new_in2);
-        n->subsume_by( cmpN );
+        Node* cmpN = new (this) CmpNNode(in1->in(1), new_in2);
+        n->subsume_by(cmpN, this);
         if (in1->outcnt() == 0) {
-          in1->disconnect_inputs(NULL);
+          in1->disconnect_inputs(NULL, this);
         }
         if (in2->outcnt() == 0) {
-          in2->disconnect_inputs(NULL);
+          in2->disconnect_inputs(NULL, this);
         }
       }
     }
@@ -2498,18 +2728,17 @@
   case Op_EncodeP: {
     Node* in1 = n->in(1);
     if (in1->is_DecodeN()) {
-      n->subsume_by(in1->in(1));
+      n->subsume_by(in1->in(1), this);
     } else if (in1->Opcode() == Op_ConP) {
-      Compile* C = Compile::current();
       const Type* t = in1->bottom_type();
       if (t == TypePtr::NULL_PTR) {
-        n->subsume_by(ConNode::make(C, TypeNarrowOop::NULL_PTR));
+        n->subsume_by(ConNode::make(this, TypeNarrowOop::NULL_PTR), this);
       } else if (t->isa_oopptr()) {
-        n->subsume_by(ConNode::make(C, t->make_narrowoop()));
+        n->subsume_by(ConNode::make(this, t->make_narrowoop()), this);
       }
     }
     if (in1->outcnt() == 0) {
-      in1->disconnect_inputs(NULL);
+      in1->disconnect_inputs(NULL, this);
     }
     break;
   }
@@ -2532,7 +2761,7 @@
           }
         }
         assert(proj != NULL, "must be found");
-        p->subsume_by(proj);
+        p->subsume_by(proj, this);
       }
     }
     break;
@@ -2552,7 +2781,7 @@
           unique_in = NULL;
       }
       if (unique_in != NULL) {
-        n->subsume_by(unique_in);
+        n->subsume_by(unique_in, this);
       }
     }
     break;
@@ -2565,16 +2794,15 @@
       Node* d = n->find_similar(Op_DivI);
       if (d) {
         // Replace them with a fused divmod if supported
-        Compile* C = Compile::current();
         if (Matcher::has_match_rule(Op_DivModI)) {
-          DivModINode* divmod = DivModINode::make(C, n);
-          d->subsume_by(divmod->div_proj());
-          n->subsume_by(divmod->mod_proj());
+          DivModINode* divmod = DivModINode::make(this, n);
+          d->subsume_by(divmod->div_proj(), this);
+          n->subsume_by(divmod->mod_proj(), this);
         } else {
           // replace a%b with a-((a/b)*b)
-          Node* mult = new (C) MulINode(d, d->in(2));
-          Node* sub  = new (C) SubINode(d->in(1), mult);
-          n->subsume_by( sub );
+          Node* mult = new (this) MulINode(d, d->in(2));
+          Node* sub  = new (this) SubINode(d->in(1), mult);
+          n->subsume_by(sub, this);
         }
       }
     }
@@ -2586,16 +2814,15 @@
       Node* d = n->find_similar(Op_DivL);
       if (d) {
         // Replace them with a fused divmod if supported
-        Compile* C = Compile::current();
         if (Matcher::has_match_rule(Op_DivModL)) {
-          DivModLNode* divmod = DivModLNode::make(C, n);
-          d->subsume_by(divmod->div_proj());
-          n->subsume_by(divmod->mod_proj());
+          DivModLNode* divmod = DivModLNode::make(this, n);
+          d->subsume_by(divmod->div_proj(), this);
+          n->subsume_by(divmod->mod_proj(), this);
         } else {
           // replace a%b with a-((a/b)*b)
-          Node* mult = new (C) MulLNode(d, d->in(2));
-          Node* sub  = new (C) SubLNode(d->in(1), mult);
-          n->subsume_by( sub );
+          Node* mult = new (this) MulLNode(d, d->in(2));
+          Node* sub  = new (this) SubLNode(d->in(1), mult);
+          n->subsume_by(sub, this);
         }
       }
     }
@@ -2614,8 +2841,8 @@
     if (n->req()-1 > 2) {
       // Replace many operand PackNodes with a binary tree for matching
       PackNode* p = (PackNode*) n;
-      Node* btp = p->binary_tree_pack(Compile::current(), 1, n->req());
-      n->subsume_by(btp);
+      Node* btp = p->binary_tree_pack(this, 1, n->req());
+      n->subsume_by(btp, this);
     }
     break;
   case Op_Loop:
@@ -2639,18 +2866,16 @@
       if (t != NULL && t->is_con()) {
         juint shift = t->get_con();
         if (shift > mask) { // Unsigned cmp
-          Compile* C = Compile::current();
-          n->set_req(2, ConNode::make(C, TypeInt::make(shift & mask)));
+          n->set_req(2, ConNode::make(this, TypeInt::make(shift & mask)));
         }
       } else {
         if (t == NULL || t->_lo < 0 || t->_hi > (int)mask) {
-          Compile* C = Compile::current();
-          Node* shift = new (C) AndINode(in2, ConNode::make(C, TypeInt::make(mask)));
+          Node* shift = new (this) AndINode(in2, ConNode::make(this, TypeInt::make(mask)));
           n->set_req(2, shift);
         }
       }
       if (in2->outcnt() == 0) { // Remove dead node
-        in2->disconnect_inputs(NULL);
+        in2->disconnect_inputs(NULL, this);
       }
     }
     break;
@@ -2668,7 +2893,7 @@
 //------------------------------final_graph_reshaping_walk---------------------
 // Replacing Opaque nodes with their input in final_graph_reshaping_impl(),
 // requires that the walk visits a node's inputs before visiting the node.
-static void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Reshape_Counts &frc ) {
+void Compile::final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Reshape_Counts &frc ) {
   ResourceArea *area = Thread::current()->resource_area();
   Unique_Node_List sfpt(area);

@@ -2734,7 +2959,7 @@
           n->set_req(j, in->in(1));
         }
         if (in->outcnt() == 0) {
-          in->disconnect_inputs(NULL);
+          in->disconnect_inputs(NULL, this);
         }
       }
     }
@@ -3007,7 +3232,8 @@
 }

 Compile::TracePhase::TracePhase(const char* name, elapsedTimer* accumulator, bool dolog)
-  : TraceTime(NULL, accumulator, false NOT_PRODUCT( || TimeCompiler ), false)
+  : TraceTime(NULL, accumulator, false NOT_PRODUCT( || TimeCompiler ), false),
+    _phase_name(name), _dolog(dolog)
 {
   if (dolog) {
     C = Compile::current();
@@ -3017,15 +3243,34 @@
     _log = NULL;
   }
   if (_log != NULL) {
-    _log->begin_head("phase name='%s' nodes='%d'", name, C->unique());
+    _log->begin_head("phase name='%s' nodes='%d' live='%d'", _phase_name, C->unique(), C->live_nodes());
     _log->stamp();
     _log->end_head();
   }
 }

 Compile::TracePhase::~TracePhase() {
+
+  C = Compile::current();
+  if (_dolog) {
+    _log = C->log();
+  } else {
+    _log = NULL;
+  }
+
+#ifdef ASSERT
+  if (PrintIdealNodeCount) {
+    tty->print_cr("phase name='%s' nodes='%d' live='%d' live_graph_walk='%d'",
+                  _phase_name, C->unique(), C->live_nodes(), C->count_live_nodes_by_graph_walk());
+  }
+
+  if (VerifyIdealNodeCount) {
+    Compile::current()->print_missing_nodes();
+  }
+#endif
+
   if (_log != NULL) {
-    _log->done("phase nodes='%d'", C->unique());
+    _log->done("phase name='%s' nodes='%d' live='%d'", _phase_name, C->unique(), C->live_nodes());
   }
 }

@@ -3226,3 +3471,33 @@
     cb.consts()->relocate((address) constant_addr, relocInfo::internal_word_type);
   }
 }
+
+void Compile::dump_inlining() {
+  if (PrintInlining) {
+    // Print inlining message for candidates that we couldn't inline
+    // for lack of space or non constant receiver
+    for (int i = 0; i < _late_inlines.length(); i++) {
+      CallGenerator* cg = _late_inlines.at(i);
+      cg->print_inlining_late("live nodes > LiveNodeCountInliningCutoff");
+    }
+    Unique_Node_List useful;
+    useful.push(root());
+    for (uint next = 0; next < useful.size(); ++next) {
+      Node* n  = useful.at(next);
+      if (n->is_Call() && n->as_Call()->generator() != NULL && n->as_Call()->generator()->call_node() == n) {
+        CallNode* call = n->as_Call();
+        CallGenerator* cg = call->generator();
+        cg->print_inlining_late("receiver not constant");
+      }
+      uint max = n->len();
+      for ( uint i = 0; i < max; ++i ) {
+        Node *m = n->in(i);
+        if ( m == NULL ) continue;
+        useful.push(m);
+      }
+    }
+    for (int i = 0; i < _print_inlining_list->length(); i++) {
+      tty->print(_print_inlining_list->at(i).ss()->as_string());
+    }
+  }
+}
--- a/src/share/vm/opto/compile.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/compile.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -30,6 +30,7 @@
 #include "code/debugInfoRec.hpp"
 #include "code/exceptionHandlerTable.hpp"
 #include "compiler/compilerOracle.hpp"
+#include "compiler/compileBroker.hpp"
 #include "libadt/dict.hpp"
 #include "libadt/port.hpp"
 #include "libadt/vectset.hpp"
@@ -75,6 +76,8 @@
 class Unique_Node_List;
 class nmethod;
 class WarmCallInfo;
+class Node_Stack;
+struct Final_Reshape_Counts;

 //------------------------------Compile----------------------------------------
 // This class defines a top-level Compiler invocation.
@@ -98,6 +101,8 @@
    private:
     Compile*    C;
     CompileLog* _log;
+    const char* _phase_name;
+    bool _dolog;
    public:
     TracePhase(const char* name, elapsedTimer* accumulator, bool dolog);
     ~TracePhase();
@@ -259,6 +264,8 @@
   int                   _orig_pc_slot_offset_in_bytes;

   int                   _major_progress;        // Count of something big happening
+  bool                  _inlining_progress;     // progress doing incremental inlining?
+  bool                  _inlining_incrementally;// Are we doing incremental inlining (post parse)
   bool                  _has_loops;             // True if the method _may_ have some loops
   bool                  _has_split_ifs;         // True if the method _may_ have some split-if
   bool                  _has_unsafe_access;     // True if the method _may_ produce faults in unsafe loads or stores.
@@ -297,6 +304,9 @@

   // Node management
   uint                  _unique;                // Counter for unique Node indices
+  VectorSet             _dead_node_list;        // Set of dead nodes
+  uint                  _dead_node_count;       // Number of dead nodes; VectorSet::Size() is O(N).
+                                                // So use this to keep count and make the call O(1).
   debug_only(static int _debug_idx;)            // Monotonic counter (not reset), use -XX:BreakAtNode=<idx>
   Arena                 _node_arena;            // Arena for new-space Nodes
   Arena                 _old_arena;             // Arena for old-space Nodes, lifetime during xform
@@ -343,9 +353,69 @@
   Unique_Node_List*     _for_igvn;              // Initial work-list for next round of Iterative GVN
   WarmCallInfo*         _warm_calls;            // Sorted work-list for heat-based inlining.

-  GrowableArray<CallGenerator*> _late_inlines;  // List of CallGenerators to be revisited after
-                                                // main parsing has finished.
+  GrowableArray<CallGenerator*> _late_inlines;        // List of CallGenerators to be revisited after
+                                                      // main parsing has finished.
+  GrowableArray<CallGenerator*> _string_late_inlines; // same but for string operations
+
+  int                           _late_inlines_pos;    // Where in the queue should the next late inlining candidate go (emulate depth first inlining)
+  uint                          _number_of_mh_late_inlines; // number of method handle late inlining still pending
+
+
+  // Inlining may not happen in parse order which would make
+  // PrintInlining output confusing. Keep track of PrintInlining
+  // pieces in order.
+  class PrintInliningBuffer : public ResourceObj {
+   private:
+    CallGenerator* _cg;
+    stringStream* _ss;
+
+   public:
+    PrintInliningBuffer()
+      : _cg(NULL) { _ss = new stringStream(); }
+
+    stringStream* ss() const { return _ss; }
+    CallGenerator* cg() const { return _cg; }
+    void set_cg(CallGenerator* cg) { _cg = cg; }
+  };
+
+  GrowableArray<PrintInliningBuffer>* _print_inlining_list;
+  int _print_inlining;
+
+ public:

+  outputStream* print_inlining_stream() const {
+    return _print_inlining_list->at(_print_inlining).ss();
+  }
+
+  void print_inlining_skip(CallGenerator* cg) {
+    if (PrintInlining) {
+      _print_inlining_list->at(_print_inlining).set_cg(cg);
+      _print_inlining++;
+      _print_inlining_list->insert_before(_print_inlining, PrintInliningBuffer());
+    }
+  }
+
+  void print_inlining_insert(CallGenerator* cg) {
+    if (PrintInlining) {
+      for (int i = 0; i < _print_inlining_list->length(); i++) {
+        if (_print_inlining_list->at(i).cg() == cg) {
+          _print_inlining_list->insert_before(i+1, PrintInliningBuffer());
+          _print_inlining = i+1;
+          _print_inlining_list->at(i).set_cg(NULL);
+          return;
+        }
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  void print_inlining(ciMethod* method, int inline_level, int bci, const char* msg = NULL) {
+    stringStream ss;
+    CompileTask::print_inlining(&ss, method, inline_level, bci, msg);
+    print_inlining_stream()->print(ss.as_string());
+  }
+
+ private:
   // Matching, CFG layout, allocation, code generation
   PhaseCFG*             _cfg;                   // Results of CFG finding
   bool                  _select_24_bit_instr;   // We selected an instruction with a 24-bit result
@@ -412,6 +482,10 @@
   int               fixed_slots() const         { assert(_fixed_slots >= 0, "");         return _fixed_slots; }
   void          set_fixed_slots(int n)          { _fixed_slots = n; }
   int               major_progress() const      { return _major_progress; }
+  void          set_inlining_progress(bool z)   { _inlining_progress = z; }
+  int               inlining_progress() const   { return _inlining_progress; }
+  void          set_inlining_incrementally(bool z) { _inlining_incrementally = z; }
+  int               inlining_incrementally() const { return _inlining_incrementally; }
   void          set_major_progress()            { _major_progress++; }
   void        clear_major_progress()            { _major_progress = 0; }
   int               num_loop_opts() const       { return _num_loop_opts; }
@@ -518,7 +592,7 @@
   ciEnv*            env() const                 { return _env; }
   CompileLog*       log() const                 { return _log; }
   bool              failing() const             { return _env->failing() || _failure_reason != NULL; }
-  const char* failure_reason() { return _failure_reason; }
+  const char*       failure_reason() { return _failure_reason; }
   bool              failure_reason_is(const char* r) { return (r==_failure_reason) || (r!=NULL && _failure_reason!=NULL && strcmp(r, _failure_reason)==0); }

   void record_failure(const char* reason);
@@ -533,7 +607,7 @@
     record_method_not_compilable(reason, true);
   }
   bool check_node_count(uint margin, const char* reason) {
-    if (unique() + margin > (uint)MaxNodeLimit) {
+    if (live_nodes() + margin > (uint)MaxNodeLimit) {
       record_method_not_compilable(reason);
       return true;
     } else {
@@ -542,25 +616,41 @@
   }

   // Node management
-  uint              unique() const              { return _unique; }
-  uint         next_unique()                    { return _unique++; }
-  void          set_unique(uint i)              { _unique = i; }
-  static int        debug_idx()                 { return debug_only(_debug_idx)+0; }
-  static void   set_debug_idx(int i)            { debug_only(_debug_idx = i); }
-  Arena*            node_arena()                { return &_node_arena; }
-  Arena*            old_arena()                 { return &_old_arena; }
-  RootNode*         root() const                { return _root; }
-  void          set_root(RootNode* r)           { _root = r; }
-  StartNode*        start() const;              // (Derived from root.)
+  uint         unique() const              { return _unique; }
+  uint         next_unique()               { return _unique++; }
+  void         set_unique(uint i)          { _unique = i; }
+  static int   debug_idx()                 { return debug_only(_debug_idx)+0; }
+  static void  set_debug_idx(int i)        { debug_only(_debug_idx = i); }
+  Arena*       node_arena()                { return &_node_arena; }
+  Arena*       old_arena()                 { return &_old_arena; }
+  RootNode*    root() const                { return _root; }
+  void         set_root(RootNode* r)       { _root = r; }
+  StartNode*   start() const;              // (Derived from root.)
   void         init_start(StartNode* s);
-  Node*             immutable_memory();
+  Node*        immutable_memory();

-  Node*             recent_alloc_ctl() const    { return _recent_alloc_ctl; }
-  Node*             recent_alloc_obj() const    { return _recent_alloc_obj; }
-  void          set_recent_alloc(Node* ctl, Node* obj) {
+  Node*        recent_alloc_ctl() const    { return _recent_alloc_ctl; }
+  Node*        recent_alloc_obj() const    { return _recent_alloc_obj; }
+  void         set_recent_alloc(Node* ctl, Node* obj) {
                                                   _recent_alloc_ctl = ctl;
                                                   _recent_alloc_obj = obj;
-                                                }
+                                           }
+  void         record_dead_node(uint idx)  { if (_dead_node_list.test_set(idx)) return;
+                                             _dead_node_count++;
+                                           }
+  uint         dead_node_count()           { return _dead_node_count; }
+  void         reset_dead_node_list()      { _dead_node_list.Reset();
+                                             _dead_node_count = 0;
+                                           }
+  uint          live_nodes() const         {
+    int  val = _unique - _dead_node_count;
+    assert (val >= 0, err_msg_res("number of tracked dead nodes %d more than created nodes %d", _unique, _dead_node_count));
+            return (uint) val;
+                                           }
+#ifdef ASSERT
+  uint         count_live_nodes_by_graph_walk();
+  void         print_missing_nodes();
+#endif

   // Constant table
   ConstantTable&   constant_table() { return _constant_table; }
@@ -634,7 +724,7 @@

   // Decide how to build a call.
   // The profile factor is a discount to apply to this site's interp. profile.
-  CallGenerator*    call_generator(ciMethod* call_method, int vtable_index, bool call_is_virtual, JVMState* jvms, bool allow_inline, float profile_factor, bool allow_intrinsics = true);
+  CallGenerator*    call_generator(ciMethod* call_method, int vtable_index, bool call_is_virtual, JVMState* jvms, bool allow_inline, float profile_factor, bool allow_intrinsics = true, bool delayed_forbidden = false);
   bool should_delay_inlining(ciMethod* call_method, JVMState* jvms);

   // Report if there were too many traps at a current method and bci.
@@ -662,14 +752,46 @@


   void              identify_useful_nodes(Unique_Node_List &useful);
-  void              remove_useless_nodes  (Unique_Node_List &useful);
+  void              update_dead_node_list(Unique_Node_List &useful);
+  void              remove_useless_nodes (Unique_Node_List &useful);

   WarmCallInfo*     warm_calls() const          { return _warm_calls; }
   void          set_warm_calls(WarmCallInfo* l) { _warm_calls = l; }
   WarmCallInfo* pop_warm_call();

   // Record this CallGenerator for inlining at the end of parsing.
-  void              add_late_inline(CallGenerator* cg) { _late_inlines.push(cg); }
+  void              add_late_inline(CallGenerator* cg)        {
+    _late_inlines.insert_before(_late_inlines_pos, cg);
+    _late_inlines_pos++;
+  }
+
+  void              prepend_late_inline(CallGenerator* cg)    {
+    _late_inlines.insert_before(0, cg);
+  }
+
+  void              add_string_late_inline(CallGenerator* cg) {
+    _string_late_inlines.push(cg);
+  }
+
+  void remove_useless_late_inlines(GrowableArray<CallGenerator*>* inlines, Unique_Node_List &useful);
+
+  void dump_inlining();
+
+  bool over_inlining_cutoff() const {
+    if (!inlining_incrementally()) {
+      return unique() > (uint)NodeCountInliningCutoff;
+    } else {
+      return live_nodes() > (uint)LiveNodeCountInliningCutoff;
+    }
+  }
+
+  void inc_number_of_mh_late_inlines() { _number_of_mh_late_inlines++; }
+  void dec_number_of_mh_late_inlines() { assert(_number_of_mh_late_inlines > 0, "_number_of_mh_late_inlines < 0 !"); _number_of_mh_late_inlines--; }
+  bool has_mh_late_inlines() const     { return _number_of_mh_late_inlines > 0; }
+
+  void inline_incrementally_one(PhaseIterGVN& igvn);
+  void inline_incrementally(PhaseIterGVN& igvn);
+  void inline_string_calls(bool parse_time);

   // Matching, CFG layout, allocation, code generation
   PhaseCFG*         cfg()                       { return _cfg; }
@@ -876,6 +998,11 @@
   static juint  _intrinsic_hist_count[vmIntrinsics::ID_LIMIT];
   static jubyte _intrinsic_hist_flags[vmIntrinsics::ID_LIMIT];
 #endif
+  // Function calls made by the public function final_graph_reshaping.
+  // No need to be made public as they are not called elsewhere.
+  void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc);
+  void final_graph_reshaping_walk( Node_Stack &nstack, Node *root, Final_Reshape_Counts &frc );
+  void eliminate_redundant_card_marks(Node* n);

  public:
--- a/src/share/vm/opto/doCall.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/doCall.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -41,25 +41,30 @@
 #include "prims/nativeLookup.hpp"
 #include "runtime/sharedRuntime.hpp"

-void trace_type_profile(ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
+void trace_type_profile(Compile* C, ciMethod *method, int depth, int bci, ciMethod *prof_method, ciKlass *prof_klass, int site_count, int receiver_count) {
   if (TraceTypeProfile || PrintInlining NOT_PRODUCT(|| PrintOptoInlining)) {
+    outputStream* out = tty;
     if (!PrintInlining) {
       if (NOT_PRODUCT(!PrintOpto &&) !PrintCompilation) {
         method->print_short_name();
         tty->cr();
       }
       CompileTask::print_inlining(prof_method, depth, bci);
+    } else {
+      out = C->print_inlining_stream();
     }
-    CompileTask::print_inline_indent(depth);
-    tty->print(" \\-> TypeProfile (%d/%d counts) = ", receiver_count, site_count);
-    prof_klass->name()->print_symbol();
-    tty->cr();
+    CompileTask::print_inline_indent(depth, out);
+    out->print(" \\-> TypeProfile (%d/%d counts) = ", receiver_count, site_count);
+    stringStream ss;
+    prof_klass->name()->print_symbol_on(&ss);
+    out->print(ss.as_string());
+    out->cr();
   }
 }

 CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool call_is_virtual,
                                        JVMState* jvms, bool allow_inline,
-                                       float prof_factor, bool allow_intrinsics) {
+                                       float prof_factor, bool allow_intrinsics, bool delayed_forbidden) {
   ciMethod*       caller   = jvms->method();
   int             bci      = jvms->bci();
   Bytecodes::Code bytecode = caller->java_code_at_bci(bci);
@@ -126,7 +131,9 @@
   // MethodHandle.invoke* are native methods which obviously don't
   // have bytecodes and so normal inlining fails.
   if (callee->is_method_handle_intrinsic()) {
-    return CallGenerator::for_method_handle_call(jvms, caller, callee);
+    CallGenerator* cg = CallGenerator::for_method_handle_call(jvms, caller, callee, delayed_forbidden);
+    assert (cg == NULL || !delayed_forbidden || !cg->is_late_inline() || cg->is_mh_late_inline(), "unexpected CallGenerator");
+    return cg;
   }

   // Do not inline strict fp into non-strict code, or the reverse
@@ -157,20 +164,27 @@
       WarmCallInfo scratch_ci;
       if (!UseOldInlining)
         scratch_ci.init(jvms, callee, profile, prof_factor);
-      WarmCallInfo* ci = ilt->ok_to_inline(callee, jvms, profile, &scratch_ci);
+      bool should_delay = false;
+      WarmCallInfo* ci = ilt->ok_to_inline(callee, jvms, profile, &scratch_ci, should_delay);
       assert(ci != &scratch_ci, "do not let this pointer escape");
       bool allow_inline   = (ci != NULL && !ci->is_cold());
       bool require_inline = (allow_inline && ci->is_hot());

       if (allow_inline) {
         CallGenerator* cg = CallGenerator::for_inline(callee, expected_uses);
-        if (require_inline && cg != NULL && should_delay_inlining(callee, jvms)) {
+
+        if (require_inline && cg != NULL) {
           // Delay the inlining of this method to give us the
           // opportunity to perform some high level optimizations
           // first.
-          return CallGenerator::for_late_inline(callee, cg);
+          if (should_delay_inlining(callee, jvms)) {
+            assert(!delayed_forbidden, "strange");
+            return CallGenerator::for_string_late_inline(callee, cg);
+          } else if ((should_delay || AlwaysIncrementalInline) && !delayed_forbidden) {
+            return CallGenerator::for_late_inline(callee, cg);
+          }
         }
-        if (cg == NULL) {
+        if (cg == NULL || should_delay) {
           // Fall through.
         } else if (require_inline || !InlineWarmCalls) {
           return cg;
@@ -234,13 +248,13 @@
           }
           if (miss_cg != NULL) {
             if (next_hit_cg != NULL) {
-              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
+              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), next_receiver_method, profile.receiver(1), site_count, profile.receiver_count(1));
               // We don't need to record dependency on a receiver here and below.
               // Whenever we inline, the dependency is added by Parse::Parse().
               miss_cg = CallGenerator::for_predicted_call(profile.receiver(1), miss_cg, next_hit_cg, PROB_MAX);
             }
             if (miss_cg != NULL) {
-              trace_type_profile(jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
+              trace_type_profile(C, jvms->method(), jvms->depth() - 1, jvms->bci(), receiver_method, profile.receiver(0), site_count, receiver_count);
               CallGenerator* cg = CallGenerator::for_predicted_call(profile.receiver(0), miss_cg, hit_cg, profile.receiver_prob(0));
               if (cg != NULL)  return cg;
             }
@@ -335,7 +349,7 @@
     return true;
   }

-  assert(dest_method->will_link(method()->holder(), klass, bc()), "dest_method: typeflow responsibility");
+  assert(dest_method->is_loaded(), "dest_method: typeflow responsibility");
   return false;
 }

@@ -351,7 +365,7 @@
   // Set frequently used booleans
   const bool is_virtual = bc() == Bytecodes::_invokevirtual;
   const bool is_virtual_or_interface = is_virtual || bc() == Bytecodes::_invokeinterface;
-  const bool has_receiver = is_virtual_or_interface || bc() == Bytecodes::_invokespecial;
+  const bool has_receiver = Bytecodes::has_receiver(bc());

   // Find target being called
   bool             will_link;
@@ -381,6 +395,8 @@
   // Note:  In the absence of miranda methods, an abstract class K can perform
   // an invokevirtual directly on an interface method I.m if K implements I.

+  // orig_callee is the resolved callee which's signature includes the
+  // appendix argument.
   const int nargs = orig_callee->arg_size();

   // Push appendix argument (MethodType, CallSite, etc.), if one.
@@ -573,7 +589,7 @@
       }
       // If there is going to be a trap, put it at the next bytecode:
       set_bci(iter().next_bci());
-      do_null_assert(peek(), T_OBJECT);
+      null_assert(peek());
       set_bci(iter().cur_bci()); // put it back
     }
   }
--- a/src/share/vm/opto/escape.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/escape.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -2313,7 +2313,7 @@
       }
     }
   }
-  if ((int)C->unique() + 2*NodeLimitFudgeFactor > MaxNodeLimit) {
+  if ((int) (C->live_nodes() + 2*NodeLimitFudgeFactor) > MaxNodeLimit) {
     if (C->do_escape_analysis() == true && !C->failing()) {
       // Retry compilation without escape analysis.
       // If this is the first failure, the sentinel string will "stick"
--- a/src/share/vm/opto/gcm.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/gcm.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1359,7 +1359,7 @@
   // If we inserted any instructions between a Call and his CatchNode,
   // clone the instructions on all paths below the Catch.
   for( i=0; i < _num_blocks; i++ )
-    _blocks[i]->call_catch_cleanup(_bbs);
+    _blocks[i]->call_catch_cleanup(_bbs, C);

 #ifndef PRODUCT
   if (trace_opto_pipelining()) {
--- a/src/share/vm/opto/graphKit.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/graphKit.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -93,6 +93,16 @@
   return jvms;
 }

+//--------------------------------sync_jvms_for_reexecute---------------------
+// Make sure our current jvms agrees with our parse state.  This version
+// uses the reexecute_sp for reexecuting bytecodes.
+JVMState* GraphKit::sync_jvms_for_reexecute() {
+  JVMState* jvms = this->jvms();
+  jvms->set_bci(bci());          // Record the new bci in the JVMState
+  jvms->set_sp(reexecute_sp());  // Record the new sp in the JVMState
+  return jvms;
+}
+
 #ifdef ASSERT
 bool GraphKit::jvms_in_sync() const {
   Parse* parse = is_Parse();
@@ -143,7 +153,7 @@
 void GraphKit::stop_and_kill_map() {
   SafePointNode* dead_map = stop();
   if (dead_map != NULL) {
-    dead_map->disconnect_inputs(NULL); // Mark the map as killed.
+    dead_map->disconnect_inputs(NULL, C); // Mark the map as killed.
     assert(dead_map->is_killed(), "must be so marked");
   }
 }
@@ -826,7 +836,16 @@
   // Walk the inline list to fill in the correct set of JVMState's
   // Also fill in the associated edges for each JVMState.

-  JVMState* youngest_jvms = sync_jvms();
+  // If the bytecode needs to be reexecuted we need to put
+  // the arguments back on the stack.
+  const bool should_reexecute = jvms()->should_reexecute();
+  JVMState* youngest_jvms = should_reexecute ? sync_jvms_for_reexecute() : sync_jvms();
+
+  // NOTE: set_bci (called from sync_jvms) might reset the reexecute bit to
+  // undefined if the bci is different.  This is normal for Parse but it
+  // should not happen for LibraryCallKit because only one bci is processed.
+  assert(!is_LibraryCallKit() || (jvms()->should_reexecute() == should_reexecute),
+         "in LibraryCallKit the reexecute bit should not change");

   // If we are guaranteed to throw, we can prune everything but the
   // input to the current bytecode.
@@ -860,7 +879,7 @@
   }

   // Presize the call:
-  debug_only(uint non_debug_edges = call->req());
+  DEBUG_ONLY(uint non_debug_edges = call->req());
   call->add_req_batch(top(), youngest_jvms->debug_depth());
   assert(call->req() == non_debug_edges + youngest_jvms->debug_depth(), "");

@@ -965,7 +984,7 @@
   assert(call->jvms()->debug_depth() == call->req() - non_debug_edges, "");
 }

-bool GraphKit::compute_stack_effects(int& inputs, int& depth, bool for_parse) {
+bool GraphKit::compute_stack_effects(int& inputs, int& depth) {
   Bytecodes::Code code = java_bc();
   if (code == Bytecodes::_wide) {
     code = method()->java_code_at_bci(bci() + 1);
@@ -1005,14 +1024,11 @@
   case Bytecodes::_getfield:
   case Bytecodes::_putfield:
     {
+      bool ignored_will_link;
+      ciField* field = method()->get_field_at_bci(bci(), ignored_will_link);
+      int      size  = field->type()->size();
       bool is_get = (depth >= 0), is_static = (depth & 1);
-      ciBytecodeStream iter(method());
-      iter.reset_to_bci(bci());
-      iter.next();
-      bool ignored_will_link;
-      ciField* field = iter.get_field(ignored_will_link);
-      int      size  = field->type()->size();
-      inputs  = (is_static ? 0 : 1);
+      inputs = (is_static ? 0 : 1);
       if (is_get) {
         depth = size - inputs;
       } else {
@@ -1028,26 +1044,11 @@
   case Bytecodes::_invokedynamic:
   case Bytecodes::_invokeinterface:
     {
-      ciBytecodeStream iter(method());
-      iter.reset_to_bci(bci());
-      iter.next();
       bool ignored_will_link;
       ciSignature* declared_signature = NULL;
-      ciMethod* callee = iter.get_method(ignored_will_link, &declared_signature);
+      ciMethod* ignored_callee = method()->get_method_at_bci(bci(), ignored_will_link, &declared_signature);
       assert(declared_signature != NULL, "cannot be null");
-      // (Do not use ciMethod::arg_size(), because
-      // it might be an unloaded method, which doesn't
-      // know whether it is static or not.)
-      if (for_parse) {
-        // Case 1: When called from parse we are *before* the invoke (in the
-        //         caller) and need to to adjust the inputs by an appendix
-        //         argument that will be pushed implicitly.
-        inputs = callee->invoke_arg_size(code) - (iter.has_appendix() ? 1 : 0);
-      } else {
-        // Case 2: Here we are *after* the invoke (in the callee) and need to
-        //         remove any appendix arguments that were popped.
-        inputs = callee->invoke_arg_size(code) - (callee->has_member_arg() ? 1 : 0);
-      }
+      inputs   = declared_signature->arg_size_for_bc(code);
       int size = declared_signature->return_type()->size();
       depth = size - inputs;
     }
@@ -1178,7 +1179,7 @@
   Node *chk = NULL;
   switch(type) {
     case T_LONG   : chk = new (C) CmpLNode(value, _gvn.zerocon(T_LONG)); break;
-    case T_INT    : chk = new (C) CmpINode( value, _gvn.intcon(0)); break;
+    case T_INT    : chk = new (C) CmpINode(value, _gvn.intcon(0)); break;
     case T_ARRAY  : // fall through
       type = T_OBJECT;  // simplify further tests
     case T_OBJECT : {
@@ -1229,7 +1230,8 @@
       break;
     }

-    default      : ShouldNotReachHere();
+    default:
+      fatal(err_msg_res("unexpected type: %s", type2name(type)));
   }
   assert(chk != NULL, "sanity check");
   chk = _gvn.transform(chk);
@@ -1769,11 +1771,21 @@
   CallProjections callprojs;
   call->extract_projections(&callprojs, true);

+  Node* init_mem = call->in(TypeFunc::Memory);
+  Node* final_mem = final_state->in(TypeFunc::Memory);
+  Node* final_ctl = final_state->in(TypeFunc::Control);
+  Node* final_io = final_state->in(TypeFunc::I_O);
+
   // Replace all the old call edges with the edges from the inlining result
-  C->gvn_replace_by(callprojs.fallthrough_catchproj, final_state->in(TypeFunc::Control));
-  C->gvn_replace_by(callprojs.fallthrough_memproj,   final_state->in(TypeFunc::Memory));
-  C->gvn_replace_by(callprojs.fallthrough_ioproj,    final_state->in(TypeFunc::I_O));
-  Node* final_mem = final_state->in(TypeFunc::Memory);
+  if (callprojs.fallthrough_catchproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_catchproj, final_ctl);
+  }
+  if (callprojs.fallthrough_memproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_memproj,   final_mem);
+  }
+  if (callprojs.fallthrough_ioproj != NULL) {
+    C->gvn_replace_by(callprojs.fallthrough_ioproj,    final_io);
+  }

   // Replace the result with the new result if it exists and is used
   if (callprojs.resproj != NULL && result != NULL) {
@@ -1782,10 +1794,15 @@

   if (ejvms == NULL) {
     // No exception edges to simply kill off those paths
-    C->gvn_replace_by(callprojs.catchall_catchproj, C->top());
-    C->gvn_replace_by(callprojs.catchall_memproj,   C->top());
-    C->gvn_replace_by(callprojs.catchall_ioproj,    C->top());
-
+    if (callprojs.catchall_catchproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_catchproj, C->top());
+    }
+    if (callprojs.catchall_memproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_memproj,   C->top());
+    }
+    if (callprojs.catchall_ioproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_ioproj,    C->top());
+    }
     // Replace the old exception object with top
     if (callprojs.exobj != NULL) {
       C->gvn_replace_by(callprojs.exobj, C->top());
@@ -1797,10 +1814,15 @@
     SafePointNode* ex_map = ekit.combine_and_pop_all_exception_states();

     Node* ex_oop = ekit.use_exception_state(ex_map);
-
-    C->gvn_replace_by(callprojs.catchall_catchproj, ekit.control());
-    C->gvn_replace_by(callprojs.catchall_memproj,   ekit.reset_memory());
-    C->gvn_replace_by(callprojs.catchall_ioproj,    ekit.i_o());
+    if (callprojs.catchall_catchproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_catchproj, ekit.control());
+    }
+    if (callprojs.catchall_memproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_memproj,   ekit.reset_memory());
+    }
+    if (callprojs.catchall_ioproj != NULL) {
+      C->gvn_replace_by(callprojs.catchall_ioproj,    ekit.i_o());
+    }

     // Replace the old exception object with the newly created one
     if (callprojs.exobj != NULL) {
@@ -1809,7 +1831,7 @@
   }

   // Disconnect the call from the graph
-  call->disconnect_inputs(NULL);
+  call->disconnect_inputs(NULL, C);
   C->gvn_replace_by(call, C->top());

   // Clean up any MergeMems that feed other MergeMems since the
@@ -1861,15 +1883,17 @@
   // occurs here, the runtime will make sure an MDO exists.  There is
   // no need to call method()->ensure_method_data() at this point.

+  // Set the stack pointer to the right value for reexecution:
+  set_sp(reexecute_sp());
+
 #ifdef ASSERT
   if (!must_throw) {
     // Make sure the stack has at least enough depth to execute
     // the current bytecode.
-    int inputs, ignore;
-    if (compute_stack_effects(inputs, ignore)) {
-      assert(sp() >= inputs, "must have enough JVMS stack to execute");
-      // It is a frequent error in library_call.cpp to issue an
-      // uncommon trap with the _sp value already popped.
+    int inputs, ignored_depth;
+    if (compute_stack_effects(inputs, ignored_depth)) {
+      assert(sp() >= inputs, err_msg_res("must have enough JVMS stack to execute %s: sp=%d, inputs=%d",
+             Bytecodes::name(java_bc()), sp(), inputs));
     }
   }
 #endif
@@ -1900,7 +1924,8 @@
   case Deoptimization::Action_make_not_compilable:
     break;
   default:
-    assert(false, "bad action");
+    fatal(err_msg_res("unknown action %d: %s", action, Deoptimization::trap_action_name(action)));
+    break;
 #endif
   }

@@ -2667,7 +2692,7 @@
       case SSC_always_false:
         // It needs a null check because a null will *pass* the cast check.
         // A non-null value will always produce an exception.
-        return do_null_assert(obj, T_OBJECT);
+        return null_assert(obj);
       }
     }
   }
@@ -2786,7 +2811,7 @@
   mb->init_req(TypeFunc::Control, control());
   mb->init_req(TypeFunc::Memory,  reset_memory());
   Node* membar = _gvn.transform(mb);
-  set_control(_gvn.transform(new (C) ProjNode(membar,TypeFunc::Control) ));
+  set_control(_gvn.transform(new (C) ProjNode(membar, TypeFunc::Control)));
   set_all_memory_call(membar);
   return membar;
 }
@@ -2975,7 +3000,7 @@
   set_control( _gvn.transform(new (C) ProjNode(allocx, TypeFunc::Control) ) );
   // create memory projection for i_o
   set_memory ( _gvn.transform( new (C) ProjNode(allocx, TypeFunc::Memory, true) ), rawidx );
-  make_slow_call_ex(allocx, env()->OutOfMemoryError_klass(), true);
+  make_slow_call_ex(allocx, env()->Throwable_klass(), true);

   // create a memory projection as for the normal control path
   Node* malloc = _gvn.transform(new (C) ProjNode(allocx, TypeFunc::Memory));
@@ -3148,7 +3173,7 @@
     Node* cmp_lh = _gvn.transform( new(C) CmpINode(layout_val, intcon(layout_con)) );
     Node* bol_lh = _gvn.transform( new(C) BoolNode(cmp_lh, BoolTest::eq) );
     { BuildCutout unless(this, bol_lh, PROB_MAX);
-      _sp += nargs;
+      inc_sp(nargs);
       uncommon_trap(Deoptimization::Reason_class_check,
                     Deoptimization::Action_maybe_recompile);
     }
@@ -3391,7 +3416,7 @@
   {
     PreserveJVMState pjvms(this);
     set_control(iffalse);
-    _sp += nargs;
+    inc_sp(nargs);
     uncommon_trap(reason, Deoptimization::Action_maybe_recompile);
   }
   Node* iftrue = _gvn.transform(new (C) IfTrueNode(iff));
--- a/src/share/vm/opto/graphKit.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/graphKit.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -41,6 +41,7 @@
 class FastLockNode;
 class FastUnlockNode;
 class IdealKit;
+class LibraryCallKit;
 class Parse;
 class RootNode;

@@ -60,11 +61,13 @@
   PhaseGVN         &_gvn;       // Some optimizations while parsing
   SafePointNode*    _map;       // Parser map from JVM to Nodes
   SafePointNode*    _exceptions;// Parser map(s) for exception state(s)
-  int               _sp;        // JVM Expression Stack Pointer
   int               _bci;       // JVM Bytecode Pointer
   ciMethod*         _method;    // JVM Current Method

  private:
+  int               _sp;        // JVM Expression Stack Pointer; don't modify directly!
+
+ private:
   SafePointNode*     map_not_null() const {
     assert(_map != NULL, "must call stopped() to test for reset compiler map");
     return _map;
@@ -80,7 +83,8 @@
   }
 #endif

-  virtual Parse* is_Parse() const { return NULL; }
+  virtual Parse*          is_Parse()          const { return NULL; }
+  virtual LibraryCallKit* is_LibraryCallKit() const { return NULL; }

   ciEnv*        env()           const { return _env; }
   PhaseGVN&     gvn()           const { return _gvn; }
@@ -141,7 +145,7 @@
                                         _bci = jvms->bci();
                                         _method = jvms->has_method() ? jvms->method() : NULL; }
   void set_map(SafePointNode* m)      { _map = m; debug_only(verify_map()); }
-  void set_sp(int i)                  { assert(i >= 0, "must be non-negative"); _sp = i; }
+  void set_sp(int sp)                 { assert(sp >= 0, err_msg_res("sp must be non-negative: %d", sp)); _sp = sp; }
   void clean_stack(int from_sp); // clear garbage beyond from_sp to top

   void inc_sp(int i)                  { set_sp(sp() + i); }
@@ -149,7 +153,9 @@
   void set_bci(int bci)               { _bci = bci; }

   // Make sure jvms has current bci & sp.
-  JVMState* sync_jvms()     const;
+  JVMState* sync_jvms() const;
+  JVMState* sync_jvms_for_reexecute();
+
 #ifdef ASSERT
   // Make sure JVMS has an updated copy of bci and sp.
   // Also sanity-check method, depth, and monitor depth.
@@ -286,7 +292,7 @@
   // How many stack inputs does the current BC consume?
   // And, how does the stack change after the bytecode?
   // Returns false if unknown.
-  bool compute_stack_effects(int& inputs, int& depth, bool for_parse = false);
+  bool compute_stack_effects(int& inputs, int& depth);

   // Add a fixed offset to a pointer
   Node* basic_plus_adr(Node* base, Node* ptr, intptr_t offset) {
@@ -337,20 +343,37 @@
   Node* load_object_klass(Node* object);
   // Find out the length of an array.
   Node* load_array_length(Node* array);
+
+
   // Helper function to do a NULL pointer check or ZERO check based on type.
-  Node* null_check_common(Node* value, BasicType type,
-                          bool assert_null, Node* *null_control);
   // Throw an exception if a given value is null.
   // Return the value cast to not-null.
   // Be clever about equivalent dominating null checks.
-  Node* do_null_check(Node* value, BasicType type) {
-    return null_check_common(value, type, false, NULL);
+  Node* null_check_common(Node* value, BasicType type,
+                          bool assert_null = false, Node* *null_control = NULL);
+  Node* null_check(Node* value, BasicType type = T_OBJECT) {
+    return null_check_common(value, type);
+  }
+  Node* null_check_receiver() {
+    assert(argument(0)->bottom_type()->isa_ptr(), "must be");
+    return null_check(argument(0));
+  }
+  Node* zero_check_int(Node* value) {
+    assert(value->bottom_type()->basic_type() == T_INT,
+        err_msg_res("wrong type: %s", type2name(value->bottom_type()->basic_type())));
+    return null_check_common(value, T_INT);
+  }
+  Node* zero_check_long(Node* value) {
+    assert(value->bottom_type()->basic_type() == T_LONG,
+        err_msg_res("wrong type: %s", type2name(value->bottom_type()->basic_type())));
+    return null_check_common(value, T_LONG);
   }
   // Throw an uncommon trap if a given value is __not__ null.
   // Return the value cast to null, and be clever about dominating checks.
-  Node* do_null_assert(Node* value, BasicType type) {
-    return null_check_common(value, type, true, NULL);
+  Node* null_assert(Node* value, BasicType type = T_OBJECT) {
+    return null_check_common(value, type, true);
   }
+
   // Null check oop.  Return null-path control into (*null_control).
   // Return a cast-not-null node which depends on the not-null control.
   // If never_see_null, use an uncommon trap (*null_control sees a top).
@@ -371,9 +394,9 @@
   // Replace all occurrences of one node by another.
   void replace_in_map(Node* old, Node* neww);

-  void  push(Node* n)     { map_not_null();        _map->set_stack(_map->_jvms,   _sp++, n); }
-  Node* pop()             { map_not_null(); return _map->stack(    _map->_jvms, --_sp); }
-  Node* peek(int off = 0) { map_not_null(); return _map->stack(    _map->_jvms,   _sp - off - 1); }
+  void  push(Node* n)     { map_not_null();        _map->set_stack(_map->_jvms,   _sp++        , n); }
+  Node* pop()             { map_not_null(); return _map->stack(    _map->_jvms, --_sp             ); }
+  Node* peek(int off = 0) { map_not_null(); return _map->stack(    _map->_jvms,   _sp - off - 1   ); }

   void push_pair(Node* ldval) {
     push(ldval);
@@ -580,19 +603,15 @@

   //---------- help for generating calls --------------

-  // Do a null check on the receiver, which is in argument(0).
-  Node* null_check_receiver(ciMethod* callee) {
+  // Do a null check on the receiver as it would happen before the call to
+  // callee (with all arguments still on the stack).
+  Node* null_check_receiver_before_call(ciMethod* callee) {
     assert(!callee->is_static(), "must be a virtual method");
-    int nargs = 1 + callee->signature()->size();
-    // Null check on self without removing any arguments.  The argument
-    // null check technically happens in the wrong place, which can lead to
-    // invalid stack traces when the primitive is inlined into a method
-    // which handles NullPointerExceptions.
-    Node* receiver = argument(0);
-    _sp += nargs;
-    receiver = do_null_check(receiver, T_OBJECT);
-    _sp -= nargs;
-    return receiver;
+    const int nargs = callee->arg_size();
+    inc_sp(nargs);
+    Node* n = null_check_receiver();
+    dec_sp(nargs);
+    return n;
   }

   // Fill in argument edges for the call from argument(0), argument(1), ...
@@ -645,6 +664,9 @@
                   klass, reason_string, must_throw, keep_exact_action);
   }

+  // SP when bytecode needs to be reexecuted.
+  virtual int reexecute_sp() { return sp(); }
+
   // Report if there were too many traps at the current method and bci.
   // Report if a trap was recorded, and/or PerMethodTrapLimit was exceeded.
   // If there is no MDO at all, report no trap unless told to assume it.
--- a/src/share/vm/opto/ifg.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/ifg.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -573,7 +573,7 @@
               (n2lidx(def) && !liveout.member(n2lidx(def)) ) ) {
             b->_nodes.remove(j - 1);
             if( lrgs(r)._def == n ) lrgs(r)._def = 0;
-            n->disconnect_inputs(NULL);
+            n->disconnect_inputs(NULL, C);
             _cfg._bbs.map(n->_idx,NULL);
             n->replace_by(C->top());
             // Since yanking a Node from block, high pressure moves up one
--- a/src/share/vm/opto/lcm.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/lcm.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1005,7 +1005,7 @@
 //------------------------------call_catch_cleanup-----------------------------
 // If we inserted any instructions between a Call and his CatchNode,
 // clone the instructions on all paths below the Catch.
-void Block::call_catch_cleanup(Block_Array &bbs) {
+void Block::call_catch_cleanup(Block_Array &bbs, Compile* C) {

   // End of region to clone
   uint end = end_idx();
@@ -1067,7 +1067,7 @@

   // Remove the now-dead cloned ops
   for(uint i3 = beg; i3 < end; i3++ ) {
-    _nodes[beg]->disconnect_inputs(NULL);
+    _nodes[beg]->disconnect_inputs(NULL, C);
     _nodes.remove(beg);
   }

@@ -1080,7 +1080,7 @@
       Node *n = sb->_nodes[j];
       if (n->outcnt() == 0 &&
           (!n->is_Proj() || n->as_Proj()->in(0)->outcnt() == 1) ){
-        n->disconnect_inputs(NULL);
+        n->disconnect_inputs(NULL, C);
         sb->_nodes.remove(j);
         new_cnt--;
       }
--- a/src/share/vm/opto/library_call.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/library_call.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -67,30 +67,64 @@
 // Local helper class for LibraryIntrinsic:
 class LibraryCallKit : public GraphKit {
  private:
-  LibraryIntrinsic* _intrinsic;   // the library intrinsic being called
+  LibraryIntrinsic* _intrinsic;     // the library intrinsic being called
+  Node*             _result;        // the result node, if any
+  int               _reexecute_sp;  // the stack pointer when bytecode needs to be reexecuted

   const TypeOopPtr* sharpen_unsafe_type(Compile::AliasType* alias_type, const TypePtr *adr_type, bool is_native_ptr = false);

  public:
-  LibraryCallKit(JVMState* caller, LibraryIntrinsic* intrinsic)
-    : GraphKit(caller),
-      _intrinsic(intrinsic)
+  LibraryCallKit(JVMState* jvms, LibraryIntrinsic* intrinsic)
+    : GraphKit(jvms),
+      _intrinsic(intrinsic),
+      _result(NULL)
   {
+    // Check if this is a root compile.  In that case we don't have a caller.
+    if (!jvms->has_method()) {
+      _reexecute_sp = sp();
+    } else {
+      // Find out how many arguments the interpreter needs when deoptimizing
+      // and save the stack pointer value so it can used by uncommon_trap.
+      // We find the argument count by looking at the declared signature.
+      bool ignored_will_link;
+      ciSignature* declared_signature = NULL;
+      ciMethod* ignored_callee = caller()->get_method_at_bci(bci(), ignored_will_link, &declared_signature);
+      const int nargs = declared_signature->arg_size_for_bc(caller()->java_code_at_bci(bci()));
+      _reexecute_sp = sp() + nargs;  // "push" arguments back on stack
+    }
   }

+  virtual LibraryCallKit* is_LibraryCallKit() const { return (LibraryCallKit*)this; }
+
   ciMethod*         caller()    const    { return jvms()->method(); }
   int               bci()       const    { return jvms()->bci(); }
   LibraryIntrinsic* intrinsic() const    { return _intrinsic; }
   vmIntrinsics::ID  intrinsic_id() const { return _intrinsic->intrinsic_id(); }
   ciMethod*         callee()    const    { return _intrinsic->method(); }
-  ciSignature*      signature() const    { return callee()->signature(); }
-  int               arg_size()  const    { return callee()->arg_size(); }

   bool try_to_inline();
   Node* try_to_predicate();

+  void push_result() {
+    // Push the result onto the stack.
+    if (!stopped() && result() != NULL) {
+      BasicType bt = result()->bottom_type()->basic_type();
+      push_node(bt, result());
+    }
+  }
+
+ private:
+  void fatal_unexpected_iid(vmIntrinsics::ID iid) {
+    fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+  }
+
+  void  set_result(Node* n) { assert(_result == NULL, "only set once"); _result = n; }
+  void  set_result(RegionNode* region, PhiNode* value);
+  Node*     result() { return _result; }
+
+  virtual int reexecute_sp() { return _reexecute_sp; }
+
   // Helper functions to inline natives
-  void push_result(RegionNode* region, PhiNode* value);
   Node* generate_guard(Node* test, RegionNode* region, float true_prob);
   Node* generate_slow_guard(Node* test, RegionNode* region);
   Node* generate_fair_guard(Node* test, RegionNode* region);
@@ -108,21 +142,19 @@
                               bool disjoint_bases, const char* &name, bool dest_uninitialized);
   Node* load_mirror_from_klass(Node* klass);
   Node* load_klass_from_mirror_common(Node* mirror, bool never_see_null,
-                                      int nargs,
                                       RegionNode* region, int null_path,
                                       int offset);
-  Node* load_klass_from_mirror(Node* mirror, bool never_see_null, int nargs,
+  Node* load_klass_from_mirror(Node* mirror, bool never_see_null,
                                RegionNode* region, int null_path) {
     int offset = java_lang_Class::klass_offset_in_bytes();
-    return load_klass_from_mirror_common(mirror, never_see_null, nargs,
+    return load_klass_from_mirror_common(mirror, never_see_null,
                                          region, null_path,
                                          offset);
   }
   Node* load_array_klass_from_mirror(Node* mirror, bool never_see_null,
-                                     int nargs,
                                      RegionNode* region, int null_path) {
     int offset = java_lang_Class::array_klass_offset_in_bytes();
-    return load_klass_from_mirror_common(mirror, never_see_null, nargs,
+    return load_klass_from_mirror_common(mirror, never_see_null,
                                          region, null_path,
                                          offset);
   }
@@ -161,16 +193,14 @@
   bool inline_string_indexOf();
   Node* string_indexOf(Node* string_object, ciTypeArray* target_array, jint offset, jint cache_i, jint md2_i);
   bool inline_string_equals();
-  Node* pop_math_arg();
+  Node* round_double_node(Node* n);
   bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName);
   bool inline_math_native(vmIntrinsics::ID id);
   bool inline_trig(vmIntrinsics::ID id);
-  bool inline_trans(vmIntrinsics::ID id);
-  bool inline_abs(vmIntrinsics::ID id);
-  bool inline_sqrt(vmIntrinsics::ID id);
+  bool inline_math(vmIntrinsics::ID id);
+  bool inline_exp();
+  bool inline_pow();
   void finish_pow_exp(Node* result, Node* x, Node* y, const TypeFunc* call_type, address funcAddr, const char* funcName);
-  bool inline_pow(vmIntrinsics::ID id);
-  bool inline_exp(vmIntrinsics::ID id);
   bool inline_min_max(vmIntrinsics::ID id);
   Node* generate_min_max(vmIntrinsics::ID id, Node* x, Node* y);
   // This returns Type::AnyPtr, RawPtr, or OopPtr.
@@ -179,7 +209,7 @@
   // Helper for inline_unsafe_access.
   // Generates the guards that check whether the result of
   // Unsafe.getObject should be recorded in an SATB log buffer.
-  void insert_pre_barrier(Node* base_oop, Node* offset, Node* pre_val, int nargs, bool need_mem_bar);
+  void insert_pre_barrier(Node* base_oop, Node* offset, Node* pre_val, bool need_mem_bar);
   bool inline_unsafe_access(bool is_native_ptr, bool is_store, BasicType type, bool is_volatile);
   bool inline_unsafe_prefetch(bool is_native_ptr, bool is_store, bool is_static);
   bool inline_unsafe_allocate();
@@ -253,11 +283,7 @@
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind);
   bool inline_unsafe_ordered_store(BasicType type);
   bool inline_fp_conversions(vmIntrinsics::ID id);
-  bool inline_numberOfLeadingZeros(vmIntrinsics::ID id);
-  bool inline_numberOfTrailingZeros(vmIntrinsics::ID id);
-  bool inline_bitCount(vmIntrinsics::ID id);
-  bool inline_reverseBytes(vmIntrinsics::ID id);
-
+  bool inline_number_methods(vmIntrinsics::ID id);
   bool inline_reference_get();
   bool inline_aescrypt_Block(vmIntrinsics::ID id);
   bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
@@ -321,15 +347,18 @@
   switch (id) {
   case vmIntrinsics::_compareTo:
     if (!SpecialStringCompareTo)  return NULL;
+    if (!Matcher::match_rule_supported(Op_StrComp))  return NULL;
     break;
   case vmIntrinsics::_indexOf:
     if (!SpecialStringIndexOf)  return NULL;
     break;
   case vmIntrinsics::_equals:
     if (!SpecialStringEquals)  return NULL;
+    if (!Matcher::match_rule_supported(Op_StrEquals))  return NULL;
     break;
   case vmIntrinsics::_equalsC:
     if (!SpecialArraysEquals)  return NULL;
+    if (!Matcher::match_rule_supported(Op_AryEq))  return NULL;
     break;
   case vmIntrinsics::_arraycopy:
     if (!InlineArrayCopy)  return NULL;
@@ -382,6 +411,19 @@
     if (!Matcher::match_rule_supported(Op_CountTrailingZerosL)) return NULL;
     break;

+  case vmIntrinsics::_reverseBytes_c:
+    if (!Matcher::match_rule_supported(Op_ReverseBytesUS)) return NULL;
+    break;
+  case vmIntrinsics::_reverseBytes_s:
+    if (!Matcher::match_rule_supported(Op_ReverseBytesS))  return NULL;
+    break;
+  case vmIntrinsics::_reverseBytes_i:
+    if (!Matcher::match_rule_supported(Op_ReverseBytesI))  return NULL;
+    break;
+  case vmIntrinsics::_reverseBytes_l:
+    if (!Matcher::match_rule_supported(Op_ReverseBytesL))  return NULL;
+    break;
+
   case vmIntrinsics::_Reference_get:
     // Use the intrinsic version of Reference.get() so that the value in
     // the referent field can be registered by the G1 pre-barrier code.
@@ -488,10 +530,13 @@
     tty->print_cr("Intrinsic %s", str);
   }
 #endif
-
+  ciMethod* callee = kit.callee();
+  const int bci    = kit.bci();
+
+  // Try to inline the intrinsic.
   if (kit.try_to_inline()) {
     if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
-      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
+      C->print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
     }
     C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_worked);
     if (C->log()) {
@@ -500,6 +545,8 @@
                      (is_virtual() ? " virtual='1'" : ""),
                      C->unique() - nodes);
     }
+    // Push the result from the inlined method onto the stack.
+    kit.push_result();
     return kit.transfer_exceptions_into_jvms();
   }

@@ -508,12 +555,12 @@
     if (jvms->has_method()) {
       // Not a root compile.
       const char* msg = is_virtual() ? "failed to inline (intrinsic, virtual)" : "failed to inline (intrinsic)";
-      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg);
+      C->print_inlining(callee, jvms->depth() - 1, bci, msg);
     } else {
       // Root compile
       tty->print("Did not generate intrinsic %s%s at bci:%d in",
                vmIntrinsics::name_at(intrinsic_id()),
-               (is_virtual() ? " (virtual)" : ""), kit.bci());
+               (is_virtual() ? " (virtual)" : ""), bci);
     }
   }
   C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
@@ -532,9 +579,15 @@
     tty->print_cr("Predicate for intrinsic %s", str);
   }
 #endif
+  ciMethod* callee = kit.callee();
+  const int bci    = kit.bci();

   Node* slow_ctl = kit.try_to_predicate();
   if (!kit.failing()) {
+    if (PrintIntrinsics || PrintInlining NOT_PRODUCT( || PrintOptoInlining) ) {
+      C->print_inlining(callee, jvms->depth() - 1, bci, is_virtual() ? "(intrinsic, virtual)" : "(intrinsic)");
+    }
+    C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_worked);
     if (C->log()) {
       C->log()->elem("predicate_intrinsic id='%s'%s nodes='%d'",
                      vmIntrinsics::name_at(intrinsic_id()),
@@ -549,12 +602,12 @@
     if (jvms->has_method()) {
       // Not a root compile.
       const char* msg = "failed to generate predicate for intrinsic";
-      CompileTask::print_inlining(kit.callee(), jvms->depth() - 1, kit.bci(), msg);
+      C->print_inlining(kit.callee(), jvms->depth() - 1, bci, msg);
     } else {
       // Root compile
-      tty->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
-               vmIntrinsics::name_at(intrinsic_id()),
-               (is_virtual() ? " (virtual)" : ""), kit.bci());
+      C->print_inlining_stream()->print("Did not generate predicate for intrinsic %s%s at bci:%d in",
+                                        vmIntrinsics::name_at(intrinsic_id()),
+                                        (is_virtual() ? " (virtual)" : ""), bci);
     }
   }
   C->gather_intrinsic_statistics(intrinsic_id(), is_virtual(), Compile::_intrinsic_failed);
@@ -566,6 +619,7 @@
   const bool is_store       = true;
   const bool is_native_ptr  = true;
   const bool is_static      = true;
+  const bool is_volatile    = true;

   if (!jvms()->has_method()) {
     // Root JVMState has a null method.
@@ -575,13 +629,11 @@
   }
   assert(merged_memory(), "");

+
   switch (intrinsic_id()) {
-  case vmIntrinsics::_hashCode:
-    return inline_native_hashcode(intrinsic()->is_virtual(), !is_static);
-  case vmIntrinsics::_identityHashCode:
-    return inline_native_hashcode(/*!virtual*/ false, is_static);
-  case vmIntrinsics::_getClass:
-    return inline_native_getClass();
+  case vmIntrinsics::_hashCode:                 return inline_native_hashcode(intrinsic()->is_virtual(), !is_static);
+  case vmIntrinsics::_identityHashCode:         return inline_native_hashcode(/*!virtual*/ false,         is_static);
+  case vmIntrinsics::_getClass:                 return inline_native_getClass();

   case vmIntrinsics::_dsin:
   case vmIntrinsics::_dcos:
@@ -592,203 +644,114 @@
   case vmIntrinsics::_dexp:
   case vmIntrinsics::_dlog:
   case vmIntrinsics::_dlog10:
-  case vmIntrinsics::_dpow:
-    return inline_math_native(intrinsic_id());
+  case vmIntrinsics::_dpow:                     return inline_math_native(intrinsic_id());

   case vmIntrinsics::_min:
-  case vmIntrinsics::_max:
-    return inline_min_max(intrinsic_id());
-
-  case vmIntrinsics::_arraycopy:
-    return inline_arraycopy();
-
-  case vmIntrinsics::_compareTo:
-    return inline_string_compareTo();
-  case vmIntrinsics::_indexOf:
-    return inline_string_indexOf();
-  case vmIntrinsics::_equals:
-    return inline_string_equals();
-
-  case vmIntrinsics::_getObject:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_OBJECT, false);
-  case vmIntrinsics::_getBoolean:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_BOOLEAN, false);
-  case vmIntrinsics::_getByte:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_BYTE, false);
-  case vmIntrinsics::_getShort:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_SHORT, false);
-  case vmIntrinsics::_getChar:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_CHAR, false);
-  case vmIntrinsics::_getInt:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_INT, false);
-  case vmIntrinsics::_getLong:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_LONG, false);
-  case vmIntrinsics::_getFloat:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_FLOAT, false);
-  case vmIntrinsics::_getDouble:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_DOUBLE, false);
-
-  case vmIntrinsics::_putObject:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_OBJECT, false);
-  case vmIntrinsics::_putBoolean:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_BOOLEAN, false);
-  case vmIntrinsics::_putByte:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_BYTE, false);
-  case vmIntrinsics::_putShort:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_SHORT, false);
-  case vmIntrinsics::_putChar:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_CHAR, false);
-  case vmIntrinsics::_putInt:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_INT, false);
-  case vmIntrinsics::_putLong:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_LONG, false);
-  case vmIntrinsics::_putFloat:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_FLOAT, false);
-  case vmIntrinsics::_putDouble:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_DOUBLE, false);
-
-  case vmIntrinsics::_getByte_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_BYTE, false);
-  case vmIntrinsics::_getShort_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_SHORT, false);
-  case vmIntrinsics::_getChar_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_CHAR, false);
-  case vmIntrinsics::_getInt_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_INT, false);
-  case vmIntrinsics::_getLong_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_LONG, false);
-  case vmIntrinsics::_getFloat_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_FLOAT, false);
-  case vmIntrinsics::_getDouble_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_DOUBLE, false);
-  case vmIntrinsics::_getAddress_raw:
-    return inline_unsafe_access(is_native_ptr, !is_store, T_ADDRESS, false);
-
-  case vmIntrinsics::_putByte_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_BYTE, false);
-  case vmIntrinsics::_putShort_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_SHORT, false);
-  case vmIntrinsics::_putChar_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_CHAR, false);
-  case vmIntrinsics::_putInt_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_INT, false);
-  case vmIntrinsics::_putLong_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_LONG, false);
-  case vmIntrinsics::_putFloat_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_FLOAT, false);
-  case vmIntrinsics::_putDouble_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_DOUBLE, false);
-  case vmIntrinsics::_putAddress_raw:
-    return inline_unsafe_access(is_native_ptr, is_store, T_ADDRESS, false);
-
-  case vmIntrinsics::_getObjectVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_OBJECT, true);
-  case vmIntrinsics::_getBooleanVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_BOOLEAN, true);
-  case vmIntrinsics::_getByteVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_BYTE, true);
-  case vmIntrinsics::_getShortVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_SHORT, true);
-  case vmIntrinsics::_getCharVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_CHAR, true);
-  case vmIntrinsics::_getIntVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_INT, true);
-  case vmIntrinsics::_getLongVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_LONG, true);
-  case vmIntrinsics::_getFloatVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_FLOAT, true);
-  case vmIntrinsics::_getDoubleVolatile:
-    return inline_unsafe_access(!is_native_ptr, !is_store, T_DOUBLE, true);
-
-  case vmIntrinsics::_putObjectVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_OBJECT, true);
-  case vmIntrinsics::_putBooleanVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_BOOLEAN, true);
-  case vmIntrinsics::_putByteVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_BYTE, true);
-  case vmIntrinsics::_putShortVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_SHORT, true);
-  case vmIntrinsics::_putCharVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_CHAR, true);
-  case vmIntrinsics::_putIntVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_INT, true);
-  case vmIntrinsics::_putLongVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_LONG, true);
-  case vmIntrinsics::_putFloatVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_FLOAT, true);
-  case vmIntrinsics::_putDoubleVolatile:
-    return inline_unsafe_access(!is_native_ptr, is_store, T_DOUBLE, true);
-
-  case vmIntrinsics::_prefetchRead:
-    return inline_unsafe_prefetch(!is_native_ptr, !is_store, !is_static);
-  case vmIntrinsics::_prefetchWrite:
-    return inline_unsafe_prefetch(!is_native_ptr, is_store, !is_static);
-  case vmIntrinsics::_prefetchReadStatic:
-    return inline_unsafe_prefetch(!is_native_ptr, !is_store, is_static);
-  case vmIntrinsics::_prefetchWriteStatic:
-    return inline_unsafe_prefetch(!is_native_ptr, is_store, is_static);
-
-  case vmIntrinsics::_compareAndSwapObject:
-    return inline_unsafe_load_store(T_OBJECT, LS_cmpxchg);
-  case vmIntrinsics::_compareAndSwapInt:
-    return inline_unsafe_load_store(T_INT, LS_cmpxchg);
-  case vmIntrinsics::_compareAndSwapLong:
-    return inline_unsafe_load_store(T_LONG, LS_cmpxchg);
-
-  case vmIntrinsics::_putOrderedObject:
-    return inline_unsafe_ordered_store(T_OBJECT);
-  case vmIntrinsics::_putOrderedInt:
-    return inline_unsafe_ordered_store(T_INT);
-  case vmIntrinsics::_putOrderedLong:
-    return inline_unsafe_ordered_store(T_LONG);
-
-  case vmIntrinsics::_getAndAddInt:
-    return inline_unsafe_load_store(T_INT, LS_xadd);
-  case vmIntrinsics::_getAndAddLong:
-    return inline_unsafe_load_store(T_LONG, LS_xadd);
-  case vmIntrinsics::_getAndSetInt:
-    return inline_unsafe_load_store(T_INT, LS_xchg);
-  case vmIntrinsics::_getAndSetLong:
-    return inline_unsafe_load_store(T_LONG, LS_xchg);
-  case vmIntrinsics::_getAndSetObject:
-    return inline_unsafe_load_store(T_OBJECT, LS_xchg);
-
-  case vmIntrinsics::_currentThread:
-    return inline_native_currentThread();
-  case vmIntrinsics::_isInterrupted:
-    return inline_native_isInterrupted();
+  case vmIntrinsics::_max:                      return inline_min_max(intrinsic_id());
+
+  case vmIntrinsics::_arraycopy:                return inline_arraycopy();
+
+  case vmIntrinsics::_compareTo:                return inline_string_compareTo();
+  case vmIntrinsics::_indexOf:                  return inline_string_indexOf();
+  case vmIntrinsics::_equals:                   return inline_string_equals();
+
+  case vmIntrinsics::_getObject:                return inline_unsafe_access(!is_native_ptr, !is_store, T_OBJECT,  !is_volatile);
+  case vmIntrinsics::_getBoolean:               return inline_unsafe_access(!is_native_ptr, !is_store, T_BOOLEAN, !is_volatile);
+  case vmIntrinsics::_getByte:                  return inline_unsafe_access(!is_native_ptr, !is_store, T_BYTE,    !is_volatile);
+  case vmIntrinsics::_getShort:                 return inline_unsafe_access(!is_native_ptr, !is_store, T_SHORT,   !is_volatile);
+  case vmIntrinsics::_getChar:                  return inline_unsafe_access(!is_native_ptr, !is_store, T_CHAR,    !is_volatile);
+  case vmIntrinsics::_getInt:                   return inline_unsafe_access(!is_native_ptr, !is_store, T_INT,     !is_volatile);
+  case vmIntrinsics::_getLong:                  return inline_unsafe_access(!is_native_ptr, !is_store, T_LONG,    !is_volatile);
+  case vmIntrinsics::_getFloat:                 return inline_unsafe_access(!is_native_ptr, !is_store, T_FLOAT,   !is_volatile);
+  case vmIntrinsics::_getDouble:                return inline_unsafe_access(!is_native_ptr, !is_store, T_DOUBLE,  !is_volatile);
+
+  case vmIntrinsics::_putObject:                return inline_unsafe_access(!is_native_ptr,  is_store, T_OBJECT,  !is_volatile);
+  case vmIntrinsics::_putBoolean:               return inline_unsafe_access(!is_native_ptr,  is_store, T_BOOLEAN, !is_volatile);
+  case vmIntrinsics::_putByte:                  return inline_unsafe_access(!is_native_ptr,  is_store, T_BYTE,    !is_volatile);
+  case vmIntrinsics::_putShort:                 return inline_unsafe_access(!is_native_ptr,  is_store, T_SHORT,   !is_volatile);
+  case vmIntrinsics::_putChar:                  return inline_unsafe_access(!is_native_ptr,  is_store, T_CHAR,    !is_volatile);
+  case vmIntrinsics::_putInt:                   return inline_unsafe_access(!is_native_ptr,  is_store, T_INT,     !is_volatile);
+  case vmIntrinsics::_putLong:                  return inline_unsafe_access(!is_native_ptr,  is_store, T_LONG,    !is_volatile);
+  case vmIntrinsics::_putFloat:                 return inline_unsafe_access(!is_native_ptr,  is_store, T_FLOAT,   !is_volatile);
+  case vmIntrinsics::_putDouble:                return inline_unsafe_access(!is_native_ptr,  is_store, T_DOUBLE,  !is_volatile);
+
+  case vmIntrinsics::_getByte_raw:              return inline_unsafe_access( is_native_ptr, !is_store, T_BYTE,    !is_volatile);
+  case vmIntrinsics::_getShort_raw:             return inline_unsafe_access( is_native_ptr, !is_store, T_SHORT,   !is_volatile);
+  case vmIntrinsics::_getChar_raw:              return inline_unsafe_access( is_native_ptr, !is_store, T_CHAR,    !is_volatile);
+  case vmIntrinsics::_getInt_raw:               return inline_unsafe_access( is_native_ptr, !is_store, T_INT,     !is_volatile);
+  case vmIntrinsics::_getLong_raw:              return inline_unsafe_access( is_native_ptr, !is_store, T_LONG,    !is_volatile);
+  case vmIntrinsics::_getFloat_raw:             return inline_unsafe_access( is_native_ptr, !is_store, T_FLOAT,   !is_volatile);
+  case vmIntrinsics::_getDouble_raw:            return inline_unsafe_access( is_native_ptr, !is_store, T_DOUBLE,  !is_volatile);
+  case vmIntrinsics::_getAddress_raw:           return inline_unsafe_access( is_native_ptr, !is_store, T_ADDRESS, !is_volatile);
+
+  case vmIntrinsics::_putByte_raw:              return inline_unsafe_access( is_native_ptr,  is_store, T_BYTE,    !is_volatile);
+  case vmIntrinsics::_putShort_raw:             return inline_unsafe_access( is_native_ptr,  is_store, T_SHORT,   !is_volatile);
+  case vmIntrinsics::_putChar_raw:              return inline_unsafe_access( is_native_ptr,  is_store, T_CHAR,    !is_volatile);
+  case vmIntrinsics::_putInt_raw:               return inline_unsafe_access( is_native_ptr,  is_store, T_INT,     !is_volatile);
+  case vmIntrinsics::_putLong_raw:              return inline_unsafe_access( is_native_ptr,  is_store, T_LONG,    !is_volatile);
+  case vmIntrinsics::_putFloat_raw:             return inline_unsafe_access( is_native_ptr,  is_store, T_FLOAT,   !is_volatile);
+  case vmIntrinsics::_putDouble_raw:            return inline_unsafe_access( is_native_ptr,  is_store, T_DOUBLE,  !is_volatile);
+  case vmIntrinsics::_putAddress_raw:           return inline_unsafe_access( is_native_ptr,  is_store, T_ADDRESS, !is_volatile);
+
+  case vmIntrinsics::_getObjectVolatile:        return inline_unsafe_access(!is_native_ptr, !is_store, T_OBJECT,   is_volatile);
+  case vmIntrinsics::_getBooleanVolatile:       return inline_unsafe_access(!is_native_ptr, !is_store, T_BOOLEAN,  is_volatile);
+  case vmIntrinsics::_getByteVolatile:          return inline_unsafe_access(!is_native_ptr, !is_store, T_BYTE,     is_volatile);
+  case vmIntrinsics::_getShortVolatile:         return inline_unsafe_access(!is_native_ptr, !is_store, T_SHORT,    is_volatile);
+  case vmIntrinsics::_getCharVolatile:          return inline_unsafe_access(!is_native_ptr, !is_store, T_CHAR,     is_volatile);
+  case vmIntrinsics::_getIntVolatile:           return inline_unsafe_access(!is_native_ptr, !is_store, T_INT,      is_volatile);
+  case vmIntrinsics::_getLongVolatile:          return inline_unsafe_access(!is_native_ptr, !is_store, T_LONG,     is_volatile);
+  case vmIntrinsics::_getFloatVolatile:         return inline_unsafe_access(!is_native_ptr, !is_store, T_FLOAT,    is_volatile);
+  case vmIntrinsics::_getDoubleVolatile:        return inline_unsafe_access(!is_native_ptr, !is_store, T_DOUBLE,   is_volatile);
+
+  case vmIntrinsics::_putObjectVolatile:        return inline_unsafe_access(!is_native_ptr,  is_store, T_OBJECT,   is_volatile);
+  case vmIntrinsics::_putBooleanVolatile:       return inline_unsafe_access(!is_native_ptr,  is_store, T_BOOLEAN,  is_volatile);
+  case vmIntrinsics::_putByteVolatile:          return inline_unsafe_access(!is_native_ptr,  is_store, T_BYTE,     is_volatile);
+  case vmIntrinsics::_putShortVolatile:         return inline_unsafe_access(!is_native_ptr,  is_store, T_SHORT,    is_volatile);
+  case vmIntrinsics::_putCharVolatile:          return inline_unsafe_access(!is_native_ptr,  is_store, T_CHAR,     is_volatile);
+  case vmIntrinsics::_putIntVolatile:           return inline_unsafe_access(!is_native_ptr,  is_store, T_INT,      is_volatile);
+  case vmIntrinsics::_putLongVolatile:          return inline_unsafe_access(!is_native_ptr,  is_store, T_LONG,     is_volatile);
+  case vmIntrinsics::_putFloatVolatile:         return inline_unsafe_access(!is_native_ptr,  is_store, T_FLOAT,    is_volatile);
+  case vmIntrinsics::_putDoubleVolatile:        return inline_unsafe_access(!is_native_ptr,  is_store, T_DOUBLE,   is_volatile);
+
+  case vmIntrinsics::_prefetchRead:             return inline_unsafe_prefetch(!is_native_ptr, !is_store, !is_static);
+  case vmIntrinsics::_prefetchWrite:            return inline_unsafe_prefetch(!is_native_ptr,  is_store, !is_static);
+  case vmIntrinsics::_prefetchReadStatic:       return inline_unsafe_prefetch(!is_native_ptr, !is_store,  is_static);
+  case vmIntrinsics::_prefetchWriteStatic:      return inline_unsafe_prefetch(!is_native_ptr,  is_store,  is_static);
+
+  case vmIntrinsics::_compareAndSwapObject:     return inline_unsafe_load_store(T_OBJECT, LS_cmpxchg);
+  case vmIntrinsics::_compareAndSwapInt:        return inline_unsafe_load_store(T_INT,    LS_cmpxchg);
+  case vmIntrinsics::_compareAndSwapLong:       return inline_unsafe_load_store(T_LONG,   LS_cmpxchg);
+
+  case vmIntrinsics::_putOrderedObject:         return inline_unsafe_ordered_store(T_OBJECT);
+  case vmIntrinsics::_putOrderedInt:            return inline_unsafe_ordered_store(T_INT);
+  case vmIntrinsics::_putOrderedLong:           return inline_unsafe_ordered_store(T_LONG);
+
+  case vmIntrinsics::_getAndAddInt:             return inline_unsafe_load_store(T_INT,    LS_xadd);
+  case vmIntrinsics::_getAndAddLong:            return inline_unsafe_load_store(T_LONG,   LS_xadd);
+  case vmIntrinsics::_getAndSetInt:             return inline_unsafe_load_store(T_INT,    LS_xchg);
+  case vmIntrinsics::_getAndSetLong:            return inline_unsafe_load_store(T_LONG,   LS_xchg);
+  case vmIntrinsics::_getAndSetObject:          return inline_unsafe_load_store(T_OBJECT, LS_xchg);
+
+  case vmIntrinsics::_currentThread:            return inline_native_currentThread();
+  case vmIntrinsics::_isInterrupted:            return inline_native_isInterrupted();

 #ifdef TRACE_HAVE_INTRINSICS
-  case vmIntrinsics::_classID:
-    return inline_native_classID();
-  case vmIntrinsics::_threadID:
-    return inline_native_threadID();
-  case vmIntrinsics::_counterTime:
-    return inline_native_time_funcs(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), "counterTime");
+  case vmIntrinsics::_classID:                  return inline_native_classID();
+  case vmIntrinsics::_threadID:                 return inline_native_threadID();
+  case vmIntrinsics::_counterTime:              return inline_native_time_funcs(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), "counterTime");
 #endif
-  case vmIntrinsics::_currentTimeMillis:
-    return inline_native_time_funcs(CAST_FROM_FN_PTR(address, os::javaTimeMillis), "currentTimeMillis");
-  case vmIntrinsics::_nanoTime:
-    return inline_native_time_funcs(CAST_FROM_FN_PTR(address, os::javaTimeNanos), "nanoTime");
-  case vmIntrinsics::_allocateInstance:
-    return inline_unsafe_allocate();
-  case vmIntrinsics::_copyMemory:
-    return inline_unsafe_copyMemory();
-  case vmIntrinsics::_newArray:
-    return inline_native_newArray();
-  case vmIntrinsics::_getLength:
-    return inline_native_getLength();
-  case vmIntrinsics::_copyOf:
-    return inline_array_copyOf(false);
-  case vmIntrinsics::_copyOfRange:
-    return inline_array_copyOf(true);
-  case vmIntrinsics::_equalsC:
-    return inline_array_equals();
-  case vmIntrinsics::_clone:
-    return inline_native_clone(intrinsic()->is_virtual());
-
-  case vmIntrinsics::_isAssignableFrom:
-    return inline_native_subtype_check();
+  case vmIntrinsics::_currentTimeMillis:        return inline_native_time_funcs(CAST_FROM_FN_PTR(address, os::javaTimeMillis), "currentTimeMillis");
+  case vmIntrinsics::_nanoTime:                 return inline_native_time_funcs(CAST_FROM_FN_PTR(address, os::javaTimeNanos), "nanoTime");
+  case vmIntrinsics::_allocateInstance:         return inline_unsafe_allocate();
+  case vmIntrinsics::_copyMemory:               return inline_unsafe_copyMemory();
+  case vmIntrinsics::_newArray:                 return inline_native_newArray();
+  case vmIntrinsics::_getLength:                return inline_native_getLength();
+  case vmIntrinsics::_copyOf:                   return inline_array_copyOf(false);
+  case vmIntrinsics::_copyOfRange:              return inline_array_copyOf(true);
+  case vmIntrinsics::_equalsC:                  return inline_array_equals();
+  case vmIntrinsics::_clone:                    return inline_native_clone(intrinsic()->is_virtual());
+
+  case vmIntrinsics::_isAssignableFrom:         return inline_native_subtype_check();

   case vmIntrinsics::_isInstance:
   case vmIntrinsics::_getModifiers:
@@ -797,44 +760,32 @@
   case vmIntrinsics::_isPrimitive:
   case vmIntrinsics::_getSuperclass:
   case vmIntrinsics::_getComponentType:
-  case vmIntrinsics::_getClassAccessFlags:
-    return inline_native_Class_query(intrinsic_id());
+  case vmIntrinsics::_getClassAccessFlags:      return inline_native_Class_query(intrinsic_id());

   case vmIntrinsics::_floatToRawIntBits:
   case vmIntrinsics::_floatToIntBits:
   case vmIntrinsics::_intBitsToFloat:
   case vmIntrinsics::_doubleToRawLongBits:
   case vmIntrinsics::_doubleToLongBits:
-  case vmIntrinsics::_longBitsToDouble:
-    return inline_fp_conversions(intrinsic_id());
+  case vmIntrinsics::_longBitsToDouble:         return inline_fp_conversions(intrinsic_id());

   case vmIntrinsics::_numberOfLeadingZeros_i:
   case vmIntrinsics::_numberOfLeadingZeros_l:
-    return inline_numberOfLeadingZeros(intrinsic_id());
-
   case vmIntrinsics::_numberOfTrailingZeros_i:
   case vmIntrinsics::_numberOfTrailingZeros_l:
-    return inline_numberOfTrailingZeros(intrinsic_id());
-
   case vmIntrinsics::_bitCount_i:
   case vmIntrinsics::_bitCount_l:
-    return inline_bitCount(intrinsic_id());
-
   case vmIntrinsics::_reverseBytes_i:
   case vmIntrinsics::_reverseBytes_l:
   case vmIntrinsics::_reverseBytes_s:
-  case vmIntrinsics::_reverseBytes_c:
-    return inline_reverseBytes((vmIntrinsics::ID) intrinsic_id());
-
-  case vmIntrinsics::_getCallerClass:
-    return inline_native_Reflection_getCallerClass();
-
-  case vmIntrinsics::_Reference_get:
-    return inline_reference_get();
+  case vmIntrinsics::_reverseBytes_c:           return inline_number_methods(intrinsic_id());
+
+  case vmIntrinsics::_getCallerClass:           return inline_native_Reflection_getCallerClass();
+
+  case vmIntrinsics::_Reference_get:            return inline_reference_get();

   case vmIntrinsics::_aescrypt_encryptBlock:
-  case vmIntrinsics::_aescrypt_decryptBlock:
-    return inline_aescrypt_Block(intrinsic_id());
+  case vmIntrinsics::_aescrypt_decryptBlock:    return inline_aescrypt_Block(intrinsic_id());

   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
@@ -883,13 +834,13 @@
   }
 }

-//------------------------------push_result------------------------------
+//------------------------------set_result-------------------------------
 // Helper function for finishing intrinsics.
-void LibraryCallKit::push_result(RegionNode* region, PhiNode* value) {
+void LibraryCallKit::set_result(RegionNode* region, PhiNode* value) {
   record_for_igvn(region);
   set_control(_gvn.transform(region));
-  BasicType value_type = value->type()->basic_type();
-  push_node(value_type, _gvn.transform(value));
+  set_result( _gvn.transform(value));
+  assert(value->type()->basic_type() == result()->bottom_type()->basic_type(), "sanity");
 }

 //------------------------------generate_guard---------------------------
@@ -1078,7 +1029,6 @@
 // to Int nodes containing the lenghts of str1 and str2.
 //
 Node* LibraryCallKit::make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2) {
-
   Node* result = NULL;
   switch (opcode) {
   case Op_StrIndexOf:
@@ -1105,51 +1055,23 @@
 }

 //------------------------------inline_string_compareTo------------------------
+// public int java.lang.String.compareTo(String anotherString);
 bool LibraryCallKit::inline_string_compareTo() {
-
-  if (!Matcher::has_match_rule(Op_StrComp)) return false;
-
-  _sp += 2;
-  Node *argument = pop();  // pop non-receiver first:  it was pushed second
-  Node *receiver = pop();
-
-  // Null check on self without removing any arguments.  The argument
-  // null check technically happens in the wrong place, which can lead to
-  // invalid stack traces when string compare is inlined into a method
-  // which handles NullPointerExceptions.
-  _sp += 2;
-  receiver = do_null_check(receiver, T_OBJECT);
-  argument = do_null_check(argument, T_OBJECT);
-  _sp -= 2;
+  Node* receiver = null_check(argument(0));
+  Node* arg      = null_check(argument(1));
   if (stopped()) {
     return true;
   }
-
-  Node* compare = make_string_method_node(Op_StrComp, receiver, argument);
-  push(compare);
+  set_result(make_string_method_node(Op_StrComp, receiver, arg));
   return true;
 }

 //------------------------------inline_string_equals------------------------
 bool LibraryCallKit::inline_string_equals() {
-
-  if (!Matcher::has_match_rule(Op_StrEquals)) return false;
-
-  int nargs = 2;
-  _sp += nargs;
-  Node* argument = pop();  // pop non-receiver first:  it was pushed second
-  Node* receiver = pop();
-
-  // Null check on self without removing any arguments.  The argument
-  // null check technically happens in the wrong place, which can lead to
-  // invalid stack traces when string compare is inlined into a method
-  // which handles NullPointerExceptions.
-  _sp += nargs;
-  receiver = do_null_check(receiver, T_OBJECT);
-  //should not do null check for argument for String.equals(), because spec
-  //allows to specify NULL as argument.
-  _sp -= nargs;
-
+  Node* receiver = null_check_receiver();
+  // NOTE: Do not null check argument for String.equals() because spec
+  // allows to specify NULL as argument.
+  Node* argument = this->argument(1);
   if (stopped()) {
     return true;
   }
@@ -1173,9 +1095,7 @@
   ciInstanceKlass* klass = env()->String_klass();

   if (!stopped()) {
-    _sp += nargs;          // gen_instanceof might do an uncommon trap
     Node* inst = gen_instanceof(argument, makecon(TypeKlassPtr::make(klass)));
-    _sp -= nargs;
     Node* cmp  = _gvn.transform(new (C) CmpINode(inst, intcon(1)));
     Node* bol  = _gvn.transform(new (C) BoolNode(cmp, BoolTest::ne));

@@ -1207,7 +1127,7 @@
     Node* receiver_cnt  = load_String_length(no_ctrl, receiver);

     // Get start addr of argument
-    Node* argument_val   = load_String_value(no_ctrl, argument);
+    Node* argument_val    = load_String_value(no_ctrl, argument);
     Node* argument_offset = load_String_offset(no_ctrl, argument);
     Node* argument_start = array_element_address(argument_val, argument_offset, T_CHAR);

@@ -1236,24 +1156,15 @@
   set_control(_gvn.transform(region));
   record_for_igvn(region);

-  push(_gvn.transform(phi));
-
+  set_result(_gvn.transform(phi));
   return true;
 }

 //------------------------------inline_array_equals----------------------------
 bool LibraryCallKit::inline_array_equals() {
-
-  if (!Matcher::has_match_rule(Op_AryEq)) return false;
-
-  _sp += 2;
-  Node *argument2 = pop();
-  Node *argument1 = pop();
-
-  Node* equals =
-    _gvn.transform(new (C) AryEqNode(control(), memory(TypeAryPtr::CHARS),
-                                        argument1, argument2) );
-  push(equals);
+  Node* arg1 = argument(0);
+  Node* arg2 = argument(1);
+  set_result(_gvn.transform(new (C) AryEqNode(control(), memory(TypeAryPtr::CHARS), arg1, arg2)));
   return true;
 }

@@ -1325,7 +1236,7 @@
   float likely   = PROB_LIKELY(0.9);
   float unlikely = PROB_UNLIKELY(0.9);

-  const int nargs = 2; // number of arguments to push back for uncommon trap in predicate
+  const int nargs = 0; // no arguments to push back for uncommon trap in predicate

   Node* source        = load_String_value(no_ctrl, string_object);
   Node* sourceOffset  = load_String_offset(no_ctrl, string_object);
@@ -1396,10 +1307,8 @@

 //------------------------------inline_string_indexOf------------------------
 bool LibraryCallKit::inline_string_indexOf() {
-
-  _sp += 2;
-  Node *argument = pop();  // pop non-receiver first:  it was pushed second
-  Node *receiver = pop();
+  Node* receiver = argument(0);
+  Node* arg      = argument(1);

   Node* result;
   // Disable the use of pcmpestri until it can be guaranteed that
@@ -1409,15 +1318,8 @@
     // Generate SSE4.2 version of indexOf
     // We currently only have match rules that use SSE4.2

-    // Null check on self without removing any arguments.  The argument
-    // null check technically happens in the wrong place, which can lead to
-    // invalid stack traces when string compare is inlined into a method
-    // which handles NullPointerExceptions.
-    _sp += 2;
-    receiver = do_null_check(receiver, T_OBJECT);
-    argument = do_null_check(argument, T_OBJECT);
-    _sp -= 2;
-
+    receiver = null_check(receiver);
+    arg      = null_check(arg);
     if (stopped()) {
       return true;
     }
@@ -1439,12 +1341,12 @@
     Node* source_cnt  = load_String_length(no_ctrl, receiver);

     // Get start addr of substring
-    Node* substr = load_String_value(no_ctrl, argument);
-    Node* substr_offset = load_String_offset(no_ctrl, argument);
+    Node* substr = load_String_value(no_ctrl, arg);
+    Node* substr_offset = load_String_offset(no_ctrl, arg);
     Node* substr_start = array_element_address(substr, substr_offset, T_CHAR);

     // Get length of source string
-    Node* substr_cnt  = load_String_length(no_ctrl, argument);
+    Node* substr_cnt  = load_String_length(no_ctrl, arg);

     // Check for substr count > string count
     Node* cmp = _gvn.transform( new(C) CmpINode(substr_cnt, source_cnt) );
@@ -1477,10 +1379,10 @@

   } else { // Use LibraryCallKit::string_indexOf
     // don't intrinsify if argument isn't a constant string.
-    if (!argument->is_Con()) {
+    if (!arg->is_Con()) {
      return false;
     }
-    const TypeOopPtr* str_type = _gvn.type(argument)->isa_oopptr();
+    const TypeOopPtr* str_type = _gvn.type(arg)->isa_oopptr();
     if (str_type == NULL) {
       return false;
     }
@@ -1511,21 +1413,15 @@
      return false;
     }

-    // Null check on self without removing any arguments.  The argument
-    // null check technically happens in the wrong place, which can lead to
-    // invalid stack traces when string compare is inlined into a method
-    // which handles NullPointerExceptions.
-    _sp += 2;
-    receiver = do_null_check(receiver, T_OBJECT);
-    // No null check on the argument is needed since it's a constant String oop.
-    _sp -= 2;
+    receiver = null_check(receiver, T_OBJECT);
+    // NOTE: No null check on the argument is needed since it's a constant String oop.
     if (stopped()) {
       return true;
     }

     // The null string as a pattern always returns 0 (match at beginning of string)
     if (c == 0) {
-      push(intcon(0));
+      set_result(intcon(0));
       return true;
     }

@@ -1548,47 +1444,54 @@

     result = string_indexOf(receiver, pat, o, cache, md2);
   }
-
-  push(result);
+  set_result(result);
   return true;
 }

-//--------------------------pop_math_arg--------------------------------
-// Pop a double argument to a math function from the stack
-// rounding it if necessary.
-Node * LibraryCallKit::pop_math_arg() {
-  Node *arg = pop_pair();
-  if( Matcher::strict_fp_requires_explicit_rounding && UseSSE<=1 )
-    arg = _gvn.transform( new (C) RoundDoubleNode(0, arg) );
-  return arg;
+//--------------------------round_double_node--------------------------------
+// Round a double node if necessary.
+Node* LibraryCallKit::round_double_node(Node* n) {
+  if (Matcher::strict_fp_requires_explicit_rounding && UseSSE <= 1)
+    n = _gvn.transform(new (C) RoundDoubleNode(0, n));
+  return n;
+}
+
+//------------------------------inline_math-----------------------------------
+// public static double Math.abs(double)
+// public static double Math.sqrt(double)
+// public static double Math.log(double)
+// public static double Math.log10(double)
+bool LibraryCallKit::inline_math(vmIntrinsics::ID id) {
+  Node* arg = round_double_node(argument(0));
+  Node* n;
+  switch (id) {
+  case vmIntrinsics::_dabs:   n = new (C) AbsDNode(    arg);  break;
+  case vmIntrinsics::_dsqrt:  n = new (C) SqrtDNode(0, arg);  break;
+  case vmIntrinsics::_dlog:   n = new (C) LogDNode(    arg);  break;
+  case vmIntrinsics::_dlog10: n = new (C) Log10DNode(  arg);  break;
+  default:  fatal_unexpected_iid(id);  break;
+  }
+  set_result(_gvn.transform(n));
+  return true;
 }

 //------------------------------inline_trig----------------------------------
 // Inline sin/cos/tan instructions, if possible.  If rounding is required, do
 // argument reduction which will turn into a fast/slow diamond.
 bool LibraryCallKit::inline_trig(vmIntrinsics::ID id) {
-  _sp += arg_size();            // restore stack pointer
-  Node* arg = pop_math_arg();
-  Node* trig = NULL;
+  Node* arg = round_double_node(argument(0));
+  Node* n = NULL;

   switch (id) {
-  case vmIntrinsics::_dsin:
-    trig = _gvn.transform((Node*)new (C) SinDNode(arg));
-    break;
-  case vmIntrinsics::_dcos:
-    trig = _gvn.transform((Node*)new (C) CosDNode(arg));
-    break;
-  case vmIntrinsics::_dtan:
-    trig = _gvn.transform((Node*)new (C) TanDNode(arg));
-    break;
-  default:
-    assert(false, "bad intrinsic was passed in");
-    return false;
+  case vmIntrinsics::_dsin:  n = new (C) SinDNode(arg);  break;
+  case vmIntrinsics::_dcos:  n = new (C) CosDNode(arg);  break;
+  case vmIntrinsics::_dtan:  n = new (C) TanDNode(arg);  break;
+  default:  fatal_unexpected_iid(id);  break;
   }
+  n = _gvn.transform(n);

   // Rounding required?  Check for argument reduction!
-  if( Matcher::strict_fp_requires_explicit_rounding ) {
-
+  if (Matcher::strict_fp_requires_explicit_rounding) {
     static const double     pi_4 =  0.7853981633974483;
     static const double neg_pi_4 = -0.7853981633974483;
     // pi/2 in 80-bit extended precision
@@ -1623,8 +1526,8 @@
     // probably do the math inside the SIN encoding.

     // Make the merge point
-    RegionNode *r = new (C) RegionNode(3);
-    Node *phi = new (C) PhiNode(r,Type::DOUBLE);
+    RegionNode* r = new (C) RegionNode(3);
+    Node* phi = new (C) PhiNode(r, Type::DOUBLE);

     // Flatten arg so we need only 1 test
     Node *abs = _gvn.transform(new (C) AbsDNode(arg));
@@ -1639,7 +1542,7 @@
     set_control(opt_iff(r,iff));

     // Set fast path result
-    phi->init_req(2,trig);
+    phi->init_req(2, n);

     // Slow path - non-blocking leaf call
     Node* call = NULL;
@@ -1661,37 +1564,18 @@
       break;
     }
     assert(control()->in(0) == call, "");
-    Node* slow_result = _gvn.transform(new (C) ProjNode(call,TypeFunc::Parms));
-    r->init_req(1,control());
-    phi->init_req(1,slow_result);
+    Node* slow_result = _gvn.transform(new (C) ProjNode(call, TypeFunc::Parms));
+    r->init_req(1, control());
+    phi->init_req(1, slow_result);

     // Post-merge
     set_control(_gvn.transform(r));
     record_for_igvn(r);
-    trig = _gvn.transform(phi);
+    n = _gvn.transform(phi);

     C->set_has_split_ifs(true); // Has chance for split-if optimization
   }
-  // Push result back on JVM stack
-  push_pair(trig);
-  return true;
-}
-
-//------------------------------inline_sqrt-------------------------------------
-// Inline square root instruction, if possible.
-bool LibraryCallKit::inline_sqrt(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_dsqrt, "Not square root");
-  _sp += arg_size();        // restore stack pointer
-  push_pair(_gvn.transform(new (C) SqrtDNode(0, pop_math_arg())));
-  return true;
-}
-
-//------------------------------inline_abs-------------------------------------
-// Inline absolute value instruction, if possible.
-bool LibraryCallKit::inline_abs(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_dabs, "Not absolute value");
-  _sp += arg_size();        // restore stack pointer
-  push_pair(_gvn.transform(new (C) AbsDNode(pop_math_arg())));
+  set_result(n);
   return true;
 }

@@ -1700,24 +1584,18 @@
   //result=(result.isNaN())? funcAddr():result;
   // Check: If isNaN() by checking result!=result? then either trap
   // or go to runtime
-  Node* cmpisnan = _gvn.transform(new (C) CmpDNode(result,result));
+  Node* cmpisnan = _gvn.transform(new (C) CmpDNode(result, result));
   // Build the boolean node
-  Node* bolisnum = _gvn.transform( new (C) BoolNode(cmpisnan, BoolTest::eq) );
+  Node* bolisnum = _gvn.transform(new (C) BoolNode(cmpisnan, BoolTest::eq));

   if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
-    {
-      BuildCutout unless(this, bolisnum, PROB_STATIC_FREQUENT);
-      // End the current control-flow path
-      push_pair(x);
-      if (y != NULL) {
-        push_pair(y);
-      }
+    { BuildCutout unless(this, bolisnum, PROB_STATIC_FREQUENT);
       // The pow or exp intrinsic returned a NaN, which requires a call
       // to the runtime.  Recompile with the runtime call.
       uncommon_trap(Deoptimization::Reason_intrinsic,
                     Deoptimization::Action_make_not_entrant);
     }
-    push_pair(result);
+    set_result(result);
   } else {
     // If this inlining ever returned NaN in the past, we compile a call
     // to the runtime to properly handle corner cases
@@ -1727,7 +1605,7 @@
     Node* if_fast = _gvn.transform( new (C) IfTrueNode(iff) );

     if (!if_slow->is_top()) {
-      RegionNode* result_region = new(C) RegionNode(3);
+      RegionNode* result_region = new (C) RegionNode(3);
       PhiNode*    result_val = new (C) PhiNode(result_region, Type::DOUBLE);

       result_region->init_req(1, if_fast);
@@ -1747,9 +1625,9 @@

       result_region->init_req(2, control());
       result_val->init_req(2, value);
-      push_result(result_region, result_val);
+      set_result(result_region, result_val);
     } else {
-      push_pair(result);
+      set_result(result);
     }
   }
 }
@@ -1757,25 +1635,19 @@
 //------------------------------inline_exp-------------------------------------
 // Inline exp instructions, if possible.  The Intel hardware only misses
 // really odd corner cases (+/- Infinity).  Just uncommon-trap them.
-bool LibraryCallKit::inline_exp(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_dexp, "Not exp");
-
-  _sp += arg_size();        // restore stack pointer
-  Node *x = pop_math_arg();
-  Node *result = _gvn.transform(new (C) ExpDNode(0,x));
-
-  finish_pow_exp(result, x, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
+bool LibraryCallKit::inline_exp() {
+  Node* arg = round_double_node(argument(0));
+  Node* n   = _gvn.transform(new (C) ExpDNode(0, arg));
+
+  finish_pow_exp(n, arg, NULL, OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");

   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
   return true;
 }

 //------------------------------inline_pow-------------------------------------
 // Inline power instructions, if possible.
-bool LibraryCallKit::inline_pow(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_dpow, "Not pow");
-
+bool LibraryCallKit::inline_pow() {
   // Pseudocode for pow
   // if (x <= 0.0) {
   //   long longy = (long)y;
@@ -1793,15 +1665,14 @@
   // }
   // return result;

-  _sp += arg_size();        // restore stack pointer
-  Node* y = pop_math_arg();
-  Node* x = pop_math_arg();
+  Node* x = round_double_node(argument(0));
+  Node* y = round_double_node(argument(2));

   Node* result = NULL;

   if (!too_many_traps(Deoptimization::Reason_intrinsic)) {
     // Short form: skip the fancy tests and just check for NaN result.
-    result = _gvn.transform( new (C) PowDNode(0, x, y) );
+    result = _gvn.transform(new (C) PowDNode(0, x, y));
   } else {
     // If this inlining ever returned NaN in the past, include all
     // checks + call to the runtime.
@@ -1919,55 +1790,23 @@
     // Post merge
     set_control(_gvn.transform(r));
     record_for_igvn(r);
-    result=_gvn.transform(phi);
+    result = _gvn.transform(phi);
   }

   finish_pow_exp(result, x, y, OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW");

   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
-  return true;
-}
-
-//------------------------------inline_trans-------------------------------------
-// Inline transcendental instructions, if possible.  The Intel hardware gets
-// these right, no funny corner cases missed.
-bool LibraryCallKit::inline_trans(vmIntrinsics::ID id) {
-  _sp += arg_size();        // restore stack pointer
-  Node* arg = pop_math_arg();
-  Node* trans = NULL;
-
-  switch (id) {
-  case vmIntrinsics::_dlog:
-    trans = _gvn.transform((Node*)new (C) LogDNode(arg));
-    break;
-  case vmIntrinsics::_dlog10:
-    trans = _gvn.transform((Node*)new (C) Log10DNode(arg));
-    break;
-  default:
-    assert(false, "bad intrinsic was passed in");
-    return false;
-  }
-
-  // Push result back on JVM stack
-  push_pair(trans);
   return true;
 }

 //------------------------------runtime_math-----------------------------
 bool LibraryCallKit::runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName) {
-  Node* a = NULL;
-  Node* b = NULL;
-
   assert(call_type == OptoRuntime::Math_DD_D_Type() || call_type == OptoRuntime::Math_D_D_Type(),
          "must be (DD)D or (D)D type");

   // Inputs
-  _sp += arg_size();        // restore stack pointer
-  if (call_type == OptoRuntime::Math_DD_D_Type()) {
-    b = pop_math_arg();
-  }
-  a = pop_math_arg();
+  Node* a = round_double_node(argument(0));
+  Node* b = (call_type == OptoRuntime::Math_DD_D_Type()) ? round_double_node(argument(2)) : NULL;

   const TypePtr* no_memory_effects = NULL;
   Node* trig = make_runtime_call(RC_LEAF, call_type, funcAddr, funcName,
@@ -1979,43 +1818,43 @@
   assert(value_top == top(), "second value must be top");
 #endif

-  push_pair(value);
+  set_result(value);
   return true;
 }

 //------------------------------inline_math_native-----------------------------
 bool LibraryCallKit::inline_math_native(vmIntrinsics::ID id) {
+#define FN_PTR(f) CAST_FROM_FN_PTR(address, f)
   switch (id) {
     // These intrinsics are not properly supported on all hardware
-  case vmIntrinsics::_dcos: return Matcher::has_match_rule(Op_CosD) ? inline_trig(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dcos), "COS");
-  case vmIntrinsics::_dsin: return Matcher::has_match_rule(Op_SinD) ? inline_trig(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dsin), "SIN");
-  case vmIntrinsics::_dtan: return Matcher::has_match_rule(Op_TanD) ? inline_trig(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dtan), "TAN");
-
-  case vmIntrinsics::_dlog:   return Matcher::has_match_rule(Op_LogD) ? inline_trans(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dlog), "LOG");
-  case vmIntrinsics::_dlog10: return Matcher::has_match_rule(Op_Log10D) ? inline_trans(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dlog10), "LOG10");
+  case vmIntrinsics::_dcos:   return Matcher::has_match_rule(Op_CosD)   ? inline_trig(id) :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dcos),   "COS");
+  case vmIntrinsics::_dsin:   return Matcher::has_match_rule(Op_SinD)   ? inline_trig(id) :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dsin),   "SIN");
+  case vmIntrinsics::_dtan:   return Matcher::has_match_rule(Op_TanD)   ? inline_trig(id) :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dtan),   "TAN");
+
+  case vmIntrinsics::_dlog:   return Matcher::has_match_rule(Op_LogD)   ? inline_math(id) :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog),   "LOG");
+  case vmIntrinsics::_dlog10: return Matcher::has_match_rule(Op_Log10D) ? inline_math(id) :
+    runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog10), "LOG10");

     // These intrinsics are supported on all hardware
-  case vmIntrinsics::_dsqrt: return Matcher::has_match_rule(Op_SqrtD) ? inline_sqrt(id) : false;
-  case vmIntrinsics::_dabs:  return Matcher::has_match_rule(Op_AbsD)  ? inline_abs(id)  : false;
-
-  case vmIntrinsics::_dexp:  return
-    Matcher::has_match_rule(Op_ExpD) ? inline_exp(id) :
-    runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
-  case vmIntrinsics::_dpow:  return
-    Matcher::has_match_rule(Op_PowD) ? inline_pow(id) :
-    runtime_math(OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW");
+  case vmIntrinsics::_dsqrt:  return Matcher::has_match_rule(Op_SqrtD)  ? inline_math(id) : false;
+  case vmIntrinsics::_dabs:   return Matcher::has_match_rule(Op_AbsD)   ? inline_math(id) : false;
+
+  case vmIntrinsics::_dexp:   return Matcher::has_match_rule(Op_ExpD)   ? inline_exp()    :
+    runtime_math(OptoRuntime::Math_D_D_Type(),  FN_PTR(SharedRuntime::dexp),  "EXP");
+  case vmIntrinsics::_dpow:   return Matcher::has_match_rule(Op_PowD)   ? inline_pow()    :
+    runtime_math(OptoRuntime::Math_DD_D_Type(), FN_PTR(SharedRuntime::dpow),  "POW");
+#undef FN_PTR

    // These intrinsics are not yet correctly implemented
   case vmIntrinsics::_datan2:
     return false;

   default:
-    ShouldNotReachHere();
+    fatal_unexpected_iid(id);
     return false;
   }
 }
@@ -2030,8 +1869,7 @@

 //----------------------------inline_min_max-----------------------------------
 bool LibraryCallKit::inline_min_max(vmIntrinsics::ID id) {
-  push(generate_min_max(id, argument(0), argument(1)));
-
+  set_result(generate_min_max(id, argument(0), argument(1)));
   return true;
 }

@@ -2254,99 +2092,37 @@
   }
 }

-//-------------------inline_numberOfLeadingZeros_int/long-----------------------
-// inline int Integer.numberOfLeadingZeros(int)
-// inline int Long.numberOfLeadingZeros(long)
-bool LibraryCallKit::inline_numberOfLeadingZeros(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_numberOfLeadingZeros_i || id == vmIntrinsics::_numberOfLeadingZeros_l, "not numberOfLeadingZeros");
-  if (id == vmIntrinsics::_numberOfLeadingZeros_i && !Matcher::match_rule_supported(Op_CountLeadingZerosI)) return false;
-  if (id == vmIntrinsics::_numberOfLeadingZeros_l && !Matcher::match_rule_supported(Op_CountLeadingZerosL)) return false;
-  _sp += arg_size();  // restore stack pointer
+//--------------------------inline_number_methods-----------------------------
+// inline int     Integer.numberOfLeadingZeros(int)
+// inline int        Long.numberOfLeadingZeros(long)
+//
+// inline int     Integer.numberOfTrailingZeros(int)
+// inline int        Long.numberOfTrailingZeros(long)
+//
+// inline int     Integer.bitCount(int)
+// inline int        Long.bitCount(long)
+//
+// inline char  Character.reverseBytes(char)
+// inline short     Short.reverseBytes(short)
+// inline int     Integer.reverseBytes(int)
+// inline long       Long.reverseBytes(long)
+bool LibraryCallKit::inline_number_methods(vmIntrinsics::ID id) {
+  Node* arg = argument(0);
+  Node* n;
   switch (id) {
-  case vmIntrinsics::_numberOfLeadingZeros_i:
-    push(_gvn.transform(new (C) CountLeadingZerosINode(pop())));
-    break;
-  case vmIntrinsics::_numberOfLeadingZeros_l:
-    push(_gvn.transform(new (C) CountLeadingZerosLNode(pop_pair())));
-    break;
-  default:
-    ShouldNotReachHere();
-  }
-  return true;
-}
-
-//-------------------inline_numberOfTrailingZeros_int/long----------------------
-// inline int Integer.numberOfTrailingZeros(int)
-// inline int Long.numberOfTrailingZeros(long)
-bool LibraryCallKit::inline_numberOfTrailingZeros(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_numberOfTrailingZeros_i || id == vmIntrinsics::_numberOfTrailingZeros_l, "not numberOfTrailingZeros");
-  if (id == vmIntrinsics::_numberOfTrailingZeros_i && !Matcher::match_rule_supported(Op_CountTrailingZerosI)) return false;
-  if (id == vmIntrinsics::_numberOfTrailingZeros_l && !Matcher::match_rule_supported(Op_CountTrailingZerosL)) return false;
-  _sp += arg_size();  // restore stack pointer
-  switch (id) {
-  case vmIntrinsics::_numberOfTrailingZeros_i:
-    push(_gvn.transform(new (C) CountTrailingZerosINode(pop())));
-    break;
-  case vmIntrinsics::_numberOfTrailingZeros_l:
-    push(_gvn.transform(new (C) CountTrailingZerosLNode(pop_pair())));
-    break;
-  default:
-    ShouldNotReachHere();
+  case vmIntrinsics::_numberOfLeadingZeros_i:   n = new (C) CountLeadingZerosINode( arg);  break;
+  case vmIntrinsics::_numberOfLeadingZeros_l:   n = new (C) CountLeadingZerosLNode( arg);  break;
+  case vmIntrinsics::_numberOfTrailingZeros_i:  n = new (C) CountTrailingZerosINode(arg);  break;
+  case vmIntrinsics::_numberOfTrailingZeros_l:  n = new (C) CountTrailingZerosLNode(arg);  break;
+  case vmIntrinsics::_bitCount_i:               n = new (C) PopCountINode(          arg);  break;
+  case vmIntrinsics::_bitCount_l:               n = new (C) PopCountLNode(          arg);  break;
+  case vmIntrinsics::_reverseBytes_c:           n = new (C) ReverseBytesUSNode(0,   arg);  break;
+  case vmIntrinsics::_reverseBytes_s:           n = new (C) ReverseBytesSNode( 0,   arg);  break;
+  case vmIntrinsics::_reverseBytes_i:           n = new (C) ReverseBytesINode( 0,   arg);  break;
+  case vmIntrinsics::_reverseBytes_l:           n = new (C) ReverseBytesLNode( 0,   arg);  break;
+  default:  fatal_unexpected_iid(id);  break;
   }
-  return true;
-}
-
-//----------------------------inline_bitCount_int/long-----------------------
-// inline int Integer.bitCount(int)
-// inline int Long.bitCount(long)
-bool LibraryCallKit::inline_bitCount(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_bitCount_i || id == vmIntrinsics::_bitCount_l, "not bitCount");
-  if (id == vmIntrinsics::_bitCount_i && !Matcher::has_match_rule(Op_PopCountI)) return false;
-  if (id == vmIntrinsics::_bitCount_l && !Matcher::has_match_rule(Op_PopCountL)) return false;
-  _sp += arg_size();  // restore stack pointer
-  switch (id) {
-  case vmIntrinsics::_bitCount_i:
-    push(_gvn.transform(new (C) PopCountINode(pop())));
-    break;
-  case vmIntrinsics::_bitCount_l:
-    push(_gvn.transform(new (C) PopCountLNode(pop_pair())));
-    break;
-  default:
-    ShouldNotReachHere();
-  }
-  return true;
-}
-
-//----------------------------inline_reverseBytes_int/long/char/short-------------------
-// inline Integer.reverseBytes(int)
-// inline Long.reverseBytes(long)
-// inline Character.reverseBytes(char)
-// inline Short.reverseBytes(short)
-bool LibraryCallKit::inline_reverseBytes(vmIntrinsics::ID id) {
-  assert(id == vmIntrinsics::_reverseBytes_i || id == vmIntrinsics::_reverseBytes_l ||
-         id == vmIntrinsics::_reverseBytes_c || id == vmIntrinsics::_reverseBytes_s,
-         "not reverse Bytes");
-  if (id == vmIntrinsics::_reverseBytes_i && !Matcher::has_match_rule(Op_ReverseBytesI))  return false;
-  if (id == vmIntrinsics::_reverseBytes_l && !Matcher::has_match_rule(Op_ReverseBytesL))  return false;
-  if (id == vmIntrinsics::_reverseBytes_c && !Matcher::has_match_rule(Op_ReverseBytesUS)) return false;
-  if (id == vmIntrinsics::_reverseBytes_s && !Matcher::has_match_rule(Op_ReverseBytesS))  return false;
-  _sp += arg_size();  // restore stack pointer
-  switch (id) {
-  case vmIntrinsics::_reverseBytes_i:
-    push(_gvn.transform(new (C) ReverseBytesINode(0, pop())));
-    break;
-  case vmIntrinsics::_reverseBytes_l:
-    push_pair(_gvn.transform(new (C) ReverseBytesLNode(0, pop_pair())));
-    break;
-  case vmIntrinsics::_reverseBytes_c:
-    push(_gvn.transform(new (C) ReverseBytesUSNode(0, pop())));
-    break;
-  case vmIntrinsics::_reverseBytes_s:
-    push(_gvn.transform(new (C) ReverseBytesSNode(0, pop())));
-    break;
-  default:
-    ;
-  }
+  set_result(_gvn.transform(n));
   return true;
 }

@@ -2356,7 +2132,7 @@

 // Helper that guards and inserts a pre-barrier.
 void LibraryCallKit::insert_pre_barrier(Node* base_oop, Node* offset,
-                                        Node* pre_val, int nargs, bool need_mem_bar) {
+                                        Node* pre_val, bool need_mem_bar) {
   // We could be accessing the referent field of a reference object. If so, when G1
   // is enabled, we need to log the value in the referent field in an SATB buffer.
   // This routine performs some compile time filters and generates suitable
@@ -2406,8 +2182,8 @@
   //   }
   // }

-  float likely  = PROB_LIKELY(0.999);
-  float unlikely  = PROB_UNLIKELY(0.999);
+  float likely   = PROB_LIKELY(  0.999);
+  float unlikely = PROB_UNLIKELY(0.999);

   IdealKit ideal(this);
 #define __ ideal.
@@ -2419,9 +2195,7 @@
       sync_kit(ideal);

       Node* ref_klass_con = makecon(TypeKlassPtr::make(env()->Reference_klass()));
-      _sp += nargs;  // gen_instanceof might do an uncommon trap
       Node* is_instof = gen_instanceof(base_oop, ref_klass_con);
-      _sp -= nargs;

       // Update IdealKit memory and control from graphKit.
       __ sync_kit(this);
@@ -2505,7 +2279,7 @@
   {
     ResourceMark rm;
     // Check the signatures.
-    ciSignature* sig = signature();
+    ciSignature* sig = callee()->signature();
 #ifdef ASSERT
     if (!is_store) {
       // Object getObject(Object base, int/long offset), etc.
@@ -2543,42 +2317,19 @@

   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  int type_words = type2size[ (type == T_ADDRESS) ? T_LONG : type ];
-
-  // Argument words:  "this" plus (oop/offset) or (lo/hi) args plus maybe 1 or 2 value words
-  int nargs = 1 + (is_native_ptr ? 2 : 3) + (is_store ? type_words : 0);
-  assert(callee()->arg_size() == nargs, "must be");
-
-  debug_only(int saved_sp = _sp);
-  _sp += nargs;
-
-  Node* val;
-  debug_only(val = (Node*)(uintptr_t)-1);
-
-
-  if (is_store) {
-    // Get the value being stored.  (Pop it first; it was pushed last.)
-    switch (type) {
-    case T_DOUBLE:
-    case T_LONG:
-    case T_ADDRESS:
-      val = pop_pair();
-      break;
-    default:
-      val = pop();
-    }
-  }
+  Node* receiver = argument(0);  // type: oop

   // Build address expression.  See the code in inline_unsafe_prefetch.
-  Node *adr;
-  Node *heap_base_oop = top();
+  Node* adr;
+  Node* heap_base_oop = top();
   Node* offset = top();
+  Node* val;

   if (!is_native_ptr) {
+    // The base is either a Java object or a value produced by Unsafe.staticFieldBase
+    Node* base = argument(1);  // type: oop
     // The offset is a value produced by Unsafe.staticFieldOffset or Unsafe.objectFieldOffset
-    offset = pop_pair();
-    // The base is either a Java object or a value produced by Unsafe.staticFieldBase
-    Node* base   = pop();
+    offset = argument(2);  // type: long
     // We currently rely on the cookies produced by Unsafe.xxxFieldOffset
     // to be plain byte offsets, which are also the same as those accepted
     // by oopDesc::field_base.
@@ -2588,18 +2339,14 @@
     offset = ConvL2X(offset);
     adr = make_unsafe_address(base, offset);
     heap_base_oop = base;
+    val = is_store ? argument(4) : NULL;
   } else {
-    Node* ptr = pop_pair();
-    // Adjust Java long to machine word:
-    ptr = ConvL2X(ptr);
+    Node* ptr = argument(1);  // type: long
+    ptr = ConvL2X(ptr);  // adjust Java long to machine word
     adr = make_unsafe_address(NULL, ptr);
+    val = is_store ? argument(3) : NULL;
   }

-  // Pop receiver last:  it was pushed first.
-  Node *receiver = pop();
-
-  assert(saved_sp == _sp, "must have correct argument count");
-
   const TypePtr *adr_type = _gvn.type(adr)->isa_ptr();

   // First guess at the value type.
@@ -2633,13 +2380,7 @@
     }
   }

-  // Null check on self without removing any arguments.  The argument
-  // null check technically happens in the wrong place, which can lead to
-  // invalid stack traces when the primitive is inlined into a method
-  // which handles NullPointerExceptions.
-  _sp += nargs;
-  do_null_check(receiver, T_OBJECT);
-  _sp -= nargs;
+  receiver = null_check(receiver);
   if (stopped()) {
     return true;
   }
@@ -2671,34 +2412,36 @@

   if (!is_store) {
     Node* p = make_load(control(), adr, value_type, type, adr_type, is_volatile);
-    // load value and push onto stack
+    // load value
     switch (type) {
     case T_BOOLEAN:
     case T_CHAR:
     case T_BYTE:
     case T_SHORT:
     case T_INT:
+    case T_LONG:
     case T_FLOAT:
-      push(p);
+    case T_DOUBLE:
       break;
     case T_OBJECT:
       if (need_read_barrier) {
-        insert_pre_barrier(heap_base_oop, offset, p, nargs, !(is_volatile || need_mem_bar));
+        insert_pre_barrier(heap_base_oop, offset, p, !(is_volatile || need_mem_bar));
       }
-      push(p);
       break;
     case T_ADDRESS:
       // Cast to an int type.
-      p = _gvn.transform( new (C) CastP2XNode(NULL,p) );
+      p = _gvn.transform(new (C) CastP2XNode(NULL, p));
       p = ConvX2L(p);
-      push_pair(p);
+      break;
+    default:
+      fatal(err_msg_res("unexpected type %d: %s", type, type2name(type)));
       break;
-    case T_DOUBLE:
-    case T_LONG:
-      push_pair( p );
-      break;
-    default: ShouldNotReachHere();
     }
+    // The load node has the control of the preceding MemBarCPUOrder.  All
+    // following nodes will have the control of the MemBarCPUOrder inserted at
+    // the end of this method.  So, pushing the load onto the stack at a later
+    // point is fine.
+    set_result(p);
   } else {
     // place effect of store into memory
     switch (type) {
@@ -2762,7 +2505,7 @@
   {
     ResourceMark rm;
     // Check the signatures.
-    ciSignature* sig = signature();
+    ciSignature* sig = callee()->signature();
 #ifdef ASSERT
     // Object getObject(Object base, int/long offset), etc.
     BasicType rtype = sig->return_type()->basic_type();
@@ -2780,19 +2523,21 @@

   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  // Argument words:  "this" if not static, plus (oop/offset) or (lo/hi) args
-  int nargs = (is_static ? 0 : 1) + (is_native_ptr ? 2 : 3);
-
-  debug_only(int saved_sp = _sp);
-  _sp += nargs;
+  const int idx = is_static ? 0 : 1;
+  if (!is_static) {
+    null_check_receiver();
+    if (stopped()) {
+      return true;
+    }
+  }

   // Build address expression.  See the code in inline_unsafe_access.
   Node *adr;
   if (!is_native_ptr) {
+    // The base is either a Java object or a value produced by Unsafe.staticFieldBase
+    Node* base   = argument(idx + 0);  // type: oop
     // The offset is a value produced by Unsafe.staticFieldOffset or Unsafe.objectFieldOffset
-    Node* offset = pop_pair();
-    // The base is either a Java object or a value produced by Unsafe.staticFieldBase
-    Node* base   = pop();
+    Node* offset = argument(idx + 1);  // type: long
     // We currently rely on the cookies produced by Unsafe.xxxFieldOffset
     // to be plain byte offsets, which are also the same as those accepted
     // by oopDesc::field_base.
@@ -2802,31 +2547,11 @@
     offset = ConvL2X(offset);
     adr = make_unsafe_address(base, offset);
   } else {
-    Node* ptr = pop_pair();
-    // Adjust Java long to machine word:
-    ptr = ConvL2X(ptr);
+    Node* ptr = argument(idx + 0);  // type: long
+    ptr = ConvL2X(ptr);  // adjust Java long to machine word
     adr = make_unsafe_address(NULL, ptr);
   }

-  if (is_static) {
-    assert(saved_sp == _sp, "must have correct argument count");
-  } else {
-    // Pop receiver last:  it was pushed first.
-    Node *receiver = pop();
-    assert(saved_sp == _sp, "must have correct argument count");
-
-    // Null check on self without removing any arguments.  The argument
-    // null check technically happens in the wrong place, which can lead to
-    // invalid stack traces when the primitive is inlined into a method
-    // which handles NullPointerExceptions.
-    _sp += nargs;
-    do_null_check(receiver, T_OBJECT);
-    _sp -= nargs;
-    if (stopped()) {
-      return true;
-    }
-  }
-
   // Generate the read or write prefetch
   Node *prefetch;
   if (is_store) {
@@ -2841,7 +2566,22 @@
 }

 //----------------------------inline_unsafe_load_store----------------------------
-
+// This method serves a couple of different customers (depending on LoadStoreKind):
+//
+// LS_cmpxchg:
+//   public final native boolean compareAndSwapObject(Object o, long offset, Object expected, Object x);
+//   public final native boolean compareAndSwapInt(   Object o, long offset, int    expected, int    x);
+//   public final native boolean compareAndSwapLong(  Object o, long offset, long   expected, long   x);
+//
+// LS_xadd:
+//   public int  getAndAddInt( Object o, long offset, int  delta)
+//   public long getAndAddLong(Object o, long offset, long delta)
+//
+// LS_xchg:
+//   int    getAndSet(Object o, long offset, int    newValue)
+//   long   getAndSet(Object o, long offset, long   newValue)
+//   Object getAndSet(Object o, long offset, Object newValue)
+//
 bool LibraryCallKit::inline_unsafe_load_store(BasicType type, LoadStoreKind kind) {
   // This basic scheme here is the same as inline_unsafe_access, but
   // differs in enough details that combining them would make the code
@@ -2856,7 +2596,8 @@
   BasicType rtype;
   {
     ResourceMark rm;
-    ciSignature* sig = signature();
+    // Check the signatures.
+    ciSignature* sig = callee()->signature();
     rtype = sig->return_type()->basic_type();
     if (kind == LS_xadd || kind == LS_xchg) {
       // Check the signatures.
@@ -2881,28 +2622,31 @@
   }
 #endif //PRODUCT

-  // number of stack slots per value argument (1 or 2)
-  int type_words = type2size[type];
-
   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  // Argument words:  "this" plus oop plus offset (plus oldvalue) plus newvalue/delta;
-  int nargs = 1 + 1 + 2  + ((kind == LS_cmpxchg) ? type_words : 0) + type_words;
-
-  // pop arguments: newval, offset, base, and receiver
-  debug_only(int saved_sp = _sp);
-  _sp += nargs;
-  Node* newval   = (type_words == 1) ? pop() : pop_pair();
-  Node* oldval   = (kind == LS_cmpxchg) ? ((type_words == 1) ? pop() : pop_pair()) : NULL;
-  Node *offset   = pop_pair();
-  Node *base     = pop();
-  Node *receiver = pop();
-  assert(saved_sp == _sp, "must have correct argument count");
-
-  //  Null check receiver.
-  _sp += nargs;
-  do_null_check(receiver, T_OBJECT);
-  _sp -= nargs;
+  // Get arguments:
+  Node* receiver = NULL;
+  Node* base     = NULL;
+  Node* offset   = NULL;
+  Node* oldval   = NULL;
+  Node* newval   = NULL;
+  if (kind == LS_cmpxchg) {
+    const bool two_slot_type = type2size[type] == 2;
+    receiver = argument(0);  // type: oop
+    base     = argument(1);  // type: oop
+    offset   = argument(2);  // type: long
+    oldval   = argument(4);  // type: oop, int, or long
+    newval   = argument(two_slot_type ? 6 : 5);  // type: oop, int, or long
+  } else if (kind == LS_xadd || kind == LS_xchg){
+    receiver = argument(0);  // type: oop
+    base     = argument(1);  // type: oop
+    offset   = argument(2);  // type: long
+    oldval   = NULL;
+    newval   = argument(4);  // type: oop, int, or long
+  }
+
+  // Null check receiver.
+  receiver = null_check(receiver);
   if (stopped()) {
     return true;
   }
@@ -3008,7 +2752,7 @@
     post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
     break;
   default:
-    ShouldNotReachHere();
+    fatal(err_msg_res("unexpected type %d: %s", type, type2name(type)));
     break;
   }

@@ -3029,10 +2773,14 @@
 #endif

   assert(type2size[load_store->bottom_type()->basic_type()] == type2size[rtype], "result type should match");
-  push_node(load_store->bottom_type()->basic_type(), load_store);
+  set_result(load_store);
   return true;
 }

+//----------------------------inline_unsafe_ordered_store----------------------
+// public native void sun.misc.Unsafe.putOrderedObject(Object o, long offset, Object x);
+// public native void sun.misc.Unsafe.putOrderedInt(Object o, long offset, int x);
+// public native void sun.misc.Unsafe.putOrderedLong(Object o, long offset, long x);
 bool LibraryCallKit::inline_unsafe_ordered_store(BasicType type) {
   // This is another variant of inline_unsafe_access, differing in
   // that it always issues store-store ("release") barrier and ensures
@@ -3044,7 +2792,7 @@
   {
     ResourceMark rm;
     // Check the signatures.
-    ciSignature* sig = signature();
+    ciSignature* sig = callee()->signature();
 #ifdef ASSERT
     BasicType rtype = sig->return_type()->basic_type();
     assert(rtype == T_VOID, "must return void");
@@ -3055,27 +2803,16 @@
   }
 #endif //PRODUCT

-  // number of stack slots per value argument (1 or 2)
-  int type_words = type2size[type];
-
   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  // Argument words:  "this" plus oop plus offset plus value;
-  int nargs = 1 + 1 + 2 + type_words;
-
-  // pop arguments: val, offset, base, and receiver
-  debug_only(int saved_sp = _sp);
-  _sp += nargs;
-  Node* val      = (type_words == 1) ? pop() : pop_pair();
-  Node *offset   = pop_pair();
-  Node *base     = pop();
-  Node *receiver = pop();
-  assert(saved_sp == _sp, "must have correct argument count");
-
-  //  Null check receiver.
-  _sp += nargs;
-  do_null_check(receiver, T_OBJECT);
-  _sp -= nargs;
+  // Get arguments:
+  Node* receiver = argument(0);  // type: oop
+  Node* base     = argument(1);  // type: oop
+  Node* offset   = argument(2);  // type: long
+  Node* val      = argument(4);  // type: oop, int, or long
+
+  // Null check receiver.
+  receiver = null_check(receiver);
   if (stopped()) {
     return true;
   }
@@ -3092,7 +2829,7 @@
   insert_mem_bar(Op_MemBarRelease);
   insert_mem_bar(Op_MemBarCPUOrder);
   // Ensure that the store is atomic for longs:
-  bool require_atomic_access = true;
+  const bool require_atomic_access = true;
   Node* store;
   if (type == T_OBJECT) // reference stores need a store barrier.
     store = store_oop_to_unknown(control(), base, adr, adr_type, val, type);
@@ -3103,20 +2840,17 @@
   return true;
 }

+//----------------------------inline_unsafe_allocate---------------------------
+// public native Object sun.mics.Unsafe.allocateInstance(Class<?> cls);
 bool LibraryCallKit::inline_unsafe_allocate() {
   if (callee()->is_static())  return false;  // caller must have the capability!
-  int nargs = 1 + 1;
-  assert(signature()->size() == nargs-1, "alloc has 1 argument");
-  null_check_receiver(callee());  // check then ignore argument(0)
-  _sp += nargs;  // set original stack for use by uncommon_trap
-  Node* cls = do_null_check(argument(1), T_OBJECT);
-  _sp -= nargs;
+
+  null_check_receiver();  // null-check, then ignore
+  Node* cls = null_check(argument(1));
   if (stopped())  return true;

-  Node* kls = load_klass_from_mirror(cls, false, nargs, NULL, 0);
-  _sp += nargs;  // set original stack for use by uncommon_trap
-  kls = do_null_check(kls, T_OBJECT);
-  _sp -= nargs;
+  Node* kls = load_klass_from_mirror(cls, false, NULL, 0);
+  kls = null_check(kls);
   if (stopped())  return true;  // argument was like int.class

   // Note:  The argument might still be an illegal value like
@@ -3127,12 +2861,11 @@
   // can generate code to load it as unsigned byte.
   Node* inst = make_load(NULL, insp, TypeInt::UBYTE, T_BOOLEAN);
   Node* bits = intcon(instanceKlass::fully_initialized);
-  Node* test = _gvn.transform( new (C) SubINode(inst, bits) );
+  Node* test = _gvn.transform(new (C) SubINode(inst, bits));
   // The 'test' is non-zero if we need to take a slow path.

   Node* obj = new_instance(kls, test);
-  push(obj);
-
+  set_result(obj);
   return true;
 }

@@ -3143,15 +2876,10 @@
  * return myklass->trace_id & ~0x3
  */
 bool LibraryCallKit::inline_native_classID() {
-  int nargs = 1 + 1;
-  null_check_receiver(callee());  // check then ignore argument(0)
-  _sp += nargs;
-  Node* cls = do_null_check(argument(1), T_OBJECT);
-  _sp -= nargs;
-  Node* kls = load_klass_from_mirror(cls, false, nargs, NULL, 0);
-  _sp += nargs;
-  kls = do_null_check(kls, T_OBJECT);
-  _sp -= nargs;
+  null_check_receiver();  // null-check, then ignore
+  Node* cls = null_check(argument(1), T_OBJECT);
+  Node* kls = load_klass_from_mirror(cls, false, NULL, 0);
+  kls = null_check(kls, T_OBJECT);
   ByteSize offset = TRACE_ID_OFFSET;
   Node* insp = basic_plus_adr(kls, in_bytes(offset));
   Node* tvalue = make_load(NULL, insp, TypeLong::LONG, T_LONG);
@@ -3162,7 +2890,7 @@

   const TypePtr *adr_type = _gvn.type(insp)->isa_ptr();
   store_to_memory(control(), insp, orl, T_LONG, adr_type);
-  push_pair(andl);
+  set_result(andl);
   return true;
 }

@@ -3177,13 +2905,12 @@
   size_t thread_id_size = OSThread::thread_id_size();
   if (thread_id_size == (size_t) BytesPerLong) {
     threadid = ConvL2I(make_load(control(), p, TypeLong::LONG, T_LONG));
-    push(threadid);
   } else if (thread_id_size == (size_t) BytesPerInt) {
     threadid = make_load(control(), p, TypeInt::INT, T_INT);
-    push(threadid);
   } else {
     ShouldNotReachHere();
   }
+  set_result(threadid);
   return true;
 }
 #endif
@@ -3192,29 +2919,28 @@
 // inline code for System.currentTimeMillis() and System.nanoTime()
 // these have the same type and signature
 bool LibraryCallKit::inline_native_time_funcs(address funcAddr, const char* funcName) {
-  const TypeFunc *tf = OptoRuntime::void_long_Type();
+  const TypeFunc* tf = OptoRuntime::void_long_Type();
   const TypePtr* no_memory_effects = NULL;
   Node* time = make_runtime_call(RC_LEAF, tf, funcAddr, funcName, no_memory_effects);
   Node* value = _gvn.transform(new (C) ProjNode(time, TypeFunc::Parms+0));
 #ifdef ASSERT
-  Node* value_top = _gvn.transform(new (C) ProjNode(time, TypeFunc::Parms + 1));
+  Node* value_top = _gvn.transform(new (C) ProjNode(time, TypeFunc::Parms+1));
   assert(value_top == top(), "second value must be top");
 #endif
-  push_pair(value);
+  set_result(value);
   return true;
 }

 //------------------------inline_native_currentThread------------------
 bool LibraryCallKit::inline_native_currentThread() {
   Node* junk = NULL;
-  push(generate_current_thread(junk));
+  set_result(generate_current_thread(junk));
   return true;
 }

 //------------------------inline_native_isInterrupted------------------
+// private native boolean java.lang.Thread.isInterrupted(boolean ClearInterrupted);
 bool LibraryCallKit::inline_native_isInterrupted() {
-  const int nargs = 1+1;  // receiver + boolean
-  assert(nargs == arg_size(), "sanity");
   // Add a fast path to t.isInterrupted(clear_int):
   //   (t == Thread.current() && (!TLS._osthread._interrupted || !clear_int))
   //   ? TLS._osthread._interrupted : /*slow path:*/ t.isInterrupted(clear_int)
@@ -3226,14 +2952,23 @@

   // We only go to the fast case code if we pass two guards.
   // Paths which do not pass are accumulated in the slow_region.
+
+  enum {
+    no_int_result_path   = 1, // t == Thread.current() && !TLS._osthread._interrupted
+    no_clear_result_path = 2, // t == Thread.current() &&  TLS._osthread._interrupted && !clear_int
+    slow_result_path     = 3, // slow path: t.isInterrupted(clear_int)
+    PATH_LIMIT
+  };
+
+  // Ensure that it's not possible to move the load of TLS._osthread._interrupted flag
+  // out of the function.
+  insert_mem_bar(Op_MemBarCPUOrder);
+
+  RegionNode* result_rgn = new (C) RegionNode(PATH_LIMIT);
+  PhiNode*    result_val = new (C) PhiNode(result_rgn, TypeInt::BOOL);
+
   RegionNode* slow_region = new (C) RegionNode(1);
   record_for_igvn(slow_region);
-  RegionNode* result_rgn = new (C) RegionNode(1+3); // fast1, fast2, slow
-  PhiNode*    result_val = new (C) PhiNode(result_rgn, TypeInt::BOOL);
-  enum { no_int_result_path   = 1,
-         no_clear_result_path = 2,
-         slow_result_path     = 3
-  };

   // (a) Receiving thread must be the current thread.
   Node* rec_thr = argument(0);
@@ -3242,14 +2977,13 @@
   Node* cmp_thr = _gvn.transform( new (C) CmpPNode(cur_thr, rec_thr) );
   Node* bol_thr = _gvn.transform( new (C) BoolNode(cmp_thr, BoolTest::ne) );

-  bool known_current_thread = (_gvn.type(bol_thr) == TypeInt::ZERO);
-  if (!known_current_thread)
-    generate_slow_guard(bol_thr, slow_region);
+  generate_slow_guard(bol_thr, slow_region);

   // (b) Interrupt bit on TLS must be false.
   Node* p = basic_plus_adr(top()/*!oop*/, tls_ptr, in_bytes(JavaThread::osthread_offset()));
   Node* osthread = make_load(NULL, p, TypeRawPtr::NOTNULL, T_ADDRESS);
   p = basic_plus_adr(top()/*!oop*/, osthread, in_bytes(OSThread::interrupted_offset()));
+
   // Set the control input on the field _interrupted read to prevent it floating up.
   Node* int_bit = make_load(control(), p, TypeInt::BOOL, T_INT);
   Node* cmp_bit = _gvn.transform( new (C) CmpINode(int_bit, intcon(0)) );
@@ -3294,27 +3028,24 @@
     Node* slow_val = set_results_for_java_call(slow_call);
     // this->control() comes from set_results_for_java_call

-    // If we know that the result of the slow call will be true, tell the optimizer!
-    if (known_current_thread)  slow_val = intcon(1);
-
     Node* fast_io  = slow_call->in(TypeFunc::I_O);
     Node* fast_mem = slow_call->in(TypeFunc::Memory);
+
     // These two phis are pre-filled with copies of of the fast IO and Memory
-    Node* io_phi   = PhiNode::make(result_rgn, fast_io,  Type::ABIO);
-    Node* mem_phi  = PhiNode::make(result_rgn, fast_mem, Type::MEMORY, TypePtr::BOTTOM);
+    PhiNode* result_mem  = PhiNode::make(result_rgn, fast_mem, Type::MEMORY, TypePtr::BOTTOM);
+    PhiNode* result_io   = PhiNode::make(result_rgn, fast_io,  Type::ABIO);

     result_rgn->init_req(slow_result_path, control());
-    io_phi    ->init_req(slow_result_path, i_o());
-    mem_phi   ->init_req(slow_result_path, reset_memory());
+    result_io ->init_req(slow_result_path, i_o());
+    result_mem->init_req(slow_result_path, reset_memory());
     result_val->init_req(slow_result_path, slow_val);

-    set_all_memory( _gvn.transform(mem_phi) );
-    set_i_o(        _gvn.transform(io_phi) );
+    set_all_memory(_gvn.transform(result_mem));
+    set_i_o(       _gvn.transform(result_io));
   }

-  push_result(result_rgn, result_val);
   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
+  set_result(result_rgn, result_val);
   return true;
 }

@@ -3334,7 +3065,6 @@
 // If the region is NULL, force never_see_null = true.
 Node* LibraryCallKit::load_klass_from_mirror_common(Node* mirror,
                                                     bool never_see_null,
-                                                    int nargs,
                                                     RegionNode* region,
                                                     int null_path,
                                                     int offset) {
@@ -3342,7 +3072,6 @@
   Node* p = basic_plus_adr(mirror, offset);
   const TypeKlassPtr*  kls_type = TypeKlassPtr::OBJECT_OR_NULL;
   Node* kls = _gvn.transform( LoadKlassNode::make(_gvn, immutable_memory(), p, TypeRawPtr::BOTTOM, kls_type) );
-  _sp += nargs; // any deopt will start just before call to enclosing method
   Node* null_ctl = top();
   kls = null_check_oop(kls, &null_ctl, never_see_null);
   if (region != NULL) {
@@ -3351,7 +3080,6 @@
   } else {
     assert(null_ctl == top(), "no loose ends");
   }
-  _sp -= nargs;
   return kls;
 }

@@ -3376,7 +3104,6 @@

 //-------------------------inline_native_Class_query-------------------
 bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
-  int nargs = 1+0;  // just the Class mirror, in most cases
   const Type* return_type = TypeInt::BOOL;
   Node* prim_return_value = top();  // what happens if it's a primitive class?
   bool never_see_null = !too_many_traps(Deoptimization::Reason_null_check);
@@ -3384,11 +3111,14 @@

   enum { _normal_path = 1, _prim_path = 2, PATH_LIMIT };

+  Node* mirror = argument(0);
+  Node* obj    = top();
+
   switch (id) {
   case vmIntrinsics::_isInstance:
-    nargs = 1+1;  // the Class mirror, plus the object getting queried about
     // nothing is an instance of a primitive type
     prim_return_value = intcon(0);
+    obj = argument(1);
     break;
   case vmIntrinsics::_getModifiers:
     prim_return_value = intcon(JVM_ACC_ABSTRACT | JVM_ACC_FINAL | JVM_ACC_PUBLIC);
@@ -3419,12 +3149,10 @@
     return_type = TypeInt::INT;  // not bool!  6297094
     break;
   default:
-    ShouldNotReachHere();
+    fatal_unexpected_iid(id);
+    break;
   }

-  Node* mirror =                      argument(0);
-  Node* obj    = (nargs <= 1)? top(): argument(1);
-
   const TypeInstPtr* mirror_con = _gvn.type(mirror)->isa_instptr();
   if (mirror_con == NULL)  return false;  // cannot happen?

@@ -3451,9 +3179,7 @@
   // For Reflection.getClassAccessFlags(), the null check occurs in
   // the wrong place; see inline_unsafe_access(), above, for a similar
   // situation.
-  _sp += nargs;  // set original stack for use by uncommon_trap
-  mirror = do_null_check(mirror, T_OBJECT);
-  _sp -= nargs;
+  mirror = null_check(mirror);
   // If mirror or obj is dead, only null-path is taken.
   if (stopped())  return true;

@@ -3461,11 +3187,10 @@

   // Now load the mirror's klass metaobject, and null-check it.
   // Side-effects region with the control path if the klass is null.
-  Node* kls = load_klass_from_mirror(mirror, never_see_null, nargs,
-                                     region, _prim_path);
+  Node* kls = load_klass_from_mirror(mirror, never_see_null, region, _prim_path);
   // If kls is null, we have a primitive mirror.
   phi->init_req(_prim_path, prim_return_value);
-  if (stopped()) { push_result(region, phi); return true; }
+  if (stopped()) { set_result(region, phi); return true; }

   Node* p;  // handy temp
   Node* null_ctl;
@@ -3476,9 +3201,7 @@
   switch (id) {
   case vmIntrinsics::_isInstance:
     // nothing is an instance of a primitive type
-    _sp += nargs;          // gen_instanceof might do an uncommon trap
     query_value = gen_instanceof(obj, kls);
-    _sp -= nargs;
     break;

   case vmIntrinsics::_getModifiers:
@@ -3553,16 +3276,16 @@
     break;

   default:
-    ShouldNotReachHere();
+    fatal_unexpected_iid(id);
+    break;
   }

   // Fall-through is the normal case of a query to a real class.
   phi->init_req(1, query_value);
   region->init_req(1, control());

-  push_result(region, phi);
   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
+  set_result(region, phi);
   return true;
 }

@@ -3570,8 +3293,6 @@
 // This intrinsic takes the JNI calls out of the heart of
 // UnsafeFieldAccessorImpl.set, which improves Field.set, readObject, etc.
 bool LibraryCallKit::inline_native_subtype_check() {
-  int nargs = 1+1;  // the Class mirror, plus the other class getting examined
-
   // Pull both arguments off the stack.
   Node* args[2];                // two java.lang.Class mirrors: superc, subc
   args[0] = argument(0);
@@ -3602,11 +3323,9 @@
   int which_arg;
   for (which_arg = 0; which_arg <= 1; which_arg++) {
     Node* arg = args[which_arg];
-    _sp += nargs;  // set original stack for use by uncommon_trap
-    arg = do_null_check(arg, T_OBJECT);
-    _sp -= nargs;
+    arg = null_check(arg);
     if (stopped())  break;
-    args[which_arg] = _gvn.transform(arg);
+    args[which_arg] = arg;

     Node* p = basic_plus_adr(arg, class_klass_offset);
     Node* kls = LoadKlassNode::make(_gvn, immutable_memory(), p, adr_type, kls_type);
@@ -3618,9 +3337,7 @@
   for (which_arg = 0; which_arg <= 1; which_arg++) {
     Node* kls = klasses[which_arg];
     Node* null_ctl = top();
-    _sp += nargs;  // set original stack for use by uncommon_trap
     kls = null_check_oop(kls, &null_ctl, never_see_null);
-    _sp -= nargs;
     int prim_path = (which_arg == 0 ? _prim_0_path : _prim_1_path);
     region->init_req(prim_path, null_ctl);
     if (stopped())  break;
@@ -3670,8 +3387,7 @@
   }

   set_control(_gvn.transform(region));
-  push(_gvn.transform(phi));
-
+  set_result(_gvn.transform(phi));
   return true;
 }

@@ -3719,14 +3435,12 @@


 //-----------------------inline_native_newArray--------------------------
+// private static native Object java.lang.reflect.newArray(Class<?> componentType, int length);
 bool LibraryCallKit::inline_native_newArray() {
-  int nargs = 2;
   Node* mirror    = argument(0);
   Node* count_val = argument(1);

-  _sp += nargs;  // set original stack for use by uncommon_trap
-  mirror = do_null_check(mirror, T_OBJECT);
-  _sp -= nargs;
+  mirror = null_check(mirror);
   // If mirror or obj is dead, only null-path is taken.
   if (stopped())  return true;

@@ -3740,7 +3454,6 @@

   bool never_see_null = !too_many_traps(Deoptimization::Reason_null_check);
   Node* klass_node = load_array_klass_from_mirror(mirror, never_see_null,
-                                                  nargs,
                                                   result_reg, _slow_path);
   Node* normal_ctl   = control();
   Node* no_array_ctl = result_reg->in(_slow_path);
@@ -3767,7 +3480,7 @@
     // Normal case:  The array type has been cached in the java.lang.Class.
     // The following call works fine even if the array type is polymorphic.
     // It could be a dynamic mix of int[], boolean[], Object[], etc.
-    Node* obj = new_array(klass_node, count_val, nargs);
+    Node* obj = new_array(klass_node, count_val, 0);  // no arguments to push
     result_reg->init_req(_normal_path, control());
     result_val->init_req(_normal_path, obj);
     result_io ->init_req(_normal_path, i_o());
@@ -3777,23 +3490,18 @@
   // Return the combined state.
   set_i_o(        _gvn.transform(result_io)  );
   set_all_memory( _gvn.transform(result_mem) );
-  push_result(result_reg, result_val);
+
   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
+  set_result(result_reg, result_val);
   return true;
 }

 //----------------------inline_native_getLength--------------------------
+// public static native int java.lang.reflect.Array.getLength(Object array);
 bool LibraryCallKit::inline_native_getLength() {
   if (too_many_traps(Deoptimization::Reason_intrinsic))  return false;

-  int nargs = 1;
-  Node* array = argument(0);
-
-  _sp += nargs;  // set original stack for use by uncommon_trap
-  array = do_null_check(array, T_OBJECT);
-  _sp -= nargs;
-
+  Node* array = null_check(argument(0));
   // If array is dead, only null-path is taken.
   if (stopped())  return true;

@@ -3803,7 +3511,6 @@
   if (non_array != NULL) {
     PreserveJVMState pjvms(this);
     set_control(non_array);
-    _sp += nargs;  // push the arguments back on the stack
     uncommon_trap(Deoptimization::Reason_intrinsic,
                   Deoptimization::Action_maybe_recompile);
   }
@@ -3813,19 +3520,21 @@

   // The works fine even if the array type is polymorphic.
   // It could be a dynamic mix of int[], boolean[], Object[], etc.
-  push( load_array_length(array) );
-
-  C->set_has_split_ifs(true); // Has chance for split-if optimization
-
+  Node* result = load_array_length(array);
+
+  C->set_has_split_ifs(true);  // Has chance for split-if optimization
+  set_result(result);
   return true;
 }

 //------------------------inline_array_copyOf----------------------------
+// public static <T,U> T[] java.util.Arrays.copyOf(     U[] original, int newLength,         Class<? extends T[]> newType);
+// public static <T,U> T[] java.util.Arrays.copyOfRange(U[] original, int from,      int to, Class<? extends T[]> newType);
 bool LibraryCallKit::inline_array_copyOf(bool is_copyOfRange) {
+  return false;
   if (too_many_traps(Deoptimization::Reason_intrinsic))  return false;

-  // Restore the stack and pop off the arguments.
-  int nargs = 3 + (is_copyOfRange? 1: 0);
+  // Get the arguments.
   Node* original          = argument(0);
   Node* start             = is_copyOfRange? argument(1): intcon(0);
   Node* end               = is_copyOfRange? argument(2): argument(1);
@@ -3833,23 +3542,21 @@

   Node* newcopy;

-  //set the original stack and the reexecute bit for the interpreter to reexecute
-  //the bytecode that invokes Arrays.copyOf if deoptimization happens
+  // Set the original stack and the reexecute bit for the interpreter to reexecute
+  // the bytecode that invokes Arrays.copyOf if deoptimization happens.
   { PreserveReexecuteState preexecs(this);
-    _sp += nargs;
     jvms()->set_should_reexecute(true);

-    array_type_mirror = do_null_check(array_type_mirror, T_OBJECT);
-    original          = do_null_check(original, T_OBJECT);
+    array_type_mirror = null_check(array_type_mirror);
+    original          = null_check(original);

     // Check if a null path was taken unconditionally.
     if (stopped())  return true;

     Node* orig_length = load_array_length(original);

-    Node* klass_node = load_klass_from_mirror(array_type_mirror, false, 0,
-                                              NULL, 0);
-    klass_node = do_null_check(klass_node, T_OBJECT);
+    Node* klass_node = load_klass_from_mirror(array_type_mirror, false, NULL, 0);
+    klass_node = null_check(klass_node);

     RegionNode* bailout = new (C) RegionNode(1);
     record_for_igvn(bailout);
@@ -3872,7 +3579,7 @@

     Node* length = end;
     if (_gvn.type(start) != TypeInt::ZERO) {
-      length = _gvn.transform( new (C) SubINode(end, start) );
+      length = _gvn.transform(new (C) SubINode(end, start));
     }

     // Bail out if length is negative.
@@ -3883,19 +3590,18 @@

     if (bailout->req() > 1) {
       PreserveJVMState pjvms(this);
-      set_control( _gvn.transform(bailout) );
+      set_control(_gvn.transform(bailout));
       uncommon_trap(Deoptimization::Reason_intrinsic,
                     Deoptimization::Action_maybe_recompile);
     }

     if (!stopped()) {
-
       // How many elements will we copy from the original?
       // The answer is MinI(orig_length - start, length).
-      Node* orig_tail = _gvn.transform( new(C) SubINode(orig_length, start) );
+      Node* orig_tail = _gvn.transform(new (C) SubINode(orig_length, start));
       Node* moved = generate_min_max(vmIntrinsics::_min, orig_tail, length);

-      newcopy = new_array(klass_node, length, 0);
+      newcopy = new_array(klass_node, length, 0);  // no argments to push

       // Generate a direct call to the right arraycopy function(s).
       // We know the copy is disjoint but we might not know if the
@@ -3910,14 +3616,12 @@
                          original, start, newcopy, intcon(0), moved,
                          disjoint_bases, length_never_negative);
     }
-  } //original reexecute and sp are set back here
-
-  if(!stopped()) {
-    push(newcopy);
-  }
+  } // original reexecute is set back here

   C->set_has_split_ifs(true); // Has chance for split-if optimization
-
+  if (!stopped()) {
+    set_result(newcopy);
+  }
   return true;
 }

@@ -3969,7 +3673,7 @@
                            SharedRuntime::get_resolve_static_call_stub(),
                            method, bci());
   } else if (is_virtual) {
-    null_check_receiver(method);
+    null_check_receiver();
     int vtable_index = methodOopDesc::invalid_vtable_index;
     if (UseInlineCaches) {
       // Suppress the vtable call
@@ -3983,7 +3687,7 @@
                           SharedRuntime::get_resolve_virtual_call_stub(),
                           method, vtable_index, bci());
   } else {  // neither virtual nor static:  opt_virtual
-    null_check_receiver(method);
+    null_check_receiver();
     slow_call = new(C) CallStaticJavaNode(tf,
                                 SharedRuntime::get_resolve_opt_virtual_call_stub(),
                                 method, bci());
@@ -4012,7 +3716,7 @@
   Node* obj = NULL;
   if (!is_static) {
     // Check for hashing null object
-    obj = null_check_receiver(callee());
+    obj = null_check_receiver();
     if (stopped())  return true;        // unconditionally null
     result_reg->init_req(_null_path, top());
     result_val->init_req(_null_path, top());
@@ -4028,9 +3732,9 @@

   // Unconditionally null?  Then return right away.
   if (stopped()) {
-    set_control( result_reg->in(_null_path) );
+    set_control( result_reg->in(_null_path));
     if (!stopped())
-      push(      result_val ->in(_null_path) );
+      set_result(result_val->in(_null_path));
     return true;
   }

@@ -4103,8 +3807,7 @@
   if (!stopped()) {
     // No need for PreserveJVMState, because we're using up the present state.
     set_all_memory(init_mem);
-    vmIntrinsics::ID hashCode_id = vmIntrinsics::_hashCode;
-    if (is_static)   hashCode_id = vmIntrinsics::_identityHashCode;
+    vmIntrinsics::ID hashCode_id = is_static ? vmIntrinsics::_identityHashCode : vmIntrinsics::_hashCode;
     CallJavaNode* slow_call = generate_method_call(hashCode_id, is_virtual, is_static);
     Node* slow_result = set_results_for_java_call(slow_call);
     // this->control() comes from set_results_for_java_call
@@ -4117,48 +3820,38 @@
   // Return the combined state.
   set_i_o(        _gvn.transform(result_io)  );
   set_all_memory( _gvn.transform(result_mem) );
-  push_result(result_reg, result_val);
-
+
+  set_result(result_reg, result_val);
   return true;
 }

 //---------------------------inline_native_getClass----------------------------
+// public final native Class<?> java.lang.Object.getClass();
+//
 // Build special case code for calls to getClass on an object.
 bool LibraryCallKit::inline_native_getClass() {
-  Node* obj = null_check_receiver(callee());
+  Node* obj = null_check_receiver();
   if (stopped())  return true;
-  push( load_mirror_from_klass(load_object_klass(obj)) );
+  set_result(load_mirror_from_klass(load_object_klass(obj)));
   return true;
 }

 //-----------------inline_native_Reflection_getCallerClass---------------------
+// public static native Class<?> sun.reflect.Reflection.getCallerClass(int realFramesToSkip);
+//
 // In the presence of deep enough inlining, getCallerClass() becomes a no-op.
 //
 // NOTE that this code must perform the same logic as
 // vframeStream::security_get_caller_frame in that it must skip
 // Method.invoke() and auxiliary frames.
-
-
-
-
 bool LibraryCallKit::inline_native_Reflection_getCallerClass() {
-  ciMethod*       method = callee();
-
 #ifndef PRODUCT
   if ((PrintIntrinsics || PrintInlining || PrintOptoInlining) && Verbose) {
     tty->print_cr("Attempting to inline sun.reflect.Reflection.getCallerClass");
   }
 #endif

-  debug_only(int saved_sp = _sp);
-
-  // Argument words:  (int depth)
-  int nargs = 1;
-
-  _sp += nargs;
-  Node* caller_depth_node = pop();
-
-  assert(saved_sp == _sp, "must have correct argument count");
+  Node* caller_depth_node = argument(0);

   // The depth value must be a constant in order for the runtime call
   // to be eliminated.
@@ -4230,7 +3923,8 @@
       tty->print_cr("  Bailing out because caller depth (%d) exceeded inlining depth (%d)", caller_depth_type->get_con(), _depth);
       tty->print_cr("  JVM state at this point:");
       for (int i = _depth; i >= 1; i--) {
-        tty->print_cr("   %d) %s", i, jvms()->of_depth(i)->method()->name()->as_utf8());
+        ciMethod* m = jvms()->of_depth(i)->method();
+        tty->print_cr("   %d) %s.%s", i, m->holder()->name()->as_utf8(), m->name()->as_utf8());
       }
     }
 #endif
@@ -4240,14 +3934,17 @@
   // Acquire method holder as java.lang.Class
   ciInstanceKlass* caller_klass  = caller_jvms->method()->holder();
   ciInstance*      caller_mirror = caller_klass->java_mirror();
+
   // Push this as a constant
-  push(makecon(TypeInstPtr::make(caller_mirror)));
+  set_result(makecon(TypeInstPtr::make(caller_mirror)));
+
 #ifndef PRODUCT
   if ((PrintIntrinsics || PrintInlining || PrintOptoInlining) && Verbose) {
     tty->print_cr("  Succeeded: caller = %s.%s, caller depth = %d, depth = %d", caller_klass->name()->as_utf8(), caller_jvms->method()->name()->as_utf8(), caller_depth_type->get_con(), _depth);
     tty->print_cr("  JVM state at this point:");
     for (int i = _depth; i >= 1; i--) {
-      tty->print_cr("   %d) %s", i, jvms()->of_depth(i)->method()->name()->as_utf8());
+      ciMethod* m = jvms()->of_depth(i)->method();
+      tty->print_cr("   %d) %s.%s", i, m->holder()->name()->as_utf8(), m->name()->as_utf8());
     }
   }
 #endif
@@ -4283,36 +3980,23 @@
 }

 bool LibraryCallKit::inline_fp_conversions(vmIntrinsics::ID id) {
-  // restore the arguments
-  _sp += arg_size();
+  Node* arg = argument(0);
+  Node* result;

   switch (id) {
-  case vmIntrinsics::_floatToRawIntBits:
-    push(_gvn.transform( new (C) MoveF2INode(pop())));
-    break;
-
-  case vmIntrinsics::_intBitsToFloat:
-    push(_gvn.transform( new (C) MoveI2FNode(pop())));
-    break;
-
-  case vmIntrinsics::_doubleToRawLongBits:
-    push_pair(_gvn.transform( new (C) MoveD2LNode(pop_pair())));
-    break;
-
-  case vmIntrinsics::_longBitsToDouble:
-    push_pair(_gvn.transform( new (C) MoveL2DNode(pop_pair())));
-    break;
+  case vmIntrinsics::_floatToRawIntBits:    result = new (C) MoveF2INode(arg);  break;
+  case vmIntrinsics::_intBitsToFloat:       result = new (C) MoveI2FNode(arg);  break;
+  case vmIntrinsics::_doubleToRawLongBits:  result = new (C) MoveD2LNode(arg);  break;
+  case vmIntrinsics::_longBitsToDouble:     result = new (C) MoveL2DNode(arg);  break;

   case vmIntrinsics::_doubleToLongBits: {
-    Node* value = pop_pair();
-
     // two paths (plus control) merge in a wood
     RegionNode *r = new (C) RegionNode(3);
     Node *phi = new (C) PhiNode(r, TypeLong::LONG);

-    Node *cmpisnan = _gvn.transform( new (C) CmpDNode(value, value));
+    Node *cmpisnan = _gvn.transform(new (C) CmpDNode(arg, arg));
     // Build the boolean node
-    Node *bolisnan = _gvn.transform( new (C) BoolNode( cmpisnan, BoolTest::ne ) );
+    Node *bolisnan = _gvn.transform(new (C) BoolNode(cmpisnan, BoolTest::ne));

     // Branch either way.
     // NaN case is less traveled, which makes all the difference.
@@ -4330,35 +4014,30 @@
     r->init_req(1, iftrue);

     // Else fall through
-    Node *iffalse = _gvn.transform( new (C) IfFalseNode(opt_ifisnan) );
+    Node *iffalse = _gvn.transform(new (C) IfFalseNode(opt_ifisnan));
     set_control(iffalse);

-    phi->init_req(2, _gvn.transform( new (C) MoveD2LNode(value)));
+    phi->init_req(2, _gvn.transform(new (C) MoveD2LNode(arg)));
     r->init_req(2, iffalse);

     // Post merge
     set_control(_gvn.transform(r));
     record_for_igvn(r);

-    Node* result = _gvn.transform(phi);
+    C->set_has_split_ifs(true); // Has chance for split-if optimization
+    result = phi;
     assert(result->bottom_type()->isa_long(), "must be");
-    push_pair(result);
-
-    C->set_has_split_ifs(true); // Has chance for split-if optimization
-
     break;
   }

   case vmIntrinsics::_floatToIntBits: {
-    Node* value = pop();
-
     // two paths (plus control) merge in a wood
     RegionNode *r = new (C) RegionNode(3);
     Node *phi = new (C) PhiNode(r, TypeInt::INT);

-    Node *cmpisnan = _gvn.transform( new (C) CmpFNode(value, value));
+    Node *cmpisnan = _gvn.transform(new (C) CmpFNode(arg, arg));
     // Build the boolean node
-    Node *bolisnan = _gvn.transform( new (C) BoolNode( cmpisnan, BoolTest::ne ) );
+    Node *bolisnan = _gvn.transform(new (C) BoolNode(cmpisnan, BoolTest::ne));

     // Branch either way.
     // NaN case is less traveled, which makes all the difference.
@@ -4376,29 +4055,27 @@
     r->init_req(1, iftrue);

     // Else fall through
-    Node *iffalse = _gvn.transform( new (C) IfFalseNode(opt_ifisnan) );
+    Node *iffalse = _gvn.transform(new (C) IfFalseNode(opt_ifisnan));
     set_control(iffalse);

-    phi->init_req(2, _gvn.transform( new (C) MoveF2INode(value)));
+    phi->init_req(2, _gvn.transform(new (C) MoveF2INode(arg)));
     r->init_req(2, iffalse);

     // Post merge
     set_control(_gvn.transform(r));
     record_for_igvn(r);

-    Node* result = _gvn.transform(phi);
+    C->set_has_split_ifs(true); // Has chance for split-if optimization
+    result = phi;
     assert(result->bottom_type()->isa_int(), "must be");
-    push(result);
-
-    C->set_has_split_ifs(true); // Has chance for split-if optimization
-
     break;
   }

   default:
-    ShouldNotReachHere();
+    fatal_unexpected_iid(id);
+    break;
   }
-
+  set_result(_gvn.transform(result));
   return true;
 }

@@ -4409,23 +4086,19 @@
 #endif //_LP64

 //----------------------inline_unsafe_copyMemory-------------------------
+// public native void sun.misc.Unsafe.copyMemory(Object srcBase, long srcOffset, Object destBase, long destOffset, long bytes);
 bool LibraryCallKit::inline_unsafe_copyMemory() {
   if (callee()->is_static())  return false;  // caller must have the capability!
-  int nargs = 1 + 5 + 3;  // 5 args:  (src: ptr,off, dst: ptr,off, size)
-  assert(signature()->size() == nargs-1, "copy has 5 arguments");
-  null_check_receiver(callee());  // check then ignore argument(0)
+  null_check_receiver();  // null-check receiver
   if (stopped())  return true;

   C->set_has_unsafe_access(true);  // Mark eventual nmethod as "unsafe".

-  Node* src_ptr = argument(1);
-  Node* src_off = ConvL2X(argument(2));
-  assert(argument(3)->is_top(), "2nd half of long");
-  Node* dst_ptr = argument(4);
-  Node* dst_off = ConvL2X(argument(5));
-  assert(argument(6)->is_top(), "2nd half of long");
-  Node* size    = ConvL2X(argument(7));
-  assert(argument(8)->is_top(), "2nd half of long");
+  Node* src_ptr =         argument(1);   // type: oop
+  Node* src_off = ConvL2X(argument(2));  // type: long
+  Node* dst_ptr =         argument(4);   // type: oop
+  Node* dst_off = ConvL2X(argument(5));  // type: long
+  Node* size    = ConvL2X(argument(7));  // type: long

   assert(Unsafe_field_offset_to_byte_offset(11) == 11,
          "fieldOffset must be byte-scaled");
@@ -4545,6 +4218,8 @@
 }

 //------------------------inline_native_clone----------------------------
+// protected native Object java.lang.Object.clone();
+//
 // Here are the simple edge cases:
 //  null receiver => normal trap
 //  virtual and clone was overridden => slow path to out-of-line clone
@@ -4561,20 +4236,16 @@
 // can be sharply typed as an object array, a type array, or an instance.
 //
 bool LibraryCallKit::inline_native_clone(bool is_virtual) {
-  int nargs = 1;
   PhiNode* result_val;

-  //set the original stack and the reexecute bit for the interpreter to reexecute
-  //the bytecode that invokes Object.clone if deoptimization happens
+  // Set the reexecute bit for the interpreter to reexecute
+  // the bytecode that invokes Object.clone if deoptimization happens.
   { PreserveReexecuteState preexecs(this);
     jvms()->set_should_reexecute(true);

-    //null_check_receiver will adjust _sp (push and pop)
-    Node* obj = null_check_receiver(callee());
+    Node* obj = null_check_receiver();
     if (stopped())  return true;

-    _sp += nargs;
-
     Node* obj_klass = load_object_klass(obj);
     const TypeKlassPtr* tklass = _gvn.type(obj_klass)->isa_klassptr();
     const TypeOopPtr*   toop   = ((tklass != NULL)
@@ -4611,7 +4282,7 @@
       set_control(array_ctl);
       Node* obj_length = load_array_length(obj);
       Node* obj_size  = NULL;
-      Node* alloc_obj = new_array(obj_klass, obj_length, 0, &obj_size);
+      Node* alloc_obj = new_array(obj_klass, obj_length, 0, &obj_size);  // no arguments to push

       if (!use_ReduceInitialCardMarks()) {
         // If it is an oop array, it requires very special treatment,
@@ -4711,10 +4382,9 @@
     set_control(    _gvn.transform(result_reg) );
     set_i_o(        _gvn.transform(result_i_o) );
     set_all_memory( _gvn.transform(result_mem) );
-  } //original reexecute and sp are set back here
-
-  push(_gvn.transform(result_val));
-
+  } // original reexecute is set back here
+
+  set_result(_gvn.transform(result_val));
   return true;
 }

@@ -4755,25 +4425,25 @@


 //------------------------------inline_arraycopy-----------------------
+// public static native void java.lang.System.arraycopy(Object src,  int  srcPos,
+//                                                      Object dest, int destPos,
+//                                                      int length);
 bool LibraryCallKit::inline_arraycopy() {
-  // Restore the stack and pop off the arguments.
-  int nargs = 5;  // 2 oops, 3 ints, no size_t or long
-  assert(callee()->signature()->size() == nargs, "copy has 5 arguments");
-
-  Node *src         = argument(0);
-  Node *src_offset  = argument(1);
-  Node *dest        = argument(2);
-  Node *dest_offset = argument(3);
-  Node *length      = argument(4);
+  // Get the arguments.
+  Node* src         = argument(0);  // type: oop
+  Node* src_offset  = argument(1);  // type: int
+  Node* dest        = argument(2);  // type: oop
+  Node* dest_offset = argument(3);  // type: int
+  Node* length      = argument(4);  // type: int

   // Compile time checks.  If any of these checks cannot be verified at compile time,
   // we do not make a fast path for this call.  Instead, we let the call remain as it
   // is.  The checks we choose to mandate at compile time are:
   //
   // (1) src and dest are arrays.
-  const Type* src_type = src->Value(&_gvn);
+  const Type* src_type  = src->Value(&_gvn);
   const Type* dest_type = dest->Value(&_gvn);
-  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_src  = src_type->isa_aryptr();
   const TypeAryPtr* top_dest = dest_type->isa_aryptr();
   if (top_src  == NULL || top_src->klass()  == NULL ||
       top_dest == NULL || top_dest->klass() == NULL) {
@@ -4828,15 +4498,13 @@
   record_for_igvn(slow_region);

   // (3) operands must not be null
-  // We currently perform our null checks with the do_null_check routine.
+  // We currently perform our null checks with the null_check routine.
   // This means that the null exceptions will be reported in the caller
   // rather than (correctly) reported inside of the native arraycopy call.
   // This should be corrected, given time.  We do our null check with the
   // stack pointer restored.
-  _sp += nargs;
-  src  = do_null_check(src,  T_ARRAY);
-  dest = do_null_check(dest, T_ARRAY);
-  _sp -= nargs;
+  src  = null_check(src,  T_ARRAY);
+  dest = null_check(dest, T_ARRAY);

   // (4) src_offset must not be negative.
   generate_negative_guard(src_offset, slow_region);
@@ -5179,7 +4847,7 @@
   slow_control = top();
   if (slow_region != NULL)
     slow_control = _gvn.transform(slow_region);
-  debug_only(slow_region = (RegionNode*)badAddress);
+  DEBUG_ONLY(slow_region = (RegionNode*)badAddress);

   set_control(checked_control);
   if (!stopped()) {
@@ -5674,33 +5342,22 @@
 }

 //----------------------------inline_reference_get----------------------------
-
+// public T java.lang.ref.Reference.get();
 bool LibraryCallKit::inline_reference_get() {
-  const int nargs = 1; // self
-
-  guarantee(java_lang_ref_Reference::referent_offset > 0,
-            "should have already been set");
-
-  int referent_offset = java_lang_ref_Reference::referent_offset;
-
-  // Restore the stack and pop off the argument
-  _sp += nargs;
-  Node *reference_obj = pop();
-
-  // Null check on self without removing any arguments.
-  _sp += nargs;
-  reference_obj = do_null_check(reference_obj, T_OBJECT);
-  _sp -= nargs;;
-
+  const int referent_offset = java_lang_ref_Reference::referent_offset;
+  guarantee(referent_offset > 0, "should have already been set");
+
+  // Get the argument:
+  Node* reference_obj = null_check_receiver();
   if (stopped()) return true;

-  Node *adr = basic_plus_adr(reference_obj, reference_obj, referent_offset);
+  Node* adr = basic_plus_adr(reference_obj, reference_obj, referent_offset);

   ciInstanceKlass* klass = env()->Object_klass();
   const TypeOopPtr* object_type = TypeOopPtr::make_from_klass(klass);

   Node* no_ctrl = NULL;
-  Node *result = make_load(no_ctrl, adr, object_type, T_OBJECT);
+  Node* result = make_load(no_ctrl, adr, object_type, T_OBJECT);

   // Use the pre-barrier to record the value in the referent field
   pre_barrier(false /* do_load */,
@@ -5713,7 +5370,7 @@
   // across safepoint since GC can change its value.
   insert_mem_bar(Op_MemBarCPUOrder);

-  push(result);
+  set_result(result);
   return true;
 }

@@ -5770,15 +5427,11 @@
   }
   if (stubAddr == NULL) return false;

-  // Restore the stack and pop off the arguments.
-  int nargs = 5;  // this + 2 oop/offset combos
-  assert(callee()->signature()->size() == nargs-1, "encryptBlock has 4 arguments");
-
-  Node *aescrypt_object  = argument(0);
-  Node *src         = argument(1);
-  Node *src_offset  = argument(2);
-  Node *dest        = argument(3);
-  Node *dest_offset = argument(4);
+  Node* aescrypt_object = argument(0);
+  Node* src             = argument(1);
+  Node* src_offset      = argument(2);
+  Node* dest            = argument(3);
+  Node* dest_offset     = argument(4);

   // (1) src and dest are arrays.
   const Type* src_type = src->Value(&_gvn);
@@ -5829,16 +5482,12 @@
   }
   if (stubAddr == NULL) return false;

-
-  // Restore the stack and pop off the arguments.
-  int nargs = 6;  // this + oop/offset + len + oop/offset
-  assert(callee()->signature()->size() == nargs-1, "wrong number of arguments");
-  Node *cipherBlockChaining_object  = argument(0);
-  Node *src         = argument(1);
-  Node *src_offset  = argument(2);
-  Node *len         = argument(3);
-  Node *dest        = argument(4);
-  Node *dest_offset = argument(5);
+  Node* cipherBlockChaining_object = argument(0);
+  Node* src                        = argument(1);
+  Node* src_offset                 = argument(2);
+  Node* len                        = argument(3);
+  Node* dest                       = argument(4);
+  Node* dest_offset                = argument(5);

   // (1) src and dest are arrays.
   const Type* src_type = src->Value(&_gvn);
@@ -5920,11 +5569,8 @@
 //
 Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting) {
   // First, check receiver for NULL since it is virtual method.
-  int nargs = arg_size();
   Node* objCBC = argument(0);
-  _sp += nargs;
-  objCBC = do_null_check(objCBC, T_OBJECT);
-  _sp -= nargs;
+  objCBC = null_check(objCBC);

   if (stopped()) return NULL; // Always NULL

@@ -5948,9 +5594,7 @@
   }
   ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();

-  _sp += nargs;          // gen_instanceof might do an uncommon trap
   Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
-  _sp -= nargs;
   Node* cmp_instof  = _gvn.transform(new (C) CmpINode(instof, intcon(1)));
   Node* bool_instof  = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne));

@@ -5966,7 +5610,7 @@
   RegionNode* region = new(C) RegionNode(3);
   region->init_req(1, instof_false);
   Node* src = argument(1);
-  Node *dest = argument(4);
+  Node* dest = argument(4);
   Node* cmp_src_dest = _gvn.transform(new (C) CmpPNode(src, dest));
   Node* bool_src_dest = _gvn.transform(new (C) BoolNode(cmp_src_dest, BoolTest::eq));
   Node* src_dest_conjoint = generate_guard(bool_src_dest, NULL, PROB_MIN);
@@ -5974,7 +5618,4 @@

   record_for_igvn(region);
   return _gvn.transform(region);
-
 }
-
-
--- a/src/share/vm/opto/locknode.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/locknode.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -165,7 +165,7 @@
   kill_dead_locals();

   // Null check; get casted pointer.
-  Node *obj = do_null_check(peek(), T_OBJECT);
+  Node* obj = null_check(peek());
   // Check for locking null object
   if (stopped()) return;
--- a/src/share/vm/opto/loopTransform.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/loopTransform.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -269,10 +269,10 @@
 bool IdealLoopTree::policy_peeling( PhaseIdealLoop *phase ) const {
   Node *test = ((IdealLoopTree*)this)->tail();
   int  body_size = ((IdealLoopTree*)this)->_body.size();
-  int  uniq      = phase->C->unique();
+  int  live_node_count = phase->C->live_nodes();
   // Peeling does loop cloning which can result in O(N^2) node construction
   if( body_size > 255 /* Prevent overflow for large body_size */
-      || (body_size * body_size + uniq > MaxNodeLimit) ) {
+      || (body_size * body_size + live_node_count > MaxNodeLimit) ) {
     return false;           // too large to safely clone
   }
   while( test != _head ) {      // Scan till run off top of loop
@@ -601,7 +601,7 @@
     return false;
   if (new_body_size > unroll_limit ||
       // Unrolling can result in a large amount of node construction
-      new_body_size >= MaxNodeLimit - phase->C->unique()) {
+      new_body_size >= MaxNodeLimit - (uint) phase->C->live_nodes()) {
     return false;
   }

@@ -2268,7 +2268,7 @@

   // Skip next optimizations if running low on nodes. Note that
   // policy_unswitching and policy_maximally_unroll have this check.
-  uint nodes_left = MaxNodeLimit - phase->C->unique();
+  uint nodes_left = MaxNodeLimit - (uint) phase->C->live_nodes();
   if ((2 * _body.size()) > nodes_left) {
     return true;
   }
--- a/src/share/vm/opto/loopUnswitch.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/loopUnswitch.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -59,7 +59,7 @@
   if (!_head->is_Loop()) {
     return false;
   }
-  uint nodes_left = MaxNodeLimit - phase->C->unique();
+  uint nodes_left = MaxNodeLimit - phase->C->live_nodes();
   if (2 * _body.size() > nodes_left) {
     return false; // Too speculative if running low on nodes.
   }
--- a/src/share/vm/opto/loopopts.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/loopopts.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -729,7 +729,7 @@
   for (DUIterator_Fast imax, i = region->fast_outs(imax); i < imax; i++) {
     weight += region->fast_out(i)->outcnt();
   }
-  int nodes_left = MaxNodeLimit - C->unique();
+  int nodes_left = MaxNodeLimit - C->live_nodes();
   if (weight * 8 > nodes_left) {
 #ifndef PRODUCT
     if (PrintOpto)
--- a/src/share/vm/opto/macro.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/macro.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -2262,7 +2262,7 @@
   Node *slow_ctrl = _fallthroughproj->clone();
   transform_later(slow_ctrl);
   _igvn.hash_delete(_fallthroughproj);
-  _fallthroughproj->disconnect_inputs(NULL);
+  _fallthroughproj->disconnect_inputs(NULL, C);
   region->init_req(1, slow_ctrl);
   // region inputs are now complete
   transform_later(region);
@@ -2327,7 +2327,7 @@
   Node *slow_ctrl = _fallthroughproj->clone();
   transform_later(slow_ctrl);
   _igvn.hash_delete(_fallthroughproj);
-  _fallthroughproj->disconnect_inputs(NULL);
+  _fallthroughproj->disconnect_inputs(NULL, C);
   region->init_req(1, slow_ctrl);
   // region inputs are now complete
   transform_later(region);
--- a/src/share/vm/opto/matcher.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/matcher.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -342,6 +342,7 @@
   // Reset node counter so MachNodes start with _idx at 0
   int nodes = C->unique(); // save value
   C->set_unique(0);
+  C->reset_dead_node_list();

   // Recursively match trees from old space into new space.
   // Correct leaves of new-space Nodes; they point to old-space.
--- a/src/share/vm/opto/memnode.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/memnode.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -2716,10 +2716,8 @@
     zend  = phase->transform( new(C) URShiftXNode(zend,  shift) );
   }

+  // Bulk clear double-words
   Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) );
-  Node* zinit = phase->zerocon((unit == BytesPerLong) ? T_LONG : T_INT);
-
-  // Bulk clear double-words
   Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) );
   mem = new (C) ClearArrayNode(ctl, mem, zsize, adr);
   return phase->transform(mem);
--- a/src/share/vm/opto/node.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/node.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -57,7 +57,7 @@
   int new_debug_idx = old_debug_idx+1;
   if (new_debug_idx > 0) {
     // Arrange that the lowest five decimal digits of _debug_idx
-    // will repeat thos of _idx.  In case this is somehow pathological,
+    // will repeat those of _idx. In case this is somehow pathological,
     // we continue to assign negative numbers (!) consecutively.
     const int mod = 100000;
     int bump = (int)(_idx - new_debug_idx) % mod;
@@ -67,7 +67,7 @@
   }
   Compile::set_debug_idx(new_debug_idx);
   set_debug_idx( new_debug_idx );
-  assert(Compile::current()->unique() < (uint)MaxNodeLimit, "Node limit exceeded");
+  assert(Compile::current()->unique() < (UINT_MAX - 1), "Node limit exceeded UINT_MAX");
   if (BreakAtNode != 0 && (_debug_idx == BreakAtNode || (int)_idx == BreakAtNode)) {
     tty->print_cr("BreakAtNode: _idx=%d _debug_idx=%d", _idx, _debug_idx);
     BREAKPOINT;
@@ -802,7 +802,7 @@
 //-------------------------disconnect_inputs-----------------------------------
 // NULL out all inputs to eliminate incoming Def-Use edges.
 // Return the number of edges between 'n' and 'this'
-int Node::disconnect_inputs(Node *n) {
+int Node::disconnect_inputs(Node *n, Compile* C) {
   int edges_to_n = 0;

   uint cnt = req();
@@ -824,6 +824,9 @@

   // Node::destruct requires all out edges be deleted first
   // debug_only(destruct();)   // no reuse benefit expected
+  if (edges_to_n == 0) {
+    C->record_dead_node(_idx);
+  }
   return edges_to_n;
 }
--- a/src/share/vm/opto/node.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/node.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -406,7 +406,7 @@
   int replace_edge(Node* old, Node* neww);
   // NULL out all inputs to eliminate incoming Def-Use edges.
   // Return the number of edges between 'n' and 'this'
-  int  disconnect_inputs(Node *n);
+  int  disconnect_inputs(Node *n, Compile *c);

   // Quickly, return true if and only if I am Compile::current()->top().
   bool is_top() const {
@@ -454,9 +454,9 @@
   void replace_by(Node* new_node);
   // Globally replace this node by a given new node, updating all uses
   // and cutting input edges of old node.
-  void subsume_by(Node* new_node) {
+  void subsume_by(Node* new_node, Compile* c) {
     replace_by(new_node);
-    disconnect_inputs(NULL);
+    disconnect_inputs(NULL, c);
   }
   void set_req_X( uint i, Node *n, PhaseIterGVN *igvn );
   // Find the one non-null required input.  RegionNode only
--- a/src/share/vm/opto/output.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/output.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -513,7 +513,7 @@
           }
           adjust_block_start += diff;
           b->_nodes.map(idx, replacement);
-          mach->subsume_by(replacement);
+          mach->subsume_by(replacement, C);
           mach = replacement;
           progress = true;

@@ -1426,7 +1426,7 @@
               jmp_rule[i]   = mach->rule();
 #endif
               b->_nodes.map(j, replacement);
-              mach->subsume_by(replacement);
+              mach->subsume_by(replacement, C);
               n    = replacement;
               mach = replacement;
             }
--- a/src/share/vm/opto/parse.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/parse.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -70,7 +70,7 @@
   InlineTree *build_inline_tree_for_callee(ciMethod* callee_method,
                                            JVMState* caller_jvms,
                                            int caller_bci);
-  const char* try_to_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result);
+  const char* try_to_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result, bool& should_delay);
   const char* should_inline(ciMethod* callee_method, ciMethod* caller_method, int caller_bci, ciCallProfile& profile, WarmCallInfo* wci_result) const;
   const char* should_not_inline(ciMethod* callee_method, ciMethod* caller_method, WarmCallInfo* wci_result) const;
   void        print_inlining(ciMethod *callee_method, int caller_bci, const char *failure_msg) const;
@@ -107,7 +107,7 @@
   // and may be accessed by find_subtree_from_root.
   // The call_method is the dest_method for a special or static invocation.
   // The call_method is an optimized virtual method candidate otherwise.
-  WarmCallInfo* ok_to_inline(ciMethod *call_method, JVMState* caller_jvms, ciCallProfile& profile, WarmCallInfo* wci);
+  WarmCallInfo* ok_to_inline(ciMethod *call_method, JVMState* caller_jvms, ciCallProfile& profile, WarmCallInfo* wci, bool& should_delay);

   // Information about inlined method
   JVMState*   caller_jvms()       const { return _caller_jvms; }
--- a/src/share/vm/opto/parse1.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/parse1.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -601,8 +601,8 @@
   set_map(entry_map);
   do_exits();

-  if (log)  log->done("parse nodes='%d' memory='%d'",
-                      C->unique(), C->node_arena()->used());
+  if (log)  log->done("parse nodes='%d' live='%d' memory='%d'",
+                      C->unique(), C->live_nodes(), C->node_arena()->used());
 }

 //---------------------------do_all_blocks-------------------------------------
@@ -1008,7 +1008,7 @@
   // If this is an inlined method, we may have to do a receiver null check.
   if (_caller->has_method() && is_normal_parse() && !method()->is_static()) {
     GraphKit kit(_caller);
-    kit.null_check_receiver(method());
+    kit.null_check_receiver_before_call(method());
     _caller = kit.transfer_exceptions_into_jvms();
     if (kit.stopped()) {
       _exits.add_exception_states_from(_caller);
@@ -1398,7 +1398,7 @@
 #ifdef ASSERT
     int pre_bc_sp = sp();
     int inputs, depth;
-    bool have_se = !stopped() && compute_stack_effects(inputs, depth, /*for_parse*/ true);
+    bool have_se = !stopped() && compute_stack_effects(inputs, depth);
     assert(!have_se || pre_bc_sp >= inputs, err_msg_res("have enough stack to execute this BC: pre_bc_sp=%d, inputs=%d", pre_bc_sp, inputs));
 #endif //ASSERT
--- a/src/share/vm/opto/parse2.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/parse2.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -48,7 +48,7 @@
   const Type* elem = Type::TOP;
   Node* adr = array_addressing(elem_type, 0, &elem);
   if (stopped())  return;     // guaranteed null or range check
-  _sp -= 2;                   // Pop array and index
+  dec_sp(2);                  // Pop array and index
   const TypeAryPtr* adr_type = TypeAryPtr::get_array_body_type(elem_type);
   Node* ld = make_load(control(), adr, elem, elem_type, adr_type);
   push(ld);
@@ -60,7 +60,7 @@
   Node* adr = array_addressing(elem_type, 1);
   if (stopped())  return;     // guaranteed null or range check
   Node* val = pop();
-  _sp -= 2;                   // Pop array and index
+  dec_sp(2);                  // Pop array and index
   const TypeAryPtr* adr_type = TypeAryPtr::get_array_body_type(elem_type);
   store_to_memory(control(), adr, val, elem_type, adr_type);
 }
@@ -73,7 +73,7 @@
   Node *ary   = peek(1+vals);   // in case of exception

   // Null check the array base, with correct stack contents
-  ary = do_null_check(ary, T_ARRAY);
+  ary = null_check(ary, T_ARRAY);
   // Compile-time detect of null-exception?
   if (stopped())  return top();

@@ -681,7 +681,7 @@

 void Parse::do_irem() {
   // Must keep both values on the expression-stack during null-check
-  do_null_check(peek(), T_INT);
+  zero_check_int(peek());
   // Compile-time detect of null-exception?
   if (stopped())  return;

@@ -958,7 +958,7 @@
   DEBUG_ONLY(sync_jvms());   // argument(n) requires a synced jvms
   assert(argument(0) != NULL, "must exist");
   assert(bc_depth == 1 || argument(1) != NULL, "two must exist");
-  _sp += bc_depth;
+  inc_sp(bc_depth);
   return bc_depth;
 }

@@ -1581,8 +1581,8 @@
     set_pair_local( iter().get_index(), dstore_rounding(pop_pair()) );
     break;

-  case Bytecodes::_pop:  _sp -= 1;   break;
-  case Bytecodes::_pop2: _sp -= 2;   break;
+  case Bytecodes::_pop:  dec_sp(1);   break;
+  case Bytecodes::_pop2: dec_sp(2);   break;
   case Bytecodes::_swap:
     a = pop();
     b = pop();
@@ -1650,7 +1650,7 @@

   case Bytecodes::_arraylength: {
     // Must do null-check with value on expression stack
-    Node *ary = do_null_check(peek(), T_ARRAY);
+    Node *ary = null_check(peek(), T_ARRAY);
     // Compile-time detect of null-exception?
     if (stopped())  return;
     a = pop();
@@ -1667,15 +1667,15 @@
   case Bytecodes::_laload: {
     a = array_addressing(T_LONG, 0);
     if (stopped())  return;     // guaranteed null or range check
-    _sp -= 2;                   // Pop array and index
-    push_pair( make_load(control(), a, TypeLong::LONG, T_LONG, TypeAryPtr::LONGS));
+    dec_sp(2);                  // Pop array and index
+    push_pair(make_load(control(), a, TypeLong::LONG, T_LONG, TypeAryPtr::LONGS));
     break;
   }
   case Bytecodes::_daload: {
     a = array_addressing(T_DOUBLE, 0);
     if (stopped())  return;     // guaranteed null or range check
-    _sp -= 2;                   // Pop array and index
-    push_pair( make_load(control(), a, Type::DOUBLE, T_DOUBLE, TypeAryPtr::DOUBLES));
+    dec_sp(2);                  // Pop array and index
+    push_pair(make_load(control(), a, Type::DOUBLE, T_DOUBLE, TypeAryPtr::DOUBLES));
     break;
   }
   case Bytecodes::_bastore: array_store(T_BYTE);  break;
@@ -1699,7 +1699,7 @@
     a = array_addressing(T_LONG, 2);
     if (stopped())  return;     // guaranteed null or range check
     c = pop_pair();
-    _sp -= 2;                   // Pop array and index
+    dec_sp(2);                  // Pop array and index
     store_to_memory(control(), a, c, T_LONG, TypeAryPtr::LONGS);
     break;
   }
@@ -1707,7 +1707,7 @@
     a = array_addressing(T_DOUBLE, 2);
     if (stopped())  return;     // guaranteed null or range check
     c = pop_pair();
-    _sp -= 2;                   // Pop array and index
+    dec_sp(2);                  // Pop array and index
     c = dstore_rounding(c);
     store_to_memory(control(), a, c, T_DOUBLE, TypeAryPtr::DOUBLES);
     break;
@@ -1733,7 +1733,7 @@
     break;
   case Bytecodes::_idiv:
     // Must keep both values on the expression-stack during null-check
-    do_null_check(peek(), T_INT);
+    zero_check_int(peek());
     // Compile-time detect of null-exception?
     if (stopped())  return;
     b = pop();
@@ -2041,7 +2041,7 @@
   case Bytecodes::_lrem:
     // Must keep both values on the expression-stack during null-check
     assert(peek(0) == top(), "long word order");
-    do_null_check(peek(1), T_LONG);
+    zero_check_long(peek(1));
     // Compile-time detect of null-exception?
     if (stopped())  return;
     b = pop_pair();
@@ -2053,7 +2053,7 @@
   case Bytecodes::_ldiv:
     // Must keep both values on the expression-stack during null-check
     assert(peek(0) == top(), "long word order");
-    do_null_check(peek(1), T_LONG);
+    zero_check_long(peek(1));
     // Compile-time detect of null-exception?
     if (stopped())  return;
     b = pop_pair();
@@ -2175,7 +2175,7 @@

   case Bytecodes::_athrow:
     // null exception oop throws NULL pointer exception
-    do_null_check(peek(), T_OBJECT);
+    null_check(peek());
     if (stopped())  return;
     // Hook the thrown exception directly to subsequent handlers.
     if (BailoutToInterpreterForThrows) {
--- a/src/share/vm/opto/parse3.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/parse3.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -116,7 +116,7 @@
   Node* obj;
   if (is_field) {
     int obj_depth = is_get ? 0 : field->type()->size();
-    obj = do_null_check(peek(obj_depth), T_OBJECT);
+    obj = null_check(peek(obj_depth));
     // Compile-time detect of null-exception?
     if (stopped())  return;

@@ -126,11 +126,11 @@
 #endif

     if (is_get) {
-      --_sp;  // pop receiver before getting
+      (void) pop();  // pop receiver before getting
       do_get_xxx(obj, field, is_field);
     } else {
       do_put_xxx(obj, field, is_field);
-      --_sp;  // pop receiver after putting
+      (void) pop();  // pop receiver after putting
     }
   } else {
     const TypeInstPtr* tip = TypeInstPtr::make(field_holder->java_mirror());
@@ -230,7 +230,7 @@
     }
     // If there is going to be a trap, put it at the next bytecode:
     set_bci(iter().next_bci());
-    do_null_assert(peek(), T_OBJECT);
+    null_assert(peek());
     set_bci(iter().cur_bci()); // put it back
   }

@@ -463,7 +463,7 @@
     // Note: the reexecute bit will be set in GraphKit::add_safepoint_edges()
     // when AllocateArray node for newarray is created.
     { PreserveReexecuteState preexecs(this);
-      _sp += ndimensions;
+      inc_sp(ndimensions);
       // Pass 0 as nargs since uncommon trap code does not need to restore stack.
       obj = expand_multianewarray(array_klass, &length[0], ndimensions, 0);
     } //original reexecute and sp are set back here
@@ -492,7 +492,7 @@
     // Create a java array for dimension sizes
     Node* dims = NULL;
     { PreserveReexecuteState preexecs(this);
-      _sp += ndimensions;
+      inc_sp(ndimensions);
       Node* dims_array_klass = makecon(TypeKlassPtr::make(ciArrayKlass::make(ciType::make(T_INT))));
       dims = new_array(dims_array_klass, intcon(ndimensions), 0);

@@ -509,6 +509,7 @@
                           makecon(TypeKlassPtr::make(array_klass)),
                           dims);
   }
+  make_slow_call_ex(c, env()->Throwable_klass(), false);

   Node* res = _gvn.transform(new (C) ProjNode(c, TypeFunc::Parms));
--- a/src/share/vm/opto/parseHelper.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/parseHelper.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -84,7 +84,7 @@
                        C->log()->identify(tp->klass()));
       }
     }
-    do_null_assert(obj, T_OBJECT);
+    null_assert(obj);
     assert( stopped() || _gvn.type(peek())->higher_equal(TypePtr::NULL_PTR), "what's left behind is null" );
     if (!stopped()) {
       profile_null_checkcast();
@@ -116,7 +116,7 @@
       C->log()->elem("assert_null reason='instanceof' klass='%d'",
                      C->log()->identify(klass));
     }
-    do_null_assert(peek(), T_OBJECT);
+    null_assert(peek());
     assert( stopped() || _gvn.type(peek())->higher_equal(TypePtr::NULL_PTR), "what's left behind is null" );
     if (!stopped()) {
       // The object is now known to be null.
@@ -139,10 +139,10 @@
 // pull array from stack and check that the store is valid
 void Parse::array_store_check() {

-  // Shorthand access to array store elements
-  Node *obj = stack(_sp-1);
-  Node *idx = stack(_sp-2);
-  Node *ary = stack(_sp-3);
+  // Shorthand access to array store elements without popping them.
+  Node *obj = peek(0);
+  Node *idx = peek(1);
+  Node *ary = peek(2);

   if (_gvn.type(obj) == TypePtr::NULL_PTR) {
     // There's never a type check on null values.
--- a/src/share/vm/opto/phaseX.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/phaseX.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -75,6 +75,13 @@
   // nh->_sentinel must be in the current node space
 }

+void NodeHash::replace_with(NodeHash *nh) {
+  debug_only(_table = (Node**)badAddress);   // interact correctly w/ operator=
+  // just copy in all the fields
+  *this = *nh;
+  // nh->_sentinel must be in the current node space
+}
+
 //------------------------------hash_find--------------------------------------
 // Find in hash table
 Node *NodeHash::hash_find( const Node *n ) {
@@ -383,6 +390,8 @@

   // Identify nodes that are reachable from below, useful.
   C->identify_useful_nodes(_useful);
+  // Update dead node list
+  C->update_dead_node_list(_useful);

   // Remove all useless nodes from PhaseValues' recorded types
   // Must be done before disconnecting nodes to preserve hash-table-invariant
@@ -1190,7 +1199,7 @@
             }
           }
         }
-
+        C->record_dead_node(dead->_idx);
         if (dead->is_macro()) {
           C->remove_macro_node(dead);
         }
@@ -1199,6 +1208,11 @@
           continue;
         }
       }
+      // Constant node that has no out-edges and has only one in-edge from
+      // root is usually dead. However, sometimes reshaping walk makes
+      // it reachable by adding use edges. So, we will NOT count Con nodes
+      // as dead to be conservative about the dead node count at any
+      // given time.
     }

     // Aggressively kill globally dead uses
--- a/src/share/vm/opto/phaseX.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/phaseX.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -92,6 +92,7 @@
   }

   void   remove_useless_nodes(VectorSet &useful); // replace with sentinel
+  void replace_with(NodeHash* nh);

   Node  *sentinel() { return _sentinel; }

@@ -386,6 +387,11 @@
   Node  *transform( Node *n );
   Node  *transform_no_reclaim( Node *n );

+  void replace_with(PhaseGVN* gvn) {
+    _table.replace_with(&gvn->_table);
+    _types = gvn->_types;
+  }
+
   // Check for a simple dead loop when a data node references itself.
   DEBUG_ONLY(void dead_loop_check(Node *n);)
 };
--- a/src/share/vm/opto/postaloc.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/postaloc.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -146,7 +146,7 @@
       }
     }
     // Disconnect control and remove precedence edges if any exist
-    old->disconnect_inputs(NULL);
+    old->disconnect_inputs(NULL, C);
   }
   return blk_adjust;
 }
@@ -513,7 +513,7 @@
         b->_nodes.remove(j--); phi_dex--;
         _cfg._bbs.map(phi->_idx,NULL);
         phi->replace_by(u);
-        phi->disconnect_inputs(NULL);
+        phi->disconnect_inputs(NULL, C);
         continue;
       }
       // Note that if value[pidx] exists, then we merged no new values here
--- a/src/share/vm/opto/reg_split.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/reg_split.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -747,7 +747,7 @@
             if( i >= cnt ) {    // Found one unique input
               assert(Find_id(n) == Find_id(u), "should be the same lrg");
               n->replace_by(u); // Then replace with unique input
-              n->disconnect_inputs(NULL);
+              n->disconnect_inputs(NULL, C);
               b->_nodes.remove(insidx);
               insidx--;
               b->_ihrp_index--;
--- a/src/share/vm/opto/runtime.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/runtime.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -989,7 +989,7 @@
       // since we're notifying the VM on every catch.
       // Force deoptimization and the rest of the lookup
       // will be fine.
-      deoptimize_caller_frame(thread, true);
+      deoptimize_caller_frame(thread);
     }

     // Check the stack guard pages.  If enabled, look for handler in this frame;
@@ -1143,17 +1143,22 @@


 void OptoRuntime::deoptimize_caller_frame(JavaThread *thread, bool doit) {
-  // Deoptimize frame
-  if (doit) {
-    // Called from within the owner thread, so no need for safepoint
-    RegisterMap reg_map(thread);
-    frame stub_frame = thread->last_frame();
-    assert(stub_frame.is_runtime_frame() || exception_blob()->contains(stub_frame.pc()), "sanity check");
-    frame caller_frame = stub_frame.sender(&reg_map);
+  // Deoptimize the caller before continuing, as the compiled
+  // exception handler table may not be valid.
+  if (!StressCompiledExceptionHandlers && doit) {
+    deoptimize_caller_frame(thread);
+  }
+}

-    // Deoptimize the caller frame.
-    Deoptimization::deoptimize_frame(thread, caller_frame.id());
-  }
+void OptoRuntime::deoptimize_caller_frame(JavaThread *thread) {
+  // Called from within the owner thread, so no need for safepoint
+  RegisterMap reg_map(thread);
+  frame stub_frame = thread->last_frame();
+  assert(stub_frame.is_runtime_frame() || exception_blob()->contains(stub_frame.pc()), "sanity check");
+  frame caller_frame = stub_frame.sender(&reg_map);
+
+  // Deoptimize the caller frame.
+  Deoptimization::deoptimize_frame(thread, caller_frame.id());
 }
--- a/src/share/vm/opto/runtime.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/runtime.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -174,6 +174,7 @@
   static address handle_exception_C       (JavaThread* thread);
   static address handle_exception_C_helper(JavaThread* thread, nmethod*& nm);
   static address rethrow_C                (oopDesc* exception, JavaThread *thread, address return_pc );
+  static void deoptimize_caller_frame     (JavaThread *thread);
   static void deoptimize_caller_frame     (JavaThread *thread, bool doit);
   static bool is_deoptimized_caller_frame (JavaThread *thread);
--- a/src/share/vm/opto/stringopts.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/stringopts.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -241,13 +241,13 @@

       _stringopts->gvn()->transform(call);
       C->gvn_replace_by(uct, call);
-      uct->disconnect_inputs(NULL);
+      uct->disconnect_inputs(NULL, C);
     }
   }

   void cleanup() {
     // disconnect the hook node
-    _arguments->disconnect_inputs(NULL);
+    _arguments->disconnect_inputs(NULL, _stringopts->C);
   }
 };

@@ -265,7 +265,8 @@
     } else if (n->is_IfTrue()) {
       Compile* C = _stringopts->C;
       C->gvn_replace_by(n, n->in(0)->in(0));
-      C->gvn_replace_by(n->in(0), C->top());
+      // get rid of the other projection
+      C->gvn_replace_by(n->in(0)->as_If()->proj_out(false), C->top());
     }
   }
 }
@@ -358,7 +359,7 @@
     C->gvn_replace_by(mem_proj, mem);
   }
   C->gvn_replace_by(init, C->top());
-  init->disconnect_inputs(NULL);
+  init->disconnect_inputs(NULL, C);
 }

 Node_List PhaseStringOpts::collect_toString_calls() {
@@ -439,7 +440,7 @@
       }
       // Find the constructor call
       Node* result = alloc->result_cast();
-      if (result == NULL || !result->is_CheckCastPP()) {
+      if (result == NULL || !result->is_CheckCastPP() || alloc->in(TypeFunc::Memory)->is_top()) {
         // strange looking allocation
 #ifndef PRODUCT
         if (PrintOptimizeStringConcat) {
@@ -744,7 +745,9 @@
       ctrl_path.push(cn);
       ctrl_path.push(cn->proj_out(0));
       ctrl_path.push(cn->proj_out(0)->unique_out());
-      ctrl_path.push(cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0));
+      if (cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0) != NULL) {
+        ctrl_path.push(cn->proj_out(0)->unique_out()->as_Catch()->proj_out(0));
+      }
     } else {
       ShouldNotReachHere();
     }
@@ -762,6 +765,12 @@
     } else if (ptr->is_IfTrue()) {
       IfNode* iff = ptr->in(0)->as_If();
       BoolNode* b = iff->in(1)->isa_Bool();
+
+      if (b == NULL) {
+        fail = true;
+        break;
+      }
+
       Node* cmp = b->in(1);
       Node* v1 = cmp->in(1);
       Node* v2 = cmp->in(2);
@@ -826,6 +835,9 @@
           ptr->in(1)->in(0) != NULL && ptr->in(1)->in(0)->is_If()) {
         // Simple diamond.
         // XXX should check for possibly merging stores.  simple data merges are ok.
+        // The IGVN will make this simple diamond go away when it
+        // transforms the Region. Make sure it sees it.
+        Compile::current()->record_for_igvn(ptr);
         ptr = ptr->in(1)->in(0)->in(0);
         continue;
       }
@@ -1408,75 +1420,80 @@
                       Deoptimization::Action_make_not_entrant);
   }

-  // length now contains the number of characters needed for the
-  // char[] so create a new AllocateArray for the char[]
-  Node* char_array = NULL;
-  {
-    PreserveReexecuteState preexecs(&kit);
-    // The original jvms is for an allocation of either a String or
-    // StringBuffer so no stack adjustment is necessary for proper
-    // reexecution.  If we deoptimize in the slow path the bytecode
-    // will be reexecuted and the char[] allocation will be thrown away.
-    kit.jvms()->set_should_reexecute(true);
-    char_array = kit.new_array(__ makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_CHAR))),
-                               length, 1);
-  }
+  Node* result;
+  if (!kit.stopped()) {
+
+    // length now contains the number of characters needed for the
+    // char[] so create a new AllocateArray for the char[]
+    Node* char_array = NULL;
+    {
+      PreserveReexecuteState preexecs(&kit);
+      // The original jvms is for an allocation of either a String or
+      // StringBuffer so no stack adjustment is necessary for proper
+      // reexecution.  If we deoptimize in the slow path the bytecode
+      // will be reexecuted and the char[] allocation will be thrown away.
+      kit.jvms()->set_should_reexecute(true);
+      char_array = kit.new_array(__ makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_CHAR))),
+                                 length, 1);
+    }
+
+    // Mark the allocation so that zeroing is skipped since the code
+    // below will overwrite the entire array
+    AllocateArrayNode* char_alloc = AllocateArrayNode::Ideal_array_allocation(char_array, _gvn);
+    char_alloc->maybe_set_complete(_gvn);

-  // Mark the allocation so that zeroing is skipped since the code
-  // below will overwrite the entire array
-  AllocateArrayNode* char_alloc = AllocateArrayNode::Ideal_array_allocation(char_array, _gvn);
-  char_alloc->maybe_set_complete(_gvn);
-
-  // Now copy the string representations into the final char[]
-  Node* start = __ intcon(0);
-  for (int argi = 0; argi < sc->num_arguments(); argi++) {
-    Node* arg = sc->argument(argi);
-    switch (sc->mode(argi)) {
-      case StringConcat::IntMode: {
-        Node* end = __ AddI(start, string_sizes->in(argi));
-        // getChars words backwards so pass the ending point as well as the start
-        int_getChars(kit, arg, char_array, start, end);
-        start = end;
-        break;
+    // Now copy the string representations into the final char[]
+    Node* start = __ intcon(0);
+    for (int argi = 0; argi < sc->num_arguments(); argi++) {
+      Node* arg = sc->argument(argi);
+      switch (sc->mode(argi)) {
+        case StringConcat::IntMode: {
+          Node* end = __ AddI(start, string_sizes->in(argi));
+          // getChars words backwards so pass the ending point as well as the start
+          int_getChars(kit, arg, char_array, start, end);
+          start = end;
+          break;
+        }
+        case StringConcat::StringNullCheckMode:
+        case StringConcat::StringMode: {
+          start = copy_string(kit, arg, char_array, start);
+          break;
+        }
+        case StringConcat::CharMode: {
+          __ store_to_memory(kit.control(), kit.array_element_address(char_array, start, T_CHAR),
+                             arg, T_CHAR, char_adr_idx);
+          start = __ AddI(start, __ intcon(1));
+          break;
+        }
+        default:
+          ShouldNotReachHere();
       }
-      case StringConcat::StringNullCheckMode:
-      case StringConcat::StringMode: {
-        start = copy_string(kit, arg, char_array, start);
-        break;
-      }
-      case StringConcat::CharMode: {
-        __ store_to_memory(kit.control(), kit.array_element_address(char_array, start, T_CHAR),
-                           arg, T_CHAR, char_adr_idx);
-        start = __ AddI(start, __ intcon(1));
-        break;
-      }
-      default:
-        ShouldNotReachHere();
     }
-  }

-  // If we're not reusing an existing String allocation then allocate one here.
-  Node* result = sc->string_alloc();
-  if (result == NULL) {
-    PreserveReexecuteState preexecs(&kit);
-    // The original jvms is for an allocation of either a String or
-    // StringBuffer so no stack adjustment is necessary for proper
-    // reexecution.
-    kit.jvms()->set_should_reexecute(true);
-    result = kit.new_instance(__ makecon(TypeKlassPtr::make(C->env()->String_klass())));
+    // If we're not reusing an existing String allocation then allocate one here.
+    result = sc->string_alloc();
+    if (result == NULL) {
+      PreserveReexecuteState preexecs(&kit);
+      // The original jvms is for an allocation of either a String or
+      // StringBuffer so no stack adjustment is necessary for proper
+      // reexecution.
+      kit.jvms()->set_should_reexecute(true);
+      result = kit.new_instance(__ makecon(TypeKlassPtr::make(C->env()->String_klass())));
+    }
+
+    // Intialize the string
+    if (java_lang_String::has_offset_field()) {
+      kit.store_String_offset(kit.control(), result, __ intcon(0));
+      kit.store_String_length(kit.control(), result, length);
+    }
+    kit.store_String_value(kit.control(), result, char_array);
+  } else {
+    result = C->top();
   }
-
-  // Intialize the string
-  if (java_lang_String::has_offset_field()) {
-    kit.store_String_offset(kit.control(), result, __ intcon(0));
-    kit.store_String_length(kit.control(), result, length);
-  }
-  kit.store_String_value(kit.control(), result, char_array);
-
   // hook up the outgoing control and result
   kit.replace_call(sc->end(), result);

   // Unhook any hook nodes
-  string_sizes->disconnect_inputs(NULL);
+  string_sizes->disconnect_inputs(NULL, C);
   sc->cleanup();
 }
--- a/src/share/vm/opto/type.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/opto/type.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -225,8 +225,10 @@
   const TypeInt    *isa_int() const;             // Returns NULL if not an Int
   const TypeLong   *is_long() const;
   const TypeLong   *isa_long() const;            // Returns NULL if not a Long
+  const TypeD      *isa_double() const;          // Returns NULL if not a Double{Top,Con,Bot}
   const TypeD      *is_double_constant() const;  // Asserts it is a DoubleCon
   const TypeD      *isa_double_constant() const; // Returns NULL if not a DoubleCon
+  const TypeF      *isa_float() const;           // Returns NULL if not a Float{Top,Con,Bot}
   const TypeF      *is_float_constant() const;   // Asserts it is a FloatCon
   const TypeF      *isa_float_constant() const;  // Returns NULL if not a FloatCon
   const TypeTuple  *is_tuple() const;            // Collection of fields, NOT a pointer
@@ -1141,24 +1143,6 @@
   return ((TypeD*)this)->_d;
 }

-inline const TypeF *Type::is_float_constant() const {
-  assert( _base == FloatCon, "Not a Float" );
-  return (TypeF*)this;
-}
-
-inline const TypeF *Type::isa_float_constant() const {
-  return ( _base == FloatCon ? (TypeF*)this : NULL);
-}
-
-inline const TypeD *Type::is_double_constant() const {
-  assert( _base == DoubleCon, "Not a Double" );
-  return (TypeD*)this;
-}
-
-inline const TypeD *Type::isa_double_constant() const {
-  return ( _base == DoubleCon ? (TypeD*)this : NULL);
-}
-
 inline const TypeInt *Type::is_int() const {
   assert( _base == Int, "Not an Int" );
   return (TypeInt*)this;
@@ -1177,6 +1161,36 @@
   return ( _base == Long ? (TypeLong*)this : NULL);
 }

+inline const TypeF *Type::isa_float() const {
+  return ((_base == FloatTop ||
+           _base == FloatCon ||
+           _base == FloatBot) ? (TypeF*)this : NULL);
+}
+
+inline const TypeF *Type::is_float_constant() const {
+  assert( _base == FloatCon, "Not a Float" );
+  return (TypeF*)this;
+}
+
+inline const TypeF *Type::isa_float_constant() const {
+  return ( _base == FloatCon ? (TypeF*)this : NULL);
+}
+
+inline const TypeD *Type::isa_double() const {
+  return ((_base == DoubleTop ||
+           _base == DoubleCon ||
+           _base == DoubleBot) ? (TypeD*)this : NULL);
+}
+
+inline const TypeD *Type::is_double_constant() const {
+  assert( _base == DoubleCon, "Not a Double" );
+  return (TypeD*)this;
+}
+
+inline const TypeD *Type::isa_double_constant() const {
+  return ( _base == DoubleCon ? (TypeD*)this : NULL);
+}
+
 inline const TypeTuple *Type::is_tuple() const {
   assert( _base == Tuple, "Not a Tuple" );
   return (TypeTuple*)this;
--- a/src/share/vm/prims/jvmtiExport.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/prims/jvmtiExport.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1305,15 +1305,17 @@
         vframeStream st(thread);
         assert(!st.at_end(), "cannot be at end");
         methodOop current_method = NULL;
+        methodHandle current_mh = methodHandle(thread, current_method);
         int current_bci = -1;
         do {
           current_method = st.method();
+          current_mh = methodHandle(thread, current_method);
           current_bci = st.bci();
           do {
             should_repeat = false;
             KlassHandle eh_klass(thread, exception_handle()->klass());
-            current_bci = current_method->fast_exception_handler_bci_for(
-              eh_klass, current_bci, THREAD);
+            current_bci = methodOopDesc::fast_exception_handler_bci_for(
+              current_mh, eh_klass, current_bci, THREAD);
             if (HAS_PENDING_EXCEPTION) {
               exception_handle = KlassHandle(thread, PENDING_EXCEPTION);
               CLEAR_PENDING_EXCEPTION;
@@ -1328,8 +1330,7 @@
           catch_jmethodID = 0;
           current_bci = 0;
         } else {
-          catch_jmethodID = jem.to_jmethodID(
-                                     methodHandle(thread, current_method));
+          catch_jmethodID = jem.to_jmethodID(current_mh);
         }

         JvmtiJavaThreadEventTransition jet(thread);
--- a/src/share/vm/prims/methodHandles.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/prims/methodHandles.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1170,8 +1170,8 @@
     // Walk all nmethods depending on this call site.
     MutexLocker mu(Compile_lock, thread);
     Universe::flush_dependents_on(call_site, target);
+    java_lang_invoke_CallSite::set_target(call_site(), target());
   }
-  java_lang_invoke_CallSite::set_target(call_site(), target());
 }
 JVM_END

@@ -1182,8 +1182,8 @@
     // Walk all nmethods depending on this call site.
     MutexLocker mu(Compile_lock, thread);
     Universe::flush_dependents_on(call_site, target);
+    java_lang_invoke_CallSite::set_target_volatile(call_site(), target());
   }
-  java_lang_invoke_CallSite::set_target_volatile(call_site(), target());
 }
 JVM_END
--- a/src/share/vm/runtime/arguments.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/runtime/arguments.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -3228,6 +3228,18 @@
   if (!EliminateLocks) {
     EliminateNestedLocks = false;
   }
+  if (!Inline) {
+    IncrementalInline = false;
+  }
+#ifndef PRODUCT
+  if (!IncrementalInline) {
+    AlwaysIncrementalInline = false;
+  }
+#endif
+  if (IncrementalInline && FLAG_IS_DEFAULT(MaxNodeLimit)) {
+    // incremental inlining: bump MaxNodeLimit
+    FLAG_SET_DEFAULT(MaxNodeLimit, (intx)75000);
+  }
 #endif

   if (PrintAssembly && FLAG_IS_DEFAULT(DebugNonSafepoints)) {
--- a/src/share/vm/runtime/deoptimization.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/runtime/deoptimization.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -1242,8 +1242,8 @@
   nmethodLocker nl(fr.pc());

   // Log a message
-  Events::log_deopt_message(thread, "Uncommon trap %d fr.pc " INTPTR_FORMAT,
-                            trap_request, fr.pc());
+  Events::log(thread, "Uncommon trap: trap_request=" PTR32_FORMAT " fr.pc=" INTPTR_FORMAT,
+              trap_request, fr.pc());

   {
     ResourceMark rm;
@@ -1274,6 +1274,11 @@
     methodDataHandle trap_mdo
       (THREAD, get_method_data(thread, trap_method, create_if_missing));

+    // Log a message
+    Events::log_deopt_message(thread, "Uncommon trap: reason=%s action=%s pc=" INTPTR_FORMAT " method=%s @ %d",
+                              trap_reason_name(reason), trap_action_name(action), fr.pc(),
+                              trap_method->name_and_sig_as_C_string(), trap_bci);
+
     // Print a bunch of diagnostics, if requested.
     if (TraceDeoptimization || LogCompilation) {
       ResourceMark rm;
--- a/src/share/vm/runtime/globals.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/runtime/globals.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -922,6 +922,9 @@
   develop(bool, PrintExceptionHandlers, false,                              \
           "Print exception handler tables for all nmethods when generated") \
                                                                             \
+  develop(bool, StressCompiledExceptionHandlers, false,                     \
+         "Exercise compiled exception handlers")                            \
+                                                                            \
   develop(bool, InterceptOSException, false,                                \
           "Starts debugger when an implicit OS (e.g., NULL) "               \
           "exception happens")                                              \
--- a/src/share/vm/runtime/sharedRuntime.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/runtime/sharedRuntime.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -642,7 +642,7 @@
       bool skip_scope_increment = false;
       // exception handler lookup
       KlassHandle ek (THREAD, exception->klass());
-      handler_bci = sd->method()->fast_exception_handler_bci_for(ek, bci, THREAD);
+      handler_bci = methodOopDesc::fast_exception_handler_bci_for(sd->method(), ek, bci, THREAD);
       if (HAS_PENDING_EXCEPTION) {
         recursive_exception = true;
         // We threw an exception while trying to find the exception handler.
--- a/src/share/vm/runtime/thread.cpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/runtime/thread.cpp	Fri Jan 11 10:38:38 2013 -0800
@@ -2171,7 +2171,7 @@
           // BiasedLocking needs an updated RegisterMap for the revoke monitors pass
           RegisterMap reg_map(this, UseBiasedLocking);
           frame compiled_frame = f.sender(&reg_map);
-          if (compiled_frame.can_be_deoptimized()) {
+          if (!StressCompiledExceptionHandlers && compiled_frame.can_be_deoptimized()) {
             Deoptimization::deoptimize(this, compiled_frame, &reg_map);
           }
         }
--- a/src/share/vm/utilities/events.hpp	Wed Jan 09 20:33:26 2013 -0800
+++ b/src/share/vm/utilities/events.hpp	Fri Jan 11 10:38:38 2013 -0800
@@ -135,11 +135,11 @@
 };

 // A simple wrapper class for fixed size text messages.
-class StringLogMessage : public FormatBuffer<132> {
+class StringLogMessage : public FormatBuffer<256> {
  public:
   // Wrap this buffer in a stringStream.
   stringStream stream() {
-    return stringStream(_buf, sizeof(_buf));
+    return stringStream(_buf, size());
   }
 };
--- a/test/compiler/6865265/StackOverflowBug.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/test/compiler/6865265/StackOverflowBug.java	Fri Jan 11 10:38:38 2013 -0800
@@ -28,7 +28,7 @@
  * @summary JVM crashes with "missing exception handler" error
  * @author volker.simonis@sap.com
  *
- * @run main/othervm -XX:CompileThreshold=100 -Xbatch -Xss224k StackOverflowBug
+ * @run main/othervm -XX:CompileThreshold=100 -Xbatch -Xss248k StackOverflowBug
  */
--- a/test/compiler/7184394/TestAESBase.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/test/compiler/7184394/TestAESBase.java	Fri Jan 11 10:38:38 2013 -0800
@@ -54,7 +54,6 @@
   String paddingStr = "PKCS5Padding";
   AlgorithmParameters algParams;
   SecretKey key;
-  int ivLen;

   static int numThreads = 0;
   int  threadId;
@@ -68,7 +67,7 @@

   public void prepare() {
     try {
-    System.out.println("\nmsgSize=" + msgSize + ", key size=" + keySize + ", reInit=" + !noReinit + ", checkOutput=" + checkOutput);
+    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);

       int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
       byte keyBytes[] = new byte[keyLenBytes];
@@ -90,10 +89,14 @@
       cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
       dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");

-      ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
-      IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
-
-      cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      if (mode.equals("CBC")) {
+        int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
+        IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
+        cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+      } else {
+        algParams = cipher.getParameters();
+        cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+      }
       algParams = cipher.getParameters();
       dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
       if (threadId == 0) {
--- a/test/compiler/7184394/TestAESMain.java	Wed Jan 09 20:33:26 2013 -0800
+++ b/test/compiler/7184394/TestAESMain.java	Fri Jan 11 10:38:38 2013 -0800
@@ -27,7 +27,8 @@
  * @bug 7184394
  * @summary add intrinsics to use AES instructions
  *
- * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
  *
  * @author Tom Deneau
  */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/8004741/Test8004741.java	Fri Jan 11 10:38:38 2013 -0800
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test Test8004741.java
+ * @bug 8004741
+ * @summary Missing compiled exception handle table entry for multidimensional array allocation
+ * @run main/othervm -Xmx64m -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:+StressCompiledExceptionHandlers Test8004741
+ *
+ */
+
+import java.util.*;
+
+public class Test8004741 extends Thread {
+
+  static int[][] test(int a, int b) throws Exception {
+    int[][] ar = null;
+    try {
+      ar = new int[a][b];
+    } catch (Error e) {
+      System.out.println("test got Error");
+      passed = true;
+      throw(e);
+    } catch (Exception e) {
+      System.out.println("test got Exception");
+      throw(e);
+    }
+    return ar;
+  }
+
+  static boolean passed = false;
+
+  public void run() {
+      System.out.println("test started");
+      try {
+        while(true) {
+          test(2,20000);
+        }
+      } catch (ThreadDeath e) {
+        System.out.println("test got ThreadDeath");
+        passed = true;
+      } catch (Error e) {
+        e.printStackTrace();
+        System.out.println("test got Error");
+      } catch (Exception e) {
+        e.printStackTrace();
+        System.out.println("test got Exception");
+      }
+  }
+
+  public static void main(String[] args) throws Exception {
+    for (int n = 0; n < 11000; n++) {
+      test(2, 20);
+    }
+
+    // First test exception catch
+    Test8004741 t = new Test8004741();
+
+    passed = false;
+    t.start();
+    Thread.sleep(1000);
+    t.stop();
+
+    Thread.sleep(5000);
+    t.join();
+    if (passed) {
+      System.out.println("PASSED");
+    } else {
+      System.out.println("FAILED");
+      System.exit(97);
+    }
+  }
+
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/8005033/Test8005033.java	Fri Jan 11 10:38:38 2013 -0800
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2012 SAP AG.  All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8005033
+ * @summary On sparcv9, C2's intrinsic for Integer.bitCount(OV) returns wrong result if OV is the result of an operation with int overflow.
+ * @run main/othervm -Xcomp -XX:CompileOnly=Test8005033::testBitCount Test8005033
+ * @author Richard Reingruber richard DOT reingruber AT sap DOT com
+ */
+
+public class Test8005033 {
+    public static int MINUS_ONE = -1;
+
+    public static void main(String[] args) {
+        System.out.println("EXECUTING test.");
+        Integer.bitCount(1);   // load class
+        int expectedBitCount = 0;
+        int calculatedBitCount = testBitCount();
+        if (expectedBitCount != calculatedBitCount) {
+            throw new InternalError("got " + calculatedBitCount + " but expected " + expectedBitCount);
+        }
+        System.out.println("SUCCESSFULLY passed test.");
+    }
+
+    // testBitCount will be compiled using the Integer.bitCount() intrinsic if possible
+    private static int testBitCount() {
+        return Integer.bitCount(MINUS_ONE+1);   // -1 + 1 => int overflow
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/8005419/Test8005419.java	Fri Jan 11 10:38:38 2013 -0800
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8005419
+ * @summary Improve intrinsics code performance on x86 by using AVX2
+ * @run main/othervm -Xbatch -Xmx64m Test8005419
+ *
+ */
+
+public class Test8005419 {
+    public static int SIZE = 64;
+
+    public static void main(String[] args) {
+        char[] a = new char[SIZE];
+        char[] b = new char[SIZE];
+
+        for (int i = 16; i < SIZE; i++) {
+          a[i] = (char)i;
+          b[i] = (char)i;
+        }
+        String s1 = new String(a);
+        String s2 = new String(b);
+
+        // Warm up
+        boolean failed = false;
+        int result = 0;
+        for (int i = 0; i < 10000; i++) {
+          result += test(s1, s2);
+        }
+        for (int i = 0; i < 10000; i++) {
+          result += test(s1, s2);
+        }
+        for (int i = 0; i < 10000; i++) {
+          result += test(s1, s2);
+        }
+        if (result != 0) failed = true;
+
+        System.out.println("Start testing");
+        // Compare same string
+        result = test(s1, s1);
+        if (result != 0) {
+          failed = true;
+          System.out.println("Failed same: result = " + result + ", expected 0");
+        }
+        // Compare equal strings
+        for (int i = 1; i <= SIZE; i++) {
+          s1 = new String(a, 0, i);
+          s2 = new String(b, 0, i);
+          result = test(s1, s2);
+          if (result != 0) {
+            failed = true;
+            System.out.println("Failed equals s1[" + i + "], s2[" + i + "]: result = " + result + ", expected 0");
+          }
+        }
+        // Compare equal strings but different sizes
+        for (int i = 1; i <= SIZE; i++) {
+          s1 = new String(a, 0, i);
+          for (int j = 1; j <= SIZE; j++) {
+            s2 = new String(b, 0, j);
+            result = test(s1, s2);
+            if (result != (i-j)) {
+              failed = true;
+              System.out.println("Failed diff size s1[" + i + "], s2[" + j + "]: result = " + result + ", expected " + (i-j));
+            }
+          }
+        }
+        // Compare strings with one char different and different sizes
+        for (int i = 1; i <= SIZE; i++) {
+          s1 = new String(a, 0, i);
+          for (int j = 0; j < i; j++) {
+            b[j] -= 3; // change char
+            s2 = new String(b, 0, i);
+            result = test(s1, s2);
+            int chdiff = a[j] - b[j];
+            if (result != chdiff) {
+              failed = true;
+              System.out.println("Failed diff char s1[" + i + "], s2[" + i + "]: result = " + result + ", expected " + chdiff);
+            }
+            result = test(s2, s1);
+            chdiff = b[j] - a[j];
+            if (result != chdiff) {
+              failed = true;
+              System.out.println("Failed diff char s2[" + i + "], s1[" + i + "]: result = " + result + ", expected " + chdiff);
+            }
+            b[j] += 3; // restore
+          }
+        }
+        if (failed) {
+          System.out.println("FAILED");
+          System.exit(97);
+        }
+        System.out.println("PASSED");
+    }
+
+    private static int test(String str1, String str2) {
+        return str1.compareTo(str2);
+    }
+}