changeset 2885:d8cb48376797

7097546: Optimize use of CMOVE instructions Summary: Avoid CMove in a loop if possible. May generate CMove if it could be moved outside a loop. Reviewed-by: never
author kvn
date Wed, 26 Oct 2011 06:08:56 -0700
parents e69a66a1457b
children cec1757a0134
files src/cpu/sparc/vm/sparc.ad src/cpu/x86/vm/x86_32.ad src/cpu/x86/vm/x86_64.ad src/share/vm/compiler/compileBroker.cpp src/share/vm/opto/loopopts.cpp src/share/vm/opto/matcher.hpp
diffstat 6 files changed, 144 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/sparc/vm/sparc.ad	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/cpu/sparc/vm/sparc.ad	Wed Oct 26 06:08:56 2011 -0700
@@ -1860,6 +1860,14 @@
 // Threshold size for cleararray.
 const int Matcher::init_array_short_size = 8 * BytesPerLong;
 
+// No additional cost for CMOVL.
+const int Matcher::long_cmove_cost() { return 0; }
+
+// CMOVF/CMOVD are expensive on T4 and on SPARC64.
+const int Matcher::float_cmove_cost() {
+  return (VM_Version::is_T4() || VM_Version::is_sparc64()) ? ConditionalMoveLimit : 0;
+}
+
 // Should the Matcher clone shifts on addressing modes, expecting them to
 // be subsumed into complex addressing expressions or compute them into
 // registers?  True for Intel but false for most RISCs
--- a/src/cpu/x86/vm/x86_32.ad	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/cpu/x86/vm/x86_32.ad	Wed Oct 26 06:08:56 2011 -0700
@@ -1393,6 +1393,12 @@
 // Threshold size for cleararray.
 const int Matcher::init_array_short_size = 8 * BytesPerLong;
 
+// Needs 2 CMOV's for longs.
+const int Matcher::long_cmove_cost() { return 1; }
+
+// No CMOVF/CMOVD with SSE/SSE2
+const int Matcher::float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; }
+
 // Should the Matcher clone shifts on addressing modes, expecting them to
 // be subsumed into complex addressing expressions or compute them into
 // registers?  True for Intel but false for most RISCs
@@ -7905,6 +7911,40 @@
 
 //----------Conditional Move---------------------------------------------------
 // Conditional move
+instruct jmovI_reg(cmpOp cop, eFlagsReg cr, eRegI dst, eRegI src) %{
+  predicate(!VM_Version::supports_cmov() );
+  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  format %{ "J$cop,us skip\t# signed cmove\n\t"
+            "MOV    $dst,$src\n"
+      "skip:" %}
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movl($dst$$Register, $src$$Register);
+    __ bind(Lskip);
+  %}
+  ins_pipe( pipe_cmov_reg );
+%}
+
+instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src) %{
+  predicate(!VM_Version::supports_cmov() );
+  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
+  ins_cost(200);
+  format %{ "J$cop,us skip\t# unsigned cmove\n\t"
+            "MOV    $dst,$src\n"
+      "skip:" %}
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movl($dst$$Register, $src$$Register);
+    __ bind(Lskip);
+  %}
+  ins_pipe( pipe_cmov_reg );
+%}
+
 instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
--- a/src/cpu/x86/vm/x86_64.ad	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/cpu/x86/vm/x86_64.ad	Wed Oct 26 06:08:56 2011 -0700
@@ -1993,6 +1993,12 @@
 // Threshold size for cleararray.
 const int Matcher::init_array_short_size = 8 * BytesPerLong;
 
+// No additional cost for CMOVL.
+const int Matcher::long_cmove_cost() { return 0; }
+
+// No CMOVF/CMOVD with SSE2
+const int Matcher::float_cmove_cost() { return ConditionalMoveLimit; }
+
 // Should the Matcher clone shifts on addressing modes, expecting them
 // to be subsumed into complex addressing expressions or compute them
 // into registers?  True for Intel but false for most RISCs
--- a/src/share/vm/compiler/compileBroker.cpp	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/share/vm/compiler/compileBroker.cpp	Wed Oct 26 06:08:56 2011 -0700
@@ -1722,11 +1722,11 @@
       if (PrintCompilation) {
         const char* reason = ci_env.failure_reason();
         if (compilable == ciEnv::MethodCompilable_not_at_tier) {
-            tty->print_cr("%3d   COMPILE SKIPPED: %s (retry at different tier)", compile_id, reason);
+          tty->print_cr("%4d   COMPILE SKIPPED: %s (retry at different tier)", compile_id, reason);
         } else if (compilable == ciEnv::MethodCompilable_never) {
-          tty->print_cr("%3d   COMPILE SKIPPED: %s (not retryable)", compile_id, reason);
+          tty->print_cr("%4d   COMPILE SKIPPED: %s (not retryable)", compile_id, reason);
         } else if (compilable == ciEnv::MethodCompilable) {
-          tty->print_cr("%3d   COMPILE SKIPPED: %s", compile_id, reason);
+          tty->print_cr("%4d   COMPILE SKIPPED: %s", compile_id, reason);
         }
       }
     } else {
@@ -1743,6 +1743,13 @@
 
   collect_statistics(thread, time, task);
 
+  if (PrintCompilation && PrintInlining) {
+    tty->print("%7d ", (int) tty->time_stamp().milliseconds());  // print timestamp
+    tty->print("%4d ", compile_id);    // print compilation number
+    tty->print("%s ", (is_osr ? "%" : " "));
+    tty->print_cr("size: %d time: %d inlined: %d bytes", task->code()->total_size(), time.milliseconds(), task->num_inlined_bytecodes());
+  }
+
   if (compilable == ciEnv::MethodCompilable_never) {
     if (is_osr) {
       method->set_not_osr_compilable();
--- a/src/share/vm/opto/loopopts.cpp	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/share/vm/opto/loopopts.cpp	Wed Oct 26 06:08:56 2011 -0700
@@ -28,6 +28,7 @@
 #include "opto/connode.hpp"
 #include "opto/divnode.hpp"
 #include "opto/loopnode.hpp"
+#include "opto/matcher.hpp"
 #include "opto/mulnode.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/subnode.hpp"
@@ -472,46 +473,50 @@
 // 1 or 2 items with a total of 1 or 2 ops executed speculatively.
 Node *PhaseIdealLoop::conditional_move( Node *region ) {
 
-  assert( region->is_Region(), "sanity check" );
-  if( region->req() != 3 ) return NULL;
+  assert(region->is_Region(), "sanity check");
+  if (region->req() != 3) return NULL;
 
   // Check for CFG diamond
   Node *lp = region->in(1);
   Node *rp = region->in(2);
-  if( !lp || !rp ) return NULL;
+  if (!lp || !rp) return NULL;
   Node *lp_c = lp->in(0);
-  if( lp_c == NULL || lp_c != rp->in(0) || !lp_c->is_If() ) return NULL;
+  if (lp_c == NULL || lp_c != rp->in(0) || !lp_c->is_If()) return NULL;
   IfNode *iff = lp_c->as_If();
 
-  // Check for highly predictable branch.  No point in CMOV'ing if
-  // we are going to predict accurately all the time.
-  // %%% This hides patterns produced by utility methods like Math.min.
-  if( iff->_prob < PROB_UNLIKELY_MAG(3) ||
-      iff->_prob > PROB_LIKELY_MAG(3) )
-    return NULL;
-
   // Check for ops pinned in an arm of the diamond.
   // Can't remove the control flow in this case
-  if( lp->outcnt() > 1 ) return NULL;
-  if( rp->outcnt() > 1 ) return NULL;
+  if (lp->outcnt() > 1) return NULL;
+  if (rp->outcnt() > 1) return NULL;
+
+  IdealLoopTree* r_loop = get_loop(region);
+  assert(r_loop == get_loop(iff), "sanity");
+  // Always convert to CMOVE if all results are used only outside this loop.
+  bool used_inside_loop = (r_loop == _ltree_root);
 
   // Check profitability
   int cost = 0;
   int phis = 0;
   for (DUIterator_Fast imax, i = region->fast_outs(imax); i < imax; i++) {
     Node *out = region->fast_out(i);
-    if( !out->is_Phi() ) continue; // Ignore other control edges, etc
+    if (!out->is_Phi()) continue; // Ignore other control edges, etc
     phis++;
     PhiNode* phi = out->as_Phi();
-    switch (phi->type()->basic_type()) {
-    case T_LONG:
-      cost++;                   // Probably encodes as 2 CMOV's
+    BasicType bt = phi->type()->basic_type();
+    switch (bt) {
+    case T_FLOAT:
+    case T_DOUBLE: {
+      cost += Matcher::float_cmove_cost(); // Could be very expensive
+      break;
+    }
+    case T_LONG: {
+      cost += Matcher::long_cmove_cost(); // May encodes as 2 CMOV's
+    }
     case T_INT:                 // These all CMOV fine
-    case T_FLOAT:
-    case T_DOUBLE:
-    case T_ADDRESS:             // (RawPtr)
+    case T_ADDRESS: {           // (RawPtr)
       cost++;
       break;
+    }
     case T_NARROWOOP: // Fall through
     case T_OBJECT: {            // Base oops are OK, but not derived oops
       const TypeOopPtr *tp = phi->type()->make_ptr()->isa_oopptr();
@@ -524,7 +529,7 @@
       // relevant bases.  This puts the allocator in the business of
       // manufacturing expensive instructions, generally a bad plan.
       // Just Say No to Conditionally-Moved Derived Pointers.
-      if( tp && tp->offset() != 0 )
+      if (tp && tp->offset() != 0)
         return NULL;
       cost++;
       break;
@@ -533,39 +538,64 @@
       return NULL;              // In particular, can't do memory or I/O
     }
     // Add in cost any speculative ops
-    for( uint j = 1; j < region->req(); j++ ) {
+    for (uint j = 1; j < region->req(); j++) {
       Node *proj = region->in(j);
       Node *inp = phi->in(j);
       if (get_ctrl(inp) == proj) { // Found local op
         cost++;
         // Check for a chain of dependent ops; these will all become
         // speculative in a CMOV.
-        for( uint k = 1; k < inp->req(); k++ )
+        for (uint k = 1; k < inp->req(); k++)
           if (get_ctrl(inp->in(k)) == proj)
-            return NULL;        // Too much speculative goo
+            cost += ConditionalMoveLimit; // Too much speculative goo
       }
     }
     // See if the Phi is used by a Cmp or Narrow oop Decode/Encode.
     // This will likely Split-If, a higher-payoff operation.
     for (DUIterator_Fast kmax, k = phi->fast_outs(kmax); k < kmax; k++) {
       Node* use = phi->fast_out(k);
-      if( use->is_Cmp() || use->is_DecodeN() || use->is_EncodeP() )
-        return NULL;
+      if (use->is_Cmp() || use->is_DecodeN() || use->is_EncodeP())
+        cost += ConditionalMoveLimit;
+      // Is there a use inside the loop?
+      // Note: check only basic types since CMoveP is pinned.
+      if (!used_inside_loop && is_java_primitive(bt)) {
+        IdealLoopTree* u_loop = get_loop(has_ctrl(use) ? get_ctrl(use) : use);
+        if (r_loop == u_loop || r_loop->is_member(u_loop)) {
+          used_inside_loop = true;
+        }
+      }
     }
   }
-  if( cost >= ConditionalMoveLimit ) return NULL; // Too much goo
   Node* bol = iff->in(1);
-  assert( bol->Opcode() == Op_Bool, "" );
+  assert(bol->Opcode() == Op_Bool, "");
   int cmp_op = bol->in(1)->Opcode();
   // It is expensive to generate flags from a float compare.
   // Avoid duplicated float compare.
-  if( phis > 1 && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) return NULL;
+  if (phis > 1 && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) return NULL;
+
+  float infrequent_prob = PROB_UNLIKELY_MAG(3);
+  // Ignore cost and blocks frequency if CMOVE can be moved outside the loop.
+  if (used_inside_loop) {
+    if (cost >= ConditionalMoveLimit) return NULL; // Too much goo
+
+    // BlockLayoutByFrequency optimization moves infrequent branch
+    // from hot path. No point in CMOV'ing in such case (110 is used
+    // instead of 100 to take into account not exactness of float value).
+    if (BlockLayoutByFrequency) {
+      infrequent_prob = MAX2(infrequent_prob, (float)BlockLayoutMinDiamondPercentage/110.0f);
+    }
+  }
+  // Check for highly predictable branch.  No point in CMOV'ing if
+  // we are going to predict accurately all the time.
+  if (iff->_prob < infrequent_prob ||
+      iff->_prob > (1.0f - infrequent_prob))
+    return NULL;
 
   // --------------
   // Now replace all Phis with CMOV's
   Node *cmov_ctrl = iff->in(0);
   uint flip = (lp->Opcode() == Op_IfTrue);
-  while( 1 ) {
+  while (1) {
     PhiNode* phi = NULL;
     for (DUIterator_Fast imax, i = region->fast_outs(imax); i < imax; i++) {
       Node *out = region->fast_out(i);
@@ -576,15 +606,15 @@
     }
     if (phi == NULL)  break;
 #ifndef PRODUCT
-    if( PrintOpto && VerifyLoopOptimizations ) tty->print_cr("CMOV");
+    if (PrintOpto && VerifyLoopOptimizations) tty->print_cr("CMOV");
 #endif
     // Move speculative ops
-    for( uint j = 1; j < region->req(); j++ ) {
+    for (uint j = 1; j < region->req(); j++) {
       Node *proj = region->in(j);
       Node *inp = phi->in(j);
       if (get_ctrl(inp) == proj) { // Found local op
 #ifndef PRODUCT
-        if( PrintOpto && VerifyLoopOptimizations ) {
+        if (PrintOpto && VerifyLoopOptimizations) {
           tty->print("  speculate: ");
           inp->dump();
         }
@@ -596,7 +626,14 @@
     register_new_node( cmov, cmov_ctrl );
     _igvn.replace_node( phi, cmov );
 #ifndef PRODUCT
-    if( VerifyLoopOptimizations ) verify();
+    if (TraceLoopOpts) {
+      tty->print("CMOV  ");
+      r_loop->dump_head();
+      if (Verbose)
+        bol->in(1)->dump(1);
+        cmov->dump(1);
+    }
+    if (VerifyLoopOptimizations) verify();
 #endif
   }
 
@@ -676,14 +713,14 @@
 
   // Split 'n' through the merge point if it is profitable
   Node *phi = split_thru_phi( n, n_blk, policy );
-  if( !phi ) return n;
+  if (!phi) return n;
 
   // Found a Phi to split thru!
   // Replace 'n' with the new phi
   _igvn.replace_node( n, phi );
   // Moved a load around the loop, 'en-registering' something.
-  if( n_blk->Opcode() == Op_Loop && n->is_Load() &&
-      !phi->in(LoopNode::LoopBackControl)->is_Load() )
+  if (n_blk->is_Loop() && n->is_Load() &&
+      !phi->in(LoopNode::LoopBackControl)->is_Load())
     C->set_major_progress();
 
   return phi;
--- a/src/share/vm/opto/matcher.hpp	Tue Oct 25 12:51:13 2011 -0700
+++ b/src/share/vm/opto/matcher.hpp	Wed Oct 26 06:08:56 2011 -0700
@@ -360,6 +360,12 @@
   // Anything this size or smaller may get converted to discrete scalar stores.
   static const int init_array_short_size;
 
+  // Some hardware needs 2 CMOV's for longs.
+  static const int long_cmove_cost();
+
+  // Some hardware have expensive CMOV for float and double.
+  static const int float_cmove_cost();
+
   // Should the Matcher clone shifts on addressing modes, expecting them to
   // be subsumed into complex addressing expressions or compute them into
   // registers?  True for Intel but false for most RISCs