# HG changeset patch
# User ysr
# Date 1255683946 25200
# Node ID aa001a20bd617448556286bc9fde8d32a6221861
# Parent  6bdfda9a712077ebfb9c3b5ba4f0659be5c0791c
6888898: CMS: ReduceInitialCardMarks unsafe in the presence of cms precleaning
6889757: G1: enable card mark elision for initializing writes from compiled code (ReduceInitialCardMarks)
Summary: Defer the (compiler-elided) card-mark upon a slow-path allocation until after the store  and before the next subsequent safepoint; G1 now answers yes to can_elide_tlab_write_barriers().
Reviewed-by: jcoomes, kvn, never

diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -992,11 +992,39 @@
 
   // Can a compiler initialize a new object without store barriers?
   // This permission only extends from the creation of a new object
-  // via a TLAB up to the first subsequent safepoint.
+  // via a TLAB up to the first subsequent safepoint. If such permission
+  // is granted for this heap type, the compiler promises to call
+  // defer_store_barrier() below on any slow path allocation of
+  // a new object for which such initializing store barriers will
+  // have been elided. G1, like CMS, allows this, but should be
+  // ready to provide a compensating write barrier as necessary
+  // if that storage came out of a non-young region. The efficiency
+  // of this implementation depends crucially on being able to
+  // answer very efficiently in constant time whether a piece of
+  // storage in the heap comes from a young region or not.
+  // See ReduceInitialCardMarks.
   virtual bool can_elide_tlab_store_barriers() const {
-    // Since G1's TLAB's may, on occasion, come from non-young regions
-    // as well. (Is there a flag controlling that? XXX)
-    return false;
+    return true;
+  }
+
+  bool is_in_young(oop obj) {
+    HeapRegion* hr = heap_region_containing(obj);
+    return hr != NULL && hr->is_young();
+  }
+
+  // We don't need barriers for initializing stores to objects
+  // in the young gen: for the SATB pre-barrier, there is no
+  // pre-value that needs to be remembered; for the remembered-set
+  // update logging post-barrier, we don't maintain remembered set
+  // information for young gen objects. Note that non-generational
+  // G1 does not have any "young" objects, should not elide
+  // the rs logging barrier and so should always answer false below.
+  // However, non-generational G1 (-XX:-G1Gen) appears to have
+  // bit-rotted so was not tested below.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) {
+    assert(G1Gen || !is_in_young(new_obj),
+           "Non-generational G1 should never return true below");
+    return is_in_young(new_obj);
   }
 
   // Can a compiler elide a store barrier when it writes
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp
--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -314,41 +314,6 @@
   return false;
 }
 
-// Static method
-bool ParallelScavengeHeap::is_in_young(oop* p) {
-  ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap();
-  assert(heap->kind() == CollectedHeap::ParallelScavengeHeap,
-                                            "Must be ParallelScavengeHeap");
-
-  PSYoungGen* young_gen = heap->young_gen();
-
-  if (young_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  return false;
-}
-
-// Static method
-bool ParallelScavengeHeap::is_in_old_or_perm(oop* p) {
-  ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap();
-  assert(heap->kind() == CollectedHeap::ParallelScavengeHeap,
-                                            "Must be ParallelScavengeHeap");
-
-  PSOldGen* old_gen = heap->old_gen();
-  PSPermGen* perm_gen = heap->perm_gen();
-
-  if (old_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  if (perm_gen->is_in_reserved(p)) {
-    return true;
-  }
-
-  return false;
-}
-
 // There are two levels of allocation policy here.
 //
 // When an allocation request fails, the requesting thread must invoke a VM
@@ -764,6 +729,13 @@
   CollectedHeap::resize_all_tlabs();
 }
 
+bool ParallelScavengeHeap::can_elide_initializing_store_barrier(oop new_obj) {
+  // We don't need barriers for stores to objects in the
+  // young gen and, a fortiori, for initializing stores to
+  // objects therein.
+  return is_in_young(new_obj);
+}
+
 // This method is used by System.gc() and JVMTI.
 void ParallelScavengeHeap::collect(GCCause::Cause cause) {
   assert(!Heap_lock->owned_by_self(),
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp
--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -129,8 +129,8 @@
     return perm_gen()->is_in(p);
   }
 
-  static bool is_in_young(oop *p);        // reserved part
-  static bool is_in_old_or_perm(oop *p);  // reserved part
+  inline bool is_in_young(oop p);        // reserved part
+  inline bool is_in_old_or_perm(oop p);  // reserved part
 
   // Memory allocation.   "gc_time_limit_was_exceeded" will
   // be set to true if the adaptive size policy determine that
@@ -191,6 +191,10 @@
     return true;
   }
 
+  // Return true if we don't we need a store barrier for
+  // initializing stores to an object at this address.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj);
+
   // Can a compiler elide a store barrier when it writes
   // a permanent oop into the heap?  Applies when the compiler
   // is storing x to the heap, where x->is_perm() is true.
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp
--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -41,3 +41,11 @@
     PSMarkSweep::invoke(maximum_compaction);
   }
 }
+
+inline bool ParallelScavengeHeap::is_in_young(oop p) {
+  return young_gen()->is_in_reserved(p);
+}
+
+inline bool ParallelScavengeHeap::is_in_old_or_perm(oop p) {
+  return old_gen()->is_in_reserved(p) || perm_gen()->is_in_reserved(p);
+}
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_interface/collectedHeap.cpp
--- a/src/share/vm/gc_interface/collectedHeap.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_interface/collectedHeap.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -137,6 +137,89 @@
   return obj;
 }
 
+void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) {
+  MemRegion deferred = thread->deferred_card_mark();
+  if (!deferred.is_empty()) {
+    {
+      // Verify that the storage points to a parsable object in heap
+      DEBUG_ONLY(oop old_obj = oop(deferred.start());)
+      assert(is_in(old_obj), "Not in allocated heap");
+      assert(!can_elide_initializing_store_barrier(old_obj),
+             "Else should have been filtered in defer_store_barrier()");
+      assert(!is_in_permanent(old_obj), "Sanity: not expected");
+      assert(old_obj->is_oop(true), "Not an oop");
+      assert(old_obj->is_parsable(), "Will not be concurrently parsable");
+      assert(deferred.word_size() == (size_t)(old_obj->size()),
+             "Mismatch: multiple objects?");
+    }
+    BarrierSet* bs = barrier_set();
+    assert(bs->has_write_region_opt(), "No write_region() on BarrierSet");
+    bs->write_region(deferred);
+    // "Clear" the deferred_card_mark field
+    thread->set_deferred_card_mark(MemRegion());
+  }
+  assert(thread->deferred_card_mark().is_empty(), "invariant");
+}
+
+// Helper for ReduceInitialCardMarks. For performance,
+// compiled code may elide card-marks for initializing stores
+// to a newly allocated object along the fast-path. We
+// compensate for such elided card-marks as follows:
+// (a) Generational, non-concurrent collectors, such as
+//     GenCollectedHeap(ParNew,DefNew,Tenured) and
+//     ParallelScavengeHeap(ParallelGC, ParallelOldGC)
+//     need the card-mark if and only if the region is
+//     in the old gen, and do not care if the card-mark
+//     succeeds or precedes the initializing stores themselves,
+//     so long as the card-mark is completed before the next
+//     scavenge. For all these cases, we can do a card mark
+//     at the point at which we do a slow path allocation
+//     in the old gen. For uniformity, however, we end
+//     up using the same scheme (see below) for all three
+//     cases (deferring the card-mark appropriately).
+// (b) GenCollectedHeap(ConcurrentMarkSweepGeneration) requires
+//     in addition that the card-mark for an old gen allocated
+//     object strictly follow any associated initializing stores.
+//     In these cases, the memRegion remembered below is
+//     used to card-mark the entire region either just before the next
+//     slow-path allocation by this thread or just before the next scavenge or
+//     CMS-associated safepoint, whichever of these events happens first.
+//     (The implicit assumption is that the object has been fully
+//     initialized by this point, a fact that we assert when doing the
+//     card-mark.)
+// (c) G1CollectedHeap(G1) uses two kinds of write barriers. When a
+//     G1 concurrent marking is in progress an SATB (pre-write-)barrier is
+//     is used to remember the pre-value of any store. Initializing
+//     stores will not need this barrier, so we need not worry about
+//     compensating for the missing pre-barrier here. Turning now
+//     to the post-barrier, we note that G1 needs a RS update barrier
+//     which simply enqueues a (sequence of) dirty cards which may
+//     optionally be refined by the concurrent update threads. Note
+//     that this barrier need only be applied to a non-young write,
+//     but, like in CMS, because of the presence of concurrent refinement
+//     (much like CMS' precleaning), must strictly follow the oop-store.
+//     Thus, using the same protocol for maintaining the intended
+//     invariants turns out, serendepitously, to be the same for all
+//     three collectors/heap types above.
+//
+// For each future collector, this should be reexamined with
+// that specific collector in mind.
+oop CollectedHeap::defer_store_barrier(JavaThread* thread, oop new_obj) {
+  // If a previous card-mark was deferred, flush it now.
+  flush_deferred_store_barrier(thread);
+  if (can_elide_initializing_store_barrier(new_obj)) {
+    // The deferred_card_mark region should be empty
+    // following the flush above.
+    assert(thread->deferred_card_mark().is_empty(), "Error");
+  } else {
+    // Remember info for the newly deferred store barrier
+    MemRegion deferred = MemRegion((HeapWord*)new_obj, new_obj->size());
+    assert(!deferred.is_empty(), "Error");
+    thread->set_deferred_card_mark(deferred);
+  }
+  return new_obj;
+}
+
 size_t CollectedHeap::filler_array_hdr_size() {
   return size_t(arrayOopDesc::header_size(T_INT));
 }
@@ -225,16 +308,6 @@
   fill_with_object_impl(start, words);
 }
 
-oop CollectedHeap::new_store_barrier(oop new_obj) {
-  // %%% This needs refactoring.  (It was imported from the server compiler.)
-  guarantee(can_elide_tlab_store_barriers(), "store barrier elision not supported");
-  BarrierSet* bs = this->barrier_set();
-  assert(bs->has_write_region_opt(), "Barrier set does not have write_region");
-  int new_size = new_obj->size();
-  bs->write_region(MemRegion((HeapWord*)new_obj, new_size));
-  return new_obj;
-}
-
 HeapWord* CollectedHeap::allocate_new_tlab(size_t size) {
   guarantee(false, "thread-local allocation buffers not supported");
   return NULL;
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_interface/collectedHeap.hpp
--- a/src/share/vm/gc_interface/collectedHeap.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/gc_interface/collectedHeap.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -400,9 +400,14 @@
     guarantee(false, "thread-local allocation buffers not supported");
     return 0;
   }
+
   // Can a compiler initialize a new object without store barriers?
   // This permission only extends from the creation of a new object
-  // via a TLAB up to the first subsequent safepoint.
+  // via a TLAB up to the first subsequent safepoint. If such permission
+  // is granted for this heap type, the compiler promises to call
+  // defer_store_barrier() below on any slow path allocation of
+  // a new object for which such initializing store barriers will
+  // have been elided.
   virtual bool can_elide_tlab_store_barriers() const = 0;
 
   // If a compiler is eliding store barriers for TLAB-allocated objects,
@@ -410,8 +415,19 @@
   // an object allocated anywhere.  The compiler's runtime support
   // promises to call this function on such a slow-path-allocated
   // object before performing initializations that have elided
-  // store barriers.  Returns new_obj, or maybe a safer copy thereof.
-  virtual oop new_store_barrier(oop new_obj);
+  // store barriers. Returns new_obj, or maybe a safer copy thereof.
+  virtual oop defer_store_barrier(JavaThread* thread, oop new_obj);
+
+  // Answers whether an initializing store to a new object currently
+  // allocated at the given address doesn't need a (deferred) store
+  // barrier. Returns "true" if it doesn't need an initializing
+  // store barrier; answers "false" if it does.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) = 0;
+
+  // If the CollectedHeap was asked to defer a store barrier above,
+  // this informs it to flush such a deferred store barrier to the
+  // remembered set.
+  virtual void flush_deferred_store_barrier(JavaThread* thread);
 
   // Can a compiler elide a store barrier when it writes
   // a permanent oop into the heap?  Applies when the compiler
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/memory/genCollectedHeap.hpp
--- a/src/share/vm/memory/genCollectedHeap.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/memory/genCollectedHeap.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -260,6 +260,17 @@
     return true;
   }
 
+  // We don't need barriers for stores to objects in the
+  // young gen and, a fortiori, for initializing stores to
+  // objects therein. This applies to {DefNew,ParNew}+{Tenured,CMS}
+  // only and may need to be re-examined in case other
+  // kinds of collectors are implemented in the future.
+  virtual bool can_elide_initializing_store_barrier(oop new_obj) {
+    assert(UseParNewGC || UseSerialGC || UseConcMarkSweepGC,
+           "Check can_elide_initializing_store_barrier() for this collector");
+    return is_in_youngest((void*)new_obj);
+  }
+
   // Can a compiler elide a store barrier when it writes
   // a permanent oop into the heap?  Applies when the compiler
   // is storing x to the heap, where x->is_perm() is true.
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/graphKit.cpp
--- a/src/share/vm/opto/graphKit.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/opto/graphKit.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -3183,6 +3183,15 @@
       return;
   }
 
+  if (use_ReduceInitialCardMarks()
+      && obj == just_allocated_object(control())) {
+    // We can skip marks on a freshly-allocated object in Eden.
+    // Keep this code in sync with maybe_defer_card_mark() in runtime.cpp.
+    // That routine informs GC to take appropriate compensating steps
+    // so as to make this card-mark elision safe.
+    return;
+  }
+
   if (!use_precise) {
     // All card marks for a (non-array) instance are in one place:
     adr = obj;
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/library_call.cpp
--- a/src/share/vm/opto/library_call.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/opto/library_call.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -4060,13 +4060,13 @@
           result_mem ->set_req(_objArray_path, reset_memory());
         }
       }
-      // We can dispense with card marks if we know the allocation
-      // comes out of eden (TLAB)...  In fact, ReduceInitialCardMarks
-      // causes the non-eden paths to simulate a fresh allocation,
-      // insofar that no further card marks are required to initialize
-      // the object.
-
       // Otherwise, there are no card marks to worry about.
+      // (We can dispense with card marks if we know the allocation
+      //  comes out of eden (TLAB)...  In fact, ReduceInitialCardMarks
+      //  causes the non-eden paths to take compensating steps to
+      //  simulate a fresh allocation, so that no further
+      //  card marks are required in compiled code to initialize
+      //  the object.)
 
       if (!stopped()) {
         copy_to_clone(obj, alloc_obj, obj_size, true, false);
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/runtime.cpp
--- a/src/share/vm/opto/runtime.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/opto/runtime.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -143,18 +143,20 @@
 // We failed the fast-path allocation.  Now we need to do a scavenge or GC
 // and try allocation again.
 
-void OptoRuntime::do_eager_card_mark(JavaThread* thread) {
+void OptoRuntime::maybe_defer_card_mark(JavaThread* thread) {
   // After any safepoint, just before going back to compiled code,
-  // we perform a card mark.  This lets the compiled code omit
-  // card marks for initialization of new objects.
-  // Keep this code consistent with GraphKit::store_barrier.
+  // we inform the GC that we will be doing initializing writes to
+  // this object in the future without emitting card-marks, so
+  // GC may take any compensating steps.
+  // NOTE: Keep this code consistent with GraphKit::store_barrier.
 
   oop new_obj = thread->vm_result();
   if (new_obj == NULL)  return;
 
   assert(Universe::heap()->can_elide_tlab_store_barriers(),
          "compiler must check this first");
-  new_obj = Universe::heap()->new_store_barrier(new_obj);
+  // GC may decide to give back a safer copy of new_obj.
+  new_obj = Universe::heap()->defer_store_barrier(thread, new_obj);
   thread->set_vm_result(new_obj);
 }
 
@@ -197,8 +199,8 @@
   JRT_BLOCK_END;
 
   if (GraphKit::use_ReduceInitialCardMarks()) {
-    // do them now so we don't have to do them on the fast path
-    do_eager_card_mark(thread);
+    // inform GC that we won't do card marks for initializing writes.
+    maybe_defer_card_mark(thread);
   }
 JRT_END
 
@@ -236,8 +238,8 @@
   JRT_BLOCK_END;
 
   if (GraphKit::use_ReduceInitialCardMarks()) {
-    // do them now so we don't have to do them on the fast path
-    do_eager_card_mark(thread);
+    // inform GC that we won't do card marks for initializing writes.
+    maybe_defer_card_mark(thread);
   }
 JRT_END
 
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/runtime.hpp
--- a/src/share/vm/opto/runtime.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/opto/runtime.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -133,8 +133,8 @@
   // Allocate storage for a objArray or typeArray
   static void new_array_C(klassOopDesc* array_klass, int len, JavaThread *thread);
 
-  // Post-allocation step for implementing ReduceInitialCardMarks:
-  static void do_eager_card_mark(JavaThread* thread);
+  // Post-slow-path-allocation step for implementing ReduceInitialCardMarks:
+  static void maybe_defer_card_mark(JavaThread* thread);
 
   // Allocate storage for a multi-dimensional arrays
   // Note: needs to be fixed for arbitrary number of dimensions
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/runtime/thread.cpp
--- a/src/share/vm/runtime/thread.cpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/runtime/thread.cpp	Fri Oct 16 02:05:46 2009 -0700
@@ -1212,6 +1212,7 @@
 {
   initialize();
   _is_attaching = is_attaching;
+  assert(_deferred_card_mark.is_empty(), "Default MemRegion ctor");
 }
 
 bool JavaThread::reguard_stack(address cur_sp) {
@@ -2317,6 +2318,10 @@
 
 
 void JavaThread::oops_do(OopClosure* f) {
+  // Flush deferred store-barriers, if any, associated with
+  // initializing stores done by this JavaThread in the current epoch.
+  Universe::heap()->flush_deferred_store_barrier(this);
+
   // The ThreadProfiler oops_do is done from FlatProfiler::oops_do
   // since there may be more than one thread using each ThreadProfiler.
 
diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/runtime/thread.hpp
--- a/src/share/vm/runtime/thread.hpp	Fri Oct 02 14:16:39 2009 -0700
+++ b/src/share/vm/runtime/thread.hpp	Fri Oct 16 02:05:46 2009 -0700
@@ -683,8 +683,13 @@
   methodOop     _callee_target;
 
   // Oop results of VM runtime calls
-  oop           _vm_result;                      // Used to pass back an oop result into Java code, GC-preserved
-  oop           _vm_result_2;                    // Used to pass back an oop result into Java code, GC-preserved
+  oop           _vm_result;    // Used to pass back an oop result into Java code, GC-preserved
+  oop           _vm_result_2;  // Used to pass back an oop result into Java code, GC-preserved
+
+  // See ReduceInitialCardMarks: this holds the precise space interval of
+  // the most recent slow path allocation for which compiled code has
+  // elided card-marks for performance along the fast-path.
+  MemRegion     _deferred_card_mark;
 
   MonitorChunk* _monitor_chunks;                 // Contains the off stack monitors
                                                  // allocated during deoptimization
@@ -1090,6 +1095,9 @@
   oop  vm_result_2() const                       { return _vm_result_2; }
   void set_vm_result_2  (oop x)                  { _vm_result_2   = x; }
 
+  MemRegion deferred_card_mark() const           { return _deferred_card_mark; }
+  void set_deferred_card_mark(MemRegion mr)      { _deferred_card_mark = mr;   }
+
   // Exception handling for compiled methods
   oop      exception_oop() const                 { return _exception_oop; }
   int      exception_stack_size() const          { return _exception_stack_size; }