# HG changeset patch # User ysr # Date 1255683946 25200 # Node ID aa001a20bd617448556286bc9fde8d32a6221861 # Parent 6bdfda9a712077ebfb9c3b5ba4f0659be5c0791c 6888898: CMS: ReduceInitialCardMarks unsafe in the presence of cms precleaning 6889757: G1: enable card mark elision for initializing writes from compiled code (ReduceInitialCardMarks) Summary: Defer the (compiler-elided) card-mark upon a slow-path allocation until after the store and before the next subsequent safepoint; G1 now answers yes to can_elide_tlab_write_barriers(). Reviewed-by: jcoomes, kvn, never diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -992,11 +992,39 @@ // Can a compiler initialize a new object without store barriers? // This permission only extends from the creation of a new object - // via a TLAB up to the first subsequent safepoint. + // via a TLAB up to the first subsequent safepoint. If such permission + // is granted for this heap type, the compiler promises to call + // defer_store_barrier() below on any slow path allocation of + // a new object for which such initializing store barriers will + // have been elided. G1, like CMS, allows this, but should be + // ready to provide a compensating write barrier as necessary + // if that storage came out of a non-young region. The efficiency + // of this implementation depends crucially on being able to + // answer very efficiently in constant time whether a piece of + // storage in the heap comes from a young region or not. + // See ReduceInitialCardMarks. virtual bool can_elide_tlab_store_barriers() const { - // Since G1's TLAB's may, on occasion, come from non-young regions - // as well. (Is there a flag controlling that? XXX) - return false; + return true; + } + + bool is_in_young(oop obj) { + HeapRegion* hr = heap_region_containing(obj); + return hr != NULL && hr->is_young(); + } + + // We don't need barriers for initializing stores to objects + // in the young gen: for the SATB pre-barrier, there is no + // pre-value that needs to be remembered; for the remembered-set + // update logging post-barrier, we don't maintain remembered set + // information for young gen objects. Note that non-generational + // G1 does not have any "young" objects, should not elide + // the rs logging barrier and so should always answer false below. + // However, non-generational G1 (-XX:-G1Gen) appears to have + // bit-rotted so was not tested below. + virtual bool can_elide_initializing_store_barrier(oop new_obj) { + assert(G1Gen || !is_in_young(new_obj), + "Non-generational G1 should never return true below"); + return is_in_young(new_obj); } // Can a compiler elide a store barrier when it writes diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp --- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -314,41 +314,6 @@ return false; } -// Static method -bool ParallelScavengeHeap::is_in_young(oop* p) { - ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap(); - assert(heap->kind() == CollectedHeap::ParallelScavengeHeap, - "Must be ParallelScavengeHeap"); - - PSYoungGen* young_gen = heap->young_gen(); - - if (young_gen->is_in_reserved(p)) { - return true; - } - - return false; -} - -// Static method -bool ParallelScavengeHeap::is_in_old_or_perm(oop* p) { - ParallelScavengeHeap* heap = (ParallelScavengeHeap*)Universe::heap(); - assert(heap->kind() == CollectedHeap::ParallelScavengeHeap, - "Must be ParallelScavengeHeap"); - - PSOldGen* old_gen = heap->old_gen(); - PSPermGen* perm_gen = heap->perm_gen(); - - if (old_gen->is_in_reserved(p)) { - return true; - } - - if (perm_gen->is_in_reserved(p)) { - return true; - } - - return false; -} - // There are two levels of allocation policy here. // // When an allocation request fails, the requesting thread must invoke a VM @@ -764,6 +729,13 @@ CollectedHeap::resize_all_tlabs(); } +bool ParallelScavengeHeap::can_elide_initializing_store_barrier(oop new_obj) { + // We don't need barriers for stores to objects in the + // young gen and, a fortiori, for initializing stores to + // objects therein. + return is_in_young(new_obj); +} + // This method is used by System.gc() and JVMTI. void ParallelScavengeHeap::collect(GCCause::Cause cause) { assert(!Heap_lock->owned_by_self(), diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp --- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -129,8 +129,8 @@ return perm_gen()->is_in(p); } - static bool is_in_young(oop *p); // reserved part - static bool is_in_old_or_perm(oop *p); // reserved part + inline bool is_in_young(oop p); // reserved part + inline bool is_in_old_or_perm(oop p); // reserved part // Memory allocation. "gc_time_limit_was_exceeded" will // be set to true if the adaptive size policy determine that @@ -191,6 +191,10 @@ return true; } + // Return true if we don't we need a store barrier for + // initializing stores to an object at this address. + virtual bool can_elide_initializing_store_barrier(oop new_obj); + // Can a compiler elide a store barrier when it writes // a permanent oop into the heap? Applies when the compiler // is storing x to the heap, where x->is_perm() is true. diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp --- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.inline.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -41,3 +41,11 @@ PSMarkSweep::invoke(maximum_compaction); } } + +inline bool ParallelScavengeHeap::is_in_young(oop p) { + return young_gen()->is_in_reserved(p); +} + +inline bool ParallelScavengeHeap::is_in_old_or_perm(oop p) { + return old_gen()->is_in_reserved(p) || perm_gen()->is_in_reserved(p); +} diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_interface/collectedHeap.cpp --- a/src/share/vm/gc_interface/collectedHeap.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_interface/collectedHeap.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -137,6 +137,89 @@ return obj; } +void CollectedHeap::flush_deferred_store_barrier(JavaThread* thread) { + MemRegion deferred = thread->deferred_card_mark(); + if (!deferred.is_empty()) { + { + // Verify that the storage points to a parsable object in heap + DEBUG_ONLY(oop old_obj = oop(deferred.start());) + assert(is_in(old_obj), "Not in allocated heap"); + assert(!can_elide_initializing_store_barrier(old_obj), + "Else should have been filtered in defer_store_barrier()"); + assert(!is_in_permanent(old_obj), "Sanity: not expected"); + assert(old_obj->is_oop(true), "Not an oop"); + assert(old_obj->is_parsable(), "Will not be concurrently parsable"); + assert(deferred.word_size() == (size_t)(old_obj->size()), + "Mismatch: multiple objects?"); + } + BarrierSet* bs = barrier_set(); + assert(bs->has_write_region_opt(), "No write_region() on BarrierSet"); + bs->write_region(deferred); + // "Clear" the deferred_card_mark field + thread->set_deferred_card_mark(MemRegion()); + } + assert(thread->deferred_card_mark().is_empty(), "invariant"); +} + +// Helper for ReduceInitialCardMarks. For performance, +// compiled code may elide card-marks for initializing stores +// to a newly allocated object along the fast-path. We +// compensate for such elided card-marks as follows: +// (a) Generational, non-concurrent collectors, such as +// GenCollectedHeap(ParNew,DefNew,Tenured) and +// ParallelScavengeHeap(ParallelGC, ParallelOldGC) +// need the card-mark if and only if the region is +// in the old gen, and do not care if the card-mark +// succeeds or precedes the initializing stores themselves, +// so long as the card-mark is completed before the next +// scavenge. For all these cases, we can do a card mark +// at the point at which we do a slow path allocation +// in the old gen. For uniformity, however, we end +// up using the same scheme (see below) for all three +// cases (deferring the card-mark appropriately). +// (b) GenCollectedHeap(ConcurrentMarkSweepGeneration) requires +// in addition that the card-mark for an old gen allocated +// object strictly follow any associated initializing stores. +// In these cases, the memRegion remembered below is +// used to card-mark the entire region either just before the next +// slow-path allocation by this thread or just before the next scavenge or +// CMS-associated safepoint, whichever of these events happens first. +// (The implicit assumption is that the object has been fully +// initialized by this point, a fact that we assert when doing the +// card-mark.) +// (c) G1CollectedHeap(G1) uses two kinds of write barriers. When a +// G1 concurrent marking is in progress an SATB (pre-write-)barrier is +// is used to remember the pre-value of any store. Initializing +// stores will not need this barrier, so we need not worry about +// compensating for the missing pre-barrier here. Turning now +// to the post-barrier, we note that G1 needs a RS update barrier +// which simply enqueues a (sequence of) dirty cards which may +// optionally be refined by the concurrent update threads. Note +// that this barrier need only be applied to a non-young write, +// but, like in CMS, because of the presence of concurrent refinement +// (much like CMS' precleaning), must strictly follow the oop-store. +// Thus, using the same protocol for maintaining the intended +// invariants turns out, serendepitously, to be the same for all +// three collectors/heap types above. +// +// For each future collector, this should be reexamined with +// that specific collector in mind. +oop CollectedHeap::defer_store_barrier(JavaThread* thread, oop new_obj) { + // If a previous card-mark was deferred, flush it now. + flush_deferred_store_barrier(thread); + if (can_elide_initializing_store_barrier(new_obj)) { + // The deferred_card_mark region should be empty + // following the flush above. + assert(thread->deferred_card_mark().is_empty(), "Error"); + } else { + // Remember info for the newly deferred store barrier + MemRegion deferred = MemRegion((HeapWord*)new_obj, new_obj->size()); + assert(!deferred.is_empty(), "Error"); + thread->set_deferred_card_mark(deferred); + } + return new_obj; +} + size_t CollectedHeap::filler_array_hdr_size() { return size_t(arrayOopDesc::header_size(T_INT)); } @@ -225,16 +308,6 @@ fill_with_object_impl(start, words); } -oop CollectedHeap::new_store_barrier(oop new_obj) { - // %%% This needs refactoring. (It was imported from the server compiler.) - guarantee(can_elide_tlab_store_barriers(), "store barrier elision not supported"); - BarrierSet* bs = this->barrier_set(); - assert(bs->has_write_region_opt(), "Barrier set does not have write_region"); - int new_size = new_obj->size(); - bs->write_region(MemRegion((HeapWord*)new_obj, new_size)); - return new_obj; -} - HeapWord* CollectedHeap::allocate_new_tlab(size_t size) { guarantee(false, "thread-local allocation buffers not supported"); return NULL; diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/gc_interface/collectedHeap.hpp --- a/src/share/vm/gc_interface/collectedHeap.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/gc_interface/collectedHeap.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -400,9 +400,14 @@ guarantee(false, "thread-local allocation buffers not supported"); return 0; } + // Can a compiler initialize a new object without store barriers? // This permission only extends from the creation of a new object - // via a TLAB up to the first subsequent safepoint. + // via a TLAB up to the first subsequent safepoint. If such permission + // is granted for this heap type, the compiler promises to call + // defer_store_barrier() below on any slow path allocation of + // a new object for which such initializing store barriers will + // have been elided. virtual bool can_elide_tlab_store_barriers() const = 0; // If a compiler is eliding store barriers for TLAB-allocated objects, @@ -410,8 +415,19 @@ // an object allocated anywhere. The compiler's runtime support // promises to call this function on such a slow-path-allocated // object before performing initializations that have elided - // store barriers. Returns new_obj, or maybe a safer copy thereof. - virtual oop new_store_barrier(oop new_obj); + // store barriers. Returns new_obj, or maybe a safer copy thereof. + virtual oop defer_store_barrier(JavaThread* thread, oop new_obj); + + // Answers whether an initializing store to a new object currently + // allocated at the given address doesn't need a (deferred) store + // barrier. Returns "true" if it doesn't need an initializing + // store barrier; answers "false" if it does. + virtual bool can_elide_initializing_store_barrier(oop new_obj) = 0; + + // If the CollectedHeap was asked to defer a store barrier above, + // this informs it to flush such a deferred store barrier to the + // remembered set. + virtual void flush_deferred_store_barrier(JavaThread* thread); // Can a compiler elide a store barrier when it writes // a permanent oop into the heap? Applies when the compiler diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/memory/genCollectedHeap.hpp --- a/src/share/vm/memory/genCollectedHeap.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/memory/genCollectedHeap.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -260,6 +260,17 @@ return true; } + // We don't need barriers for stores to objects in the + // young gen and, a fortiori, for initializing stores to + // objects therein. This applies to {DefNew,ParNew}+{Tenured,CMS} + // only and may need to be re-examined in case other + // kinds of collectors are implemented in the future. + virtual bool can_elide_initializing_store_barrier(oop new_obj) { + assert(UseParNewGC || UseSerialGC || UseConcMarkSweepGC, + "Check can_elide_initializing_store_barrier() for this collector"); + return is_in_youngest((void*)new_obj); + } + // Can a compiler elide a store barrier when it writes // a permanent oop into the heap? Applies when the compiler // is storing x to the heap, where x->is_perm() is true. diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/graphKit.cpp --- a/src/share/vm/opto/graphKit.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/opto/graphKit.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -3183,6 +3183,15 @@ return; } + if (use_ReduceInitialCardMarks() + && obj == just_allocated_object(control())) { + // We can skip marks on a freshly-allocated object in Eden. + // Keep this code in sync with maybe_defer_card_mark() in runtime.cpp. + // That routine informs GC to take appropriate compensating steps + // so as to make this card-mark elision safe. + return; + } + if (!use_precise) { // All card marks for a (non-array) instance are in one place: adr = obj; diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/opto/library_call.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -4060,13 +4060,13 @@ result_mem ->set_req(_objArray_path, reset_memory()); } } - // We can dispense with card marks if we know the allocation - // comes out of eden (TLAB)... In fact, ReduceInitialCardMarks - // causes the non-eden paths to simulate a fresh allocation, - // insofar that no further card marks are required to initialize - // the object. - // Otherwise, there are no card marks to worry about. + // (We can dispense with card marks if we know the allocation + // comes out of eden (TLAB)... In fact, ReduceInitialCardMarks + // causes the non-eden paths to take compensating steps to + // simulate a fresh allocation, so that no further + // card marks are required in compiled code to initialize + // the object.) if (!stopped()) { copy_to_clone(obj, alloc_obj, obj_size, true, false); diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/opto/runtime.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -143,18 +143,20 @@ // We failed the fast-path allocation. Now we need to do a scavenge or GC // and try allocation again. -void OptoRuntime::do_eager_card_mark(JavaThread* thread) { +void OptoRuntime::maybe_defer_card_mark(JavaThread* thread) { // After any safepoint, just before going back to compiled code, - // we perform a card mark. This lets the compiled code omit - // card marks for initialization of new objects. - // Keep this code consistent with GraphKit::store_barrier. + // we inform the GC that we will be doing initializing writes to + // this object in the future without emitting card-marks, so + // GC may take any compensating steps. + // NOTE: Keep this code consistent with GraphKit::store_barrier. oop new_obj = thread->vm_result(); if (new_obj == NULL) return; assert(Universe::heap()->can_elide_tlab_store_barriers(), "compiler must check this first"); - new_obj = Universe::heap()->new_store_barrier(new_obj); + // GC may decide to give back a safer copy of new_obj. + new_obj = Universe::heap()->defer_store_barrier(thread, new_obj); thread->set_vm_result(new_obj); } @@ -197,8 +199,8 @@ JRT_BLOCK_END; if (GraphKit::use_ReduceInitialCardMarks()) { - // do them now so we don't have to do them on the fast path - do_eager_card_mark(thread); + // inform GC that we won't do card marks for initializing writes. + maybe_defer_card_mark(thread); } JRT_END @@ -236,8 +238,8 @@ JRT_BLOCK_END; if (GraphKit::use_ReduceInitialCardMarks()) { - // do them now so we don't have to do them on the fast path - do_eager_card_mark(thread); + // inform GC that we won't do card marks for initializing writes. + maybe_defer_card_mark(thread); } JRT_END diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/opto/runtime.hpp --- a/src/share/vm/opto/runtime.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/opto/runtime.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -133,8 +133,8 @@ // Allocate storage for a objArray or typeArray static void new_array_C(klassOopDesc* array_klass, int len, JavaThread *thread); - // Post-allocation step for implementing ReduceInitialCardMarks: - static void do_eager_card_mark(JavaThread* thread); + // Post-slow-path-allocation step for implementing ReduceInitialCardMarks: + static void maybe_defer_card_mark(JavaThread* thread); // Allocate storage for a multi-dimensional arrays // Note: needs to be fixed for arbitrary number of dimensions diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/runtime/thread.cpp --- a/src/share/vm/runtime/thread.cpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/runtime/thread.cpp Fri Oct 16 02:05:46 2009 -0700 @@ -1212,6 +1212,7 @@ { initialize(); _is_attaching = is_attaching; + assert(_deferred_card_mark.is_empty(), "Default MemRegion ctor"); } bool JavaThread::reguard_stack(address cur_sp) { @@ -2317,6 +2318,10 @@ void JavaThread::oops_do(OopClosure* f) { + // Flush deferred store-barriers, if any, associated with + // initializing stores done by this JavaThread in the current epoch. + Universe::heap()->flush_deferred_store_barrier(this); + // The ThreadProfiler oops_do is done from FlatProfiler::oops_do // since there may be more than one thread using each ThreadProfiler. diff -r 6bdfda9a7120 -r aa001a20bd61 src/share/vm/runtime/thread.hpp --- a/src/share/vm/runtime/thread.hpp Fri Oct 02 14:16:39 2009 -0700 +++ b/src/share/vm/runtime/thread.hpp Fri Oct 16 02:05:46 2009 -0700 @@ -683,8 +683,13 @@ methodOop _callee_target; // Oop results of VM runtime calls - oop _vm_result; // Used to pass back an oop result into Java code, GC-preserved - oop _vm_result_2; // Used to pass back an oop result into Java code, GC-preserved + oop _vm_result; // Used to pass back an oop result into Java code, GC-preserved + oop _vm_result_2; // Used to pass back an oop result into Java code, GC-preserved + + // See ReduceInitialCardMarks: this holds the precise space interval of + // the most recent slow path allocation for which compiled code has + // elided card-marks for performance along the fast-path. + MemRegion _deferred_card_mark; MonitorChunk* _monitor_chunks; // Contains the off stack monitors // allocated during deoptimization @@ -1090,6 +1095,9 @@ oop vm_result_2() const { return _vm_result_2; } void set_vm_result_2 (oop x) { _vm_result_2 = x; } + MemRegion deferred_card_mark() const { return _deferred_card_mark; } + void set_deferred_card_mark(MemRegion mr) { _deferred_card_mark = mr; } + // Exception handling for compiled methods oop exception_oop() const { return _exception_oop; } int exception_stack_size() const { return _exception_stack_size; }