changeset 529:640db98269d8

6819891: ParNew: Fix work queue overflow code to deal correctly with +UseCompressedOops Summary: When using compressed oops, rather than chaining the overflowed grey objects' pre-images through their klass words, we use GC-worker thread-local overflow stacks. Reviewed-by: jcoomes, jmasa
author ysr
date Tue, 24 Mar 2009 18:35:17 -0700
parents 06a41dd72256
children 3564d34d36be
files src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp src/share/vm/gc_implementation/parNew/parNewGeneration.cpp src/share/vm/gc_implementation/parNew/parNewGeneration.hpp src/share/vm/runtime/globals.hpp
diffstat 4 files changed, 206 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Tue Mar 24 16:20:20 2009 -0700
+++ b/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Tue Mar 24 18:35:17 2009 -0700
@@ -3837,7 +3837,7 @@
   MutexLockerEx ml(ovflw_stk->par_lock(),
                    Mutex::_no_safepoint_check_flag);
   // Grab up to 1/4 the size of the work queue
-  size_t num = MIN2((size_t)work_q->max_elems()/4,
+  size_t num = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
                     (size_t)ParGCDesiredObjsFromOverflowList);
   num = MIN2(num, ovflw_stk->length());
   for (int i = (int) num; i > 0; i--) {
@@ -5194,13 +5194,12 @@
   NOT_PRODUCT(int num_steals = 0;)
   oop obj_to_scan;
   CMSBitMap* bm = &(_collector->_markBitMap);
-  size_t num_from_overflow_list =
-           MIN2((size_t)work_q->max_elems()/4,
-                (size_t)ParGCDesiredObjsFromOverflowList);
 
   while (true) {
     // Completely finish any left over work from (an) earlier round(s)
     cl->trim_queue(0);
+    size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
+                                         (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
                                                 work_q)) {
@@ -5612,13 +5611,12 @@
   OopTaskQueue* work_q = work_queue(i);
   NOT_PRODUCT(int num_steals = 0;)
   oop obj_to_scan;
-  size_t num_from_overflow_list =
-           MIN2((size_t)work_q->max_elems()/4,
-                (size_t)ParGCDesiredObjsFromOverflowList);
 
   while (true) {
     // Completely finish any left over work from (an) earlier round(s)
     drain->trim_queue(0);
+    size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
+                                         (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
                                                 work_q)) {
@@ -8908,7 +8906,7 @@
 // Transfer some number of overflown objects to usual marking
 // stack. Return true if some objects were transferred.
 bool MarkRefsIntoAndScanClosure::take_from_overflow_list() {
-  size_t num = MIN2((size_t)_mark_stack->capacity()/4,
+  size_t num = MIN2((size_t)(_mark_stack->capacity() - _mark_stack->length())/4,
                     (size_t)ParGCDesiredObjsFromOverflowList);
 
   bool res = _collector->take_from_overflow_list(num, _mark_stack);
--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Tue Mar 24 16:20:20 2009 -0700
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Tue Mar 24 18:35:17 2009 -0700
@@ -36,7 +36,7 @@
                                        ObjToScanQueueSet* work_queue_set_,
                                        size_t desired_plab_sz_,
                                        ParallelTaskTerminator& term_) :
-  _to_space(to_space_), _old_gen(old_gen_), _thread_num(thread_num_),
+  _to_space(to_space_), _old_gen(old_gen_), _young_gen(gen_), _thread_num(thread_num_),
   _work_queue(work_queue_set_->queue(thread_num_)), _to_space_full(false),
   _ageTable(false), // false ==> not the global age table, no perf data.
   _to_space_alloc_buffer(desired_plab_sz_),
@@ -57,6 +57,11 @@
   _start = os::elapsedTime();
   _old_gen_closure.set_generation(old_gen_);
   _old_gen_root_closure.set_generation(old_gen_);
+  if (UseCompressedOops) {
+    _overflow_stack = new (ResourceObj::C_HEAP) GrowableArray<oop>(512, true);
+  } else {
+    _overflow_stack = NULL;
+  }
 }
 #ifdef _MSC_VER
 #pragma warning( pop )
@@ -81,7 +86,7 @@
   assert(old->is_objArray(), "must be obj array");
   assert(old->is_forwarded(), "must be forwarded");
   assert(Universe::heap()->is_in_reserved(old), "must be in heap.");
-  assert(!_old_gen->is_in(old), "must be in young generation.");
+  assert(!old_gen()->is_in(old), "must be in young generation.");
 
   objArrayOop obj = objArrayOop(old->forwardee());
   // Process ParGCArrayScanChunk elements now
@@ -119,26 +124,68 @@
 
 void ParScanThreadState::trim_queues(int max_size) {
   ObjToScanQueue* queue = work_queue();
-  while (queue->size() > (juint)max_size) {
-    oop obj_to_scan;
-    if (queue->pop_local(obj_to_scan)) {
-      note_pop();
-
-      if ((HeapWord *)obj_to_scan < young_old_boundary()) {
-        if (obj_to_scan->is_objArray() &&
-            obj_to_scan->is_forwarded() &&
-            obj_to_scan->forwardee() != obj_to_scan) {
-          scan_partial_array_and_push_remainder(obj_to_scan);
+  do {
+    while (queue->size() > (juint)max_size) {
+      oop obj_to_scan;
+      if (queue->pop_local(obj_to_scan)) {
+        note_pop();
+        if ((HeapWord *)obj_to_scan < young_old_boundary()) {
+          if (obj_to_scan->is_objArray() &&
+              obj_to_scan->is_forwarded() &&
+              obj_to_scan->forwardee() != obj_to_scan) {
+            scan_partial_array_and_push_remainder(obj_to_scan);
+          } else {
+            // object is in to_space
+            obj_to_scan->oop_iterate(&_to_space_closure);
+          }
         } else {
-          // object is in to_space
-          obj_to_scan->oop_iterate(&_to_space_closure);
+          // object is in old generation
+          obj_to_scan->oop_iterate(&_old_gen_closure);
         }
-      } else {
-        // object is in old generation
-        obj_to_scan->oop_iterate(&_old_gen_closure);
       }
     }
+    // For the  case of compressed oops, we have a private, non-shared
+    // overflow stack, so we eagerly drain it so as to more evenly
+    // distribute load early. Note: this may be good to do in
+    // general rather than delay for the final stealing phase.
+    // If applicable, we'll transfer a set of objects over to our
+    // work queue, allowing them to be stolen and draining our
+    // private overflow stack.
+  } while (ParGCTrimOverflow && young_gen()->take_from_overflow_list(this));
+}
+
+bool ParScanThreadState::take_from_overflow_stack() {
+  assert(UseCompressedOops, "Else should not call");
+  assert(young_gen()->overflow_list() == NULL, "Error");
+  ObjToScanQueue* queue = work_queue();
+  GrowableArray<oop>* of_stack = overflow_stack();
+  uint num_overflow_elems = of_stack->length();
+  uint num_take_elems     = MIN2(MIN2((queue->max_elems() - queue->size())/4,
+                                      (juint)ParGCDesiredObjsFromOverflowList),
+                                 num_overflow_elems);
+  // Transfer the most recent num_take_elems from the overflow
+  // stack to our work queue.
+  for (size_t i = 0; i != num_take_elems; i++) {
+    oop cur = of_stack->pop();
+    oop obj_to_push = cur->forwardee();
+    assert(Universe::heap()->is_in_reserved(cur), "Should be in heap");
+    assert(!old_gen()->is_in_reserved(cur), "Should be in young gen");
+    assert(Universe::heap()->is_in_reserved(obj_to_push), "Should be in heap");
+    if (should_be_partially_scanned(obj_to_push, cur)) {
+      assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
+      obj_to_push = cur;
+    }
+    bool ok = queue->push(obj_to_push);
+    assert(ok, "Should have succeeded");
   }
+  assert(young_gen()->overflow_list() == NULL, "Error");
+  return num_take_elems > 0;  // was something transferred?
+}
+
+void ParScanThreadState::push_on_overflow_stack(oop p) {
+  assert(UseCompressedOops, "Else should not call");
+  overflow_stack()->push(p);
+  assert(young_gen()->overflow_list() == NULL, "Error");
 }
 
 HeapWord* ParScanThreadState::alloc_in_to_space_slow(size_t word_sz) {
@@ -456,6 +503,7 @@
   _is_alive_closure(this),
   _plab_stats(YoungPLABSize, PLABWeight)
 {
+  NOT_PRODUCT(_overflow_counter = ParGCWorkQueueOverflowInterval;)
   _task_queues = new ObjToScanQueueSet(ParallelGCThreads);
   guarantee(_task_queues != NULL, "task_queues allocation failure.");
 
@@ -993,12 +1041,19 @@
              "push forwarded object");
     }
     // Push it on one of the queues of to-be-scanned objects.
-    if (!par_scan_state->work_queue()->push(obj_to_push)) {
+    bool simulate_overflow = false;
+    NOT_PRODUCT(
+      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
+        // simulate a stack overflow
+        simulate_overflow = true;
+      }
+    )
+    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
       // Add stats for overflow pushes.
       if (Verbose && PrintGCDetails) {
         gclog_or_tty->print("queue overflow!\n");
       }
-      push_on_overflow_list(old);
+      push_on_overflow_list(old, par_scan_state);
       par_scan_state->note_overflow_push();
     }
     par_scan_state->note_push();
@@ -1110,9 +1165,16 @@
              "push forwarded object");
     }
     // Push it on one of the queues of to-be-scanned objects.
-    if (!par_scan_state->work_queue()->push(obj_to_push)) {
+    bool simulate_overflow = false;
+    NOT_PRODUCT(
+      if (ParGCWorkQueueOverflowALot && should_simulate_overflow()) {
+        // simulate a stack overflow
+        simulate_overflow = true;
+      }
+    )
+    if (simulate_overflow || !par_scan_state->work_queue()->push(obj_to_push)) {
       // Add stats for overflow pushes.
-      push_on_overflow_list(old);
+      push_on_overflow_list(old, par_scan_state);
       par_scan_state->note_overflow_push();
     }
     par_scan_state->note_push();
@@ -1135,33 +1197,91 @@
   return forward_ptr;
 }
 
-void ParNewGeneration::push_on_overflow_list(oop from_space_obj) {
-  oop cur_overflow_list = _overflow_list;
-  // if the object has been forwarded to itself, then we cannot
-  // use the klass pointer for the linked list.  Instead we have
-  // to allocate an oopDesc in the C-Heap and use that for the linked list.
-  if (from_space_obj->forwardee() == from_space_obj) {
-    oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
-    listhead->forward_to(from_space_obj);
-    from_space_obj = listhead;
+#ifndef PRODUCT
+// It's OK to call this multi-threaded;  the worst thing
+// that can happen is that we'll get a bunch of closely
+// spaced simulated oveflows, but that's OK, in fact
+// probably good as it would exercise the overflow code
+// under contention.
+bool ParNewGeneration::should_simulate_overflow() {
+  if (_overflow_counter-- <= 0) { // just being defensive
+    _overflow_counter = ParGCWorkQueueOverflowInterval;
+    return true;
+  } else {
+    return false;
   }
-  while (true) {
-    from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
-    oop observed_overflow_list =
-      (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
-    if (observed_overflow_list == cur_overflow_list) break;
-    // Otherwise...
-    cur_overflow_list = observed_overflow_list;
+}
+#endif
+
+// In case we are using compressed oops, we need to be careful.
+// If the object being pushed is an object array, then its length
+// field keeps track of the "grey boundary" at which the next
+// incremental scan will be done (see ParGCArrayScanChunk).
+// When using compressed oops, this length field is kept in the
+// lower 32 bits of the erstwhile klass word and cannot be used
+// for the overflow chaining pointer (OCP below). As such the OCP
+// would itself need to be compressed into the top 32-bits in this
+// case. Unfortunately, see below, in the event that we have a
+// promotion failure, the node to be pushed on the list can be
+// outside of the Java heap, so the heap-based pointer compression
+// would not work (we would have potential aliasing between C-heap
+// and Java-heap pointers). For this reason, when using compressed
+// oops, we simply use a worker-thread-local, non-shared overflow
+// list in the form of a growable array, with a slightly different
+// overflow stack draining strategy. If/when we start using fat
+// stacks here, we can go back to using (fat) pointer chains
+// (although some performance comparisons would be useful since
+// single global lists have their own performance disadvantages
+// as we were made painfully aware not long ago, see 6786503).
+void ParNewGeneration::push_on_overflow_list(oop from_space_obj,
+                                             ParScanThreadState* par_scan_state) {
+  assert(is_in_reserved(from_space_obj), "Should be from this generation");
+  if (UseCompressedOops) {
+    // In the case of compressed oops, we use a private, not-shared
+    // overflow stack.
+    par_scan_state->push_on_overflow_stack(from_space_obj);
+  } else {
+    assert(par_scan_state->overflow_stack() == NULL, "Error");
+    oop cur_overflow_list = _overflow_list;
+    // if the object has been forwarded to itself, then we cannot
+    // use the klass pointer for the linked list.  Instead we have
+    // to allocate an oopDesc in the C-Heap and use that for the linked list.
+    if (from_space_obj->forwardee() == from_space_obj) {
+      oopDesc* listhead = NEW_C_HEAP_ARRAY(oopDesc, 1);
+      listhead->forward_to(from_space_obj);
+      from_space_obj = listhead;
+    }
+    while (true) {
+      from_space_obj->set_klass_to_list_ptr(cur_overflow_list);
+      oop observed_overflow_list =
+        (oop)Atomic::cmpxchg_ptr(from_space_obj, &_overflow_list, cur_overflow_list);
+      if (observed_overflow_list == cur_overflow_list) break;
+      // Otherwise...
+      cur_overflow_list = observed_overflow_list;
+    }
   }
 }
 
 bool
 ParNewGeneration::take_from_overflow_list(ParScanThreadState* par_scan_state) {
+  bool res;
+
+  if (UseCompressedOops) {
+    res = par_scan_state->take_from_overflow_stack();
+  } else {
+    res = take_from_overflow_list_work(par_scan_state);
+  }
+  return res;
+}
+
+bool
+ParNewGeneration::take_from_overflow_list_work(ParScanThreadState* par_scan_state) {
   ObjToScanQueue* work_q = par_scan_state->work_queue();
   // How many to take?
-  int objsFromOverflow = MIN2(work_q->max_elems()/4,
+  int objsFromOverflow = MIN2((work_q->max_elems() - work_q->size())/4,
                               (juint)ParGCDesiredObjsFromOverflowList);
 
+  assert(par_scan_state->overflow_stack() == NULL, "Error");
   if (_overflow_list == NULL) return false;
 
   // Otherwise, there was something there; try claiming the list.
@@ -1207,11 +1327,22 @@
     oop obj_to_push = cur->forwardee();
     oop next        = oop(cur->klass_or_null());
     cur->set_klass(obj_to_push->klass());
-    if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
-      obj_to_push = cur;
+    // This may be an array object that is self-forwarded. In that case, the list pointer
+    // space, cur, is not in the Java heap, but rather in the C-heap and should be freed.
+    if (!is_in_reserved(cur)) {
+      // Temporary: change this to an assert. This is to mitigate risk for a change
+      // that has not been tested heavily (except via non-product stress options).
+      guarantee(!Universe::heap()->is_in_reserved(cur), "Can't be elsewhere in the heap");
+      // This can become a scaling bottleneck when there is work queue overflow coincident
+      // with promotion failure.
+      oopDesc* f = cur;
+      FREE_C_HEAP_ARRAY(oopDesc, f);
+    } else if (par_scan_state->should_be_partially_scanned(obj_to_push, cur)) {
       assert(arrayOop(cur)->length() == 0, "entire array remaining to be scanned");
+      obj_to_push = cur;
     }
-    work_q->push(obj_to_push);
+    bool ok = work_q->push(obj_to_push);
+    assert(ok, "Should have succeeded");
     cur = next;
     n++;
   }
--- a/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Tue Mar 24 16:20:20 2009 -0700
+++ b/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Tue Mar 24 18:35:17 2009 -0700
@@ -55,6 +55,7 @@
   friend class ParScanThreadStateSet;
  private:
   ObjToScanQueue *_work_queue;
+  GrowableArray<oop>* _overflow_stack;
 
   ParGCAllocBuffer _to_space_alloc_buffer;
 
@@ -79,6 +80,9 @@
   Space* _to_space;
   Space* to_space() { return _to_space; }
 
+  ParNewGeneration* _young_gen;
+  ParNewGeneration* young_gen() const { return _young_gen; }
+
   Generation* _old_gen;
   Generation* old_gen() { return _old_gen; }
 
@@ -134,6 +138,11 @@
   // Decrease queue size below "max_size".
   void trim_queues(int max_size);
 
+  // Private overflow stack usage
+  GrowableArray<oop>* overflow_stack() { return _overflow_stack; }
+  bool take_from_overflow_stack();
+  void push_on_overflow_stack(oop p);
+
   // Is new_obj a candidate for scan_partial_array_and_push_remainder method.
   inline bool should_be_partially_scanned(oop new_obj, oop old_obj) const;
 
@@ -372,13 +381,21 @@
   oop copy_to_survivor_space_with_undo(ParScanThreadState* par_scan_state,
                              oop obj, size_t obj_sz, markOop m);
 
+  // in support of testing overflow code
+  NOT_PRODUCT(int _overflow_counter;)
+  NOT_PRODUCT(bool should_simulate_overflow();)
+
+  // Accessor for overflow list
+  oop overflow_list() { return _overflow_list; }
+
   // Push the given (from-space) object on the global overflow list.
-  void push_on_overflow_list(oop from_space_obj);
+  void push_on_overflow_list(oop from_space_obj, ParScanThreadState* par_scan_state);
 
   // If the global overflow list is non-empty, move some tasks from it
   // onto "work_q" (which must be empty).  No more than 1/4 of the
   // max_elems of "work_q" are moved.
   bool take_from_overflow_list(ParScanThreadState* par_scan_state);
+  bool take_from_overflow_list_work(ParScanThreadState* par_scan_state);
 
   // The task queues to be used by parallel GC threads.
   ObjToScanQueueSet* task_queues() {
--- a/src/share/vm/runtime/globals.hpp	Tue Mar 24 16:20:20 2009 -0700
+++ b/src/share/vm/runtime/globals.hpp	Tue Mar 24 18:35:17 2009 -0700
@@ -1294,6 +1294,16 @@
   product(intx, ParGCArrayScanChunk, 50,                                    \
           "Scan a subset and push remainder, if array is bigger than this") \
                                                                             \
+  product(bool, ParGCTrimOverflow, true,                                    \
+          "Eagerly trim the overflow lists (useful for UseCompressedOops")  \
+                                                                            \
+  notproduct(bool, ParGCWorkQueueOverflowALot, false,                       \
+          "Whether we should simulate work queue overflow in ParNew")       \
+                                                                            \
+  notproduct(uintx, ParGCWorkQueueOverflowInterval, 1000,                   \
+          "An `interval' counter that determines how frequently"            \
+          " we simulate overflow; a smaller number increases frequency")    \
+                                                                            \
   product(intx, ParGCDesiredObjsFromOverflowList, 20,                       \
           "The desired number of objects to claim from the overflow list")  \
                                                                             \