changeset 9661:ad916ca3715b

8040162: Avoid reallocating PLABs between GC phases in G1 Summary: Allocate G1ParScanThreadState at the start of GC and only deallocate at the end of GC. This automatically keeps the current PLAB intact without flushing it (and loosing lots of memory) for every reallocation. Reviewed-by: david, mgerdin
author tschatzl
date Thu, 20 Aug 2015 15:17:43 +0200
parents 2bcdd5016317
children 57093b085a8f
files src/share/vm/gc/g1/g1CollectedHeap.cpp src/share/vm/gc/g1/g1CollectedHeap.hpp src/share/vm/gc/g1/g1CollectedHeap_ext.cpp src/share/vm/gc/g1/g1OopClosures.cpp src/share/vm/gc/g1/g1OopClosures.hpp src/share/vm/gc/g1/g1ParScanThreadState.cpp src/share/vm/gc/g1/g1ParScanThreadState.hpp
diffstat 7 files changed, 199 insertions(+), 148 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Aug 20 15:17:43 2015 +0200
@@ -4382,6 +4382,13 @@
 }
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
+private:
+  double _start_term;
+  double _term_time;
+  size_t _term_attempts;
+
+  void start_term_time() { _term_attempts++; _start_term = os::elapsedTime(); }
+  void end_term_time() { _term_time += os::elapsedTime() - _start_term; }
 protected:
   G1CollectedHeap*              _g1h;
   G1ParScanThreadState*         _par_scan_state;
@@ -4398,19 +4405,23 @@
                                 RefToScanQueueSet* queues,
                                 ParallelTaskTerminator* terminator)
     : _g1h(g1h), _par_scan_state(par_scan_state),
-      _queues(queues), _terminator(terminator) {}
+      _queues(queues), _terminator(terminator),
+      _start_term(0.0), _term_time(0.0), _term_attempts(0) {}
 
   void do_void();
 
+  double term_time() const { return _term_time; }
+  size_t term_attempts() const { return _term_attempts; }
+
 private:
   inline bool offer_termination();
 };
 
 bool G1ParEvacuateFollowersClosure::offer_termination() {
   G1ParScanThreadState* const pss = par_scan_state();
-  pss->start_term_time();
+  start_term_time();
   const bool res = terminator()->offer_termination();
-  pss->end_term_time();
+  end_term_time();
   return res;
 }
 
@@ -4451,15 +4462,17 @@
 class G1ParTask : public AbstractGangTask {
 protected:
   G1CollectedHeap*       _g1h;
-  RefToScanQueueSet      *_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   G1RootProcessor*       _root_processor;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
+  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
+      _pss(per_thread_states),
       _queues(task_queues),
       _root_processor(root_processor),
       _terminator(n_workers, _queues),
@@ -4506,7 +4519,8 @@
   void work(uint worker_id) {
     if (worker_id >= _n_workers) return;  // no work needed this round
 
-    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
+    double start_sec = os::elapsedTime();
+    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, start_sec);
 
     {
       ResourceMark rm;
@@ -4514,23 +4528,24 @@
 
       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
-      G1ParScanThreadState            pss(_g1h, worker_id, rp);
+      G1ParScanThreadState*           pss = _pss[worker_id];
+      pss->set_ref_processor(rp);
 
       bool only_young = _g1h->collector_state()->gcs_are_young();
 
       // Non-IM young GC.
-      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkNone>                                scan_only_cld_cl(&scan_only_root_cl,
                                                                                only_young, // Only process dirty klasses.
                                                                                false);     // No need to claim CLDs.
       // IM young GC.
       //    Strong roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkFromRoot>                            scan_mark_cld_cl(&scan_mark_root_cl,
                                                                                false, // Process all klasses.
                                                                                true); // Need to claim CLDs.
       //    Weak roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkPromotedFromRoot>                    scan_mark_weak_cld_cl(&scan_mark_weak_root_cl,
                                                                                     false, // Process all klasses.
                                                                                     true); // Need to claim CLDs.
@@ -4561,8 +4576,7 @@
         weak_cld_cl    = &scan_only_cld_cl;
       }
 
-      pss.start_strong_roots();
-
+      double start_strong_roots_sec = os::elapsedTime();
       _root_processor->evacuate_roots(strong_root_cl,
                                       weak_root_cl,
                                       strong_cld_cl,
@@ -4570,32 +4584,49 @@
                                       trace_metadata,
                                       worker_id);
 
-      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss);
+      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss);
       _root_processor->scan_remembered_sets(&push_heap_rs_cl,
                                             weak_root_cl,
                                             worker_id);
-      pss.end_strong_roots();
-
+      double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;
+
+      double term_sec = 0.0;
+      size_t evac_term_attempts = 0;
       {
         double start = os::elapsedTime();
-        G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator);
+        G1ParEvacuateFollowersClosure evac(_g1h, pss, _queues, &_terminator);
         evac.do_void();
+
+        evac_term_attempts = evac.term_attempts();
+        term_sec = evac.term_time();
         double elapsed_sec = os::elapsedTime() - start;
-        double term_sec = pss.term_time();
         _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
         _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, pss.term_attempts());
+        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
       }
-      _g1h->g1_policy()->record_thread_age_table(pss.age_table());
-      _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
+
+      // Flush any statistics.
+      _g1h->g1_policy()->record_thread_age_table(pss->age_table());
+      _g1h->update_surviving_young_words(pss->surviving_young_words());
+
+      assert(pss->queue_is_empty(), "should be empty");
 
       if (PrintTerminationStats) {
         MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-        pss.print_termination_stats();
+        size_t lab_waste;
+        size_t lab_undo_waste;
+        pss->waste(lab_waste, lab_undo_waste);
+        _g1h->print_termination_stats(gclog_or_tty,
+                                      worker_id,
+                                      (os::elapsedTime() - start_sec) * 1000.0,   /* elapsed time */
+                                      strong_roots_sec * 1000.0,                  /* strong roots time */
+                                      term_sec * 1000.0,                          /* evac term time */
+                                      evac_term_attempts,                         /* evac term attempts */
+                                      lab_waste,                                  /* alloc buffer waste */
+                                      lab_undo_waste                              /* undo waste */
+                                      );
       }
 
-      assert(pss.queue_is_empty(), "should be empty");
-
       // Close the inner scope so that the ResourceMark and HandleMark
       // destructors are executed here and are included as part of the
       // "GC Worker Time".
@@ -4604,6 +4635,31 @@
   }
 };
 
+void G1CollectedHeap::print_termination_stats_hdr(outputStream* const st) {
+  st->print_raw_cr("GC Termination Stats");
+  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
+  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
+  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
+}
+
+void G1CollectedHeap::print_termination_stats(outputStream* const st,
+                                              uint worker_id,
+                                              double elapsed_ms,
+                                              double strong_roots_ms,
+                                              double term_ms,
+                                              size_t term_attempts,
+                                              size_t alloc_buffer_waste,
+                                              size_t undo_waste) const {
+  st->print_cr("%3d %9.2f %9.2f %6.2f "
+               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
+               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
+               worker_id, elapsed_ms, strong_roots_ms, strong_roots_ms * 100 / elapsed_ms,
+               term_ms, term_ms * 100 / elapsed_ms, term_attempts,
+               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
+               alloc_buffer_waste * HeapWordSize / K,
+               undo_waste * HeapWordSize / K);
+}
+
 class G1StringSymbolTableUnlinkTask : public AbstractGangTask {
 private:
   BoolObjectClosure* _is_alive;
@@ -5132,17 +5188,20 @@
 
 class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
-  G1CollectedHeap*   _g1h;
-  RefToScanQueueSet* _queues;
-  WorkGang*          _workers;
-  uint               _active_workers;
+  G1CollectedHeap*        _g1h;
+  G1ParScanThreadState**  _pss;
+  RefToScanQueueSet*      _queues;
+  WorkGang*               _workers;
+  uint                    _active_workers;
 
 public:
   G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
+                           G1ParScanThreadState** per_thread_states,
                            WorkGang* workers,
                            RefToScanQueueSet *task_queues,
                            uint n_workers) :
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _workers(workers),
     _active_workers(n_workers)
@@ -5161,17 +5220,20 @@
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
-  RefToScanQueueSet *_task_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet* _task_queues;
   ParallelTaskTerminator* _terminator;
 
 public:
   G1STWRefProcTaskProxy(ProcessTask& proc_task,
-                     G1CollectedHeap* g1h,
-                     RefToScanQueueSet *task_queues,
-                     ParallelTaskTerminator* terminator) :
+                        G1CollectedHeap* g1h,
+                        G1ParScanThreadState** per_thread_states,
+                        RefToScanQueueSet *task_queues,
+                        ParallelTaskTerminator* terminator) :
     AbstractGangTask("Process reference objects in parallel"),
     _proc_task(proc_task),
     _g1h(g1h),
+    _pss(per_thread_states),
     _task_queues(task_queues),
     _terminator(terminator)
   {}
@@ -5183,11 +5245,12 @@
 
     G1STWIsAliveClosure is_alive(_g1h);
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*           pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5197,10 +5260,10 @@
     }
 
     // Keep alive closure.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     // Complete GC closure
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _task_queues, _terminator);
 
     // Call the reference processing task's work routine.
     _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);
@@ -5219,7 +5282,7 @@
   assert(_workers != NULL, "Need parallel worker threads.");
 
   ParallelTaskTerminator terminator(_active_workers, _queues);
-  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _pss, _queues, &terminator);
 
   _workers->run_task(&proc_task_proxy);
 }
@@ -5261,15 +5324,17 @@
 
 class G1ParPreserveCMReferentsTask: public AbstractGangTask {
 protected:
-  G1CollectedHeap* _g1h;
-  RefToScanQueueSet      *_queues;
+  G1CollectedHeap*       _g1h;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, uint workers, RefToScanQueueSet *task_queues) :
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, int workers, RefToScanQueueSet *task_queues) :
     AbstractGangTask("ParPreserveCMReferents"),
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _terminator(workers, _queues),
     _n_workers(workers)
@@ -5279,12 +5344,13 @@
     ResourceMark rm;
     HandleMark   hm;
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    assert(pss.queue_is_empty(), "both queue and overflow should be empty");
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*          pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+    assert(pss->queue_is_empty(), "both queue and overflow should be empty");
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5298,7 +5364,7 @@
 
     // Copying keep alive closure. Applied to referent objects that need
     // to be copied.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     ReferenceProcessor* rp = _g1h->ref_processor_cm();
 
@@ -5331,15 +5397,15 @@
     }
 
     // Drain the queue - which may cause stealing
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _queues, &_terminator);
     drain_queue.do_void();
     // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
-    assert(pss.queue_is_empty(), "should be");
+    assert(pss->queue_is_empty(), "should be");
   }
 };
 
 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references() {
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_proc_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5369,6 +5435,7 @@
   uint no_of_gc_workers = workers()->active_workers();
 
   G1ParPreserveCMReferentsTask keep_cm_referents(this,
+                                                 per_thread_states,
                                                  no_of_gc_workers,
                                                  _task_queues);
 
@@ -5383,16 +5450,17 @@
   // JNI refs.
 
   // Use only a single queue for this PSS.
-  G1ParScanThreadState            pss(this, 0, NULL);
-  assert(pss.queue_is_empty(), "pre-condition");
+  G1ParScanThreadState*           pss = per_thread_states[0];
+  pss->set_ref_processor(NULL);
+  assert(pss->queue_is_empty(), "pre-condition");
 
   // We do not embed a reference processor in the copying/scanning
   // closures while we're actually processing the discovered
   // reference objects.
 
-  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
-
-  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, pss, NULL);
+
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, pss, NULL);
 
   OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5402,10 +5470,10 @@
   }
 
   // Keep alive closure.
-  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, &pss);
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, pss);
 
   // Serial Complete GC closure
-  G1STWDrainQueueClosure drain_queue(this, &pss);
+  G1STWDrainQueueClosure drain_queue(this, pss);
 
   // Setup the soft refs policy...
   rp->setup_policy(false);
@@ -5424,7 +5492,7 @@
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, no_of_gc_workers);
     stats = rp->process_discovered_references(&is_alive,
                                               &keep_alive,
                                               &drain_queue,
@@ -5436,14 +5504,14 @@
   _gc_tracer_stw->report_gc_reference_stats(stats);
 
   // We have completed copying any necessary live referent objects.
-  assert(pss.queue_is_empty(), "both queue and overflow should be empty");
+  assert(pss->queue_is_empty(), "both queue and overflow should be empty");
 
   double ref_proc_time = os::elapsedTime() - ref_proc_start;
   g1_policy()->phase_times()->record_ref_proc_time(ref_proc_time * 1000.0);
 }
 
 // Weak Reference processing during an evacuation pause (part 2).
-void G1CollectedHeap::enqueue_discovered_references() {
+void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_enq_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5462,7 +5530,7 @@
     assert(rp->num_q() == n_workers, "sanity");
     assert(n_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, n_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, n_workers);
     rp->enqueue_discovered_references(&par_task_executor);
   }
 
@@ -5498,9 +5566,14 @@
   double start_par_time_sec = os::elapsedTime();
   double end_par_time_sec;
 
+  G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC);
+  for (uint i = 0; i < n_workers; i++) {
+    per_thread_states[i] = new_par_scan_state(i);
+  }
+
   {
     G1RootProcessor root_processor(this, n_workers);
-    G1ParTask g1_par_task(this, _task_queues, &root_processor, n_workers);
+    G1ParTask g1_par_task(this, per_thread_states, _task_queues, &root_processor, n_workers);
     // InitialMark needs claim bits to keep track of the marked-through CLDs.
     if (collector_state()->during_initial_mark_pause()) {
       ClassLoaderDataGraph::clear_claimed_marks();
@@ -5508,7 +5581,7 @@
 
     // The individual threads will set their evac-failure closures.
     if (PrintTerminationStats) {
-      G1ParScanThreadState::print_termination_stats_hdr();
+      print_termination_stats_hdr(gclog_or_tty);
     }
 
     workers()->run_task(&g1_par_task);
@@ -5535,7 +5608,7 @@
   // as we may have to copy some 'reachable' referent
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
-  process_discovered_references();
+  process_discovered_references(per_thread_states);
 
   if (G1StringDedup::is_enabled()) {
     double fixup_start = os::elapsedTime();
@@ -5551,6 +5624,12 @@
   _allocator->release_gc_alloc_regions(evacuation_info);
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
+  for (uint i = 0; i < n_workers; i++) {
+    G1ParScanThreadState* pss = per_thread_states[i];
+    delete pss;
+  }
+  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states);
+
   record_obj_copy_mem_stats();
 
   // Reset and re-enable the hot card cache.
@@ -5577,7 +5656,7 @@
   // will log these updates (and dirty their associated
   // cards). We need these updates logged to update any
   // RSets.
-  enqueue_discovered_references();
+  enqueue_discovered_references(per_thread_states);
 
   redirty_logged_cards();
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
--- a/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Aug 20 15:17:43 2015 +0200
@@ -55,6 +55,7 @@
 class HRRSCleanupTask;
 class GenerationSpec;
 class OopsInHeapRegionClosure;
+class G1ParScanThreadState;
 class G1KlassScanClosure;
 class G1ParScanThreadState;
 class ObjectClosure;
@@ -583,11 +584,11 @@
 
   // Process any reference objects discovered during
   // an incremental evacuation pause.
-  void process_discovered_references();
+  void process_discovered_references(G1ParScanThreadState** per_thread_states);
 
   // Enqueue any remaining discovered references
   // after processing.
-  void enqueue_discovered_references();
+  void enqueue_discovered_references(G1ParScanThreadState** per_thread_states);
 
 public:
   WorkGang* workers() const { return _workers; }
@@ -682,6 +683,9 @@
   // Allocates a new heap region instance.
   HeapRegion* new_heap_region(uint hrs_index, MemRegion mr);
 
+  // Allocates a new per thread par scan state for the given thread id.
+  G1ParScanThreadState* new_par_scan_state(uint worker_id);
+
   // Allocate the highest free region in the reserved heap. This will commit
   // regions as necessary.
   HeapRegion* alloc_highest_free_region();
@@ -791,6 +795,17 @@
   // Actually do the work of evacuating the collection set.
   void evacuate_collection_set(EvacuationInfo& evacuation_info);
 
+  // Print the header for the per-thread termination statistics.
+  static void print_termination_stats_hdr(outputStream* const st);
+  // Print actual per-thread termination statistics.
+  void print_termination_stats(outputStream* const st,
+                               uint worker_id,
+                               double elapsed_ms,
+                               double strong_roots_ms,
+                               double term_ms,
+                               size_t term_attempts,
+                               size_t alloc_buffer_waste,
+                               size_t undo_waste) const;
   // Update object copying statistics.
   void record_obj_copy_mem_stats();
 
--- a/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Aug 20 15:17:43 2015 +0200
@@ -24,6 +24,7 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1CollectedHeap.hpp"
+#include "gc/g1/g1ParScanThreadState.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 
 bool G1CollectedHeap::copy_allocation_context_stats(const jint* contexts,
@@ -37,3 +38,7 @@
                                              MemRegion mr) {
   return new HeapRegion(hrs_index, bot_shared(), mr);
 }
+
+G1ParScanThreadState* G1CollectedHeap::new_par_scan_state(uint worker_id) {
+  return new G1ParScanThreadState(this, worker_id);
+}
--- a/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Aug 20 15:17:43 2015 +0200
@@ -32,7 +32,11 @@
 
 G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state) :
   G1ParClosureSuper(g1, par_scan_state), _scanned_klass(NULL),
-  _cm(_g1->concurrent_mark()) {}
+  _cm(_g1->concurrent_mark()) { }
+
+G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1) :
+  G1ParClosureSuper(g1), _scanned_klass(NULL),
+  _cm(_g1->concurrent_mark()) { }
 
 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1) :
   _g1(g1), _par_scan_state(NULL), _worker_id(UINT_MAX) { }
--- a/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Aug 20 15:17:43 2015 +0200
@@ -76,15 +76,13 @@
 
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
-  G1ParScanClosure(G1CollectedHeap* g1, ReferenceProcessor* rp) :
-    G1ParClosureSuper(g1) {
-    assert(_ref_processor == NULL, "sanity");
-    _ref_processor = rp;
-  }
+  G1ParScanClosure(G1CollectedHeap* g1) : G1ParClosureSuper(g1) { }
 
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
+
+  void set_ref_processor(ReferenceProcessor* ref_processor) { _ref_processor = ref_processor; }
 };
 
 // Add back base class for metadata
@@ -104,6 +102,7 @@
   void mark_forwarded_object(oop from_obj, oop to_obj);
  public:
   G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state);
+  G1ParCopyHelper(G1CollectedHeap* g1);
 
   void set_scanned_klass(Klass* k) { _scanned_klass = k; }
   template <class T> void do_klass_barrier(T* p, oop new_obj);
@@ -132,6 +131,10 @@
     assert(_ref_processor == NULL, "sanity");
   }
 
+  G1ParCopyClosure(G1CollectedHeap* g1) : G1ParCopyHelper(g1) {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
   template <class T> void do_oop_nv(T* p) { do_oop_work(p); }
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
--- a/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Aug 20 15:17:43 2015 +0200
@@ -32,17 +32,17 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/prefetch.inline.hpp"
 
-G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp)
+G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
   : _g1h(g1h),
     _refs(g1h->task_queue(worker_id)),
     _dcq(&g1h->dirty_card_queue_set()),
     _ct_bs(g1h->g1_barrier_set()),
     _g1_rem(g1h->g1_rem_set()),
-    _hash_seed(17), _worker_id(worker_id),
-    _term_attempts(0),
+    _hash_seed(17),
+    _worker_id(worker_id),
     _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
-    _age_table(false), _scanner(g1h, rp),
-    _strong_roots_time(0), _term_time(0),
+    _age_table(false),
+    _scanner(g1h),
     _old_gen_is_full(false)
 {
   _scanner.set_par_scan_thread_state(this);
@@ -69,8 +69,6 @@
   // need to be moved to the next space.
   _dest[InCSetState::Young]        = InCSetState::Old;
   _dest[InCSetState::Old]          = InCSetState::Old;
-
-  _start = os::elapsedTime();
 }
 
 G1ParScanThreadState::~G1ParScanThreadState() {
@@ -79,28 +77,8 @@
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base);
 }
 
-void G1ParScanThreadState::print_termination_stats_hdr(outputStream* const st) {
-  st->print_raw_cr("GC Termination Stats");
-  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
-  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
-  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
-}
-
-void G1ParScanThreadState::print_termination_stats(outputStream* const st) const {
-  const double elapsed_ms = elapsed_time() * 1000.0;
-  const double s_roots_ms = strong_roots_time() * 1000.0;
-  const double term_ms    = term_time() * 1000.0;
-  size_t alloc_buffer_waste = 0;
-  size_t undo_waste = 0;
-  _plab_allocator->waste(alloc_buffer_waste, undo_waste);
-  st->print_cr("%3u %9.2f %9.2f %6.2f "
-               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
-               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
-               _worker_id, elapsed_ms, s_roots_ms, s_roots_ms * 100 / elapsed_ms,
-               term_ms, term_ms * 100 / elapsed_ms, term_attempts(),
-               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
-               alloc_buffer_waste * HeapWordSize / K,
-               undo_waste * HeapWordSize / K);
+void G1ParScanThreadState::waste(size_t& wasted, size_t& undo_wasted) {
+  _plab_allocator->waste(wasted, undo_wasted);
 }
 
 #ifdef ASSERT
@@ -292,8 +270,7 @@
                                              obj);
     }
 
-    size_t* const surv_young_words = surviving_young_words();
-    surv_young_words[young_index] += word_sz;
+    _surviving_young_words[young_index] += word_sz;
 
     if (obj->is_objArray() && arrayOop(obj)->length() >= ParGCArrayScanChunk) {
       // We keep track of the next start index in the length field of
--- a/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Aug 20 15:17:41 2015 +0200
+++ b/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Aug 20 15:17:43 2015 +0200
@@ -39,13 +39,13 @@
 class HeapRegion;
 class outputStream;
 
-class G1ParScanThreadState : public StackObj {
+class G1ParScanThreadState : public CHeapObj<mtGC> {
  private:
   G1CollectedHeap* _g1h;
   RefToScanQueue*  _refs;
   DirtyCardQueue   _dcq;
   G1SATBCardTableModRefBS* _ct_bs;
-  G1RemSet* _g1_rem;
+  G1RemSet*         _g1_rem;
 
   G1PLABAllocator*  _plab_allocator;
 
@@ -58,14 +58,6 @@
   int  _hash_seed;
   uint _worker_id;
 
-  size_t _term_attempts;
-
-  double _start;
-  double _start_strong_roots;
-  double _strong_roots_time;
-  double _start_term;
-  double _term_time;
-
   // Map from young-age-index (0 == not young, 1 is youngest) to
   // surviving words. base is what we get back from the malloc call
   size_t* _surviving_young_words_base;
@@ -90,9 +82,11 @@
   }
 
  public:
-  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp);
+  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id);
   ~G1ParScanThreadState();
 
+  void set_ref_processor(ReferenceProcessor* rp) { _scanner.set_ref_processor(rp); }
+
   ageTable*         age_table()       { return &_age_table;       }
 
 #ifdef ASSERT
@@ -119,40 +113,14 @@
 
   uint worker_id() { return _worker_id; }
 
-  size_t term_attempts() const  { return _term_attempts; }
-  void note_term_attempt() { _term_attempts++; }
-
-  void start_strong_roots() {
-    _start_strong_roots = os::elapsedTime();
-  }
-  void end_strong_roots() {
-    _strong_roots_time += (os::elapsedTime() - _start_strong_roots);
-  }
-  double strong_roots_time() const { return _strong_roots_time; }
-
-  void start_term_time() {
-    note_term_attempt();
-    _start_term = os::elapsedTime();
-  }
-  void end_term_time() {
-    _term_time += (os::elapsedTime() - _start_term);
-  }
-  double term_time() const { return _term_time; }
-
-  double elapsed_time() const {
-    return os::elapsedTime() - _start;
-  }
-
-  // Print the header for the per-thread termination statistics.
-  static void print_termination_stats_hdr(outputStream* const st = gclog_or_tty);
-
-  // Print actual per-thread termination statistics.
-  void print_termination_stats(outputStream* const st = gclog_or_tty) const;
+  // Returns the current amount of waste due to alignment or not being able to fit
+  // objects within LABs and the undo waste.
+  virtual void waste(size_t& wasted, size_t& undo_wasted);
 
   size_t* surviving_young_words() {
-    // We add on to hide entry 0 which accumulates surviving words for
+    // We add one to hide entry 0 which accumulates surviving words for
     // age -1 regions (i.e. non-young ones)
-    return _surviving_young_words;
+    return _surviving_young_words + 1;
   }
 
  private: