changeset 4639:7eabf05bddea hs24-b44

Merge
author amurillo
date Thu, 09 May 2013 06:12:11 -0700
parents e085458ffacf (current diff) 8e59680f4573 (diff)
children f8075a623349
files
diffstat 23 files changed, 793 insertions(+), 547 deletions(-) [+]
line wrap: on
line diff
--- a/make/hotspot_version	Wed May 08 16:51:54 2013 -0700
+++ b/make/hotspot_version	Thu May 09 06:12:11 2013 -0700
@@ -35,7 +35,7 @@
 
 HS_MAJOR_VER=24
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=43
+HS_BUILD_NUMBER=44
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=7
--- a/src/cpu/sparc/vm/templateTable_sparc.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/cpu/sparc/vm/templateTable_sparc.cpp	Thu May 09 06:12:11 2013 -0700
@@ -62,6 +62,13 @@
                                 noreg /* pre_val */,
                                 tmp, true /*preserve_o_regs*/);
 
+        // G1 barrier needs uncompressed oop for region cross check.
+        Register new_val = val;
+        if (UseCompressedOops && val != G0) {
+          new_val = tmp;
+          __ mov(val, new_val);
+        }
+
         if (index == noreg ) {
           assert(Assembler::is_simm13(offset), "fix this code");
           __ store_heap_oop(val, base, offset);
@@ -78,7 +85,7 @@
               __ add(base, index, base);
             }
           }
-          __ g1_write_barrier_post(base, val, tmp);
+          __ g1_write_barrier_post(base, new_val, tmp);
         }
       }
       break;
--- a/src/cpu/x86/vm/templateTable_x86_64.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/cpu/x86/vm/templateTable_x86_64.cpp	Thu May 09 06:12:11 2013 -0700
@@ -156,14 +156,19 @@
         if (val == noreg) {
           __ store_heap_oop_null(Address(rdx, 0));
         } else {
+          // G1 barrier needs uncompressed oop for region cross check.
+          Register new_val = val;
+          if (UseCompressedOops) {
+            new_val = rbx;
+            __ movptr(new_val, val);
+          }
           __ store_heap_oop(Address(rdx, 0), val);
           __ g1_write_barrier_post(rdx /* store_adr */,
-                                   val /* new_val */,
+                                   new_val /* new_val */,
                                    r15_thread /* thread */,
                                    r8 /* tmp */,
                                    rbx /* tmp2 */);
         }
-
       }
       break;
 #endif // SERIALGC
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Thu May 09 06:12:11 2013 -0700
@@ -669,7 +669,7 @@
   }
 }
 
-void ConcurrentMark::set_phase(uint active_tasks, bool concurrent) {
+void ConcurrentMark::set_concurrency(uint active_tasks) {
   assert(active_tasks <= _max_task_num, "we should not have more");
 
   _active_tasks = active_tasks;
@@ -678,6 +678,10 @@
   _terminator   = ParallelTaskTerminator((int) active_tasks, _task_queues);
   _first_overflow_barrier_sync.set_n_workers((int) active_tasks);
   _second_overflow_barrier_sync.set_n_workers((int) active_tasks);
+}
+
+void ConcurrentMark::set_concurrency_and_phase(uint active_tasks, bool concurrent) {
+  set_concurrency(active_tasks);
 
   _concurrent = concurrent;
   // We propagate this to all tasks, not just the active ones.
@@ -691,7 +695,9 @@
     // false before we start remark. At this point we should also be
     // in a STW phase.
     assert(!concurrent_marking_in_progress(), "invariant");
-    assert(_finger == _heap_end, "only way to get here");
+    assert(_finger == _heap_end,
+           err_msg("only way to get here: _finger: "PTR_FORMAT", _heap_end: "PTR_FORMAT,
+                   _finger, _heap_end));
     update_g1_committed(true);
   }
 }
@@ -859,20 +865,28 @@
     gclog_or_tty->print_cr("[%d] leaving first barrier", task_num);
   }
 
-  // let task 0 do this
-  if (task_num == 0) {
-    // task 0 is responsible for clearing the global data structures
-    // We should be here because of an overflow. During STW we should
-    // not clear the overflow flag since we rely on it being true when
-    // we exit this method to abort the pause and restart concurent
-    // marking.
-    reset_marking_state(concurrent() /* clear_overflow */);
-    force_overflow()->update();
-
-    if (G1Log::fine()) {
-      gclog_or_tty->date_stamp(PrintGCDateStamps);
-      gclog_or_tty->stamp(PrintGCTimeStamps);
-      gclog_or_tty->print_cr("[GC concurrent-mark-reset-for-overflow]");
+  // If we're executing the concurrent phase of marking, reset the marking
+  // state; otherwise the marking state is reset after reference processing,
+  // during the remark pause.
+  // If we reset here as a result of an overflow during the remark we will
+  // see assertion failures from any subsequent set_concurrency_and_phase()
+  // calls.
+  if (concurrent()) {
+    // let the task 0 do this
+    if (task_num == 0) {
+      // task 0 is responsible for clearing the global data structures
+      // We should be here because of an overflow. During STW we should
+      // not clear the overflow flag since we rely on it being true when
+      // we exit this method to abort the pause and restart concurent
+      // marking.
+      reset_marking_state(true /* clear_overflow */);
+      force_overflow()->update();
+
+      if (G1Log::fine()) {
+        gclog_or_tty->date_stamp(PrintGCDateStamps);
+        gclog_or_tty->stamp(PrintGCTimeStamps);
+        gclog_or_tty->print_cr("[GC concurrent-mark-reset-for-overflow]");
+      }
     }
   }
 
@@ -892,7 +906,7 @@
   if (concurrent()) {
     ConcurrentGCThread::stsJoin();
   }
-  // at this point everything should be re-initialised and ready to go
+  // at this point everything should be re-initialized and ready to go
 
   if (verbose_low()) {
     gclog_or_tty->print_cr("[%d] leaving second barrier", task_num);
@@ -950,8 +964,8 @@
         double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
 
         the_task->do_marking_step(mark_step_duration_ms,
-                                  true /* do_stealing    */,
-                                  true /* do_termination */);
+                                  true  /* do_termination */,
+                                  false /* is_serial*/);
 
         double end_time_sec = os::elapsedTime();
         double end_vtime_sec = os::elapsedVTime();
@@ -1107,8 +1121,8 @@
 
   uint active_workers = MAX2(1U, parallel_marking_threads());
 
-  // Parallel task terminator is set in "set_phase()"
-  set_phase(active_workers, true /* concurrent */);
+  // Parallel task terminator is set in "set_concurrency_and_phase()"
+  set_concurrency_and_phase(active_workers, true /* concurrent */);
 
   CMConcurrentMarkingTask markingTask(this, cmThread());
   if (use_parallel_marking_threads()) {
@@ -1160,12 +1174,22 @@
   if (has_overflown()) {
     // Oops.  We overflowed.  Restart concurrent marking.
     _restart_for_overflow = true;
+    if (G1TraceMarkStackOverflow) {
+      gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
+    }
+
+    // Verify the heap w.r.t. the previous marking bitmap.
+    if (VerifyDuringGC) {
+      HandleMark hm;  // handle scope
+      gclog_or_tty->print(" VerifyDuringGC:(overflow)");
+      Universe::heap()->prepare_for_verify();
+      Universe::verify(/* silent */ false,
+                       /* option */ VerifyOption_G1UsePrevMarking);
+    }
+
     // Clear the marking state because we will be restarting
     // marking due to overflowing the global mark stack.
     reset_marking_state();
-    if (G1TraceMarkStackOverflow) {
-      gclog_or_tty->print_cr("\nRemark led to restart for overflow.");
-    }
   } else {
     // Aggregate the per-task counting data that we have accumulated
     // while marking.
@@ -2051,7 +2075,8 @@
   assert(tmp_free_list.is_empty(), "post-condition");
 }
 
-// Support closures for reference procssing in G1
+// Supporting Object and Oop closures for reference discovery
+// and processing in during marking
 
 bool G1CMIsAliveClosure::do_object_b(oop obj) {
   HeapWord* addr = (HeapWord*)obj;
@@ -2059,74 +2084,30 @@
          (!_g1->is_in_g1_reserved(addr) || !_g1->is_obj_ill(obj));
 }
 
-class G1CMKeepAliveClosure: public OopClosure {
-  G1CollectedHeap* _g1;
-  ConcurrentMark*  _cm;
+// 'Keep Alive' oop closure used by both serial parallel reference processing.
+// Uses the CMTask associated with a worker thread (for serial reference
+// processing the CMTask for worker 0 is used) to preserve (mark) and
+// trace referent objects.
+//
+// Using the CMTask and embedded local queues avoids having the worker
+// threads operating on the global mark stack. This reduces the risk
+// of overflowing the stack - which we would rather avoid at this late
+// state. Also using the tasks' local queues removes the potential
+// of the workers interfering with each other that could occur if
+// operating on the global stack.
+
+class G1CMKeepAliveAndDrainClosure: public OopClosure {
+  ConcurrentMark* _cm;
+  CMTask*         _task;
+  int             _ref_counter_limit;
+  int             _ref_counter;
+  bool            _is_serial;
  public:
-  G1CMKeepAliveClosure(G1CollectedHeap* g1, ConcurrentMark* cm) :
-    _g1(g1), _cm(cm) {
-    assert(Thread::current()->is_VM_thread(), "otherwise fix worker id");
-  }
-
-  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
-  virtual void do_oop(      oop* p) { do_oop_work(p); }
-
-  template <class T> void do_oop_work(T* p) {
-    oop obj = oopDesc::load_decode_heap_oop(p);
-    HeapWord* addr = (HeapWord*)obj;
-
-    if (_cm->verbose_high()) {
-      gclog_or_tty->print_cr("\t[0] we're looking at location "
-                             "*"PTR_FORMAT" = "PTR_FORMAT,
-                             p, (void*) obj);
-    }
-
-    if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
-      _cm->mark_and_count(obj);
-      _cm->mark_stack_push(obj);
-    }
-  }
-};
-
-class G1CMDrainMarkingStackClosure: public VoidClosure {
-  ConcurrentMark*               _cm;
-  CMMarkStack*                  _markStack;
-  G1CMKeepAliveClosure*         _oopClosure;
- public:
-  G1CMDrainMarkingStackClosure(ConcurrentMark* cm, CMMarkStack* markStack,
-                               G1CMKeepAliveClosure* oopClosure) :
-    _cm(cm),
-    _markStack(markStack),
-    _oopClosure(oopClosure) { }
-
-  void do_void() {
-    _markStack->drain((OopClosure*)_oopClosure, _cm->nextMarkBitMap(), false);
-  }
-};
-
-// 'Keep Alive' closure used by parallel reference processing.
-// An instance of this closure is used in the parallel reference processing
-// code rather than an instance of G1CMKeepAliveClosure. We could have used
-// the G1CMKeepAliveClosure as it is MT-safe. Also reference objects are
-// placed on to discovered ref lists once so we can mark and push with no
-// need to check whether the object has already been marked. Using the
-// G1CMKeepAliveClosure would mean, however, having all the worker threads
-// operating on the global mark stack. This means that an individual
-// worker would be doing lock-free pushes while it processes its own
-// discovered ref list followed by drain call. If the discovered ref lists
-// are unbalanced then this could cause interference with the other
-// workers. Using a CMTask (and its embedded local data structures)
-// avoids that potential interference.
-class G1CMParKeepAliveAndDrainClosure: public OopClosure {
-  ConcurrentMark*  _cm;
-  CMTask*          _task;
-  int              _ref_counter_limit;
-  int              _ref_counter;
- public:
-  G1CMParKeepAliveAndDrainClosure(ConcurrentMark* cm, CMTask* task) :
-    _cm(cm), _task(task),
+  G1CMKeepAliveAndDrainClosure(ConcurrentMark* cm, CMTask* task, bool is_serial) :
+    _cm(cm), _task(task), _is_serial(is_serial),
     _ref_counter_limit(G1RefProcDrainInterval) {
     assert(_ref_counter_limit > 0, "sanity");
+    assert(!_is_serial || _task->task_id() == 0, "only task 0 for serial code");
     _ref_counter = _ref_counter_limit;
   }
 
@@ -2146,23 +2127,27 @@
       _ref_counter--;
 
       if (_ref_counter == 0) {
-        // We have dealt with _ref_counter_limit references, pushing them and objects
-        // reachable from them on to the local stack (and possibly the global stack).
-        // Call do_marking_step() to process these entries. We call the routine in a
-        // loop, which we'll exit if there's nothing more to do (i.e. we're done
-        // with the entries that we've pushed as a result of the deal_with_reference
-        // calls above) or we overflow.
-        // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
-        // while there may still be some work to do. (See the comment at the
-        // beginning of CMTask::do_marking_step() for those conditions - one of which
-        // is reaching the specified time target.) It is only when
-        // CMTask::do_marking_step() returns without setting the has_aborted() flag
-        // that the marking has completed.
+        // We have dealt with _ref_counter_limit references, pushing them
+        // and objects reachable from them on to the local stack (and
+        // possibly the global stack). Call CMTask::do_marking_step() to
+        // process these entries.
+        //
+        // We call CMTask::do_marking_step() in a loop, which we'll exit if
+        // there's nothing more to do (i.e. we're done with the entries that
+        // were pushed as a result of the CMTask::deal_with_reference() calls
+        // above) or we overflow.
+        //
+        // Note: CMTask::do_marking_step() can set the CMTask::has_aborted()
+        // flag while there may still be some work to do. (See the comment at
+        // the beginning of CMTask::do_marking_step() for those conditions -
+        // one of which is reaching the specified time target.) It is only
+        // when CMTask::do_marking_step() returns without setting the
+        // has_aborted() flag that the marking step has completed.
         do {
           double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
           _task->do_marking_step(mark_step_duration_ms,
-                                 false /* do_stealing    */,
-                                 false /* do_termination */);
+                                 false      /* do_termination */,
+                                 _is_serial);
         } while (_task->has_aborted() && !_cm->has_overflown());
         _ref_counter = _ref_counter_limit;
       }
@@ -2174,36 +2159,50 @@
   }
 };
 
-class G1CMParDrainMarkingStackClosure: public VoidClosure {
+// 'Drain' oop closure used by both serial and parallel reference processing.
+// Uses the CMTask associated with a given worker thread (for serial
+// reference processing the CMtask for worker 0 is used). Calls the
+// do_marking_step routine, with an unbelievably large timeout value,
+// to drain the marking data structures of the remaining entries
+// added by the 'keep alive' oop closure above.
+
+class G1CMDrainMarkingStackClosure: public VoidClosure {
   ConcurrentMark* _cm;
-  CMTask* _task;
+  CMTask*         _task;
+  bool            _is_serial;
  public:
-  G1CMParDrainMarkingStackClosure(ConcurrentMark* cm, CMTask* task) :
-    _cm(cm), _task(task) { }
+  G1CMDrainMarkingStackClosure(ConcurrentMark* cm, CMTask* task, bool is_serial) :
+    _cm(cm), _task(task), _is_serial(is_serial) {
+    assert(!_is_serial || _task->task_id() == 0, "only task 0 for serial code");
+  }
 
   void do_void() {
     do {
       if (_cm->verbose_high()) {
-        gclog_or_tty->print_cr("\t[%d] Drain: Calling do marking_step",
-                               _task->task_id());
+        gclog_or_tty->print_cr("\t[%d] Drain: Calling do_marking_step - serial: %s",
+                               _task->task_id(), BOOL_TO_STR(_is_serial));
       }
 
-      // We call CMTask::do_marking_step() to completely drain the local and
-      // global marking stacks. The routine is called in a loop, which we'll
-      // exit if there's nothing more to do (i.e. we'completely drained the
-      // entries that were pushed as a result of applying the
-      // G1CMParKeepAliveAndDrainClosure to the entries on the discovered ref
-      // lists above) or we overflow the global marking stack.
-      // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
-      // while there may still be some work to do. (See the comment at the
-      // beginning of CMTask::do_marking_step() for those conditions - one of which
-      // is reaching the specified time target.) It is only when
-      // CMTask::do_marking_step() returns without setting the has_aborted() flag
-      // that the marking has completed.
+      // We call CMTask::do_marking_step() to completely drain the local
+      // and global marking stacks of entries pushed by the 'keep alive'
+      // oop closure (an instance of G1CMKeepAliveAndDrainClosure above).
+      //
+      // CMTask::do_marking_step() is called in a loop, which we'll exit
+      // if there's nothing more to do (i.e. we'completely drained the
+      // entries that were pushed as a a result of applying the 'keep alive'
+      // closure to the entries on the discovered ref lists) or we overflow
+      // the global marking stack.
+      //
+      // Note: CMTask::do_marking_step() can set the CMTask::has_aborted()
+      // flag while there may still be some work to do. (See the comment at
+      // the beginning of CMTask::do_marking_step() for those conditions -
+      // one of which is reaching the specified time target.) It is only
+      // when CMTask::do_marking_step() returns without setting the
+      // has_aborted() flag that the marking step has completed.
 
       _task->do_marking_step(1000000000.0 /* something very large */,
-                             true /* do_stealing    */,
-                             true /* do_termination */);
+                             true         /* do_termination */,
+                             _is_serial);
     } while (_task->has_aborted() && !_cm->has_overflown());
   }
 };
@@ -2242,13 +2241,16 @@
                      G1CollectedHeap* g1h,
                      ConcurrentMark* cm) :
     AbstractGangTask("Process reference objects in parallel"),
-    _proc_task(proc_task), _g1h(g1h), _cm(cm) { }
+    _proc_task(proc_task), _g1h(g1h), _cm(cm) {
+    ReferenceProcessor* rp = _g1h->ref_processor_cm();
+    assert(rp->processing_is_mt(), "shouldn't be here otherwise");
+  }
 
   virtual void work(uint worker_id) {
-    CMTask* marking_task = _cm->task(worker_id);
+    CMTask* task = _cm->task(worker_id);
     G1CMIsAliveClosure g1_is_alive(_g1h);
-    G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task);
-    G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task);
+    G1CMKeepAliveAndDrainClosure g1_par_keep_alive(_cm, task, false /* is_serial */);
+    G1CMDrainMarkingStackClosure g1_par_drain(_cm, task, false /* is_serial */);
 
     _proc_task.work(worker_id, g1_is_alive, g1_par_keep_alive, g1_par_drain);
   }
@@ -2256,12 +2258,15 @@
 
 void G1CMRefProcTaskExecutor::execute(ProcessTask& proc_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
+  assert(_g1h->ref_processor_cm()->processing_is_mt(), "processing is not MT");
 
   G1CMRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm);
 
-  // We need to reset the phase for each task execution so that
-  // the termination protocol of CMTask::do_marking_step works.
-  _cm->set_phase(_active_workers, false /* concurrent */);
+  // We need to reset the concurrency level before each
+  // proxy task execution, so that the termination protocol
+  // and overflow handling in CMTask::do_marking_step() knows
+  // how many workers to wait for.
+  _cm->set_concurrency(_active_workers);
   _g1h->set_par_threads(_active_workers);
   _workers->run_task(&proc_task_proxy);
   _g1h->set_par_threads(0);
@@ -2283,15 +2288,33 @@
 
 void G1CMRefProcTaskExecutor::execute(EnqueueTask& enq_task) {
   assert(_workers != NULL, "Need parallel worker threads.");
+  assert(_g1h->ref_processor_cm()->processing_is_mt(), "processing is not MT");
 
   G1CMRefEnqueueTaskProxy enq_task_proxy(enq_task);
 
+  // Not strictly necessary but...
+  //
+  // We need to reset the concurrency level before each
+  // proxy task execution, so that the termination protocol
+  // and overflow handling in CMTask::do_marking_step() knows
+  // how many workers to wait for.
+  _cm->set_concurrency(_active_workers);
   _g1h->set_par_threads(_active_workers);
   _workers->run_task(&enq_task_proxy);
   _g1h->set_par_threads(0);
 }
 
 void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
+  if (has_overflown()) {
+    // Skip processing the discovered references if we have
+    // overflown the global marking stack. Reference objects
+    // only get discovered once so it is OK to not
+    // de-populate the discovered reference lists. We could have,
+    // but the only benefit would be that, when marking restarts,
+    // less reference objects are discovered.
+    return;
+  }
+
   ResourceMark rm;
   HandleMark   hm;
 
@@ -2313,64 +2336,77 @@
     // See the comment in G1CollectedHeap::ref_processing_init()
     // about how reference processing currently works in G1.
 
-    // Process weak references.
+    // Set the soft reference policy
     rp->setup_policy(clear_all_soft_refs);
     assert(_markStack.isEmpty(), "mark stack should be empty");
 
-    G1CMKeepAliveClosure g1_keep_alive(g1h, this);
-    G1CMDrainMarkingStackClosure
-      g1_drain_mark_stack(this, &_markStack, &g1_keep_alive);
-
-    // We use the work gang from the G1CollectedHeap and we utilize all
-    // the worker threads.
-    uint active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1U;
+    // Instances of the 'Keep Alive' and 'Complete GC' closures used
+    // in serial reference processing. Note these closures are also
+    // used for serially processing (by the the current thread) the
+    // JNI references during parallel reference processing.
+    //
+    // These closures do not need to synchronize with the worker
+    // threads involved in parallel reference processing as these
+    // instances are executed serially by the current thread (e.g.
+    // reference processing is not multi-threaded and is thus
+    // performed by the current thread instead of a gang worker).
+    //
+    // The gang tasks involved in parallel reference procssing create
+    // their own instances of these closures, which do their own
+    // synchronization among themselves.
+    G1CMKeepAliveAndDrainClosure g1_keep_alive(this, task(0), true /* is_serial */);
+    G1CMDrainMarkingStackClosure g1_drain_mark_stack(this, task(0), true /* is_serial */);
+
+    // We need at least one active thread. If reference processing
+    // is not multi-threaded we use the current (VMThread) thread,
+    // otherwise we use the work gang from the G1CollectedHeap and
+    // we utilize all the worker threads we can.
+    bool processing_is_mt = rp->processing_is_mt() && g1h->workers() != NULL;
+    uint active_workers = (processing_is_mt ? g1h->workers()->active_workers() : 1U);
     active_workers = MAX2(MIN2(active_workers, _max_task_num), 1U);
 
+    // Parallel processing task executor.
     G1CMRefProcTaskExecutor par_task_executor(g1h, this,
                                               g1h->workers(), active_workers);
+    AbstractRefProcTaskExecutor* executor = (processing_is_mt ? &par_task_executor : NULL);
 
     ReferenceProcessorStats stats;
-    if (rp->processing_is_mt()) {
-      // Set the degree of MT here.  If the discovery is done MT, there
-      // may have been a different number of threads doing the discovery
-      // and a different number of discovered lists may have Ref objects.
-      // That is OK as long as the Reference lists are balanced (see
-      // balance_all_queues() and balance_queues()).
-      rp->set_active_mt_degree(active_workers);
-
-      stats = rp->process_discovered_references(&g1_is_alive,
-                                      &g1_keep_alive,
-                                      &g1_drain_mark_stack,
-                                      &par_task_executor,
-                                      g1h->gc_timer_cm());
-
-      // The work routines of the parallel keep_alive and drain_marking_stack
-      // will set the has_overflown flag if we overflow the global marking
-      // stack.
-    } else {
-      stats = rp->process_discovered_references(&g1_is_alive,
-                                        &g1_keep_alive,
-                                        &g1_drain_mark_stack,
-                                        NULL,
-                                        g1h->gc_timer_cm());
-    }
+
+    // Set the concurrency level. The phase was already set prior to
+    // executing the remark task.
+    set_concurrency(active_workers);
+
+    // Set the degree of MT processing here.  If the discovery was done MT,
+    // the number of threads involved during discovery could differ from
+    // the number of active workers.  This is OK as long as the discovered
+    // Reference lists are balanced (see balance_all_queues() and balance_queues()).
+    rp->set_active_mt_degree(active_workers);
+
+    // Process the weak references.
+    stats = rp->process_discovered_references(&g1_is_alive,
+                                              &g1_keep_alive,
+                                              &g1_drain_mark_stack,
+                                              executor,
+                                              g1h->gc_timer_cm());
+
+    // The do_oop work routines of the keep_alive and drain_marking_stack
+    // oop closures will set the has_overflown flag if we overflow the
+    // global marking stack.
 
     g1h->gc_tracer_cm()->report_gc_reference_stats(stats);
 
     assert(_markStack.overflow() || _markStack.isEmpty(),
             "mark stack should be empty (unless it overflowed)");
+
     if (_markStack.overflow()) {
-      // Should have been done already when we tried to push an
+      // This should have been done already when we tried to push an
       // entry on to the global mark stack. But let's do it again.
       set_has_overflown();
     }
 
-    if (rp->processing_is_mt()) {
-      assert(rp->num_q() == active_workers, "why not");
-      rp->enqueue_discovered_references(&par_task_executor);
-    } else {
-      rp->enqueue_discovered_references();
-    }
+    assert(rp->num_q() == active_workers, "why not");
+
+    rp->enqueue_discovered_references(executor);
 
     rp->verify_no_references_recorded();
     assert(!rp->discovery_enabled(), "Post condition");
@@ -2390,8 +2426,8 @@
 
 class CMRemarkTask: public AbstractGangTask {
 private:
-  ConcurrentMark *_cm;
-
+  ConcurrentMark* _cm;
+  bool            _is_serial;
 public:
   void work(uint worker_id) {
     // Since all available tasks are actually started, we should
@@ -2401,8 +2437,8 @@
       task->record_start_time();
       do {
         task->do_marking_step(1000000000.0 /* something very large */,
-                              true /* do_stealing    */,
-                              true /* do_termination */);
+                              true         /* do_termination       */,
+                              _is_serial);
       } while (task->has_aborted() && !_cm->has_overflown());
       // If we overflow, then we do not want to restart. We instead
       // want to abort remark and do concurrent marking again.
@@ -2410,8 +2446,8 @@
     }
   }
 
-  CMRemarkTask(ConcurrentMark* cm, int active_workers) :
-    AbstractGangTask("Par Remark"), _cm(cm) {
+  CMRemarkTask(ConcurrentMark* cm, int active_workers, bool is_serial) :
+    AbstractGangTask("Par Remark"), _cm(cm), _is_serial(is_serial) {
     _cm->terminator()->reset_for_reuse(active_workers);
   }
 };
@@ -2432,30 +2468,40 @@
       active_workers = (uint) ParallelGCThreads;
       g1h->workers()->set_active_workers(active_workers);
     }
-    set_phase(active_workers, false /* concurrent */);
+    set_concurrency_and_phase(active_workers, false /* concurrent */);
     // Leave _parallel_marking_threads at it's
     // value originally calculated in the ConcurrentMark
     // constructor and pass values of the active workers
     // through the gang in the task.
 
-    CMRemarkTask remarkTask(this, active_workers);
+    CMRemarkTask remarkTask(this, active_workers, false /* is_serial */);
+    // We will start all available threads, even if we decide that the
+    // active_workers will be fewer. The extra ones will just bail out
+    // immediately.
     g1h->set_par_threads(active_workers);
     g1h->workers()->run_task(&remarkTask);
     g1h->set_par_threads(0);
   } else {
     G1CollectedHeap::StrongRootsScope srs(g1h);
-    // this is remark, so we'll use up all available threads
     uint active_workers = 1;
-    set_phase(active_workers, false /* concurrent */);
-
-    CMRemarkTask remarkTask(this, active_workers);
-    // We will start all available threads, even if we decide that the
-    // active_workers will be fewer. The extra ones will just bail out
-    // immediately.
+    set_concurrency_and_phase(active_workers, false /* concurrent */);
+
+    // Note - if there's no work gang then the VMThread will be
+    // the thread to execute the remark - serially. We have
+    // to pass true for the is_serial parameter so that
+    // CMTask::do_marking_step() doesn't enter the sync
+    // barriers in the event of an overflow. Doing so will
+    // cause an assert that the current thread is not a
+    // concurrent GC thread.
+    CMRemarkTask remarkTask(this, active_workers, true /* is_serial*/);
     remarkTask.work(0);
   }
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
-  guarantee(satb_mq_set.completed_buffers_num() == 0, "invariant");
+  guarantee(has_overflown() ||
+            satb_mq_set.completed_buffers_num() == 0,
+            err_msg("Invariant: has_overflown = %s, num buffers = %d",
+                    BOOL_TO_STR(has_overflown()),
+                    satb_mq_set.completed_buffers_num()));
 
   print_stats();
 
@@ -3776,8 +3822,8 @@
 
 /*****************************************************************************
 
-    The do_marking_step(time_target_ms) method is the building block
-    of the parallel marking framework. It can be called in parallel
+    The do_marking_step(time_target_ms, ...) method is the building
+    block of the parallel marking framework. It can be called in parallel
     with other invocations of do_marking_step() on different tasks
     (but only one per task, obviously) and concurrently with the
     mutator threads, or during remark, hence it eliminates the need
@@ -3787,7 +3833,7 @@
     pauses too, since do_marking_step() ensures that it aborts before
     it needs to yield.
 
-    The data structures that is uses to do marking work are the
+    The data structures that it uses to do marking work are the
     following:
 
       (1) Marking Bitmap. If there are gray objects that appear only
@@ -3836,7 +3882,7 @@
       (2) When a global overflow (on the global stack) has been
       triggered. Before the task aborts, it will actually sync up with
       the other tasks to ensure that all the marking data structures
-      (local queues, stacks, fingers etc.)  are re-initialised so that
+      (local queues, stacks, fingers etc.)  are re-initialized so that
       when do_marking_step() completes, the marking phase can
       immediately restart.
 
@@ -3873,11 +3919,25 @@
     place, it was natural to piggy-back all the other conditions on it
     too and not constantly check them throughout the code.
 
+    If do_termination is true then do_marking_step will enter its
+    termination protocol.
+
+    The value of is_serial must be true when do_marking_step is being
+    called serially (i.e. by the VMThread) and do_marking_step should
+    skip any synchronization in the termination and overflow code.
+    Examples include the serial remark code and the serial reference
+    processing closures.
+
+    The value of is_serial must be false when do_marking_step is
+    being called by any of the worker threads in a work gang.
+    Examples include the concurrent marking code (CMMarkingTask),
+    the MT remark code, and the MT reference processing closures.
+
  *****************************************************************************/
 
 void CMTask::do_marking_step(double time_target_ms,
-                             bool do_stealing,
-                             bool do_termination) {
+                             bool do_termination,
+                             bool is_serial) {
   assert(time_target_ms >= 1.0, "minimum granularity is 1ms");
   assert(concurrent() == _cm->concurrent(), "they should be the same");
 
@@ -3898,6 +3958,12 @@
   _start_time_ms = os::elapsedVTime() * 1000.0;
   statsOnly( _interval_start_time_ms = _start_time_ms );
 
+  // If do_stealing is true then do_marking_step will attempt to
+  // steal work from the other CMTasks. It only makes sense to
+  // enable stealing when the termination protocol is enabled
+  // and do_marking_step() is not being called serially.
+  bool do_stealing = do_termination && !is_serial;
+
   double diff_prediction_ms =
     g1_policy->get_new_prediction(&_marking_step_diffs_ms);
   _time_target_ms = time_target_ms - diff_prediction_ms;
@@ -4138,10 +4204,12 @@
     }
 
     _termination_start_time_ms = os::elapsedVTime() * 1000.0;
+
     // The CMTask class also extends the TerminatorTerminator class,
     // hence its should_exit_termination() method will also decide
     // whether to exit the termination protocol or not.
-    bool finished = _cm->terminator()->offer_termination(this);
+    bool finished = (is_serial ||
+                     _cm->terminator()->offer_termination(this));
     double termination_end_time_ms = os::elapsedVTime() * 1000.0;
     _termination_time_ms +=
       termination_end_time_ms - _termination_start_time_ms;
@@ -4221,20 +4289,28 @@
         gclog_or_tty->print_cr("[%d] detected overflow", _task_id);
       }
 
-      _cm->enter_first_sync_barrier(_task_id);
-      // When we exit this sync barrier we know that all tasks have
-      // stopped doing marking work. So, it's now safe to
-      // re-initialise our data structures. At the end of this method,
-      // task 0 will clear the global data structures.
+      if (!is_serial) {
+        // We only need to enter the sync barrier if being called
+        // from a parallel context
+        _cm->enter_first_sync_barrier(_task_id);
+
+        // When we exit this sync barrier we know that all tasks have
+        // stopped doing marking work. So, it's now safe to
+        // re-initialise our data structures. At the end of this method,
+        // task 0 will clear the global data structures.
+      }
 
       statsOnly( ++_aborted_overflow );
 
       // We clear the local state of this task...
       clear_region_fields();
 
-      // ...and enter the second barrier.
-      _cm->enter_second_sync_barrier(_task_id);
-      // At this point everything has bee re-initialised and we're
+      if (!is_serial) {
+        // ...and enter the second barrier.
+        _cm->enter_second_sync_barrier(_task_id);
+      }
+      // At this point, if we're during the concurrent phase of
+      // marking, everything has been re-initialized and we're
       // ready to restart.
     }
 
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Thu May 09 06:12:11 2013 -0700
@@ -354,8 +354,8 @@
   friend class CalcLiveObjectsClosure;
   friend class G1CMRefProcTaskProxy;
   friend class G1CMRefProcTaskExecutor;
-  friend class G1CMParKeepAliveAndDrainClosure;
-  friend class G1CMParDrainMarkingStackClosure;
+  friend class G1CMKeepAliveAndDrainClosure;
+  friend class G1CMDrainMarkingStackClosure;
 
 protected:
   ConcurrentMarkThread* _cmThread;   // the thread doing the work
@@ -470,9 +470,12 @@
   // structures are initialised to a sensible and predictable state.
   void set_non_marking_state();
 
+  // Called to indicate how many threads are currently active.
+  void set_concurrency(uint active_tasks);
+
   // It should be called to indicate which phase we're in (concurrent
   // mark or remark) and how many threads are currently active.
-  void set_phase(uint active_tasks, bool concurrent);
+  void set_concurrency_and_phase(uint active_tasks, bool concurrent);
 
   // prints all gathered CM-related statistics
   void print_stats();
@@ -1117,7 +1120,9 @@
   // trying not to exceed the given duration. However, it might exit
   // prematurely, according to some conditions (i.e. SATB buffers are
   // available for processing).
-  void do_marking_step(double target_ms, bool do_stealing, bool do_termination);
+  void do_marking_step(double target_ms,
+                       bool do_termination,
+                       bool is_serial);
 
   // These two calls start and stop the timer
   void record_start_time() {
--- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Thu May 09 06:12:11 2013 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1313,232 +1313,238 @@
     gclog_or_tty->date_stamp(G1Log::fine() && PrintGCDateStamps);
     TraceCPUTime tcpu(G1Log::finer(), true, gclog_or_tty);
 
-    GCTraceTime t(GCCauseString("Full GC", gc_cause()), G1Log::fine(), true, NULL);
-    TraceCollectorStats tcs(g1mm()->full_collection_counters());
-    TraceMemoryManagerStats tms(true /* fullGC */, gc_cause());
-
-    double start = os::elapsedTime();
-    g1_policy()->record_full_collection_start();
-
-    // Note: When we have a more flexible GC logging framework that
-    // allows us to add optional attributes to a GC log record we
-    // could consider timing and reporting how long we wait in the
-    // following two methods.
-    wait_while_free_regions_coming();
-    // If we start the compaction before the CM threads finish
-    // scanning the root regions we might trip them over as we'll
-    // be moving objects / updating references. So let's wait until
-    // they are done. By telling them to abort, they should complete
-    // early.
-    _cm->root_regions()->abort();
-    _cm->root_regions()->wait_until_scan_finished();
-    append_secondary_free_list_if_not_empty_with_lock();
-
-    gc_prologue(true);
-    increment_total_collections(true /* full gc */);
-    increment_old_marking_cycles_started();
-
-    size_t g1h_prev_used = used();
-    assert(used() == recalculate_used(), "Should be equal");
-
-    verify_before_gc();
-
-    pre_full_gc_dump(gc_timer);
-
-    COMPILER2_PRESENT(DerivedPointerTable::clear());
-
-    // Disable discovery and empty the discovered lists
-    // for the CM ref processor.
-    ref_processor_cm()->disable_discovery();
-    ref_processor_cm()->abandon_partial_discovery();
-    ref_processor_cm()->verify_no_references_recorded();
-
-    // Abandon current iterations of concurrent marking and concurrent
-    // refinement, if any are in progress. We have to do this before
-    // wait_until_scan_finished() below.
-    concurrent_mark()->abort();
-
-    // Make sure we'll choose a new allocation region afterwards.
-    release_mutator_alloc_region();
-    abandon_gc_alloc_regions();
-    g1_rem_set()->cleanupHRRS();
-
-    // We should call this after we retire any currently active alloc
-    // regions so that all the ALLOC / RETIRE events are generated
-    // before the start GC event.
-    _hr_printer.start_gc(true /* full */, (size_t) total_collections());
-
-    // We may have added regions to the current incremental collection
-    // set between the last GC or pause and now. We need to clear the
-    // incremental collection set and then start rebuilding it afresh
-    // after this full GC.
-    abandon_collection_set(g1_policy()->inc_cset_head());
-    g1_policy()->clear_incremental_cset();
-    g1_policy()->stop_incremental_cset_building();
-
-    tear_down_region_sets(false /* free_list_only */);
-    g1_policy()->set_gcs_are_young(true);
-
-    // See the comments in g1CollectedHeap.hpp and
-    // G1CollectedHeap::ref_processing_init() about
-    // how reference processing currently works in G1.
-
-    // Temporarily make discovery by the STW ref processor single threaded (non-MT).
-    ReferenceProcessorMTDiscoveryMutator stw_rp_disc_ser(ref_processor_stw(), false);
-
-    // Temporarily clear the STW ref processor's _is_alive_non_header field.
-    ReferenceProcessorIsAliveMutator stw_rp_is_alive_null(ref_processor_stw(), NULL);
-
-    ref_processor_stw()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
-    ref_processor_stw()->setup_policy(do_clear_all_soft_refs);
-
-    // Do collection work
     {
-      HandleMark hm;  // Discard invalid handles created during gc
-      G1MarkSweep::invoke_at_safepoint(ref_processor_stw(), do_clear_all_soft_refs);
-    }
-
-    assert(free_regions() == 0, "we should not have added any free regions");
-    rebuild_region_sets(false /* free_list_only */);
-
-    // Enqueue any discovered reference objects that have
-    // not been removed from the discovered lists.
-    ref_processor_stw()->enqueue_discovered_references();
-
-    COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
-
-    MemoryService::track_memory_usage();
-
-    verify_after_gc();
-
-    assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
-    ref_processor_stw()->verify_no_references_recorded();
-
-    // Note: since we've just done a full GC, concurrent
-    // marking is no longer active. Therefore we need not
-    // re-enable reference discovery for the CM ref processor.
-    // That will be done at the start of the next marking cycle.
-    assert(!ref_processor_cm()->discovery_enabled(), "Postcondition");
-    ref_processor_cm()->verify_no_references_recorded();
-
-    reset_gc_time_stamp();
-    // Since everything potentially moved, we will clear all remembered
-    // sets, and clear all cards.  Later we will rebuild remembered
-    // sets. We will also reset the GC time stamps of the regions.
-    clear_rsets_post_compaction();
-    check_gc_time_stamps();
-
-    // Resize the heap if necessary.
-    resize_if_necessary_after_full_collection(explicit_gc ? 0 : word_size);
-
-    if (_hr_printer.is_active()) {
-      // We should do this after we potentially resize the heap so
-      // that all the COMMIT / UNCOMMIT events are generated before
-      // the end GC event.
-
-      print_hrs_post_compaction();
-      _hr_printer.end_gc(true /* full */, (size_t) total_collections());
-    }
-
-    if (_cg1r->use_cache()) {
-      _cg1r->clear_and_record_card_counts();
-      _cg1r->clear_hot_cache();
-    }
-
-    // Rebuild remembered sets of all regions.
-    if (G1CollectedHeap::use_parallel_gc_threads()) {
-      uint n_workers =
-        AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
-                                       workers()->active_workers(),
-                                       Threads::number_of_non_daemon_threads());
-      assert(UseDynamicNumberOfGCThreads ||
-             n_workers == workers()->total_workers(),
-             "If not dynamic should be using all the  workers");
-      workers()->set_active_workers(n_workers);
-      // Set parallel threads in the heap (_n_par_threads) only
-      // before a parallel phase and always reset it to 0 after
-      // the phase so that the number of parallel threads does
-      // no get carried forward to a serial phase where there
-      // may be code that is "possibly_parallel".
-      set_par_threads(n_workers);
-
-      ParRebuildRSTask rebuild_rs_task(this);
-      assert(check_heap_region_claim_values(
-             HeapRegion::InitialClaimValue), "sanity check");
-      assert(UseDynamicNumberOfGCThreads ||
-             workers()->active_workers() == workers()->total_workers(),
-        "Unless dynamic should use total workers");
-      // Use the most recent number of  active workers
-      assert(workers()->active_workers() > 0,
-        "Active workers not properly set");
-      set_par_threads(workers()->active_workers());
-      workers()->run_task(&rebuild_rs_task);
-      set_par_threads(0);
-      assert(check_heap_region_claim_values(
-             HeapRegion::RebuildRSClaimValue), "sanity check");
-      reset_heap_region_claim_values();
-    } else {
-      RebuildRSOutOfRegionClosure rebuild_rs(this);
-      heap_region_iterate(&rebuild_rs);
-    }
-
-    if (G1Log::fine()) {
-      print_size_transition(gclog_or_tty, g1h_prev_used, used(), capacity());
-    }
-
-    if (true) { // FIXME
-      // Ask the permanent generation to adjust size for full collections
-      perm()->compute_new_size();
-    }
-
-    // Start a new incremental collection set for the next pause
-    assert(g1_policy()->collection_set() == NULL, "must be");
-    g1_policy()->start_incremental_cset_building();
-
-    // Clear the _cset_fast_test bitmap in anticipation of adding
-    // regions to the incremental collection set for the next
-    // evacuation pause.
-    clear_cset_fast_test();
-
-    init_mutator_alloc_region();
-
-    double end = os::elapsedTime();
-    g1_policy()->record_full_collection_end();
+      GCTraceTime t(GCCauseString("Full GC", gc_cause()), G1Log::fine(), true, NULL);
+      TraceCollectorStats tcs(g1mm()->full_collection_counters());
+      TraceMemoryManagerStats tms(true /* fullGC */, gc_cause());
+
+      double start = os::elapsedTime();
+      g1_policy()->record_full_collection_start();
+
+      // Note: When we have a more flexible GC logging framework that
+      // allows us to add optional attributes to a GC log record we
+      // could consider timing and reporting how long we wait in the
+      // following two methods.
+      wait_while_free_regions_coming();
+      // If we start the compaction before the CM threads finish
+      // scanning the root regions we might trip them over as we'll
+      // be moving objects / updating references. So let's wait until
+      // they are done. By telling them to abort, they should complete
+      // early.
+      _cm->root_regions()->abort();
+      _cm->root_regions()->wait_until_scan_finished();
+      append_secondary_free_list_if_not_empty_with_lock();
+
+      gc_prologue(true);
+      increment_total_collections(true /* full gc */);
+      increment_old_marking_cycles_started();
+
+      assert(used() == recalculate_used(), "Should be equal");
+
+      verify_before_gc();
+
+      pre_full_gc_dump(gc_timer);
+
+      COMPILER2_PRESENT(DerivedPointerTable::clear());
+
+      // Disable discovery and empty the discovered lists
+      // for the CM ref processor.
+      ref_processor_cm()->disable_discovery();
+      ref_processor_cm()->abandon_partial_discovery();
+      ref_processor_cm()->verify_no_references_recorded();
+
+      // Abandon current iterations of concurrent marking and concurrent
+      // refinement, if any are in progress. We have to do this before
+      // wait_until_scan_finished() below.
+      concurrent_mark()->abort();
+
+      // Make sure we'll choose a new allocation region afterwards.
+      release_mutator_alloc_region();
+      abandon_gc_alloc_regions();
+      g1_rem_set()->cleanupHRRS();
+
+      // We should call this after we retire any currently active alloc
+      // regions so that all the ALLOC / RETIRE events are generated
+      // before the start GC event.
+      _hr_printer.start_gc(true /* full */, (size_t) total_collections());
+
+      // We may have added regions to the current incremental collection
+      // set between the last GC or pause and now. We need to clear the
+      // incremental collection set and then start rebuilding it afresh
+      // after this full GC.
+      abandon_collection_set(g1_policy()->inc_cset_head());
+      g1_policy()->clear_incremental_cset();
+      g1_policy()->stop_incremental_cset_building();
+
+      tear_down_region_sets(false /* free_list_only */);
+      g1_policy()->set_gcs_are_young(true);
+
+      // See the comments in g1CollectedHeap.hpp and
+      // G1CollectedHeap::ref_processing_init() about
+      // how reference processing currently works in G1.
+
+      // Temporarily make discovery by the STW ref processor single threaded (non-MT).
+      ReferenceProcessorMTDiscoveryMutator stw_rp_disc_ser(ref_processor_stw(), false);
+
+      // Temporarily clear the STW ref processor's _is_alive_non_header field.
+      ReferenceProcessorIsAliveMutator stw_rp_is_alive_null(ref_processor_stw(), NULL);
+
+      ref_processor_stw()->enable_discovery(true /*verify_disabled*/, true /*verify_no_refs*/);
+      ref_processor_stw()->setup_policy(do_clear_all_soft_refs);
+
+      // Do collection work
+      {
+        HandleMark hm;  // Discard invalid handles created during gc
+        G1MarkSweep::invoke_at_safepoint(ref_processor_stw(), do_clear_all_soft_refs);
+      }
+
+      assert(free_regions() == 0, "we should not have added any free regions");
+      rebuild_region_sets(false /* free_list_only */);
+
+      // Enqueue any discovered reference objects that have
+      // not been removed from the discovered lists.
+      ref_processor_stw()->enqueue_discovered_references();
+
+      COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
+
+      MemoryService::track_memory_usage();
+
+      verify_after_gc();
+
+      assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+      ref_processor_stw()->verify_no_references_recorded();
+
+      // Note: since we've just done a full GC, concurrent
+      // marking is no longer active. Therefore we need not
+      // re-enable reference discovery for the CM ref processor.
+      // That will be done at the start of the next marking cycle.
+      assert(!ref_processor_cm()->discovery_enabled(), "Postcondition");
+      ref_processor_cm()->verify_no_references_recorded();
+
+      reset_gc_time_stamp();
+      // Since everything potentially moved, we will clear all remembered
+      // sets, and clear all cards.  Later we will rebuild remembered
+      // sets. We will also reset the GC time stamps of the regions.
+      clear_rsets_post_compaction();
+      check_gc_time_stamps();
+
+      // Resize the heap if necessary.
+      resize_if_necessary_after_full_collection(explicit_gc ? 0 : word_size);
+
+      if (_hr_printer.is_active()) {
+        // We should do this after we potentially resize the heap so
+        // that all the COMMIT / UNCOMMIT events are generated before
+        // the end GC event.
+
+        print_hrs_post_compaction();
+        _hr_printer.end_gc(true /* full */, (size_t) total_collections());
+      }
+
+      if (_cg1r->use_cache()) {
+        _cg1r->clear_and_record_card_counts();
+        _cg1r->clear_hot_cache();
+      }
+
+      // Rebuild remembered sets of all regions.
+      if (G1CollectedHeap::use_parallel_gc_threads()) {
+        uint n_workers =
+          AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
+                                                  workers()->active_workers(),
+                                                  Threads::number_of_non_daemon_threads());
+        assert(UseDynamicNumberOfGCThreads ||
+               n_workers == workers()->total_workers(),
+               "If not dynamic should be using all the  workers");
+        workers()->set_active_workers(n_workers);
+        // Set parallel threads in the heap (_n_par_threads) only
+        // before a parallel phase and always reset it to 0 after
+        // the phase so that the number of parallel threads does
+        // no get carried forward to a serial phase where there
+        // may be code that is "possibly_parallel".
+        set_par_threads(n_workers);
+
+        ParRebuildRSTask rebuild_rs_task(this);
+        assert(check_heap_region_claim_values(
+               HeapRegion::InitialClaimValue), "sanity check");
+        assert(UseDynamicNumberOfGCThreads ||
+               workers()->active_workers() == workers()->total_workers(),
+               "Unless dynamic should use total workers");
+        // Use the most recent number of  active workers
+        assert(workers()->active_workers() > 0,
+               "Active workers not properly set");
+        set_par_threads(workers()->active_workers());
+        workers()->run_task(&rebuild_rs_task);
+        set_par_threads(0);
+        assert(check_heap_region_claim_values(
+               HeapRegion::RebuildRSClaimValue), "sanity check");
+        reset_heap_region_claim_values();
+      } else {
+        RebuildRSOutOfRegionClosure rebuild_rs(this);
+        heap_region_iterate(&rebuild_rs);
+      }
+
+      if (true) { // FIXME
+        // Ask the permanent generation to adjust size for full collections
+        perm()->compute_new_size();
+      }
 
 #ifdef TRACESPINNING
-    ParallelTaskTerminator::print_termination_counts();
+      ParallelTaskTerminator::print_termination_counts();
 #endif
 
-    gc_epilogue(true);
-
-    // Discard all rset updates
-    JavaThread::dirty_card_queue_set().abandon_logs();
-    assert(!G1DeferredRSUpdate
-           || (G1DeferredRSUpdate && (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any");
-
-    _young_list->reset_sampled_info();
-    // At this point there should be no regions in the
-    // entire heap tagged as young.
-    assert( check_young_list_empty(true /* check_heap */),
-      "young list should be empty at this point");
-
-    // Update the number of full collections that have been completed.
-    increment_old_marking_cycles_completed(false /* concurrent */);
-
-    _hrs.verify_optional();
-    verify_region_sets_optional();
+      // Discard all rset updates
+      JavaThread::dirty_card_queue_set().abandon_logs();
+      assert(!G1DeferredRSUpdate
+             || (G1DeferredRSUpdate &&
+                (dirty_card_queue_set().completed_buffers_num() == 0)), "Should not be any");
+
+      _young_list->reset_sampled_info();
+      // At this point there should be no regions in the
+      // entire heap tagged as young.
+      assert(check_young_list_empty(true /* check_heap */),
+             "young list should be empty at this point");
+
+      // Update the number of full collections that have been completed.
+      increment_old_marking_cycles_completed(false /* concurrent */);
+
+      _hrs.verify_optional();
+      verify_region_sets_optional();
+
+      // Start a new incremental collection set for the next pause
+      assert(g1_policy()->collection_set() == NULL, "must be");
+      g1_policy()->start_incremental_cset_building();
+
+      // Clear the _cset_fast_test bitmap in anticipation of adding
+      // regions to the incremental collection set for the next
+      // evacuation pause.
+      clear_cset_fast_test();
+
+      init_mutator_alloc_region();
+
+      double end = os::elapsedTime();
+      g1_policy()->record_full_collection_end();
+
+      if (G1Log::fine()) {
+        g1_policy()->print_heap_transition();
+      }
+
+      // We must call G1MonitoringSupport::update_sizes() in the same scoping level
+      // as an active TraceMemoryManagerStats object (i.e. before the destructor for the
+      // TraceMemoryManagerStats is called) so that the G1 memory pools are updated
+      // before any GC notifications are raised.
+      g1mm()->update_sizes();
+
+      gc_epilogue(true);
+    }
+
+    if (G1Log::finer()) {
+      g1_policy()->print_detailed_heap_transition();
+    }
 
     print_heap_after_gc();
     trace_heap_after_gc(gc_tracer);
 
-    // We must call G1MonitoringSupport::update_sizes() in the same scoping level
-    // as an active TraceMemoryManagerStats object (i.e. before the destructor for the
-    // TraceMemoryManagerStats is called) so that the G1 memory pools are updated
-    // before any GC notifications are raised.
-    g1mm()->update_sizes();
-  }
-
-  post_full_gc_dump(gc_timer);
+    post_full_gc_dump(gc_timer);
+  }
 
   gc_timer->register_gc_end(os::elapsed_counter());
 
@@ -2018,8 +2024,12 @@
 
   // Since max_byte_size is aligned to the size of a heap region (checked
   // above), we also need to align the perm gen size as it might not be.
-  const size_t total_reserved = max_byte_size +
-                                align_size_up(pgs->max_size(), HeapRegion::GrainBytes);
+  size_t total_reserved = 0;
+
+  total_reserved = add_and_check_overflow(total_reserved, max_byte_size);
+  size_t pg_max_size = (size_t) align_size_up(pgs->max_size(), HeapRegion::GrainBytes);
+  total_reserved = add_and_check_overflow(total_reserved, pg_max_size);
+
   Universe::check_alignment(total_reserved, HeapRegion::GrainBytes, "g1 heap and perm");
 
   char* addr = Universe::preferred_heap_base(total_reserved, Universe::UnscaledNarrowOop);
@@ -3357,12 +3367,12 @@
 
 void G1CollectedHeap::verify(bool silent,
                              VerifyOption vo) {
-  if (SafepointSynchronize::is_at_safepoint() || ! UseTLAB) {
+  if (SafepointSynchronize::is_at_safepoint()) {
     if (!silent) { gclog_or_tty->print("Roots (excluding permgen) "); }
     VerifyRootsClosure rootsCl(vo);
 
     assert(Thread::current()->is_VM_thread(),
-      "Expected to be executed serially by the VM thread at this point");
+           "Expected to be executed serially by the VM thread at this point");
 
     CodeBlobToOopClosure blobsCl(&rootsCl, /*do_marking=*/ false);
 
@@ -3455,7 +3465,8 @@
     }
     guarantee(!failures, "there should not have been any failures");
   } else {
-    if (!silent) gclog_or_tty->print("(SKIPPING roots, heapRegions, remset) ");
+    if (!silent)
+      gclog_or_tty->print("(SKIPPING roots, heapRegionSets, heapRegions, remset) ");
   }
 }
 
@@ -3917,7 +3928,6 @@
         // The elapsed time induced by the start time below deliberately elides
         // the possible verification above.
         double sample_start_time_sec = os::elapsedTime();
-        size_t start_used_bytes = used();
 
 #if YOUNG_LIST_VERBOSE
         gclog_or_tty->print_cr("\nBefore recording pause start.\nYoung_list:");
@@ -3925,8 +3935,7 @@
         g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE
 
-        g1_policy()->record_collection_pause_start(sample_start_time_sec,
-                                                   start_used_bytes);
+        g1_policy()->record_collection_pause_start(sample_start_time_sec);
 
         double scan_wait_start = os::elapsedTime();
         // We have to wait until the CM threads finish scanning the
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Thu May 09 06:12:11 2013 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -406,7 +406,6 @@
   }
   _free_regions_at_end_of_collection = _g1->free_regions();
   update_young_list_target_length();
-  _prev_eden_capacity = _young_list_target_length * HeapRegion::GrainBytes;
 
   // We may immediately start allocating regions and placing them on the
   // collection set list. Initialize the per-collection set info
@@ -746,6 +745,7 @@
 
 void G1CollectorPolicy::record_full_collection_start() {
   _full_collection_start_sec = os::elapsedTime();
+  record_heap_size_info_at_start();
   // Release the future to-space so that it is available for compaction into.
   _g1->set_full_collection();
 }
@@ -788,8 +788,7 @@
   _stop_world_start = os::elapsedTime();
 }
 
-void G1CollectorPolicy::record_collection_pause_start(double start_time_sec,
-                                                      size_t start_used) {
+void G1CollectorPolicy::record_collection_pause_start(double start_time_sec) {
   // We only need to do this here as the policy will only be applied
   // to the GC we're about to start. so, no point is calculating this
   // every time we calculate / recalculate the target young length.
@@ -803,19 +802,14 @@
   _trace_gen0_time_data.record_start_collection(s_w_t_ms);
   _stop_world_start = 0.0;
 
+  record_heap_size_info_at_start();
+
   phase_times()->record_cur_collection_start_sec(start_time_sec);
-  _cur_collection_pause_used_at_start_bytes = start_used;
-  _cur_collection_pause_used_regions_at_start = _g1->used_regions();
   _pending_cards = _g1->pending_card_num();
 
   _collection_set_bytes_used_before = 0;
   _bytes_copied_during_gc = 0;
 
-  YoungList* young_list = _g1->young_list();
-  _eden_bytes_before_gc = young_list->eden_used_bytes();
-  _survivor_bytes_before_gc = young_list->survivor_used_bytes();
-  _capacity_before_gc = _g1->capacity();
-
   _last_gc_was_young = false;
 
   // do that for any other surv rate groups
@@ -1156,6 +1150,21 @@
   byte_size_in_proper_unit((double)(bytes)),                    \
   proper_unit_for_byte_size((bytes))
 
+void G1CollectorPolicy::record_heap_size_info_at_start() {
+  YoungList* young_list = _g1->young_list();
+  _eden_bytes_before_gc = young_list->eden_used_bytes();
+  _survivor_bytes_before_gc = young_list->survivor_used_bytes();
+  _capacity_before_gc = _g1->capacity();
+
+  _cur_collection_pause_used_at_start_bytes = _g1->used();
+  _cur_collection_pause_used_regions_at_start = _g1->used_regions();
+
+  size_t eden_capacity_before_gc =
+         (_young_list_target_length * HeapRegion::GrainBytes) - _survivor_bytes_before_gc;
+
+  _prev_eden_capacity = eden_capacity_before_gc;
+}
+
 void G1CollectorPolicy::print_heap_transition() {
   _g1->print_size_transition(gclog_or_tty,
     _cur_collection_pause_used_at_start_bytes, _g1->used(), _g1->capacity());
@@ -1186,8 +1195,6 @@
       EXT_SIZE_PARAMS(_capacity_before_gc),
       EXT_SIZE_PARAMS(used),
       EXT_SIZE_PARAMS(capacity));
-
-    _prev_eden_capacity = eden_capacity;
 }
 
 void G1CollectorPolicy::adjust_concurrent_refinement(double update_rs_time,
--- a/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp	Thu May 09 06:12:11 2013 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -672,34 +672,36 @@
 
   bool need_to_start_conc_mark(const char* source, size_t alloc_word_size = 0);
 
-  // Update the heuristic info to record a collection pause of the given
-  // start time, where the given number of bytes were used at the start.
-  // This may involve changing the desired size of a collection set.
+  // Record the start and end of an evacuation pause.
+  void record_collection_pause_start(double start_time_sec);
+  void record_collection_pause_end(double pause_time_ms, EvacuationInfo& evacuation_info);
 
-  void record_stop_world_start();
-
-  void record_collection_pause_start(double start_time_sec, size_t start_used);
+  // Record the start and end of a full collection.
+  void record_full_collection_start();
+  void record_full_collection_end();
 
   // Must currently be called while the world is stopped.
-  void record_concurrent_mark_init_end(double
-                                           mark_init_elapsed_time_ms);
+  void record_concurrent_mark_init_end(double mark_init_elapsed_time_ms);
 
+  // Record start and end of remark.
   void record_concurrent_mark_remark_start();
   void record_concurrent_mark_remark_end();
 
+  // Record start, end, and completion of cleanup.
   void record_concurrent_mark_cleanup_start();
   void record_concurrent_mark_cleanup_end(int no_of_gc_threads);
   void record_concurrent_mark_cleanup_completed();
 
-  void record_concurrent_pause();
+  // Records the information about the heap size for reporting in
+  // print_detailed_heap_transition
+  void record_heap_size_info_at_start();
 
-  void record_collection_pause_end(double pause_time, EvacuationInfo& evacuation_info);
+  // Print heap sizing transition (with less and more detail).
   void print_heap_transition();
   void print_detailed_heap_transition();
 
-  // Record the fact that a full collection occurred.
-  void record_full_collection_start();
-  void record_full_collection_end();
+  void record_stop_world_start();
+  void record_concurrent_pause();
 
   // Record how much space we copied during a GC. This is typically
   // called when a GC alloc region is being retired.
--- a/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/parallelScavenge/parallelScavengeHeap.cpp	Thu May 09 06:12:11 2013 -0700
@@ -132,7 +132,12 @@
                   og_min_size, og_max_size,
                   yg_min_size, yg_max_size);
 
-  const size_t total_reserved = pg_max_size + og_max_size + yg_max_size;
+  size_t total_reserved = 0;
+
+  total_reserved = add_and_check_overflow(total_reserved, pg_max_size);
+  total_reserved = add_and_check_overflow(total_reserved, og_max_size);
+  total_reserved = add_and_check_overflow(total_reserved, yg_max_size);
+
   char* addr = Universe::preferred_heap_base(total_reserved, Universe::UnscaledNarrowOop);
 
   // The main part of the heap (old gen + young gen) can often use a larger page
--- a/src/share/vm/gc_implementation/shared/gcTrace.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_implementation/shared/gcTrace.cpp	Thu May 09 06:12:11 2013 -0700
@@ -97,8 +97,16 @@
   ObjectCountEventSenderClosure(GCTracer* gc_tracer) : _gc_tracer(gc_tracer) {}
  private:
   void do_cinfo(KlassInfoEntry* entry) {
-    _gc_tracer->send_object_count_after_gc_event(entry->klass(), entry->count(),
-                                                 entry->words() * BytesPerWord);
+    if (is_visible_klass(entry->klass())) {
+      _gc_tracer->send_object_count_after_gc_event(entry->klass(), entry->count(),
+                                                   entry->words() * BytesPerWord);
+      }
+  }
+
+  // Do not expose internal implementation specific classes
+  bool is_visible_klass(klassOop k) {
+    return k->klass_part()->oop_is_instance() ||
+           (k->klass_part()->oop_is_array() && k != Universe::systemObjArrayKlassObj());
   }
 };
 
--- a/src/share/vm/gc_interface/collectedHeap.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_interface/collectedHeap.cpp	Thu May 09 06:12:11 2013 -0700
@@ -56,6 +56,9 @@
 
 size_t CollectedHeap::_filler_array_max_size = 0;
 
+const char* CollectedHeap::OverflowMessage
+  = "The size of the object heap + perm gen exceeds the maximum representable size";
+
 template <>
 void EventLogBase<GCMessage>::print(outputStream* st, GCMessage& m) {
   st->print_cr("GC heap %s", m.is_before ? "before" : "after");
@@ -180,6 +183,26 @@
 #endif
 }
 
+size_t CollectedHeap::add_and_check_overflow(size_t total, size_t size) {
+  assert(size >= 0, "must be");
+  size_t result = total + size;
+  if (result < size) {
+    // We must have overflowed
+    vm_exit_during_initialization(CollectedHeap::OverflowMessage);
+  }
+  return result;
+}
+
+size_t CollectedHeap::round_up_and_check_overflow(size_t total, size_t size) {
+  assert(size >= 0, "must be");
+  size_t result = round_to(total, size);
+  if (result < size) {
+    // We must have overflowed
+    vm_exit_during_initialization(CollectedHeap::OverflowMessage);
+  }
+  return result;
+}
+
 #ifndef PRODUCT
 void CollectedHeap::check_for_bad_heap_word_value(HeapWord* addr, size_t size) {
   if (CheckMemoryInitialization && ZapUnusedHeapArea) {
--- a/src/share/vm/gc_interface/collectedHeap.hpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/gc_interface/collectedHeap.hpp	Thu May 09 06:12:11 2013 -0700
@@ -92,6 +92,8 @@
   // Used for filler objects (static, but initialized in ctor).
   static size_t _filler_array_max_size;
 
+  const static char* OverflowMessage;
+
   GCHeapLog* _gc_heap_log;
 
   // Used in support of ReduceInitialCardMarks; only consulted if COMPILER2 is being used
@@ -134,6 +136,16 @@
   // Reinitialize tlabs before resuming mutators.
   virtual void resize_all_tlabs();
 
+  // Returns the sum of total and size if the sum does not overflow;
+  // Otherwise, call vm_exit_during_initialization().
+  // The overflow check is performed by comparing the result of the sum against size, which is assumed to be non-zero.
+  size_t add_and_check_overflow(size_t total, size_t size);
+
+  // Round up total against size and return the value, if the result does not overflow;
+  // Otherwise, call vm_exit_during_initialization().
+  // The overflow check is performed by comparing the round-up result against size, which is assumed to be non-zero.
+  size_t round_up_and_check_overflow(size_t total, size_t size);
+
   // Allocate from the current thread's TLAB, with broken-out slow path.
   inline static HeapWord* allocate_from_tlab(KlassHandle klass, Thread* thread, size_t size);
   static HeapWord* allocate_from_tlab_slow(KlassHandle klass, Thread* thread, size_t size);
--- a/src/share/vm/memory/genCollectedHeap.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/memory/genCollectedHeap.cpp	Thu May 09 06:12:11 2013 -0700
@@ -201,9 +201,6 @@
                                  size_t* _total_reserved,
                                  int* _n_covered_regions,
                                  ReservedSpace* heap_rs){
-  const char overflow_msg[] = "The size of the object heap + VM data exceeds "
-    "the maximum representable size";
-
   // Now figure out the total size.
   size_t total_reserved = 0;
   int n_covered_regions = 0;
@@ -211,41 +208,29 @@
       os::large_page_size() : os::vm_page_size();
 
   for (int i = 0; i < _n_gens; i++) {
-    total_reserved += _gen_specs[i]->max_size();
-    if (total_reserved < _gen_specs[i]->max_size()) {
-      vm_exit_during_initialization(overflow_msg);
-    }
+    total_reserved = add_and_check_overflow(total_reserved, _gen_specs[i]->max_size());
     n_covered_regions += _gen_specs[i]->n_covered_regions();
   }
+
   assert(total_reserved % pageSize == 0,
          err_msg("Gen size; total_reserved=" SIZE_FORMAT ", pageSize="
                  SIZE_FORMAT, total_reserved, pageSize));
-  total_reserved += perm_gen_spec->max_size();
+  total_reserved = add_and_check_overflow(total_reserved, perm_gen_spec->max_size());
   assert(total_reserved % pageSize == 0,
          err_msg("Perm size; total_reserved=" SIZE_FORMAT ", pageSize="
                  SIZE_FORMAT ", perm gen max=" SIZE_FORMAT, total_reserved,
                  pageSize, perm_gen_spec->max_size()));
 
-  if (total_reserved < perm_gen_spec->max_size()) {
-    vm_exit_during_initialization(overflow_msg);
-  }
   n_covered_regions += perm_gen_spec->n_covered_regions();
 
   // Add the size of the data area which shares the same reserved area
   // as the heap, but which is not actually part of the heap.
-  size_t s = perm_gen_spec->misc_data_size() + perm_gen_spec->misc_code_size();
-
-  total_reserved += s;
-  if (total_reserved < s) {
-    vm_exit_during_initialization(overflow_msg);
-  }
+  size_t misc = perm_gen_spec->misc_data_size() + perm_gen_spec->misc_code_size();
+  total_reserved = add_and_check_overflow(total_reserved, misc);
 
   if (UseLargePages) {
     assert(total_reserved != 0, "total_reserved cannot be 0");
-    total_reserved = round_to(total_reserved, os::large_page_size());
-    if (total_reserved < os::large_page_size()) {
-      vm_exit_during_initialization(overflow_msg);
-    }
+    total_reserved = round_up_and_check_overflow(total_reserved, os::large_page_size());
   }
 
   // Calculate the address at which the heap must reside in order for
--- a/src/share/vm/memory/universe.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/memory/universe.cpp	Thu May 09 06:12:11 2013 -0700
@@ -1033,15 +1033,6 @@
 void universe2_init() {
   EXCEPTION_MARK;
   Universe::genesis(CATCH);
-  // Although we'd like to verify here that the state of the heap
-  // is good, we can't because the main thread has not yet added
-  // itself to the threads list (so, using current interfaces
-  // we can't "fill" its TLAB), unless TLABs are disabled.
-  if (VerifyBeforeGC && !UseTLAB &&
-      Universe::heap()->total_collections() >= VerifyGCStartAt) {
-     Universe::heap()->prepare_for_verify();
-     Universe::verify();   // make sure we're starting with a clean slate
-  }
 }
 
 
--- a/src/share/vm/opto/graphKit.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/opto/graphKit.cpp	Thu May 09 06:12:11 2013 -0700
@@ -3570,7 +3570,8 @@
 
   Node* no_ctrl = NULL;
   Node* no_base = __ top();
-  Node* zero = __ ConI(0);
+  Node* zero  = __ ConI(0);
+  Node* zeroX = __ ConX(0);
 
   float likely  = PROB_LIKELY(0.999);
   float unlikely  = PROB_UNLIKELY(0.999);
@@ -3596,7 +3597,9 @@
 
   // if (!marking)
   __ if_then(marking, BoolTest::ne, zero); {
-    Node* index   = __ load(__ ctrl(), index_adr, TypeInt::INT, T_INT, Compile::AliasIdxRaw);
+    BasicType index_bt = TypeX_X->basic_type();
+    assert(sizeof(size_t) == type2aelembytes(index_bt), "Loading G1 PtrQueue::_index with wrong size.");
+    Node* index   = __ load(__ ctrl(), index_adr, TypeX_X, index_bt, Compile::AliasIdxRaw);
 
     if (do_load) {
       // load original value
@@ -3609,22 +3612,16 @@
       Node* buffer  = __ load(__ ctrl(), buffer_adr, TypeRawPtr::NOTNULL, T_ADDRESS, Compile::AliasIdxRaw);
 
       // is the queue for this thread full?
-      __ if_then(index, BoolTest::ne, zero, likely); {
+      __ if_then(index, BoolTest::ne, zeroX, likely); {
 
         // decrement the index
-        Node* next_index = __ SubI(index,  __ ConI(sizeof(intptr_t)));
-        Node* next_indexX = next_index;
-#ifdef _LP64
-        // We could refine the type for what it's worth
-        // const TypeLong* lidxtype = TypeLong::make(CONST64(0), get_size_from_queue);
-        next_indexX = _gvn.transform( new (C) ConvI2LNode(next_index, TypeLong::make(0, max_jlong, Type::WidenMax)) );
-#endif
+        Node* next_index = _gvn.transform(new (C) SubXNode(index, __ ConX(sizeof(intptr_t))));
 
         // Now get the buffer location we will log the previous value into and store it
-        Node *log_addr = __ AddP(no_base, buffer, next_indexX);
+        Node *log_addr = __ AddP(no_base, buffer, next_index);
         __ store(__ ctrl(), log_addr, pre_val, T_OBJECT, Compile::AliasIdxRaw);
         // update the index
-        __ store(__ ctrl(), index_adr, next_index, T_INT, Compile::AliasIdxRaw);
+        __ store(__ ctrl(), index_adr, next_index, index_bt, Compile::AliasIdxRaw);
 
       } __ else_(); {
 
@@ -3651,26 +3648,21 @@
                             Node* buffer,
                             const TypeFunc* tf) {
 
-  Node* zero = __ ConI(0);
+  Node* zero  = __ ConI(0);
+  Node* zeroX = __ ConX(0);
   Node* no_base = __ top();
   BasicType card_bt = T_BYTE;
   // Smash zero into card. MUST BE ORDERED WRT TO STORE
   __ storeCM(__ ctrl(), card_adr, zero, oop_store, oop_alias_idx, card_bt, Compile::AliasIdxRaw);
 
   //  Now do the queue work
-  __ if_then(index, BoolTest::ne, zero); {
-
-    Node* next_index = __ SubI(index, __ ConI(sizeof(intptr_t)));
-    Node* next_indexX = next_index;
-#ifdef _LP64
-    // We could refine the type for what it's worth
-    // const TypeLong* lidxtype = TypeLong::make(CONST64(0), get_size_from_queue);
-    next_indexX = _gvn.transform( new (C) ConvI2LNode(next_index, TypeLong::make(0, max_jlong, Type::WidenMax)) );
-#endif // _LP64
-    Node* log_addr = __ AddP(no_base, buffer, next_indexX);
+  __ if_then(index, BoolTest::ne, zeroX); {
+
+    Node* next_index = _gvn.transform(new (C) SubXNode(index, __ ConX(sizeof(intptr_t))));
+    Node* log_addr = __ AddP(no_base, buffer, next_index);
 
     __ store(__ ctrl(), log_addr, card_adr, T_ADDRESS, Compile::AliasIdxRaw);
-    __ store(__ ctrl(), index_adr, next_index, T_INT, Compile::AliasIdxRaw);
+    __ store(__ ctrl(), index_adr, next_index, TypeX_X->basic_type(), Compile::AliasIdxRaw);
 
   } __ else_(); {
     __ make_leaf_call(tf, CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), "g1_wb_post", card_adr, __ thread());
@@ -3731,7 +3723,7 @@
   // Now some values
   // Use ctrl to avoid hoisting these values past a safepoint, which could
   // potentially reset these fields in the JavaThread.
-  Node* index  = __ load(__ ctrl(), index_adr, TypeInt::INT, T_INT, Compile::AliasIdxRaw);
+  Node* index  = __ load(__ ctrl(), index_adr, TypeX_X, TypeX_X->basic_type(), Compile::AliasIdxRaw);
   Node* buffer = __ load(__ ctrl(), buffer_adr, TypeRawPtr::NOTNULL, T_ADDRESS, Compile::AliasIdxRaw);
 
   // Convert the store obj pointer to an int prior to doing math on it
--- a/src/share/vm/runtime/init.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/runtime/init.cpp	Thu May 09 06:12:11 2013 -0700
@@ -128,15 +128,6 @@
   javaClasses_init();   // must happen after vtable initialization
   stubRoutines_init2(); // note: StubRoutines need 2-phase init
 
-  // Although we'd like to, we can't easily do a heap verify
-  // here because the main thread isn't yet a JavaThread, so
-  // its TLAB may not be made parseable from the usual interfaces.
-  if (VerifyBeforeGC && !UseTLAB &&
-      Universe::heap()->total_collections() >= VerifyGCStartAt) {
-    Universe::heap()->prepare_for_verify();
-    Universe::verify();   // make sure we're starting with a clean slate
-  }
-
   // All the flags that get adjusted by VM_Version_init and os::init_2
   // have been set so dump the flags now.
   if (PrintFlagsFinal) {
--- a/src/share/vm/runtime/thread.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/runtime/thread.cpp	Thu May 09 06:12:11 2013 -0700
@@ -3383,12 +3383,6 @@
   // real raw monitor. VM is setup enough here for raw monitor enter.
   JvmtiExport::transition_pending_onload_raw_monitors();
 
-  if (VerifyBeforeGC &&
-      Universe::heap()->total_collections() >= VerifyGCStartAt) {
-    Universe::heap()->prepare_for_verify();
-    Universe::verify();   // make sure we're starting with a clean slate
-  }
-
   // Fully start NMT
   MemTracker::start();
 
@@ -3412,6 +3406,11 @@
   }
 
   assert (Universe::is_fully_initialized(), "not initialized");
+  if (VerifyBeforeGC && VerifyGCStartAt == 0) {
+    Universe::heap()->prepare_for_verify();
+    Universe::verify();   // make sure we're starting with a clean slate
+  }
+
   EXCEPTION_MARK;
 
   // At this point, the Universe is initialized, but we have not executed
--- a/src/share/vm/services/memBaseline.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/services/memBaseline.cpp	Thu May 09 06:12:11 2013 -0700
@@ -23,9 +23,12 @@
  */
 #include "precompiled.hpp"
 #include "memory/allocation.hpp"
+#include "runtime/safepoint.hpp"
+#include "runtime/thread.hpp"
 #include "services/memBaseline.hpp"
 #include "services/memTracker.hpp"
 
+
 MemType2Name MemBaseline::MemType2NameMap[NUMBER_OF_MEMORY_TYPE] = {
   {mtJavaHeap,   "Java Heap"},
   {mtClass,      "Class"},
@@ -150,6 +153,15 @@
   return true;
 }
 
+// check if there is a safepoint in progress, if so, block the thread
+// for the safepoint
+void MemBaseline::check_safepoint(JavaThread* thr) {
+  if (SafepointSynchronize::is_synchronizing()) {
+    // grab and drop the SR_lock to honor the safepoint protocol
+    MutexLocker ml(thr->SR_lock());
+  }
+}
+
 // baseline mmap'd memory records, generate overall summary and summaries by
 // memory types
 bool MemBaseline::baseline_vm_summary(const MemPointerArray* vm_records) {
@@ -307,7 +319,7 @@
         committed_rec->pc() != vm_ptr->pc()) {
         if (!_vm_map->append(vm_ptr)) {
           return false;
-  }
+        }
         committed_rec = (VMMemRegionEx*)_vm_map->at(_vm_map->length() - 1);
     } else {
         committed_rec->expand_region(vm_ptr->addr(), vm_ptr->size());
@@ -345,16 +357,27 @@
 
 // baseline a snapshot. If summary_only = false, memory usages aggregated by
 // callsites are also baselined.
+// The method call can be lengthy, especially when detail tracking info is
+// requested. So the method checks for safepoint explicitly.
 bool MemBaseline::baseline(MemSnapshot& snapshot, bool summary_only) {
-  MutexLockerEx snapshot_locker(snapshot._lock, true);
+  Thread* THREAD = Thread::current();
+  assert(THREAD->is_Java_thread(), "must be a JavaThread");
+  MutexLocker snapshot_locker(snapshot._lock);
   reset();
-  _baselined = baseline_malloc_summary(snapshot._alloc_ptrs) &&
-               baseline_vm_summary(snapshot._vm_ptrs);
+  _baselined = baseline_malloc_summary(snapshot._alloc_ptrs);
+  if (_baselined) {
+    check_safepoint((JavaThread*)THREAD);
+    _baselined = baseline_vm_summary(snapshot._vm_ptrs);
+  }
   _number_of_classes = snapshot.number_of_classes();
 
   if (!summary_only && MemTracker::track_callsite() && _baselined) {
-    _baselined =  baseline_malloc_details(snapshot._alloc_ptrs) &&
-      baseline_vm_details(snapshot._vm_ptrs);
+    check_safepoint((JavaThread*)THREAD);
+    _baselined =  baseline_malloc_details(snapshot._alloc_ptrs);
+    if (_baselined) {
+      check_safepoint((JavaThread*)THREAD);
+      _baselined =  baseline_vm_details(snapshot._vm_ptrs);
+    }
   }
   return _baselined;
 }
--- a/src/share/vm/services/memBaseline.hpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/services/memBaseline.hpp	Thu May 09 06:12:11 2013 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -330,6 +330,9 @@
   // should not use copy constructor
   MemBaseline(MemBaseline& copy) { ShouldNotReachHere(); }
 
+  // check and block at a safepoint
+  static inline void check_safepoint(JavaThread* thr);
+
  public:
   // create a memory baseline
   MemBaseline();
--- a/src/share/vm/services/memTracker.cpp	Wed May 08 16:51:54 2013 -0700
+++ b/src/share/vm/services/memTracker.cpp	Thu May 09 06:12:11 2013 -0700
@@ -573,7 +573,7 @@
 
 // baseline current memory snapshot
 bool MemTracker::baseline() {
-  MutexLockerEx lock(_query_lock, true);
+  MutexLocker lock(_query_lock);
   MemSnapshot* snapshot = get_snapshot();
   if (snapshot != NULL) {
     return _baseline.baseline(*snapshot, false);
@@ -584,7 +584,7 @@
 // print memory usage from current snapshot
 bool MemTracker::print_memory_usage(BaselineOutputer& out, size_t unit, bool summary_only) {
   MemBaseline  baseline;
-  MutexLockerEx lock(_query_lock, true);
+  MutexLocker  lock(_query_lock);
   MemSnapshot* snapshot = get_snapshot();
   if (snapshot != NULL && baseline.baseline(*snapshot, summary_only)) {
     BaselineReporter reporter(out, unit);
@@ -597,7 +597,7 @@
 // Whitebox API for blocking until the current generation of NMT data has been merged
 bool MemTracker::wbtest_wait_for_data_merge() {
   // NMT can't be shutdown while we're holding _query_lock
-  MutexLockerEx lock(_query_lock, true);
+  MutexLocker lock(_query_lock);
   assert(_worker_thread != NULL, "Invalid query");
   // the generation at query time, so NMT will spin till this generation is processed
   unsigned long generation_at_query_time = SequenceGenerator::current_generation();
@@ -641,7 +641,7 @@
 
 // compare memory usage between current snapshot and baseline
 bool MemTracker::compare_memory_usage(BaselineOutputer& out, size_t unit, bool summary_only) {
-  MutexLockerEx lock(_query_lock, true);
+  MutexLocker lock(_query_lock);
   if (_baseline.baselined()) {
     MemBaseline baseline;
     MemSnapshot* snapshot = get_snapshot();
--- a/test/TEST.ROOT	Wed May 08 16:51:54 2013 -0700
+++ b/test/TEST.ROOT	Thu May 09 06:12:11 2013 -0700
@@ -28,4 +28,4 @@
 # DO NOT EDIT without first contacting hotspot-regtest@sun.com
 
 # The list of keywords supported in this test suite
-keys=cte_test jcmd nmt regression
+keys=cte_test jcmd nmt regression gc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/gc/TestVerifyBeforeGCDuringStartup.java	Thu May 09 06:12:11 2013 -0700
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/* @test TestVerifyBeforeGCDuringStartup.java
+ * @key gc
+ * @bug 8010463
+ * @summary Simple test run with -XX:+VerifyBeforeGC -XX:-UseTLAB to verify 8010463
+ * @library /testlibrary
+ */
+
+import com.oracle.java.testlibrary.OutputAnalyzer;
+import com.oracle.java.testlibrary.ProcessTools;
+
+public class TestVerifyBeforeGCDuringStartup {
+  public static void main(String args[]) throws Exception {
+    ProcessBuilder pb =
+      ProcessTools.createJavaProcessBuilder(System.getProperty("test.vm.opts"),
+                                            "-XX:-UseTLAB",
+                                            "-XX:+UnlockDiagnosticVMOptions",
+                                            "-XX:+VerifyBeforeGC", "-version");
+    OutputAnalyzer output = new OutputAnalyzer(pb.start());
+    output.shouldContain("[Verifying");
+    output.shouldHaveExitValue(0);
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/gc/init/TestHandleExceedingProcessSizeLimitIn32BitBuilds.java	Thu May 09 06:12:11 2013 -0700
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/* @test TestHandleExceedingProcessSizeLimitIn32BitBuilds.java
+ * @key gc
+ * @bug 6761744
+ * @summary Test run with "-Xmx3072m -XX:MaxPermSize=1024m" to correctly handle VM error (if any)
+ * @library /testlibrary
+ */
+
+import com.oracle.java.testlibrary.OutputAnalyzer;
+import com.oracle.java.testlibrary.ProcessTools;
+import java.util.ArrayList;
+import java.util.Collections;
+
+public class TestHandleExceedingProcessSizeLimitIn32BitBuilds {
+  public static void main(String args[]) throws Exception {
+    ArrayList<String> vmOpts = new ArrayList<>();
+    String testVmOptsStr = System.getProperty("test.java.opts");
+    if (!testVmOptsStr.isEmpty()) {
+      String[] testVmOpts = testVmOptsStr.split(" ");
+      Collections.addAll(vmOpts, testVmOpts);
+    }
+    Collections.addAll(vmOpts, new String[] {"-Xmx3072m", "-XX:MaxPermSize=1024m", "-version"});
+
+    ProcessBuilder pb
+      = ProcessTools.createJavaProcessBuilder(vmOpts.toArray(new String[vmOpts.size()]));
+    OutputAnalyzer output = new OutputAnalyzer(pb.start());
+
+    String dataModel = System.getProperty("sun.arch.data.model");
+    if (dataModel.equals("32")) {
+      output.shouldContain("The size of the object heap + perm gen exceeds the maximum representable size");
+      if (output.getExitValue() == 0) {
+        throw new RuntimeException("Not expected to get exit value 0");
+      }
+    }
+  }
+}
\ No newline at end of file