changeset 2608:842b840e67db

7046558: G1: concurrent marking optimizations Summary: Some optimizations to improve the concurrent marking phase: specialize the main oop closure, make sure a few methods in the fast path are properly inlined, a few more bits and pieces, and some cosmetic fixes. Reviewed-by: stefank, johnc
author tonyp
date Tue, 14 Jun 2011 10:33:43 -0400
parents 74cd10898bea
children 6747fd0512e0
files src/share/vm/gc_implementation/g1/concurrentMark.cpp src/share/vm/gc_implementation/g1/concurrentMark.hpp src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp src/share/vm/gc_implementation/g1/g1OopClosures.hpp src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp src/share/vm/utilities/bitMap.hpp
diffstat 7 files changed, 298 insertions(+), 212 deletions(-) [+]
line wrap: on
line diff
--- a/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue Jun 14 10:33:43 2011 -0400
@@ -24,10 +24,11 @@
 
 #include "precompiled.hpp"
 #include "classfile/symbolTable.hpp"
-#include "gc_implementation/g1/concurrentMark.hpp"
+#include "gc_implementation/g1/concurrentMark.inline.hpp"
 #include "gc_implementation/g1/concurrentMarkThread.inline.hpp"
 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
 #include "gc_implementation/g1/g1CollectorPolicy.hpp"
+#include "gc_implementation/g1/g1OopClosures.inline.hpp"
 #include "gc_implementation/g1/g1RemSet.hpp"
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
@@ -2546,34 +2547,42 @@
 };
 
 void ConcurrentMark::deal_with_reference(oop obj) {
-  if (verbose_high())
+  if (verbose_high()) {
     gclog_or_tty->print_cr("[global] we're dealing with reference "PTR_FORMAT,
                            (void*) obj);
-
+  }
 
   HeapWord* objAddr = (HeapWord*) obj;
   assert(obj->is_oop_or_null(true /* ignore mark word */), "Error");
   if (_g1h->is_in_g1_reserved(objAddr)) {
-    assert(obj != NULL, "is_in_g1_reserved should ensure this");
-    HeapRegion* hr = _g1h->heap_region_containing(obj);
-    if (_g1h->is_obj_ill(obj, hr)) {
-      if (verbose_high())
-        gclog_or_tty->print_cr("[global] "PTR_FORMAT" is not considered "
-                               "marked", (void*) obj);
-
-      // we need to mark it first
-      if (_nextMarkBitMap->parMark(objAddr)) {
-        // No OrderAccess:store_load() is needed. It is implicit in the
-        // CAS done in parMark(objAddr) above
-        HeapWord* finger = _finger;
-        if (objAddr < finger) {
-          if (verbose_high())
-            gclog_or_tty->print_cr("[global] below the global finger "
-                                   "("PTR_FORMAT"), pushing it", finger);
-          if (!mark_stack_push(obj)) {
-            if (verbose_low())
-              gclog_or_tty->print_cr("[global] global stack overflow during "
-                                     "deal_with_reference");
+    assert(obj != NULL, "null check is implicit");
+    if (!_nextMarkBitMap->isMarked(objAddr)) {
+      // Only get the containing region if the object is not marked on the
+      // bitmap (otherwise, it's a waste of time since we won't do
+      // anything with it).
+      HeapRegion* hr = _g1h->heap_region_containing_raw(obj);
+      if (!hr->obj_allocated_since_next_marking(obj)) {
+        if (verbose_high()) {
+          gclog_or_tty->print_cr("[global] "PTR_FORMAT" is not considered "
+                                 "marked", (void*) obj);
+        }
+
+        // we need to mark it first
+        if (_nextMarkBitMap->parMark(objAddr)) {
+          // No OrderAccess:store_load() is needed. It is implicit in the
+          // CAS done in parMark(objAddr) above
+          HeapWord* finger = _finger;
+          if (objAddr < finger) {
+            if (verbose_high()) {
+              gclog_or_tty->print_cr("[global] below the global finger "
+                                     "("PTR_FORMAT"), pushing it", finger);
+            }
+            if (!mark_stack_push(obj)) {
+              if (verbose_low()) {
+                gclog_or_tty->print_cr("[global] global stack overflow during "
+                                       "deal_with_reference");
+              }
+            }
           }
         }
       }
@@ -2631,21 +2640,43 @@
   while (finger < _heap_end) {
     assert(_g1h->is_in_g1_reserved(finger), "invariant");
 
-    // is the gap between reading the finger and doing the CAS too long?
-
-    HeapRegion* curr_region   = _g1h->heap_region_containing(finger);
+    // Note on how this code handles humongous regions. In the
+    // normal case the finger will reach the start of a "starts
+    // humongous" (SH) region. Its end will either be the end of the
+    // last "continues humongous" (CH) region in the sequence, or the
+    // standard end of the SH region (if the SH is the only region in
+    // the sequence). That way claim_region() will skip over the CH
+    // regions. However, there is a subtle race between a CM thread
+    // executing this method and a mutator thread doing a humongous
+    // object allocation. The two are not mutually exclusive as the CM
+    // thread does not need to hold the Heap_lock when it gets
+    // here. So there is a chance that claim_region() will come across
+    // a free region that's in the progress of becoming a SH or a CH
+    // region. In the former case, it will either
+    //   a) Miss the update to the region's end, in which case it will
+    //      visit every subsequent CH region, will find their bitmaps
+    //      empty, and do nothing, or
+    //   b) Will observe the update of the region's end (in which case
+    //      it will skip the subsequent CH regions).
+    // If it comes across a region that suddenly becomes CH, the
+    // scenario will be similar to b). So, the race between
+    // claim_region() and a humongous object allocation might force us
+    // to do a bit of unnecessary work (due to some unnecessary bitmap
+    // iterations) but it should not introduce and correctness issues.
+    HeapRegion* curr_region   = _g1h->heap_region_containing_raw(finger);
     HeapWord*   bottom        = curr_region->bottom();
     HeapWord*   end           = curr_region->end();
     HeapWord*   limit         = curr_region->next_top_at_mark_start();
 
-    if (verbose_low())
+    if (verbose_low()) {
       gclog_or_tty->print_cr("[%d] curr_region = "PTR_FORMAT" "
                              "["PTR_FORMAT", "PTR_FORMAT"), "
                              "limit = "PTR_FORMAT,
                              task_num, curr_region, bottom, end, limit);
-
-    HeapWord* res =
-      (HeapWord*) Atomic::cmpxchg_ptr(end, &_finger, finger);
+    }
+
+    // Is the gap between reading the finger and doing the CAS too long?
+    HeapWord* res = (HeapWord*) Atomic::cmpxchg_ptr(end, &_finger, finger);
     if (res == finger) {
       // we succeeded
 
@@ -3191,6 +3222,22 @@
 }
 #endif
 
+void CMTask::scan_object(oop obj) {
+  assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
+
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%d] we're scanning object "PTR_FORMAT,
+                           _task_id, (void*) obj);
+  }
+
+  size_t obj_size = obj->size();
+  _words_scanned += obj_size;
+
+  obj->oop_iterate(_cm_oop_closure);
+  statsOnly( ++_objs_scanned );
+  check_limits();
+}
+
 // Closure for iteration over bitmaps
 class CMBitMapClosure : public BitMapClosure {
 private:
@@ -3254,43 +3301,17 @@
   CMObjectClosure(CMTask* task) : _task(task) { }
 };
 
-// Closure for iterating over object fields
-class CMOopClosure : public OopClosure {
-private:
-  G1CollectedHeap*   _g1h;
-  ConcurrentMark*    _cm;
-  CMTask*            _task;
-
-public:
-  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
-  virtual void do_oop(      oop* p) { do_oop_work(p); }
-
-  template <class T> void do_oop_work(T* p) {
-    assert( _g1h->is_in_g1_reserved((HeapWord*) p), "invariant");
-    assert(!_g1h->is_on_master_free_list(
-                    _g1h->heap_region_containing((HeapWord*) p)), "invariant");
-
-    oop obj = oopDesc::load_decode_heap_oop(p);
-    if (_cm->verbose_high())
-      gclog_or_tty->print_cr("[%d] we're looking at location "
-                             "*"PTR_FORMAT" = "PTR_FORMAT,
-                             _task->task_id(), p, (void*) obj);
-    _task->deal_with_reference(obj);
+G1CMOopClosure::G1CMOopClosure(G1CollectedHeap* g1h,
+                               ConcurrentMark* cm,
+                               CMTask* task)
+  : _g1h(g1h), _cm(cm), _task(task) {
+  assert(_ref_processor == NULL, "should be initialized to NULL");
+
+  if (G1UseConcMarkReferenceProcessing) {
+    _ref_processor = g1h->ref_processor();
+    assert(_ref_processor != NULL, "should not be NULL");
   }
-
-  CMOopClosure(G1CollectedHeap* g1h,
-               ConcurrentMark* cm,
-               CMTask* task)
-    : _g1h(g1h), _cm(cm), _task(task)
-  {
-    assert(_ref_processor == NULL, "should be initialized to NULL");
-
-    if (G1UseConcMarkReferenceProcessing) {
-      _ref_processor = g1h->ref_processor();
-      assert(_ref_processor != NULL, "should not be NULL");
-    }
-  }
-};
+}
 
 void CMTask::setup_for_region(HeapRegion* hr) {
   // Separated the asserts so that we know which one fires.
@@ -3362,6 +3383,15 @@
   _region_finger = NULL;
 }
 
+void CMTask::set_cm_oop_closure(G1CMOopClosure* cm_oop_closure) {
+  if (cm_oop_closure == NULL) {
+    assert(_cm_oop_closure != NULL, "invariant");
+  } else {
+    assert(_cm_oop_closure == NULL, "invariant");
+  }
+  _cm_oop_closure = cm_oop_closure;
+}
+
 void CMTask::reset(CMBitMap* nextMarkBitMap) {
   guarantee(nextMarkBitMap != NULL, "invariant");
 
@@ -3411,118 +3441,6 @@
   return !_cm->mark_stack_empty() || has_aborted();
 }
 
-// This determines whether the method below will check both the local
-// and global fingers when determining whether to push on the stack a
-// gray object (value 1) or whether it will only check the global one
-// (value 0). The tradeoffs are that the former will be a bit more
-// accurate and possibly push less on the stack, but it might also be
-// a little bit slower.
-
-#define _CHECK_BOTH_FINGERS_      1
-
-void CMTask::deal_with_reference(oop obj) {
-  if (_cm->verbose_high())
-    gclog_or_tty->print_cr("[%d] we're dealing with reference = "PTR_FORMAT,
-                           _task_id, (void*) obj);
-
-  ++_refs_reached;
-
-  HeapWord* objAddr = (HeapWord*) obj;
-  assert(obj->is_oop_or_null(true /* ignore mark word */), "Error");
-  if (_g1h->is_in_g1_reserved(objAddr)) {
-    assert(obj != NULL, "is_in_g1_reserved should ensure this");
-    HeapRegion* hr =  _g1h->heap_region_containing(obj);
-    if (_g1h->is_obj_ill(obj, hr)) {
-      if (_cm->verbose_high())
-        gclog_or_tty->print_cr("[%d] "PTR_FORMAT" is not considered marked",
-                               _task_id, (void*) obj);
-
-      // we need to mark it first
-      if (_nextMarkBitMap->parMark(objAddr)) {
-        // No OrderAccess:store_load() is needed. It is implicit in the
-        // CAS done in parMark(objAddr) above
-        HeapWord* global_finger = _cm->finger();
-
-#if _CHECK_BOTH_FINGERS_
-        // we will check both the local and global fingers
-
-        if (_finger != NULL && objAddr < _finger) {
-          if (_cm->verbose_high())
-            gclog_or_tty->print_cr("[%d] below the local finger ("PTR_FORMAT"), "
-                                   "pushing it", _task_id, _finger);
-          push(obj);
-        } else if (_curr_region != NULL && objAddr < _region_limit) {
-          // do nothing
-        } else if (objAddr < global_finger) {
-          // Notice that the global finger might be moving forward
-          // concurrently. This is not a problem. In the worst case, we
-          // mark the object while it is above the global finger and, by
-          // the time we read the global finger, it has moved forward
-          // passed this object. In this case, the object will probably
-          // be visited when a task is scanning the region and will also
-          // be pushed on the stack. So, some duplicate work, but no
-          // correctness problems.
-
-          if (_cm->verbose_high())
-            gclog_or_tty->print_cr("[%d] below the global finger "
-                                   "("PTR_FORMAT"), pushing it",
-                                   _task_id, global_finger);
-          push(obj);
-        } else {
-          // do nothing
-        }
-#else // _CHECK_BOTH_FINGERS_
-        // we will only check the global finger
-
-        if (objAddr < global_finger) {
-          // see long comment above
-
-          if (_cm->verbose_high())
-            gclog_or_tty->print_cr("[%d] below the global finger "
-                                   "("PTR_FORMAT"), pushing it",
-                                   _task_id, global_finger);
-          push(obj);
-        }
-#endif // _CHECK_BOTH_FINGERS_
-      }
-    }
-  }
-}
-
-void CMTask::push(oop obj) {
-  HeapWord* objAddr = (HeapWord*) obj;
-  assert(_g1h->is_in_g1_reserved(objAddr), "invariant");
-  assert(!_g1h->is_on_master_free_list(
-              _g1h->heap_region_containing((HeapWord*) objAddr)), "invariant");
-  assert(!_g1h->is_obj_ill(obj), "invariant");
-  assert(_nextMarkBitMap->isMarked(objAddr), "invariant");
-
-  if (_cm->verbose_high())
-    gclog_or_tty->print_cr("[%d] pushing "PTR_FORMAT, _task_id, (void*) obj);
-
-  if (!_task_queue->push(obj)) {
-    // The local task queue looks full. We need to push some entries
-    // to the global stack.
-
-    if (_cm->verbose_medium())
-      gclog_or_tty->print_cr("[%d] task queue overflow, "
-                             "moving entries to the global stack",
-                             _task_id);
-    move_entries_to_global_stack();
-
-    // this should succeed since, even if we overflow the global
-    // stack, we should have definitely removed some entries from the
-    // local queue. So, there must be space on it.
-    bool success = _task_queue->push(obj);
-    assert(success, "invariant");
-  }
-
-  statsOnly( int tmp_size = _task_queue->size();
-             if (tmp_size > _local_max_size)
-               _local_max_size = tmp_size;
-             ++_local_pushes );
-}
-
 void CMTask::reached_limit() {
   assert(_words_scanned >= _words_scanned_limit ||
          _refs_reached >= _refs_reached_limit ,
@@ -4158,8 +4076,8 @@
   // eventually called from this method, so it is OK to allocate these
   // statically.
   CMBitMapClosure bitmap_closure(this, _cm, _nextMarkBitMap);
-  CMOopClosure    oop_closure(_g1h, _cm, this);
-  set_oop_closure(&oop_closure);
+  G1CMOopClosure  cm_oop_closure(_g1h, _cm, this);
+  set_cm_oop_closure(&cm_oop_closure);
 
   if (_cm->has_overflown()) {
     // This can happen if the region stack or the mark stack overflows
@@ -4435,7 +4353,7 @@
   // Mainly for debugging purposes to make sure that a pointer to the
   // closure which was statically allocated in this frame doesn't
   // escape it by accident.
-  set_oop_closure(NULL);
+  set_cm_oop_closure(NULL);
   double end_time_ms = os::elapsedVTime() * 1000.0;
   double elapsed_time_ms = end_time_ms - _start_time_ms;
   // Update the step history.
@@ -4510,7 +4428,7 @@
     _nextMarkBitMap(NULL), _hash_seed(17),
     _task_queue(task_queue),
     _task_queues(task_queues),
-    _oop_closure(NULL),
+    _cm_oop_closure(NULL),
     _aborted_region(MemRegion()) {
   guarantee(task_queue != NULL, "invariant");
   guarantee(task_queues != NULL, "invariant");
--- a/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -131,22 +131,22 @@
   void mark(HeapWord* addr) {
     assert(_bmStartWord <= addr && addr < (_bmStartWord + _bmWordSize),
            "outside underlying space?");
-    _bm.at_put(heapWordToOffset(addr), true);
+    _bm.set_bit(heapWordToOffset(addr));
   }
   void clear(HeapWord* addr) {
     assert(_bmStartWord <= addr && addr < (_bmStartWord + _bmWordSize),
            "outside underlying space?");
-    _bm.at_put(heapWordToOffset(addr), false);
+    _bm.clear_bit(heapWordToOffset(addr));
   }
   bool parMark(HeapWord* addr) {
     assert(_bmStartWord <= addr && addr < (_bmStartWord + _bmWordSize),
            "outside underlying space?");
-    return _bm.par_at_put(heapWordToOffset(addr), true);
+    return _bm.par_set_bit(heapWordToOffset(addr));
   }
   bool parClear(HeapWord* addr) {
     assert(_bmStartWord <= addr && addr < (_bmStartWord + _bmWordSize),
            "outside underlying space?");
-    return _bm.par_at_put(heapWordToOffset(addr), false);
+    return _bm.par_clear_bit(heapWordToOffset(addr));
   }
   void markRange(MemRegion mr);
   void clearAll();
@@ -928,7 +928,7 @@
   double                      _start_time_ms;
 
   // the oop closure used for iterations over oops
-  OopClosure*                 _oop_closure;
+  G1CMOopClosure*             _cm_oop_closure;
 
   // the region this task is scanning, NULL if we're not scanning any
   HeapRegion*                 _curr_region;
@@ -1122,32 +1122,17 @@
   // Clears any recorded partially scanned region
   void clear_aborted_region()   { set_aborted_region(MemRegion()); }
 
-  void set_oop_closure(OopClosure* oop_closure) {
-    _oop_closure = oop_closure;
-  }
+  void set_cm_oop_closure(G1CMOopClosure* cm_oop_closure);
 
   // It grays the object by marking it and, if necessary, pushing it
   // on the local queue
-  void deal_with_reference(oop obj);
+  inline void deal_with_reference(oop obj);
 
   // It scans an object and visits its children.
-  void scan_object(oop obj) {
-    assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
-
-    if (_cm->verbose_high())
-      gclog_or_tty->print_cr("[%d] we're scanning object "PTR_FORMAT,
-                             _task_id, (void*) obj);
-
-    size_t obj_size = obj->size();
-    _words_scanned += obj_size;
-
-    obj->oop_iterate(_oop_closure);
-    statsOnly( ++_objs_scanned );
-    check_limits();
-  }
+  void scan_object(oop obj);
 
   // It pushes an object on the local queue.
-  void push(oop obj);
+  inline void push(oop obj);
 
   // These two move entries to/from the global stack.
   void move_entries_to_global_stack();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_INLINE_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_INLINE_HPP
+
+#include "gc_implementation/g1/concurrentMark.hpp"
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+
+inline void CMTask::push(oop obj) {
+  HeapWord* objAddr = (HeapWord*) obj;
+  assert(_g1h->is_in_g1_reserved(objAddr), "invariant");
+  assert(!_g1h->is_on_master_free_list(
+              _g1h->heap_region_containing((HeapWord*) objAddr)), "invariant");
+  assert(!_g1h->is_obj_ill(obj), "invariant");
+  assert(_nextMarkBitMap->isMarked(objAddr), "invariant");
+
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%d] pushing "PTR_FORMAT, _task_id, (void*) obj);
+  }
+
+  if (!_task_queue->push(obj)) {
+    // The local task queue looks full. We need to push some entries
+    // to the global stack.
+
+    if (_cm->verbose_medium()) {
+      gclog_or_tty->print_cr("[%d] task queue overflow, "
+                             "moving entries to the global stack",
+                             _task_id);
+    }
+    move_entries_to_global_stack();
+
+    // this should succeed since, even if we overflow the global
+    // stack, we should have definitely removed some entries from the
+    // local queue. So, there must be space on it.
+    bool success = _task_queue->push(obj);
+    assert(success, "invariant");
+  }
+
+  statsOnly( int tmp_size = _task_queue->size();
+             if (tmp_size > _local_max_size)
+               _local_max_size = tmp_size;
+             ++_local_pushes );
+}
+
+// This determines whether the method below will check both the local
+// and global fingers when determining whether to push on the stack a
+// gray object (value 1) or whether it will only check the global one
+// (value 0). The tradeoffs are that the former will be a bit more
+// accurate and possibly push less on the stack, but it might also be
+// a little bit slower.
+
+#define _CHECK_BOTH_FINGERS_      1
+
+inline void CMTask::deal_with_reference(oop obj) {
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%d] we're dealing with reference = "PTR_FORMAT,
+                           _task_id, (void*) obj);
+  }
+
+  ++_refs_reached;
+
+  HeapWord* objAddr = (HeapWord*) obj;
+  assert(obj->is_oop_or_null(true /* ignore mark word */), "Error");
+ if (_g1h->is_in_g1_reserved(objAddr)) {
+    assert(obj != NULL, "null check is implicit");
+    if (!_nextMarkBitMap->isMarked(objAddr)) {
+      // Only get the containing region if the object is not marked on the
+      // bitmap (otherwise, it's a waste of time since we won't do
+      // anything with it).
+      HeapRegion* hr = _g1h->heap_region_containing_raw(obj);
+      if (!hr->obj_allocated_since_next_marking(obj)) {
+        if (_cm->verbose_high()) {
+          gclog_or_tty->print_cr("[%d] "PTR_FORMAT" is not considered marked",
+                                 _task_id, (void*) obj);
+        }
+
+        // we need to mark it first
+        if (_nextMarkBitMap->parMark(objAddr)) {
+          // No OrderAccess:store_load() is needed. It is implicit in the
+          // CAS done in parMark(objAddr) above
+          HeapWord* global_finger = _cm->finger();
+
+#if _CHECK_BOTH_FINGERS_
+          // we will check both the local and global fingers
+
+          if (_finger != NULL && objAddr < _finger) {
+            if (_cm->verbose_high()) {
+              gclog_or_tty->print_cr("[%d] below the local finger ("PTR_FORMAT"), "
+                                     "pushing it", _task_id, _finger);
+            }
+            push(obj);
+          } else if (_curr_region != NULL && objAddr < _region_limit) {
+            // do nothing
+          } else if (objAddr < global_finger) {
+            // Notice that the global finger might be moving forward
+            // concurrently. This is not a problem. In the worst case, we
+            // mark the object while it is above the global finger and, by
+            // the time we read the global finger, it has moved forward
+            // passed this object. In this case, the object will probably
+            // be visited when a task is scanning the region and will also
+            // be pushed on the stack. So, some duplicate work, but no
+            // correctness problems.
+
+            if (_cm->verbose_high()) {
+              gclog_or_tty->print_cr("[%d] below the global finger "
+                                     "("PTR_FORMAT"), pushing it",
+                                     _task_id, global_finger);
+            }
+            push(obj);
+          } else {
+            // do nothing
+          }
+#else // _CHECK_BOTH_FINGERS_
+          // we will only check the global finger
+
+          if (objAddr < global_finger) {
+            // see long comment above
+
+            if (_cm->verbose_high()) {
+              gclog_or_tty->print_cr("[%d] below the global finger "
+                                     "("PTR_FORMAT"), pushing it",
+                                     _task_id, global_finger);
+            }
+            push(obj);
+          }
+#endif // _CHECK_BOTH_FINGERS_
+        }
+      }
+    }
+  }
+}
+
+#endif // SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_INLINE_HPP
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -33,6 +33,7 @@
 class CMBitMap;
 class CMMarkStack;
 class G1ParScanThreadState;
+class CMTask;
 
 // A class that scans oops in a given heap region (much as OopsInGenClosure
 // scans oops in a generation.)
@@ -176,4 +177,16 @@
   int out_of_region() { return _out_of_region; }
 };
 
+// Closure for iterating over object fields during concurrent marking
+class G1CMOopClosure : public OopClosure {
+  G1CollectedHeap*   _g1h;
+  ConcurrentMark*    _cm;
+  CMTask*            _task;
+public:
+  G1CMOopClosure(G1CollectedHeap* g1h, ConcurrentMark* cm, CMTask* task);
+  template <class T> void do_oop_nv(T* p);
+  virtual void do_oop(      oop* p) { do_oop_nv(p); }
+  virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
+};
+
 #endif // SHARE_VM_GC_IMPLEMENTATION_G1_G1OOPCLOSURES_HPP
--- a/src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -25,7 +25,7 @@
 #ifndef SHARE_VM_GC_IMPLEMENTATION_G1_G1OOPCLOSURES_INLINE_HPP
 #define SHARE_VM_GC_IMPLEMENTATION_G1_G1OOPCLOSURES_INLINE_HPP
 
-#include "gc_implementation/g1/concurrentMark.hpp"
+#include "gc_implementation/g1/concurrentMark.inline.hpp"
 #include "gc_implementation/g1/g1CollectedHeap.hpp"
 #include "gc_implementation/g1/g1OopClosures.hpp"
 #include "gc_implementation/g1/g1RemSet.hpp"
@@ -108,5 +108,18 @@
   }
 }
 
+template <class T> inline void G1CMOopClosure::do_oop_nv(T* p) {
+  assert(_g1h->is_in_g1_reserved((HeapWord*) p), "invariant");
+  assert(!_g1h->is_on_master_free_list(
+                    _g1h->heap_region_containing((HeapWord*) p)), "invariant");
+
+  oop obj = oopDesc::load_decode_heap_oop(p);
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%d] we're looking at location "
+                           "*"PTR_FORMAT" = "PTR_FORMAT,
+                           _task->task_id(), p, (void*) obj);
+  }
+  _task->deal_with_reference(obj);
+}
 
 #endif // SHARE_VM_GC_IMPLEMENTATION_G1_G1OOPCLOSURES_INLINE_HPP
--- a/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -45,6 +45,7 @@
 
 class FilterIntoCSClosure;
 class FilterOutOfRegionClosure;
+class G1CMOopClosure;
 
 #ifdef FURTHER_SPECIALIZED_OOP_OOP_ITERATE_CLOSURES
 #error "FURTHER_SPECIALIZED_OOP_OOP_ITERATE_CLOSURES already defined."
@@ -55,7 +56,8 @@
       f(G1ParScanClosure,_nv)                           \
       f(G1ParPushHeapRSClosure,_nv)                     \
       f(FilterIntoCSClosure,_nv)                        \
-      f(FilterOutOfRegionClosure,_nv)
+      f(FilterOutOfRegionClosure,_nv)                   \
+      f(G1CMOopClosure,_nv)
 
 #ifdef FURTHER_SPECIALIZED_SINCE_SAVE_MARKS_CLOSURES
 #error "FURTHER_SPECIALIZED_SINCE_SAVE_MARKS_CLOSURES already defined."
--- a/src/share/vm/utilities/bitMap.hpp	Mon Jun 13 13:48:18 2011 +0200
+++ b/src/share/vm/utilities/bitMap.hpp	Tue Jun 14 10:33:43 2011 -0400
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -161,11 +161,11 @@
 
   // Set or clear the specified bit.
   inline void set_bit(idx_t bit);
-  void clear_bit(idx_t bit);
+  inline void clear_bit(idx_t bit);
 
   // Atomically set or clear the specified bit.
-  bool par_set_bit(idx_t bit);
-  bool par_clear_bit(idx_t bit);
+  inline bool par_set_bit(idx_t bit);
+  inline bool par_clear_bit(idx_t bit);
 
   // Put the given value at the given offset. The parallel version
   // will CAS the value into the bitmap and is quite a bit slower.