# HG changeset patch # User johnc # Date 1331589540 25200 # Node ID de5748cca21143f8674198a918477393ce986f5c # Parent f5fba31ac5ce18f62544857f4a502f7a4a040b9f 7147724: G1: hang in SurrogateLockerThread::manipulatePLL Summary: Attempting to initiate a marking cycle when allocating a humongous object can, if a marking cycle is successfully initiated by another thread, result in the allocating thread spinning until the marking cycle is complete. Eliminate a deadlock between the main ConcurrentMarkThread, the SurrogateLocker thread, the VM thread, and a mutator thread waiting on the SecondaryFreeList_lock (while free regions are going to become available) by not manipulating the pending list lock during the prologue and epilogue of the cleanup pause. Reviewed-by: brutisso, jcoomes, tonyp diff -r f5fba31ac5ce -r de5748cca211 src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp --- a/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp Sun Mar 25 19:55:03 2012 -0700 +++ b/src/share/vm/gc_implementation/g1/concurrentMarkThread.cpp Mon Mar 12 14:59:00 2012 -0700 @@ -155,7 +155,7 @@ CMCheckpointRootsFinalClosure final_cl(_cm); sprintf(verbose_str, "GC remark"); - VM_CGC_Operation op(&final_cl, verbose_str); + VM_CGC_Operation op(&final_cl, verbose_str, true /* needs_pll */); VMThread::execute(&op); } if (cm()->restart_for_overflow() && @@ -189,7 +189,7 @@ CMCleanUp cl_cl(_cm); sprintf(verbose_str, "GC cleanup"); - VM_CGC_Operation op(&cl_cl, verbose_str); + VM_CGC_Operation op(&cl_cl, verbose_str, false /* needs_pll */); VMThread::execute(&op); } else { // We don't want to update the marking status if a GC pause diff -r f5fba31ac5ce -r de5748cca211 src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp --- a/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Sun Mar 25 19:55:03 2012 -0700 +++ b/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Mon Mar 12 14:59:00 2012 -0700 @@ -993,7 +993,7 @@ // iteration (after taking the Heap_lock). result = _mutator_alloc_region.attempt_allocation(word_size, false /* bot_updates */); - if (result != NULL ){ + if (result != NULL) { return result; } @@ -2437,20 +2437,22 @@ true, /* should_initiate_conc_mark */ g1_policy()->max_pause_time_ms(), cause); + VMThread::execute(&op); if (!op.pause_succeeded()) { - // Another GC got scheduled and prevented us from scheduling - // the initial-mark GC. It's unlikely that the GC that - // pre-empted us was also an initial-mark GC. So, we'll retry - // the initial-mark GC. - if (full_gc_count_before == total_full_collections()) { - retry_gc = true; + retry_gc = op.should_retry_gc(); } else { // A Full GC happened while we were trying to schedule the // initial-mark GC. No point in starting a new cycle given // that the whole heap was collected anyway. } + + if (retry_gc) { + if (GC_locker::is_active_and_needs_gc()) { + GC_locker::stall_until_clear(); + } + } } } else { if (cause == GCCause::_gc_locker diff -r f5fba31ac5ce -r de5748cca211 src/share/vm/gc_implementation/g1/vm_operations_g1.cpp --- a/src/share/vm/gc_implementation/g1/vm_operations_g1.cpp Sun Mar 25 19:55:03 2012 -0700 +++ b/src/share/vm/gc_implementation/g1/vm_operations_g1.cpp Mon Mar 12 14:59:00 2012 -0700 @@ -34,7 +34,8 @@ VM_G1CollectForAllocation::VM_G1CollectForAllocation( unsigned int gc_count_before, size_t word_size) - : VM_G1OperationWithAllocRequest(gc_count_before, word_size) { + : VM_G1OperationWithAllocRequest(gc_count_before, word_size, + GCCause::_allocation_failure) { guarantee(word_size > 0, "an allocation should always be requested"); } @@ -57,9 +58,10 @@ bool should_initiate_conc_mark, double target_pause_time_ms, GCCause::Cause gc_cause) - : VM_G1OperationWithAllocRequest(gc_count_before, word_size), + : VM_G1OperationWithAllocRequest(gc_count_before, word_size, gc_cause), _should_initiate_conc_mark(should_initiate_conc_mark), _target_pause_time_ms(target_pause_time_ms), + _should_retry_gc(false), _full_collections_completed_before(0) { guarantee(target_pause_time_ms > 0.0, err_msg("target_pause_time_ms = %1.6lf should be positive", @@ -70,6 +72,22 @@ _gc_cause = gc_cause; } +bool VM_G1IncCollectionPause::doit_prologue() { + bool res = VM_GC_Operation::doit_prologue(); + if (!res) { + if (_should_initiate_conc_mark) { + // The prologue can fail for a couple of reasons. The first is that another GC + // got scheduled and prevented the scheduling of the initial mark GC. The + // second is that the GC locker may be active and the heap can't be expanded. + // In both cases we want to retry the GC so that the initial mark pause is + // actually scheduled. In the second case, however, we should stall until + // until the GC locker is no longer active and then retry the initial mark GC. + _should_retry_gc = true; + } + } + return res; +} + void VM_G1IncCollectionPause::doit() { G1CollectedHeap* g1h = G1CollectedHeap::heap(); assert(!_should_initiate_conc_mark || @@ -106,11 +124,25 @@ // next GC pause to be an initial mark; it returns false if a // marking cycle is already in progress. // - // If a marking cycle is already in progress just return and skip - // the pause - the requesting thread should block in doit_epilogue - // until the marking cycle is complete. + // If a marking cycle is already in progress just return and skip the + // pause below - if the reason for requesting this initial mark pause + // was due to a System.gc() then the requesting thread should block in + // doit_epilogue() until the marking cycle is complete. + // + // If this initial mark pause was requested as part of a humongous + // allocation then we know that the marking cycle must just have + // been started by another thread (possibly also allocating a humongous + // object) as there was no active marking cycle when the requesting + // thread checked before calling collect() in + // attempt_allocation_humongous(). Retrying the GC, in this case, + // will cause the requesting thread to spin inside collect() until the + // just started marking cycle is complete - which may be a while. So + // we do NOT retry the GC. if (!res) { - assert(_word_size == 0, "ExplicitGCInvokesConcurrent shouldn't be allocating"); + assert(_word_size == 0, "Concurrent Full GC/Humongous Object IM shouldn't be allocating"); + if (_gc_cause != GCCause::_g1_humongous_allocation) { + _should_retry_gc = true; + } return; } } @@ -123,6 +155,13 @@ true /* expect_null_cur_alloc_region */); } else { assert(_result == NULL, "invariant"); + if (!_pause_succeeded) { + // Another possible reason reason for the pause to not be successful + // is that, again, the GC locker is active (and has become active + // since the prologue was executed). In this case we should retry + // the pause after waiting for the GC locker to become inactive. + _should_retry_gc = true; + } } } @@ -168,6 +207,7 @@ } void VM_CGC_Operation::acquire_pending_list_lock() { + assert(_needs_pll, "don't call this otherwise"); // The caller may block while communicating // with the SLT thread in order to acquire/release the PLL. ConcurrentMarkThread::slt()-> @@ -175,6 +215,7 @@ } void VM_CGC_Operation::release_and_notify_pending_list_lock() { + assert(_needs_pll, "don't call this otherwise"); // The caller may block while communicating // with the SLT thread in order to acquire/release the PLL. ConcurrentMarkThread::slt()-> @@ -198,7 +239,9 @@ bool VM_CGC_Operation::doit_prologue() { // Note the relative order of the locks must match that in // VM_GC_Operation::doit_prologue() or deadlocks can occur - acquire_pending_list_lock(); + if (_needs_pll) { + acquire_pending_list_lock(); + } Heap_lock->lock(); SharedHeap::heap()->_thread_holds_heap_lock_for_gc = true; @@ -210,5 +253,7 @@ // VM_GC_Operation::doit_epilogue() SharedHeap::heap()->_thread_holds_heap_lock_for_gc = false; Heap_lock->unlock(); - release_and_notify_pending_list_lock(); + if (_needs_pll) { + release_and_notify_pending_list_lock(); + } } diff -r f5fba31ac5ce -r de5748cca211 src/share/vm/gc_implementation/g1/vm_operations_g1.hpp --- a/src/share/vm/gc_implementation/g1/vm_operations_g1.hpp Sun Mar 25 19:55:03 2012 -0700 +++ b/src/share/vm/gc_implementation/g1/vm_operations_g1.hpp Mon Mar 12 14:59:00 2012 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -43,8 +43,9 @@ public: VM_G1OperationWithAllocRequest(unsigned int gc_count_before, - size_t word_size) - : VM_GC_Operation(gc_count_before, GCCause::_allocation_failure), + size_t word_size, + GCCause::Cause gc_cause) + : VM_GC_Operation(gc_count_before, gc_cause), _word_size(word_size), _result(NULL), _pause_succeeded(false) { } HeapWord* result() { return _result; } bool pause_succeeded() { return _pause_succeeded; } @@ -77,6 +78,7 @@ class VM_G1IncCollectionPause: public VM_G1OperationWithAllocRequest { private: bool _should_initiate_conc_mark; + bool _should_retry_gc; double _target_pause_time_ms; unsigned int _full_collections_completed_before; public: @@ -86,11 +88,13 @@ double target_pause_time_ms, GCCause::Cause gc_cause); virtual VMOp_Type type() const { return VMOp_G1IncCollectionPause; } + virtual bool doit_prologue(); virtual void doit(); virtual void doit_epilogue(); virtual const char* name() const { return "garbage-first incremental collection pause"; } + bool should_retry_gc() const { return _should_retry_gc; } }; // Concurrent GC stop-the-world operations such as remark and cleanup; @@ -98,6 +102,7 @@ class VM_CGC_Operation: public VM_Operation { VoidClosure* _cl; const char* _printGCMessage; + bool _needs_pll; protected: // java.lang.ref.Reference support @@ -105,8 +110,8 @@ void release_and_notify_pending_list_lock(); public: - VM_CGC_Operation(VoidClosure* cl, const char *printGCMsg) - : _cl(cl), _printGCMessage(printGCMsg) { } + VM_CGC_Operation(VoidClosure* cl, const char *printGCMsg, bool needs_pll) + : _cl(cl), _printGCMessage(printGCMsg), _needs_pll(needs_pll) { } virtual VMOp_Type type() const { return VMOp_CGC_Operation; } virtual void doit(); virtual bool doit_prologue();