changeset 8568:3ccdd3482827

Backport optimization of volatile puts/gets and CAS to use ldar/stlr
author adinn
date Thu, 08 Oct 2015 11:06:07 -0400
parents f9720c487762
children acaa2f3db91b
files src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/globals_aarch64.hpp src/cpu/aarch64/vm/macroAssembler_aarch64.cpp src/cpu/aarch64/vm/macroAssembler_aarch64.hpp src/cpu/aarch64/vm/vm_version_aarch64.cpp src/share/vm/opto/graphKit.cpp
diffstat 6 files changed, 2388 insertions(+), 125 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Oct 08 11:06:07 2015 -0400
@@ -962,10 +962,1854 @@
   }
 };
 
+  // graph traversal helpers
+
+  MemBarNode *parent_membar(const Node *n);
+  MemBarNode *child_membar(const MemBarNode *n);
+  bool leading_membar(const MemBarNode *barrier);
+
+  bool is_card_mark_membar(const MemBarNode *barrier);
+  bool is_CAS(int opcode);
+
+  MemBarNode *leading_to_normal(MemBarNode *leading);
+  MemBarNode *normal_to_leading(const MemBarNode *barrier);
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing);
+
+  // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
+  bool unnecessary_acquire(const Node *barrier);
+  bool needs_acquiring_load(const Node *load);
+
+  // predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
+  bool unnecessary_release(const Node *barrier);
+  bool unnecessary_volatile(const Node *barrier);
+  bool needs_releasing_store(const Node *store);
+
+  // predicate controlling translation of CompareAndSwapX
+  bool needs_acquiring_load_exclusive(const Node *load);
+
+  // predicate controlling translation of StoreCM
+  bool unnecessary_storestore(const Node *storecm);
 %}
 
 source %{
 
+  // Optimizaton of volatile gets and puts
+  // -------------------------------------
+  //
+  // AArch64 has ldar<x> and stlr<x> instructions which we can safely
+  // use to implement volatile reads and writes. For a volatile read
+  // we simply need
+  //
+  //   ldar<x>
+  //
+  // and for a volatile write we need
+  //
+  //   stlr<x>
+  // 
+  // Alternatively, we can implement them by pairing a normal
+  // load/store with a memory barrier. For a volatile read we need
+  // 
+  //   ldr<x>
+  //   dmb ishld
+  //
+  // for a volatile write
+  //
+  //   dmb ish
+  //   str<x>
+  //   dmb ish
+  //
+  // We can also use ldaxr and stlxr to implement compare and swap CAS
+  // sequences. These are normally translated to an instruction
+  // sequence like the following
+  //
+  //   dmb      ish
+  // retry:
+  //   ldxr<x>   rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //   dmb ishld
+  //
+  // Note that the exclusive store is already using an stlxr
+  // instruction. That is required to ensure visibility to other
+  // threads of the exclusive write (assuming it succeeds) before that
+  // of any subsequent writes.
+  //
+  // The following instruction sequence is an improvement on the above
+  //
+  // retry:
+  //   ldaxr<x>  rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //
+  // We don't need the leading dmb ish since the stlxr guarantees
+  // visibility of prior writes in the case that the swap is
+  // successful. Crucially we don't have to worry about the case where
+  // the swap is not successful since no valid program should be
+  // relying on visibility of prior changes by the attempting thread
+  // in the case where the CAS fails.
+  //
+  // Similarly, we don't need the trailing dmb ishld if we substitute
+  // an ldaxr instruction since that will provide all the guarantees we
+  // require regarding observation of changes made by other threads
+  // before any change to the CAS address observed by the load.
+  //
+  // In order to generate the desired instruction sequence we need to
+  // be able to identify specific 'signature' ideal graph node
+  // sequences which i) occur as a translation of a volatile reads or
+  // writes or CAS operations and ii) do not occur through any other
+  // translation or graph transformation. We can then provide
+  // alternative aldc matching rules which translate these node
+  // sequences to the desired machine code sequences. Selection of the
+  // alternative rules can be implemented by predicates which identify
+  // the relevant node sequences.
+  //
+  // The ideal graph generator translates a volatile read to the node
+  // sequence
+  //
+  //   LoadX[mo_acquire]
+  //   MemBarAcquire
+  //
+  // As a special case when using the compressed oops optimization we
+  // may also see this variant
+  //
+  //   LoadN[mo_acquire]
+  //   DecodeN
+  //   MemBarAcquire
+  //
+  // A volatile write is translated to the node sequence
+  //
+  //   MemBarRelease
+  //   StoreX[mo_release] {CardMark}-optional
+  //   MemBarVolatile
+  //
+  // n.b. the above node patterns are generated with a strict
+  // 'signature' configuration of input and output dependencies (see
+  // the predicates below for exact details). The card mark may be as
+  // simple as a few extra nodes or, in a few GC configurations, may
+  // include more complex control flow between the leading and
+  // trailing memory barriers. However, whatever the card mark
+  // configuration these signatures are unique to translated volatile
+  // reads/stores -- they will not appear as a result of any other
+  // bytecode translation or inlining nor as a consequence of
+  // optimizing transforms.
+  //
+  // We also want to catch inlined unsafe volatile gets and puts and
+  // be able to implement them using either ldar<x>/stlr<x> or some
+  // combination of ldr<x>/stlr<x> and dmb instructions.
+  //
+  // Inlined unsafe volatiles puts manifest as a minor variant of the
+  // normal volatile put node sequence containing an extra cpuorder
+  // membar
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //   StoreX[mo_release] {CardMark}-optional
+  //   MemBarVolatile
+  //
+  // n.b. as an aside, the cpuorder membar is not itself subject to
+  // matching and translation by adlc rules.  However, the rule
+  // predicates need to detect its presence in order to correctly
+  // select the desired adlc rules.
+  //
+  // Inlined unsafe volatile gets manifest as a somewhat different
+  // node sequence to a normal volatile get
+  //
+  //   MemBarCPUOrder
+  //        ||       \\
+  //   MemBarAcquire LoadX[mo_acquire]
+  //        ||
+  //   MemBarCPUOrder
+  //
+  // In this case the acquire membar does not directly depend on the
+  // load. However, we can be sure that the load is generated from an
+  // inlined unsafe volatile get if we see it dependent on this unique
+  // sequence of membar nodes. Similarly, given an acquire membar we
+  // can know that it was added because of an inlined unsafe volatile
+  // get if it is fed and feeds a cpuorder membar and if its feed
+  // membar also feeds an acquiring load.
+  //
+  // Finally an inlined (Unsafe) CAS operation is translated to the
+  // following ideal graph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //   CompareAndSwapX {CardMark}-optional
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  // So, where we can identify these volatile read and write
+  // signatures we can choose to plant either of the above two code
+  // sequences. For a volatile read we can simply plant a normal
+  // ldr<x> and translate the MemBarAcquire to a dmb. However, we can
+  // also choose to inhibit translation of the MemBarAcquire and
+  // inhibit planting of the ldr<x>, instead planting an ldar<x>.
+  //
+  // When we recognise a volatile store signature we can choose to
+  // plant at a dmb ish as a translation for the MemBarRelease, a
+  // normal str<x> and then a dmb ish for the MemBarVolatile.
+  // Alternatively, we can inhibit translation of the MemBarRelease
+  // and MemBarVolatile and instead plant a simple stlr<x>
+  // instruction.
+  //
+  // when we recognise a CAS signature we can choose to plant a dmb
+  // ish as a translation for the MemBarRelease, the conventional
+  // macro-instruction sequence for the CompareAndSwap node (which
+  // uses ldxr<x>) and then a dmb ishld for the MemBarAcquire.
+  // Alternatively, we can elide generation of the dmb instructions
+  // and plant the alternative CompareAndSwap macro-instruction
+  // sequence (which uses ldaxr<x>).
+  // 
+  // Of course, the above only applies when we see these signature
+  // configurations. We still want to plant dmb instructions in any
+  // other cases where we may see a MemBarAcquire, MemBarRelease or
+  // MemBarVolatile. For example, at the end of a constructor which
+  // writes final/volatile fields we will see a MemBarRelease
+  // instruction and this needs a 'dmb ish' lest we risk the
+  // constructed object being visible without making the
+  // final/volatile field writes visible.
+  //
+  // n.b. the translation rules below which rely on detection of the
+  // volatile signatures and insert ldar<x> or stlr<x> are failsafe.
+  // If we see anything other than the signature configurations we
+  // always just translate the loads and stores to ldr<x> and str<x>
+  // and translate acquire, release and volatile membars to the
+  // relevant dmb instructions.
+  //
+
+  // graph traversal helpers used for volatile put/get and CAS
+  // optimization
+
+  // 1) general purpose helpers
+
+  // if node n is linked to a parent MemBarNode by an intervening
+  // Control and Memory ProjNode return the MemBarNode otherwise return
+  // NULL.
+  //
+  // n may only be a Load or a MemBar.
+
+  MemBarNode *parent_membar(const Node *n)
+  {
+    Node *ctl = NULL;
+    Node *mem = NULL;
+    Node *membar = NULL;
+
+    if (n->is_Load()) {
+      ctl = n->lookup(LoadNode::Control);
+      mem = n->lookup(LoadNode::Memory);
+    } else if (n->is_MemBar()) {
+      ctl = n->lookup(TypeFunc::Control);
+      mem = n->lookup(TypeFunc::Memory);
+    } else {
+	return NULL;
+    }
+
+    if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj()) {
+      return NULL;
+    }
+
+    membar = ctl->lookup(0);
+
+    if (!membar || !membar->is_MemBar()) {
+      return NULL;
+    }
+
+    if (mem->lookup(0) != membar) {
+      return NULL;
+    }
+
+    return membar->as_MemBar();
+  }
+
+  // if n is linked to a child MemBarNode by intervening Control and
+  // Memory ProjNodes return the MemBarNode otherwise return NULL.
+
+  MemBarNode *child_membar(const MemBarNode *n)
+  {
+    ProjNode *ctl = n->proj_out(TypeFunc::Control);
+    ProjNode *mem = n->proj_out(TypeFunc::Memory);
+
+    // MemBar needs to have both a Ctl and Mem projection
+    if (! ctl || ! mem)
+      return NULL;
+
+    MemBarNode *child = NULL;
+    Node *x;
+
+    for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
+      x = ctl->fast_out(i);
+      // if we see a membar we keep hold of it. we may also see a new
+      // arena copy of the original but it will appear later
+      if (x->is_MemBar()) {
+	  child = x->as_MemBar();
+	  break;
+      }
+    }
+
+    if (child == NULL) {
+      return NULL;
+    }
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      // if we see a membar we keep hold of it. we may also see a new
+      // arena copy of the original but it will appear later
+      if (x == child) {
+	return child;
+      }
+    }
+    return NULL;
+  }
+
+  // helper predicate use to filter candidates for a leading memory
+  // barrier
+  //
+  // returns true if barrier is a MemBarRelease or a MemBarCPUOrder
+  // whose Ctl and Mem feeds come from a MemBarRelease otherwise false
+
+  bool leading_membar(const MemBarNode *barrier)
+  {
+    int opcode = barrier->Opcode();
+    // if this is a release membar we are ok
+    if (opcode == Op_MemBarRelease) {
+      return true;
+    }
+    // if its a cpuorder membar . . .
+    if (opcode != Op_MemBarCPUOrder) {
+      return false;
+    }
+    // then the parent has to be a release membar
+    MemBarNode *parent = parent_membar(barrier);
+    if (!parent) {
+      return false;
+    }
+    opcode = parent->Opcode();
+    return opcode == Op_MemBarRelease;
+  }
+ 
+  // 2) card mark detection helper
+
+  // helper predicate which can be used to detect a volatile membar
+  // introduced as part of a conditional card mark sequence either by
+  // G1 or by CMS when UseCondCardMark is true.
+  //
+  // membar can be definitively determined to be part of a card mark
+  // sequence if and only if all the following hold
+  //
+  // i) it is a MemBarVolatile
+  //
+  // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is
+  // true
+  //
+  // iii) the node's Mem projection feeds a StoreCM node.
+  
+  bool is_card_mark_membar(const MemBarNode *barrier)
+  {
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark)) {
+      return false;
+    }
+
+    if (barrier->Opcode() != Op_MemBarVolatile) {
+      return false;
+    }
+
+    ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
+      Node *y = mem->fast_out(i);
+      if (y->Opcode() == Op_StoreCM) {
+	return true;
+      }
+    }
+  
+    return false;
+  }
+
+
+  // 3) helper predicates to traverse volatile put or CAS graphs which
+  // may contain GC barrier subgraphs
+
+  // Preamble
+  // --------
+  //
+  // for volatile writes we can omit generating barriers and employ a
+  // releasing store when we see a node sequence sequence with a
+  // leading MemBarRelease and a trailing MemBarVolatile as follows
+  //
+  //   MemBarRelease
+  //  {      ||      } -- optional
+  //  {MemBarCPUOrder}
+  //         ||     \\
+  //         ||     StoreX[mo_release]
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarVolatile
+  //
+  // where
+  //  || and \\ represent Ctl and Mem feeds via Proj nodes
+  //  | \ and / indicate further routing of the Ctl and Mem feeds
+  // 
+  // this is the graph we see for non-object stores. however, for a
+  // volatile Object store (StoreN/P) we may see other nodes below the
+  // leading membar because of the need for a GC pre- or post-write
+  // barrier.
+  //
+  // with most GC configurations we with see this simple variant which
+  // includes a post-write barrier card mark.
+  //
+  //   MemBarRelease______________________________
+  //         ||    \\               Ctl \        \\
+  //         ||    StoreN/P[mo_release] CastP2X  StoreB/CM
+  //         | \     /                       . . .  /
+  //         | MergeMem
+  //         | /
+  //         ||      /
+  //   MemBarVolatile
+  //
+  // i.e. the leading membar feeds Ctl to a CastP2X (which converts
+  // the object address to an int used to compute the card offset) and
+  // Ctl+Mem to a StoreB node (which does the actual card mark).
+  //
+  // n.b. a StoreCM node will only appear in this configuration when
+  // using CMS. StoreCM differs from a normal card mark write (StoreB)
+  // because it implies a requirement to order visibility of the card
+  // mark (StoreCM) relative to the object put (StoreP/N) using a
+  // StoreStore memory barrier (arguably this ought to be represented
+  // explicitly in the ideal graph but that is not how it works). This
+  // ordering is required for both non-volatile and volatile
+  // puts. Normally that means we need to translate a StoreCM using
+  // the sequence
+  //
+  //   dmb ishst
+  //   stlrb
+  //
+  // However, in the case of a volatile put if we can recognise this
+  // configuration and plant an stlr for the object write then we can
+  // omit the dmb and just plant an strb since visibility of the stlr
+  // is ordered before visibility of subsequent stores. StoreCM nodes
+  // also arise when using G1 or using CMS with conditional card
+  // marking. In these cases (as we shall see) we don't need to insert
+  // the dmb when translating StoreCM because there is already an
+  // intervening StoreLoad barrier between it and the StoreP/N.
+  //
+  // It is also possible to perform the card mark conditionally on it
+  // currently being unmarked in which case the volatile put graph
+  // will look slightly different
+  //
+  //   MemBarRelease____________________________________________
+  //         ||    \\               Ctl \     Ctl \     \\  Mem \
+  //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
+  //         | \     /                              \            |
+  //         | MergeMem                            . . .      StoreB
+  //         | /                                                /
+  //         ||     /
+  //   MemBarVolatile
+  //
+  // It is worth noting at this stage that both the above
+  // configurations can be uniquely identified by checking that the
+  // memory flow includes the following subgraph:
+  //
+  //   MemBarRelease
+  //  {MemBarCPUOrder}
+  //          |  \      . . .
+  //          |  StoreX[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // This is referred to as a *normal* subgraph. It can easily be
+  // detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile.
+  //
+  // A simple variation on this normal case occurs for an unsafe CAS
+  // operation. The basic graph for a non-object CAS is
+  //
+  //   MemBarRelease
+  //         ||
+  //   MemBarCPUOrder
+  //         ||     \\   . . .
+  //         ||     CompareAndSwapX
+  //         ||       |
+  //         ||     SCMemProj
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarCPUOrder
+  //         ||
+  //   MemBarAcquire
+  //
+  // The same basic variations on this arrangement (mutatis mutandis)
+  // occur when a card mark is introduced. i.e. we se the same basic
+  // shape but the StoreP/N is replaced with CompareAndSawpP/N and the
+  // tail of the graph is a pair comprising a MemBarCPUOrder +
+  // MemBarAcquire.
+  //
+  // So, in the case of a CAS the normal graph has the variant form
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |   \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |    |
+  //          |   SCMemProj
+  //          |   /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  // This graph can also easily be detected starting from any
+  // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire.
+  //
+  // the code below uses two helper predicates, leading_to_normal and
+  // normal_to_leading to identify these normal graphs, one validating
+  // the layout starting from the top membar and searching down and
+  // the other validating the layout starting from the lower membar
+  // and searching up.
+  //
+  // There are two special case GC configurations when a normal graph
+  // may not be generated: when using G1 (which always employs a
+  // conditional card mark); and when using CMS with conditional card
+  // marking configured. These GCs are both concurrent rather than
+  // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
+  // graph between the leading and trailing membar nodes, in
+  // particular enforcing stronger memory serialisation beween the
+  // object put and the corresponding conditional card mark. CMS
+  // employs a post-write GC barrier while G1 employs both a pre- and
+  // post-write GC barrier. Of course the extra nodes may be absent --
+  // they are only inserted for object puts. This significantly
+  // complicates the task of identifying whether a MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
+  // when using these GC configurations (see below). It adds similar
+  // complexity to the task of identifying whether a MemBarRelease,
+  // CompareAndSwapX or MemBarAcquire forms part of a CAS.
+  //
+  // In both cases the post-write subtree includes an auxiliary
+  // MemBarVolatile (StoreLoad barrier) separating the object put and
+  // the read of the corresponding card. This poses two additional
+  // problems.
+  //
+  // Firstly, a card mark MemBarVolatile needs to be distinguished
+  // from a normal trailing MemBarVolatile. Resolving this first
+  // problem is straightforward: a card mark MemBarVolatile always
+  // projects a Mem feed to a StoreCM node and that is a unique marker
+  //
+  //      MemBarVolatile (card mark)
+  //       C |    \     . . .
+  //         |   StoreCM   . . .
+  //       . . .
+  //
+  // The second problem is how the code generator is to translate the
+  // card mark barrier? It always needs to be translated to a "dmb
+  // ish" instruction whether or not it occurs as part of a volatile
+  // put. A StoreLoad barrier is needed after the object put to ensure
+  // i) visibility to GC threads of the object put and ii) visibility
+  // to the mutator thread of any card clearing write by a GC
+  // thread. Clearly a normal store (str) will not guarantee this
+  // ordering but neither will a releasing store (stlr). The latter
+  // guarantees that the object put is visible but does not guarantee
+  // that writes by other threads have also been observed.
+  // 
+  // So, returning to the task of translating the object put and the
+  // leading/trailing membar nodes: what do the non-normal node graph
+  // look like for these 2 special cases? and how can we determine the
+  // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
+  // in both normal and non-normal cases?
+  //
+  // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
+  // which selects conditonal execution based on the value loaded
+  // (LoadB) from the card. Ctl and Mem are fed to the If via an
+  // intervening StoreLoad barrier (MemBarVolatile).
+  //
+  // So, with CMS we may see a node graph for a volatile object store
+  // which looks like this
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder_(leading)__________________
+  //     C |    M \       \\                   C \
+  //       |       \    StoreN/P[mo_release]  CastP2X
+  //       |    Bot \    /
+  //       |       MergeMem
+  //       |         /
+  //      MemBarVolatile (card mark)
+  //     C |  ||    M |
+  //       | LoadB    |
+  //       |   |      |
+  //       | Cmp      |\
+  //       | /        | \
+  //       If         |  \
+  //       | \        |   \
+  // IfFalse  IfTrue  |    \
+  //       \     / \  |     \
+  //        \   / StoreCM    |
+  //         \ /      |      |
+  //        Region   . . .   |
+  //          | \           /
+  //          |  . . .  \  / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarVolatile (trailing)
+  //
+  // The first MergeMem merges the AliasIdxBot Mem slice from the
+  // leading membar and the oopptr Mem slice from the Store into the
+  // card mark membar. The trailing MergeMem merges the AliasIdxBot
+  // Mem slice from the card mark membar and the AliasIdxRaw slice
+  // from the StoreCM into the trailing membar (n.b. the latter
+  // proceeds via a Phi associated with the If region).
+  //
+  // The graph for a CAS varies slightly, the obvious difference being
+  // that the StoreN/P node is replaced by a CompareAndSwapP/N node
+  // and the trailing MemBarVolatile by a MemBarCPUOrder +
+  // MemBarAcquire pair. The other important difference is that the
+  // CompareAndSwap node's SCMemProj is not merged into the card mark
+  // membar - it still feeds the trailing MergeMem. This also means
+  // that the card mark membar receives its Mem feed directly from the
+  // leading membar rather than via a MergeMem.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder__(leading)_________________________
+  //       ||                       \\                 C \
+  //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
+  //     C |  ||    M |              |
+  //       | LoadB    |       ______/|
+  //       |   |      |      /       |
+  //       | Cmp      |     /      SCMemProj
+  //       | /        |    /         |
+  //       If         |   /         /
+  //       | \        |  /         /
+  // IfFalse  IfTrue  | /         /
+  //       \     / \  |/ prec    /
+  //        \   / StoreCM       /
+  //         \ /      |        /
+  //        Region   . . .    /
+  //          | \            /
+  //          |  . . .  \   / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarCPUOrder
+  //        MemBarAcquire (trailing)
+  //
+  // This has a slightly different memory subgraph to the one seen
+  // previously but the core of it is the same as for the CAS normal
+  // sungraph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder____
+  //      ||             \      . . .
+  //   MemBarVolatile  CompareAndSwapX  . . .
+  //      |  \            |
+  //        . . .   SCMemProj
+  //          |     /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  //
+  // G1 is quite a lot more complicated. The nodes inserted on behalf
+  // of G1 may comprise: a pre-write graph which adds the old value to
+  // the SATB queue; the releasing store itself; and, finally, a
+  // post-write graph which performs a card mark.
+  //
+  // The pre-write graph may be omitted, but only when the put is
+  // writing to a newly allocated (young gen) object and then only if
+  // there is a direct memory chain to the Initialize node for the
+  // object allocation. This will not happen for a volatile put since
+  // any memory chain passes through the leading membar.
+  //
+  // The pre-write graph includes a series of 3 If tests. The outermost
+  // If tests whether SATB is enabled (no else case). The next If tests
+  // whether the old value is non-NULL (no else case). The third tests
+  // whether the SATB queue index is > 0, if so updating the queue. The
+  // else case for this third If calls out to the runtime to allocate a
+  // new queue buffer.
+  //
+  // So with G1 the pre-write and releasing store subgraph looks like
+  // this (the nested Ifs are omitted).
+  //
+  //  MemBarRelease (leading)____________
+  //     C |  ||  M \   M \    M \  M \ . . .
+  //       | LoadB   \  LoadL  LoadN   \
+  //       | /        \                 \
+  //       If         |\                 \
+  //       | \        | \                 \
+  //  IfFalse  IfTrue |  \                 \
+  //       |     |    |   \                 |
+  //       |     If   |   /\                |
+  //       |     |          \               |
+  //       |                 \              |
+  //       |    . . .         \             |
+  //       | /       | /       |            |
+  //      Region  Phi[M]       |            |
+  //       | \       |         |            |
+  //       |  \_____ | ___     |            |
+  //     C | C \     |   C \ M |            |
+  //       | CastP2X | StoreN/P[mo_release] |
+  //       |         |         |            |
+  //     C |       M |       M |          M |
+  //        \        |         |           /
+  //                  . . . 
+  //          (post write subtree elided)
+  //                    . . .
+  //             C \         M /
+  //         MemBarVolatile (trailing)
+  //
+  // n.b. the LoadB in this subgraph is not the card read -- it's a
+  // read of the SATB queue active flag.
+  //
+  // Once again the CAS graph is a minor variant on the above with the
+  // expected substitutions of CompareAndSawpX for StoreN/P and
+  // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
+  //
+  // The G1 post-write subtree is also optional, this time when the
+  // new value being written is either null or can be identified as a
+  // newly allocated (young gen) object with no intervening control
+  // flow. The latter cannot happen but the former may, in which case
+  // the card mark membar is omitted and the memory feeds form the
+  // leading membar and the SToreN/P are merged direct into the
+  // trailing membar as per the normal subgraph. So, the only special
+  // case which arises is when the post-write subgraph is generated.
+  //
+  // The kernel of the post-write G1 subgraph is the card mark itself
+  // which includes a card mark memory barrier (MemBarVolatile), a
+  // card test (LoadB), and a conditional update (If feeding a
+  // StoreCM). These nodes are surrounded by a series of nested Ifs
+  // which try to avoid doing the card mark. The top level If skips if
+  // the object reference does not cross regions (i.e. it tests if
+  // (adr ^ val) >> log2(regsize) != 0) -- intra-region references
+  // need not be recorded. The next If, which skips on a NULL value,
+  // may be absent (it is not generated if the type of value is >=
+  // OopPtr::NotNull). The 3rd If skips writes to young regions (by
+  // checking if card_val != young).  n.b. although this test requires
+  // a pre-read of the card it can safely be done before the StoreLoad
+  // barrier. However that does not bypass the need to reread the card
+  // after the barrier.
+  //
+  //                (pre-write subtree elided)
+  //        . . .                  . . .    . . .  . . .
+  //        C |                    M |     M |    M |
+  //       Region                  Phi[M] StoreN    |
+  //          |                     / \      |      |
+  //         / \_______            /   \     |      |
+  //      C / C \      . . .            \    |      |
+  //       If   CastP2X . . .            |   |      |
+  //       / \                           |   |      |
+  //      /   \                          |   |      |
+  // IfFalse IfTrue                      |   |      |
+  //   |       |                         |   |     /|
+  //   |       If                        |   |    / |
+  //   |      / \                        |   |   /  |
+  //   |     /   \                        \  |  /   |
+  //   | IfFalse IfTrue                   MergeMem  |
+  //   |  . . .    / \                       /      |
+  //   |          /   \                     /       |
+  //   |     IfFalse IfTrue                /        |
+  //   |      . . .    |                  /         |
+  //   |               If                /          |
+  //   |               / \              /           |
+  //   |              /   \            /            |
+  //   |         IfFalse IfTrue       /             |
+  //   |           . . .   |         /              |
+  //   |                    \       /               |
+  //   |                     \     /                |
+  //   |             MemBarVolatile__(card mark)    |
+  //   |                ||   C |  M \  M \          |
+  //   |               LoadB   If    |    |         |
+  //   |                      / \    |    |         |
+  //   |                     . . .   |    |         |
+  //   |                          \  |    |        /
+  //   |                        StoreCM   |       /
+  //   |                          . . .   |      /
+  //   |                        _________/      /
+  //   |                       /  _____________/
+  //   |   . . .       . . .  |  /            /
+  //   |    |                 | /   _________/
+  //   |    |               Phi[M] /        /
+  //   |    |                 |   /        /
+  //   |    |                 |  /        /
+  //   |  Region  . . .     Phi[M]  _____/
+  //   |    /                 |    /
+  //   |                      |   /   
+  //   | . . .   . . .        |  /
+  //   | /                    | /
+  // Region           |  |  Phi[M]
+  //   |              |  |  / Bot
+  //    \            MergeMem 
+  //     \            /
+  //     MemBarVolatile
+  //
+  // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
+  // from the leading membar and the oopptr Mem slice from the Store
+  // into the card mark membar i.e. the memory flow to the card mark
+  // membar still looks like a normal graph.
+  //
+  // The trailing MergeMem merges an AliasIdxBot Mem slice with other
+  // Mem slices (from the StoreCM and other card mark queue stores).
+  // However in this case the AliasIdxBot Mem slice does not come
+  // direct from the card mark membar. It is merged through a series
+  // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
+  // from the leading membar with the Mem feed from the card mark
+  // membar. Each Phi corresponds to one of the Ifs which may skip
+  // around the card mark membar. So when the If implementing the NULL
+  // value check has been elided the total number of Phis is 2
+  // otherwise it is 3.
+  //
+  // The CAS graph when using G1GC also includes a pre-write subgraph
+  // and an optional post-write subgraph. Teh sam evarioations are
+  // introduced as for CMS with conditional card marking i.e. the
+  // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
+  // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
+  // Mem feed from the CompareAndSwapP/N includes a precedence
+  // dependency feed to the StoreCM and a feed via an SCMemProj to the
+  // trailing membar. So, as before the configuration includes the
+  // normal CAS graph as a subgraph of the memory flow.
+  //
+  // So, the upshot is that in all cases the volatile put graph will
+  // include a *normal* memory subgraph betwen the leading membar and
+  // its child membar, either a volatile put graph (including a
+  // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
+  // When that child is not a card mark membar then it marks the end
+  // of the volatile put or CAS subgraph. If the child is a card mark
+  // membar then the normal subgraph will form part of a volatile put
+  // subgraph if and only if the child feeds an AliasIdxBot Mem feed
+  // to a trailing barrier via a MergeMem. That feed is either direct
+  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
+  // memory flow (for G1).
+  // 
+  // The predicates controlling generation of instructions for store
+  // and barrier nodes employ a few simple helper functions (described
+  // below) which identify the presence or absence of all these
+  // subgraph configurations and provide a means of traversing from
+  // one node in the subgraph to another.
+
+  // is_CAS(int opcode)
+  //
+  // return true if opcode is one of the possible CompareAndSwapX
+  // values otherwise false.
+
+  bool is_CAS(int opcode)
+  {
+    return (opcode == Op_CompareAndSwapI ||
+	    opcode == Op_CompareAndSwapL ||
+	    opcode == Op_CompareAndSwapN ||
+	    opcode == Op_CompareAndSwapP);
+  }
+
+  // leading_to_normal
+  //
+  //graph traversal helper which detects the normal case Mem feed from
+  // a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that one or other of
+  // the following Mem flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile {trailing or card mark}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |       /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
+  //
+  // if the correct configuration is present returns the trailing
+  // membar otherwise NULL.
+  //
+  // the input membar is expected to be either a cpuorder membar or a
+  // release membar. in the latter case it should not have a cpu membar
+  // child.
+  //
+  // the returned value may be a card mark or trailing membar
+  //
+
+  MemBarNode *leading_to_normal(MemBarNode *leading)
+  {
+    assert((leading->Opcode() == Op_MemBarRelease ||
+	    leading->Opcode() == Op_MemBarCPUOrder),
+	   "expecting a volatile or cpuroder membar!");
+
+    // check the mem flow
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+    if (!mem) {
+      return NULL;
+    }
+
+    Node *x = NULL;
+    StoreNode * st = NULL;
+    LoadStoreNode *cas = NULL;
+    MergeMemNode *mm = NULL;
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_MergeMem()) {
+	if (mm != NULL) {
+	  return NULL;
+	}
+	// two merge mems is one too many
+	mm = x->as_MergeMem();
+      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two releasing stores/CAS nodes is one too many
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	st = x->as_Store();
+      } else if (is_CAS(x->Opcode())) {
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	cas = x->as_LoadStore();
+      }
+    }
+
+    // must have a store or a cas
+    if (!st && !cas) {
+      return NULL;
+    }
+
+    // must have a merge if we also have st
+    if (st && !mm) {
+      return NULL;
+    }
+
+    Node *y = NULL;
+    if (cas) {
+      // look for an SCMemProj
+      for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
+	x = cas->fast_out(i);
+	if (x->is_Proj()) {
+	  y = x;
+	  break;
+	}
+      }
+      if (y == NULL) {
+	return NULL;
+      }
+      // the proj must feed a MergeMem
+      for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) {
+	x = y->fast_out(i);
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm == NULL)
+	return NULL;
+    } else {
+      // ensure the store feeds the existing mergemem;
+      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+	if (st->fast_out(i) == mm) {
+	  y = st;
+	  break;
+	}
+      }
+      if (y == NULL) {
+	return NULL;
+      }
+    }
+
+    MemBarNode *mbar = NULL;
+    // ensure the merge feeds to the expected type of membar
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar()) {
+	int opcode = x->Opcode();
+	if (opcode == Op_MemBarVolatile && st) {
+	  mbar = x->as_MemBar();
+	} else if (cas && opcode == Op_MemBarCPUOrder) {
+	  MemBarNode *y =  x->as_MemBar();
+	  y = child_membar(y);
+	  if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
+	    mbar = y;
+	  }
+	}
+	break;
+      }
+    }
+
+    return mbar;
+  }
+
+  // normal_to_leading
+  //
+  // graph traversal helper which detects the normal case Mem feed
+  // from either a card mark or a trailing membar to a preceding
+  // release membar (optionally its cpuorder child) i.e. it ensures
+  // that one or other of the following Mem flow subgraphs is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile {card mark or trailing}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |        /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configuration is present returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a MemBarVolatile but
+  // need not be a card mark membar.
+
+  MemBarNode *normal_to_leading(const MemBarNode *barrier)
+  {
+    // input must be a volatile membar
+    assert((barrier->Opcode() == Op_MemBarVolatile ||
+	    barrier->Opcode() == Op_MemBarAcquire),
+	   "expecting a volatile or an acquire membar");
+    Node *x;
+    bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
+
+    // if we have an acquire membar then it must be fed via a CPUOrder
+    // membar
+
+    if (is_cas) {
+      // skip to parent barrier which must be a cpuorder
+      x = parent_membar(barrier);
+      if (x->Opcode() != Op_MemBarCPUOrder)
+	return NULL;
+    } else {
+      // start from the supplied barrier
+      x = (Node *)barrier;
+    }
+
+    // the Mem feed to the membar should be a merge
+    x = x ->in(TypeFunc::Memory);
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    if (is_cas) {
+      // the merge should be fed from the CAS via an SCMemProj node
+      x = NULL;
+      for (uint idx = 1; idx < mm->req(); idx++) {
+	if (mm->in(idx)->Opcode() == Op_SCMemProj) {
+	  x = mm->in(idx);
+	  break;
+	}
+      }
+      if (x == NULL) {
+	return NULL;
+      }
+      // check for a CAS feeding this proj
+      x = x->in(0);
+      int opcode = x->Opcode();
+      if (!is_CAS(opcode)) {
+	return NULL;
+      }
+      // the CAS should get its mem feed from the leading membar
+      x = x->in(MemNode::Memory);
+    } else {
+      // the merge should get its Bottom mem feed from the leading membar
+      x = mm->in(Compile::AliasIdxBot);      
+    } 
+
+    // ensure this is a non control projection
+    if (!x->is_Proj() || x->is_CFG()) {
+      return NULL;
+    }
+    // if it is fed by a membar that's the one we want
+    x = x->in(0);
+
+    if (!x->is_MemBar()) {
+      return NULL;
+    }
+
+    MemBarNode *leading = x->as_MemBar();
+    // reject invalid candidates
+    if (!leading_membar(leading)) {
+      return NULL;
+    }
+
+    // ok, we have a leading membar, now for the sanity clauses
+
+    // the leading membar must feed Mem to a releasing store or CAS
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+    StoreNode *st = NULL;
+    LoadStoreNode *cas = NULL;
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two stores or CASes is one too many
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	st = x->as_Store();
+      } else if (is_CAS(x->Opcode())) {
+	if (st != NULL || cas != NULL) {
+	  return NULL;
+	}
+	cas = x->as_LoadStore();
+      }
+    }
+
+    // we should not have both a store and a cas
+    if (st == NULL & cas == NULL) {
+      return NULL;
+    }
+
+    if (st == NULL) {
+      // nothing more to check
+      return leading;
+    } else {
+      // we should not have a store if we started from an acquire
+      if (is_cas) {
+	return NULL;
+      }
+
+      // the store should feed the merge we used to get here
+      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+	if (st->fast_out(i) == mm) {
+	  return leading;
+	}
+      }
+    }
+
+    return NULL;
+  }
+
+  // card_mark_to_trailing
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a card mark volatile membar to a trailing membar i.e. it
+  // ensures that one of the following three GC post-write Mem flow
+  // subgraphs is present.
+  //
+  // 1)
+  //     . . .
+  //       |
+  //   MemBarVolatile (card mark)
+  //      |          |     
+  //      |        StoreCM
+  //      |          |
+  //      |        . . .
+  //  Bot |  / 
+  //   MergeMem 
+  //      |
+  //      |
+  //    MemBarVolatile {trailing}
+  //
+  // 2)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    | 
+  //    |\       . . .
+  //    | \        | 
+  //    |  \  MemBarVolatile (card mark) 
+  //    |   \   |     |
+  //     \   \  |   StoreCM    . . .
+  //      \   \ |
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //    MemBarVolatile {trailing}
+  //
+  //
+  // 3)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    |\
+  //    | \
+  //    |  \      . . .
+  //    |   \       |
+  //    |\   \  MemBarVolatile (card mark)
+  //    | \   \   |     |
+  //    |  \   \  |   StoreCM    . . .
+  //    |   \   \ |
+  //     \   \  Phi
+  //      \   \ /  
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //         |
+  //    MemBarVolatile {trailing}
+  //
+  // configuration 1 is only valid if UseConcMarkSweepGC &&
+  // UseCondCardMark
+  //
+  // configurations 2 and 3 are only valid if UseG1GC.
+  //
+  // if a valid configuration is present returns the trailing membar
+  // otherwise NULL.
+  //
+  // n.b. the supplied membar is expected to be a card mark
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct operand and feeds Mem to a StoreCM node
+
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+  {
+    // input must be a card mark volatile membar
+    assert(is_card_mark_membar(barrier), "expecting a card mark membar");
+
+    Node *feed = barrier->proj_out(TypeFunc::Memory);
+    Node *x;
+    MergeMemNode *mm = NULL;
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = true;
+    while (retry_feed) {
+      // see if we have a direct MergeMem feed
+      for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	x = feed->fast_out(i);
+	// the correct Phi will be merging a Bot memory slice
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm) {
+	retry_feed = false;
+      } else if (UseG1GC & phicount++ < MAX_PHIS) {
+	// the barrier may feed indirectly via one or two Phi nodes
+	PhiNode *phi = NULL;
+	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	  x = feed->fast_out(i);
+	  // the correct Phi will be merging a Bot memory slice
+	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
+	    phi = x->as_Phi();
+	    break;
+	  }
+	}
+	if (!phi) {
+	  return NULL;
+	}
+	// look for another merge below this phi
+	feed = phi;
+      } else {
+	// couldn't find a merge
+	return NULL;
+      }
+    }
+
+    // sanity check this feed turns up as the expected slice
+    assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
+
+    MemBarNode *trailing = NULL;
+    // be sure we have a trailing membar the merge
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	trailing = x->as_MemBar();
+	break;
+      }
+    }
+
+    return trailing;
+  }
+
+  // trailing_to_card_mark
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a trailing volatile membar to a preceding card mark volatile
+  // membar i.e. it identifies whether one of the three possible extra
+  // GC post-write Mem flow subgraphs is present
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configuration is present returns the card mark membar
+  // otherwise NULL
+  //
+  // n.b. the supplied membar is expected to be a trailing
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct opcode
+
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
+  {
+    assert(trailing->Opcode() == Op_MemBarVolatile,
+	   "expecting a volatile membar");
+    assert(!is_card_mark_membar(trailing),
+	   "not expecting a card mark membar");
+
+    // the Mem feed to the membar should be a merge
+    Node *x = trailing->in(TypeFunc::Memory);
+    if (!x->is_MergeMem()) {
+      return NULL;
+    }
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    x = mm->in(Compile::AliasIdxBot);
+    // with G1 we may possibly see a Phi or two before we see a Memory
+    // Proj from the card mark membar
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = !x->is_Proj();
+
+    while (retry_feed) {
+      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+	PhiNode *phi = x->as_Phi();
+	ProjNode *proj = NULL;
+	PhiNode *nextphi = NULL;
+	bool found_leading = false;
+	for (uint i = 1; i < phi->req(); i++) {
+	  x = phi->in(i);
+	  if (x->is_Phi()) {
+	    nextphi = x->as_Phi();
+	  } else if (x->is_Proj()) {
+	    int opcode = x->in(0)->Opcode();
+	    if (opcode == Op_MemBarVolatile) {
+	      proj = x->as_Proj();
+	    } else if (opcode == Op_MemBarRelease ||
+		       opcode == Op_MemBarCPUOrder) {
+	      // probably a leading membar
+	      found_leading = true;
+	    }
+	  }
+	}
+	// if we found a correct looking proj then retry from there
+	// otherwise we must see a leading and a phi or this the
+	// wrong config
+	if (proj != NULL) {
+	  x = proj;
+	  retry_feed = false;
+	} else if (found_leading && nextphi != NULL) {
+	  // retry from this phi to check phi2
+	  x = nextphi;
+	} else {
+	  // not what we were looking for
+	  return NULL;
+	}
+      } else {
+	return NULL;
+      }
+    }
+    // the proj has to come from the card mark membar
+    x = x->in(0);
+    if (!x->is_MemBar()) {
+      return NULL;
+    }
+
+    MemBarNode *card_mark_membar = x->as_MemBar();
+
+    if (!is_card_mark_membar(card_mark_membar)) {
+      return NULL;
+    }
+
+    return card_mark_membar;
+  }
+
+  // trailing_to_leading
+  //
+  // graph traversal helper which checks the Mem flow up the graph
+  // from a (non-card mark) trailing membar attempting to locate and
+  // return an associated leading membar. it first looks for a
+  // subgraph in the normal configuration (relying on helper
+  // normal_to_leading). failing that it then looks for one of the
+  // possible post-write card mark subgraphs linking the trailing node
+  // to a the card mark membar (relying on helper
+  // trailing_to_card_mark), and then checks that the card mark membar
+  // is fed by a leading membar (once again relying on auxiliary
+  // predicate normal_to_leading).
+  //
+  // if the configuration is valid returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be either a volatile or
+  // acquire membar but in the former case must *not* be a card mark
+  // membar.
+
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing)
+  {
+    assert((trailing->Opcode() == Op_MemBarAcquire ||
+	    trailing->Opcode() == Op_MemBarVolatile),
+	   "expecting an acquire or volatile membar");
+    assert((trailing->Opcode() != Op_MemBarVolatile ||
+	    !is_card_mark_membar(trailing)),
+	   "not expecting a card mark membar");
+
+    MemBarNode *leading = normal_to_leading(trailing);
+
+    if (leading) {
+      return leading;
+    }
+
+    // nothing more to do if this is an acquire
+    if (trailing->Opcode() == Op_MemBarAcquire) {
+      return NULL;
+    }
+
+    MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
+
+    if (!card_mark_membar) {
+      return NULL;
+    }
+
+    return normal_to_leading(card_mark_membar);
+  }
+
+  // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
+bool unnecessary_acquire(const Node *barrier)
+{
+  assert(barrier->is_MemBar(), "expecting a membar");
+
+  if (UseBarriersForVolatile) {
+    // we need to plant a dmb
+    return false;
+  }
+
+  // a volatile read derived from bytecode (or also from an inlined
+  // SHA field read via LibraryCallKit::load_field_from_object)
+  // manifests as a LoadX[mo_acquire] followed by an acquire membar
+  // with a bogus read dependency on it's preceding load. so in those
+  // cases we will find the load node at the PARMS offset of the
+  // acquire membar.  n.b. there may be an intervening DecodeN node.
+  //
+  // a volatile load derived from an inlined unsafe field access
+  // manifests as a cpuorder membar with Ctl and Mem projections
+  // feeding both an acquire membar and a LoadX[mo_acquire]. The
+  // acquire then feeds another cpuorder membar via Ctl and Mem
+  // projections. The load has no output dependency on these trailing
+  // membars because subsequent nodes inserted into the graph take
+  // their control feed from the final membar cpuorder meaning they
+  // are all ordered after the load.
+
+  Node *x = barrier->lookup(TypeFunc::Parms);
+  if (x) {
+    // we are starting from an acquire and it has a fake dependency
+    //
+    // need to check for
+    //
+    //   LoadX[mo_acquire]
+    //   {  |1   }
+    //   {DecodeN}
+    //      |Parms
+    //   MemBarAcquire*
+    //
+    // where * tags node we were passed
+    // and |k means input k
+    if (x->is_DecodeNarrowPtr()) {
+      x = x->in(1);
+    }
+
+    return (x->is_Load() && x->as_Load()->is_acquire());
+  }
+  
+  // now check for an unsafe volatile get
+
+  // need to check for
+  //
+  //   MemBarCPUOrder
+  //        ||       \\
+  //   MemBarAcquire* LoadX[mo_acquire]
+  //        ||
+  //   MemBarCPUOrder
+  //
+  // where * tags node we were passed
+  // and || or \\ are Ctl+Mem feeds via intermediate Proj Nodes
+
+  // check for a parent MemBarCPUOrder
+  ProjNode *ctl;
+  ProjNode *mem;
+  MemBarNode *parent = parent_membar(barrier);
+  if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
+    return false;
+  ctl = parent->proj_out(TypeFunc::Control);
+  mem = parent->proj_out(TypeFunc::Memory);
+  if (!ctl || !mem) {
+    return false;
+  }
+  // ensure the proj nodes both feed a LoadX[mo_acquire]
+  LoadNode *ld = NULL;
+  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
+    x = ctl->fast_out(i);
+    // if we see a load we keep hold of it and stop searching
+    if (x->is_Load()) {
+      ld = x->as_Load();
+      break;
+    }
+  }
+  // it must be an acquiring load
+  if (ld && ld->is_acquire()) {
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      // if we see the same load we drop it and stop searching
+      if (x == ld) {
+	ld = NULL;
+	break;
+      }
+    }
+    // we must have dropped the load
+    if (ld == NULL) {
+      // check for a child cpuorder membar
+      MemBarNode *child  = child_membar(barrier->as_MemBar());
+      if (child && child->Opcode() == Op_MemBarCPUOrder)
+	return true;
+    }
+  }
+
+  // final option for unnecessary mebar is that it is a trailing node
+  // belonging to a CAS
+
+  MemBarNode *leading = trailing_to_leading(barrier->as_MemBar());
+
+  return leading != NULL;
+}
+
+bool needs_acquiring_load(const Node *n)
+{
+  assert(n->is_Load(), "expecting a load");
+  if (UseBarriersForVolatile) {
+    // we use a normal load and a dmb
+    return false;
+  }
+
+  LoadNode *ld = n->as_Load();
+
+  if (!ld->is_acquire()) {
+    return false;
+  }
+
+  // check if this load is feeding an acquire membar
+  //
+  //   LoadX[mo_acquire]
+  //   {  |1   }
+  //   {DecodeN}
+  //      |Parms
+  //   MemBarAcquire*
+  //
+  // where * tags node we were passed
+  // and |k means input k
+
+  Node *start = ld;
+  Node *mbacq = NULL;
+
+  // if we hit a DecodeNarrowPtr we reset the start node and restart
+  // the search through the outputs
+ restart:
+
+  for (DUIterator_Fast imax, i = start->fast_outs(imax); i < imax; i++) {
+    Node *x = start->fast_out(i);
+    if (x->is_MemBar() && x->Opcode() == Op_MemBarAcquire) {
+      mbacq = x;
+    } else if (!mbacq &&
+	       (x->is_DecodeNarrowPtr() ||
+		(x->is_Mach() && x->Opcode() == Op_DecodeN))) {
+      start = x;
+      goto restart;
+    }
+  }
+
+  if (mbacq) {
+    return true;
+  }
+
+  // now check for an unsafe volatile get
+
+  // check if Ctl and Proj feed comes from a MemBarCPUOrder
+  //
+  //     MemBarCPUOrder
+  //        ||       \\
+  //   MemBarAcquire* LoadX[mo_acquire]
+  //        ||
+  //   MemBarCPUOrder
+
+  MemBarNode *membar;
+
+  membar = parent_membar(ld);
+
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
+    return false;
+  }
+
+  // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
+
+  membar = child_membar(membar);
+
+  if (!membar || !membar->Opcode() == Op_MemBarAcquire) {
+    return false;
+  }
+
+  membar = child_membar(membar);
+  
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
+    return false;
+  }
+
+  return true;
+}
+
+bool unnecessary_release(const Node *n)
+{
+  assert((n->is_MemBar() &&
+	  n->Opcode() == Op_MemBarRelease),
+	 "expecting a release membar");
+
+  if (UseBarriersForVolatile) {
+    // we need to plant a dmb
+    return false;
+  }
+
+  // if there is a dependent CPUOrder barrier then use that as the
+  // leading
+
+  MemBarNode *barrier = n->as_MemBar();
+  // check for an intervening cpuorder membar
+  MemBarNode *b = child_membar(barrier);
+  if (b && b->Opcode() == Op_MemBarCPUOrder) {
+    // ok, so start the check from the dependent cpuorder barrier
+    barrier = b;
+  }
+
+  // must start with a normal feed
+  MemBarNode *child_barrier = leading_to_normal(barrier);
+
+  if (!child_barrier) {
+    return false;
+  }
+
+  if (!is_card_mark_membar(child_barrier)) {
+    // this is the trailing membar and we are done
+    return true;
+  }
+
+  // must be sure this card mark feeds a trailing membar
+  MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+  return (trailing != NULL);
+}
+
+bool unnecessary_volatile(const Node *n)
+{
+  // assert n->is_MemBar();
+  if (UseBarriersForVolatile) {
+    // we need to plant a dmb
+    return false;
+  }
+
+  MemBarNode *mbvol = n->as_MemBar();
+
+  // first we check if this is part of a card mark. if so then we have
+  // to generate a StoreLoad barrier
+  
+  if (is_card_mark_membar(mbvol)) {
+      return false;
+  }
+
+  // ok, if it's not a card mark then we still need to check if it is
+  // a trailing membar of a volatile put hgraph.
+
+  return (trailing_to_leading(mbvol) != NULL);
+}
+
+// predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
+bool needs_releasing_store(const Node *n)
+{
+  // assert n->is_Store();
+  if (UseBarriersForVolatile) {
+    // we use a normal store and dmb combination
+    return false;
+  }
+
+  StoreNode *st = n->as_Store();
+
+  // the store must be marked as releasing
+  if (!st->is_release()) {
+    return false;
+  }
+
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
+
+  if (! x || !x->is_Proj()) {
+    return false;
+  }
+
+  ProjNode *proj = x->as_Proj();
+
+  x = proj->lookup(0);
+
+  if (!x || !x->is_MemBar()) {
+    return false;
+  }
+
+  MemBarNode *barrier = x->as_MemBar();
+
+  // if the barrier is a release membar or a cpuorder mmebar fed by a
+  // release membar then we need to check whether that forms part of a
+  // volatile put graph.
+
+  // reject invalid candidates
+  if (!leading_membar(barrier)) {
+    return false;
+  }
+
+  // does this lead a normal subgraph?
+  MemBarNode *mbvol = leading_to_normal(barrier);
+
+  if (!mbvol) {
+    return false;
+  }
+
+  // all done unless this is a card mark
+  if (!is_card_mark_membar(mbvol)) {
+    return true;
+  }
+  
+  // we found a card mark -- just make sure we have a trailing barrier
+
+  return (card_mark_to_trailing(mbvol) != NULL);
+}
+
+// predicate controlling translation of CAS
+//
+// returns true if CAS needs to use an acquiring load otherwise false
+
+bool needs_acquiring_load_exclusive(const Node *n)
+{
+  assert(is_CAS(n->Opcode()), "expecting a compare and swap");
+  if (UseBarriersForVolatile) {
+    return false;
+  }
+
+  // CAS nodes only ought to turn up in inlined unsafe CAS operations
+#ifdef ASSERT
+  LoadStoreNode *st = n->as_LoadStore();
+
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
+
+  assert (x && x->is_Proj(), "CAS not fed by memory proj!");
+
+  ProjNode *proj = x->as_Proj();
+
+  x = proj->lookup(0);
+
+  assert (x && x->is_MemBar(), "CAS not fed by membar!");
+
+  MemBarNode *barrier = x->as_MemBar();
+
+  // the barrier must be a cpuorder mmebar fed by a release membar
+
+  assert(barrier->Opcode() == Op_MemBarCPUOrder,
+	 "CAS not fed by cpuorder membar!");
+      
+  MemBarNode *b = parent_membar(barrier);
+  assert ((b != NULL && b->Opcode() == Op_MemBarRelease),
+	  "CAS not fed by cpuorder+release membar pair!");
+
+  // does this lead a normal subgraph?
+  MemBarNode *mbar = leading_to_normal(barrier);
+
+  assert(mbar != NULL, "CAS not embedded in normal graph!");
+
+  assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire");
+#endif // ASSERT
+  // so we can just return true here
+  return true;
+}
+
+// predicate controlling translation of StoreCM
+//
+// returns true if a StoreStore must precede the card write otherwise
+// false
+
+bool unnecessary_storestore(const Node *storecm)
+{
+  assert(storecm->Opcode()  == Op_StoreCM, "expecting a StoreCM");
+
+  // we only ever need to generate a dmb ishst between an object put
+  // and the associated card mark when we are using CMS without
+  // conditional card marking
+
+  if (!UseConcMarkSweepGC || UseCondCardMark) {
+    return true;
+  }
+
+  // if we are implementing volatile puts using barriers then the
+  // object put as an str so we must insert the dmb ishst
+
+  if (UseBarriersForVolatile) {
+    return false;
+  }
+
+  // we can omit the dmb ishst if this StoreCM is part of a volatile
+  // put because in thta case the put will be implemented by stlr
+  //
+  // we need to check for a normal subgraph feeding this StoreCM.
+  // that means the StoreCM must be fed Memory from a leading membar,
+  // either a MemBarRelease or its dependent MemBarCPUOrder, and the
+  // leading membar must be part of a normal subgraph
+
+  Node *x = storecm->in(StoreNode::Memory);
+
+  if (!x->is_Proj()) {
+    return false;
+  }
+
+  x = x->in(0);
+
+  if (!x->is_MemBar()) {
+    return false;
+  }
+
+  MemBarNode *leading = x->as_MemBar();
+
+  // reject invalid candidates
+  if (!leading_membar(leading)) {
+    return false;
+  }
+
+  // we can omit the StoreStore if it is the head of a normal subgraph
+  return (leading_to_normal(leading) != NULL);
+}
+
+
 #define __ _masm.
 
 // advance declaratuons for helper functions to convert register
@@ -2056,6 +3900,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_strb0_ordered(memory mem) %{
+    MacroAssembler _masm(&cbuf);
+    __ membar(Assembler::StoreStore);
+    loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strh(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(),
@@ -2396,80 +4247,35 @@
 
   enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {      
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxr(rscratch1, addr_reg);
-    __ cmp(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxr(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
   %}
 
   enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
     MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {      
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxrw(rscratch1, addr_reg);
-    __ cmpw(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxrw(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
+
+  // The only difference between aarch64_enc_cmpxchg and
+  // aarch64_enc_cmpxchg_acq is that we use load-acquire in the
+  // CompareAndSwap sequence to serve as a barrier on acquiring a
+  // lock.
+  enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+  %}
+
+  enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
   %}
 
   // auxiliary used for CompareAndSwapX to set result register
@@ -5380,7 +7186,7 @@
 instruct loadB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadB mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsbw  $dst, $mem\t# byte" %}
@@ -5394,7 +7200,7 @@
 instruct loadB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadB mem)));
-  // predicate(n->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsb  $dst, $mem\t# byte" %}
@@ -5408,7 +7214,7 @@
 instruct loadUB(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUB mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrbw  $dst, $mem\t# byte" %}
@@ -5422,7 +7228,7 @@
 instruct loadUB2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUB mem)));
-  // predicate(n->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrb  $dst, $mem\t# byte" %}
@@ -5436,7 +7242,7 @@
 instruct loadS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadS mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrshw  $dst, $mem\t# short" %}
@@ -5450,7 +7256,7 @@
 instruct loadS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadS mem)));
-  // predicate(n->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsh  $dst, $mem\t# short" %}
@@ -5464,7 +7270,7 @@
 instruct loadUS(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadUS mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
@@ -5478,7 +7284,7 @@
 instruct loadUS2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadUS mem)));
-  // predicate(n->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrh  $dst, $mem\t# short" %}
@@ -5492,7 +7298,7 @@
 instruct loadI(iRegINoSp dst, memory mem)
 %{
   match(Set dst (LoadI mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
@@ -5506,7 +7312,7 @@
 instruct loadI2L(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (ConvI2L (LoadI mem)));
-  // predicate(n->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrsw  $dst, $mem\t# int" %}
@@ -5520,7 +7326,7 @@
 instruct loadUI2L(iRegLNoSp dst, memory mem, immL_32bits mask)
 %{
   match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
-  // predicate(n->in(1)->in(1)->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n->in(1)->in(1)->as_Load()));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# int" %}
@@ -5534,7 +7340,7 @@
 instruct loadL(iRegLNoSp dst, memory mem)
 %{
   match(Set dst (LoadL mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# int" %}
@@ -5561,7 +7367,7 @@
 instruct loadP(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadP mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# ptr" %}
@@ -5575,7 +7381,7 @@
 instruct loadN(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadN mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
@@ -5589,7 +7395,7 @@
 instruct loadKlass(iRegPNoSp dst, memory mem)
 %{
   match(Set dst (LoadKlass mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldr  $dst, $mem\t# class" %}
@@ -5603,7 +7409,7 @@
 instruct loadNKlass(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadNKlass mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed class ptr" %}
@@ -5617,7 +7423,7 @@
 instruct loadF(vRegF dst, memory mem)
 %{
   match(Set dst (LoadF mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrs  $dst, $mem\t# float" %}
@@ -5631,7 +7437,7 @@
 instruct loadD(vRegD dst, memory mem)
 %{
   match(Set dst (LoadD mem));
-  // predicate(n->as_Load()->is_unordered());
+  predicate(!needs_acquiring_load(n));
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrd  $dst, $mem\t# double" %}
@@ -5849,6 +7655,7 @@
 instruct storeimmCM0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreCM mem zero));
+  predicate(unnecessary_storestore(n));
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
@@ -5858,11 +7665,26 @@
   ins_pipe(istore_mem);
 %}
 
+// Store CMS card-mark Immediate with intervening StoreStore
+// needed when using CMS with no conditional card marking
+instruct storeimmCM0_ordered(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "dmb ishst"
+      "\n\tstrb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0_ordered(mem));
+
+  ins_pipe(istore_mem);
+%}
+
 // Store Byte
 instruct storeB(iRegIorL2I src, memory mem)
 %{
   match(Set mem (StoreB mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strb  $src, $mem\t# byte" %}
@@ -5876,7 +7698,7 @@
 instruct storeimmB0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreB mem zero));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
@@ -5890,7 +7712,7 @@
 instruct storeC(iRegIorL2I src, memory mem)
 %{
   match(Set mem (StoreC mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strh  $src, $mem\t# short" %}
@@ -5903,7 +7725,7 @@
 instruct storeimmC0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreC mem zero));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strh  zr, $mem\t# short" %}
@@ -5918,7 +7740,7 @@
 instruct storeI(iRegIorL2I src, memory mem)
 %{
   match(Set mem(StoreI mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strw  $src, $mem\t# int" %}
@@ -5931,7 +7753,7 @@
 instruct storeimmI0(immI0 zero, memory mem)
 %{
   match(Set mem(StoreI mem zero));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strw  zr, $mem\t# int" %}
@@ -5945,7 +7767,7 @@
 instruct storeL(iRegL src, memory mem)
 %{
   match(Set mem (StoreL mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "str  $src, $mem\t# int" %}
@@ -5959,7 +7781,7 @@
 instruct storeimmL0(immL0 zero, memory mem)
 %{
   match(Set mem (StoreL mem zero));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "str  zr, $mem\t# int" %}
@@ -5973,7 +7795,7 @@
 instruct storeP(iRegP src, memory mem)
 %{
   match(Set mem (StoreP mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "str  $src, $mem\t# ptr" %}
@@ -5987,7 +7809,7 @@
 instruct storeimmP0(immP0 zero, memory mem)
 %{
   match(Set mem (StoreP mem zero));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "str zr, $mem\t# ptr" %}
@@ -6046,7 +7868,7 @@
 instruct storeN(iRegN src, memory mem)
 %{
   match(Set mem (StoreN mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strw  $src, $mem\t# compressed ptr" %}
@@ -6060,9 +7882,8 @@
 %{
   match(Set mem (StoreN mem zero));
   predicate(Universe::narrow_oop_base() == NULL &&
-            Universe::narrow_klass_base() == NULL//  &&
-	    // n->as_Store()->is_unordered()
-	    );
+            Universe::narrow_klass_base() == NULL  &&
+            (!needs_releasing_store(n)));
 
   ins_cost(INSN_COST);
   format %{ "strw  rheapbase, $mem\t# compressed ptr (rheapbase==0)" %}
@@ -6076,7 +7897,7 @@
 instruct storeF(vRegF src, memory mem)
 %{
   match(Set mem (StoreF mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strs  $src, $mem\t# float" %}
@@ -6093,7 +7914,7 @@
 instruct storeD(vRegD src, memory mem)
 %{
   match(Set mem (StoreD mem src));
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
   format %{ "strd  $src, $mem\t# double" %}
@@ -6106,7 +7927,7 @@
 // Store Compressed Klass Pointer
 instruct storeNKlass(iRegN src, memory mem)
 %{
-//   predicate(n->as_Store()->is_unordered());
+  predicate(!needs_releasing_store(n));
   match(Set mem (StoreNKlass mem src));
 
   ins_cost(INSN_COST);
@@ -6156,6 +7977,312 @@
   ins_pipe(iload_prefetch);
 %}
 
+//  ---------------- volatile loads and stores ----------------
+
+// Load Byte (8 bit signed)
+instruct loadB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadB mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarsb  $dst, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_ldarsb(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Byte (8 bit signed) into long
+instruct loadB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (ConvI2L (LoadB mem)));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarsb  $dst, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_ldarsb(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Byte (8 bit unsigned)
+instruct loadUB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadUB mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarb  $dst, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_ldarb(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Byte (8 bit unsigned) into long
+instruct loadUB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (ConvI2L (LoadUB mem)));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarb  $dst, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_ldarb(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Short (16 bit signed)
+instruct loadS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadS mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarshw  $dst, $mem\t# short" %}
+
+  ins_encode(aarch64_enc_ldarshw(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+instruct loadUS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadUS mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarhw  $dst, $mem\t# short" %}
+
+  ins_encode(aarch64_enc_ldarhw(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Short/Char (16 bit unsigned) into long
+instruct loadUS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (ConvI2L (LoadUS mem)));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarh  $dst, $mem\t# short" %}
+
+  ins_encode(aarch64_enc_ldarh(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Short/Char (16 bit signed) into long
+instruct loadS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (ConvI2L (LoadS mem)));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarh  $dst, $mem\t# short" %}
+
+  ins_encode(aarch64_enc_ldarsh(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Integer (32 bit signed)
+instruct loadI_volatile(iRegINoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadI mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarw  $dst, $mem\t# int" %}
+
+  ins_encode(aarch64_enc_ldarw(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Integer (32 bit unsigned) into long
+instruct loadUI2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem, immL_32bits mask)
+%{
+  match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarw  $dst, $mem\t# int" %}
+
+  ins_encode(aarch64_enc_ldarw(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Long (64 bit signed)
+instruct loadL_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadL mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldar  $dst, $mem\t# int" %}
+
+  ins_encode(aarch64_enc_ldar(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Pointer
+instruct loadP_volatile(iRegPNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadP mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldar  $dst, $mem\t# ptr" %}
+
+  ins_encode(aarch64_enc_ldar(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Compressed Pointer
+instruct loadN_volatile(iRegNNoSp dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadN mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldarw  $dst, $mem\t# compressed ptr" %}
+
+  ins_encode(aarch64_enc_ldarw(dst, mem));
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Float
+instruct loadF_volatile(vRegF dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadF mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldars  $dst, $mem\t# float" %}
+
+  ins_encode( aarch64_enc_fldars(dst, mem) );
+
+  ins_pipe(pipe_serial);
+%}
+
+// Load Double
+instruct loadD_volatile(vRegD dst, /* sync_memory*/indirect mem)
+%{
+  match(Set dst (LoadD mem));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "ldard  $dst, $mem\t# double" %}
+
+  ins_encode( aarch64_enc_fldard(dst, mem) );
+
+  ins_pipe(pipe_serial);
+%}
+
+// Store Byte
+instruct storeB_volatile(iRegIorL2I src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreB mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrb  $src, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_stlrb(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Char/Short
+instruct storeC_volatile(iRegIorL2I src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreC mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrh  $src, $mem\t# short" %}
+
+  ins_encode(aarch64_enc_stlrh(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Integer
+
+instruct storeI_volatile(iRegIorL2I src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem(StoreI mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrw  $src, $mem\t# int" %}
+
+  ins_encode(aarch64_enc_stlrw(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Long (64 bit signed)
+instruct storeL_volatile(iRegL src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreL mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlr  $src, $mem\t# int" %}
+
+  ins_encode(aarch64_enc_stlr(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Pointer
+instruct storeP_volatile(iRegP src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreP mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlr  $src, $mem\t# ptr" %}
+
+  ins_encode(aarch64_enc_stlr(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Compressed Pointer
+instruct storeN_volatile(iRegN src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreN mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrw  $src, $mem\t# compressed ptr" %}
+
+  ins_encode(aarch64_enc_stlrw(src, mem));
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Float
+instruct storeF_volatile(vRegF src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreF mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrs  $src, $mem\t# float" %}
+
+  ins_encode( aarch64_enc_fstlrs(src, mem) );
+
+  ins_pipe(pipe_class_memory);
+%}
+
+// TODO
+// implement storeImmF0 and storeFImmPacked
+
+// Store Double
+instruct storeD_volatile(vRegD src, /* sync_memory*/indirect mem)
+%{
+  match(Set mem (StoreD mem src));
+
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrd  $src, $mem\t# double" %}
+
+  ins_encode( aarch64_enc_fstlrd(src, mem) );
+
+  ins_pipe(pipe_class_memory);
+%}
+
+//  ---------------- end of volatile loads and stores ----------------
+
 // ============================================================================
 // BSWAP Instructions
 
@@ -6373,6 +8500,20 @@
   ins_pipe(pipe_serial);
 %}
 
+instruct unnecessary_membar_acquire() %{
+  predicate(unnecessary_acquire(n));
+  match(MemBarAcquire);
+  ins_cost(0);
+
+  format %{ "membar_acquire (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_acquire (elided)");
+  %}
+
+  ins_pipe(pipe_class_empty);
+%}
+
 instruct membar_acquire() %{
   match(MemBarAcquire);
   ins_cost(VOLATILE_REF_COST);
@@ -6380,6 +8521,7 @@
   format %{ "membar_acquire" %}
 
   ins_encode %{
+    __ block_comment("membar_acquire");
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
@@ -6412,6 +8554,19 @@
   ins_pipe(pipe_serial);
 %}
 
+instruct unnecessary_membar_release() %{
+  predicate(unnecessary_release(n));
+  match(MemBarRelease);
+  ins_cost(0);
+
+  format %{ "membar_release (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_release (elided)");
+  %}
+  ins_pipe(pipe_serial);
+%}
+
 instruct membar_release() %{
   match(MemBarRelease);
   ins_cost(VOLATILE_REF_COST);
@@ -6419,6 +8574,7 @@
   format %{ "membar_release" %}
 
   ins_encode %{
+    __ block_comment("membar_release");
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
   ins_pipe(pipe_serial);
@@ -6449,6 +8605,20 @@
   ins_pipe(pipe_serial);
 %}
 
+instruct unnecessary_membar_volatile() %{
+  predicate(unnecessary_volatile(n));
+  match(MemBarVolatile);
+  ins_cost(0);
+
+  format %{ "membar_volatile (elided)" %}
+
+  ins_encode %{
+    __ block_comment("membar_volatile (elided)");
+  %}
+
+  ins_pipe(pipe_serial);
+%}
+
 instruct membar_volatile() %{
   match(MemBarVolatile);
   ins_cost(VOLATILE_REF_COST*100);
@@ -6456,6 +8626,7 @@
   format %{ "membar_volatile" %}
 
   ins_encode %{
+    __ block_comment("membar_volatile");
     __ membar(Assembler::StoreLoad);
     %}
 
@@ -6751,9 +8922,13 @@
 // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
 // can't match them
 
+// standard CompareAndSwapX when we are using barriers
+// these have higher priority than the rules selected by a predicate
+
 instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -6771,6 +8946,7 @@
 instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -6788,6 +8964,7 @@
 instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -6805,6 +8982,7 @@
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
@@ -6820,6 +8998,85 @@
 %}
 
 
+// alternative CompareAndSwapX when we are eliding barriers
+
+instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+
 instruct get_and_setI(indirect mem, iRegINoSp newv, iRegI prev) %{
   match(Set prev (GetAndSetI mem newv));
   format %{ "atomic_xchgw  $prev, $newv, [$mem]" %}
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Oct 08 11:06:07 2015 -0400
@@ -96,8 +96,8 @@
   product(bool, NearCpool, true,                                        \
          "constant pool is close to instructions")                      \
                                                                         \
-  notproduct(bool, UseAcqRelForVolatileFields, false,			\
-	     "Use acquire and release insns for volatile fields")       \
+  product(bool, UseBarriersForVolatile, false,                          \
+          "Use memory barriers to implement volatile accesses")         \
                                                                         \
   product(bool, UseCRC32, false,                                        \
           "Use CRC32 instructions for CRC32 computation")               \
@@ -115,8 +115,8 @@
   product(bool, NearCpool, true,					\
          "constant pool is close to instructions")			\
                                                                         \
-  notproduct(bool, UseAcqRelForVolatileFields, false,			\
-	     "Use acquire and release insns for volatile fields")       \
+  product(bool, UseBarriersForVolatile, false,                          \
+          "Use memory barriers to implement volatile accesses")         \
   product(bool, UseNeon, false,                                         \
           "Use Neon for CRC32 computation")                             \
   product(bool, UseCRC32, false,                                        \
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Oct 08 11:06:07 2015 -0400
@@ -3811,14 +3811,6 @@
   }
 }
 
-  bool MacroAssembler::use_acq_rel_for_volatile_fields() {
-#ifdef PRODUCT
-    return false;
-#else
-    return UseAcqRelForVolatileFields;
-#endif
-  }
-
 void MacroAssembler::build_frame(int framesize) {
   if (framesize == 0) {
     // Is this even possible?
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Oct 08 11:06:07 2015 -0400
@@ -923,6 +923,23 @@
     str(rscratch2, adr);
   }
 
+  // A generic CAS; success or failure is in the EQ flag.
+  template <typename T1, typename T2>
+  void cmpxchg(Register addr, Register expected, Register new_val,
+               T1 load_insn,
+               void (MacroAssembler::*cmp_insn)(Register, Register),
+               T2 store_insn,
+               Register tmp = rscratch1) {
+    Label retry_load, done;
+    bind(retry_load);
+    (this->*load_insn)(tmp, addr);
+    (this->*cmp_insn)(tmp, expected);
+    br(Assembler::NE, done);
+    (this->*store_insn)(tmp, new_val, addr);
+    cbnzw(tmp, retry_load);
+    bind(done);
+  }
+
   // Calls
 
   // void call(Label& L, relocInfo::relocType rtype);
@@ -1090,9 +1107,6 @@
   address read_polling_page(Register r, address page, relocInfo::relocType rtype);
   address read_polling_page(Register r, relocInfo::relocType rtype);
 
-  // Used by aarch64.ad to control code generation
-  static bool use_acq_rel_for_volatile_fields();
-
   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
   void update_byte_crc32(Register crc, Register val, Register table);
   void update_word_crc32(Register crc, Register v, Register tmp,
@@ -1186,10 +1200,6 @@
   }
 };
 
-// Used by aarch64.ad to control code generation
-#define treat_as_volatile(MEM_NODE)					\
-  (MacroAssembler::use_acq_rel_for_volatile_fields() ? (MEM_NODE)->is_volatile() : false)
-
 #ifdef ASSERT
 inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
 #endif
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Oct 08 11:06:07 2015 -0400
@@ -224,6 +224,10 @@
     UseMultiplyToLenIntrinsic = true;
   }
 
+  if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
+    UseBarriersForVolatile = (_cpuFeatures & CPU_DMB_ATOMICS) != 0;
+  }
+
   if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
     UsePopCountInstruction = true;
   }
--- a/src/share/vm/opto/graphKit.cpp	Mon Sep 21 10:36:36 2015 -0400
+++ b/src/share/vm/opto/graphKit.cpp	Thu Oct 08 11:06:07 2015 -0400
@@ -3803,7 +3803,7 @@
 
   // Smash zero into card
   if( !UseConcMarkSweepGC ) {
-    __ store(__ ctrl(), card_adr, zero, bt, adr_type, MemNode::release);
+    __ store(__ ctrl(), card_adr, zero, bt, adr_type, MemNode::unordered);
   } else {
     // Specialized path for CM store barrier
     __ storeCM(__ ctrl(), card_adr, zero, oop_store, adr_idx, bt, adr_type);