Mercurial > hg > openjdk > aarch64-port > hotspot
changeset 8568:3ccdd3482827
Backport optimization of volatile puts/gets and CAS to use ldar/stlr
author | adinn |
---|---|
date | Thu, 08 Oct 2015 11:06:07 -0400 |
parents | f9720c487762 |
children | acaa2f3db91b |
files | src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/globals_aarch64.hpp src/cpu/aarch64/vm/macroAssembler_aarch64.cpp src/cpu/aarch64/vm/macroAssembler_aarch64.hpp src/cpu/aarch64/vm/vm_version_aarch64.cpp src/share/vm/opto/graphKit.cpp |
diffstat | 6 files changed, 2388 insertions(+), 125 deletions(-) [+] |
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad Mon Sep 21 10:36:36 2015 -0400 +++ b/src/cpu/aarch64/vm/aarch64.ad Thu Oct 08 11:06:07 2015 -0400 @@ -962,10 +962,1854 @@ } }; + // graph traversal helpers + + MemBarNode *parent_membar(const Node *n); + MemBarNode *child_membar(const MemBarNode *n); + bool leading_membar(const MemBarNode *barrier); + + bool is_card_mark_membar(const MemBarNode *barrier); + bool is_CAS(int opcode); + + MemBarNode *leading_to_normal(MemBarNode *leading); + MemBarNode *normal_to_leading(const MemBarNode *barrier); + MemBarNode *card_mark_to_trailing(const MemBarNode *barrier); + MemBarNode *trailing_to_card_mark(const MemBarNode *trailing); + MemBarNode *trailing_to_leading(const MemBarNode *trailing); + + // predicates controlling emit of ldr<x>/ldar<x> and associated dmb + + bool unnecessary_acquire(const Node *barrier); + bool needs_acquiring_load(const Node *load); + + // predicates controlling emit of str<x>/stlr<x> and associated dmbs + + bool unnecessary_release(const Node *barrier); + bool unnecessary_volatile(const Node *barrier); + bool needs_releasing_store(const Node *store); + + // predicate controlling translation of CompareAndSwapX + bool needs_acquiring_load_exclusive(const Node *load); + + // predicate controlling translation of StoreCM + bool unnecessary_storestore(const Node *storecm); %} source %{ + // Optimizaton of volatile gets and puts + // ------------------------------------- + // + // AArch64 has ldar<x> and stlr<x> instructions which we can safely + // use to implement volatile reads and writes. For a volatile read + // we simply need + // + // ldar<x> + // + // and for a volatile write we need + // + // stlr<x> + // + // Alternatively, we can implement them by pairing a normal + // load/store with a memory barrier. For a volatile read we need + // + // ldr<x> + // dmb ishld + // + // for a volatile write + // + // dmb ish + // str<x> + // dmb ish + // + // We can also use ldaxr and stlxr to implement compare and swap CAS + // sequences. These are normally translated to an instruction + // sequence like the following + // + // dmb ish + // retry: + // ldxr<x> rval raddr + // cmp rval rold + // b.ne done + // stlxr<x> rval, rnew, rold + // cbnz rval retry + // done: + // cset r0, eq + // dmb ishld + // + // Note that the exclusive store is already using an stlxr + // instruction. That is required to ensure visibility to other + // threads of the exclusive write (assuming it succeeds) before that + // of any subsequent writes. + // + // The following instruction sequence is an improvement on the above + // + // retry: + // ldaxr<x> rval raddr + // cmp rval rold + // b.ne done + // stlxr<x> rval, rnew, rold + // cbnz rval retry + // done: + // cset r0, eq + // + // We don't need the leading dmb ish since the stlxr guarantees + // visibility of prior writes in the case that the swap is + // successful. Crucially we don't have to worry about the case where + // the swap is not successful since no valid program should be + // relying on visibility of prior changes by the attempting thread + // in the case where the CAS fails. + // + // Similarly, we don't need the trailing dmb ishld if we substitute + // an ldaxr instruction since that will provide all the guarantees we + // require regarding observation of changes made by other threads + // before any change to the CAS address observed by the load. + // + // In order to generate the desired instruction sequence we need to + // be able to identify specific 'signature' ideal graph node + // sequences which i) occur as a translation of a volatile reads or + // writes or CAS operations and ii) do not occur through any other + // translation or graph transformation. We can then provide + // alternative aldc matching rules which translate these node + // sequences to the desired machine code sequences. Selection of the + // alternative rules can be implemented by predicates which identify + // the relevant node sequences. + // + // The ideal graph generator translates a volatile read to the node + // sequence + // + // LoadX[mo_acquire] + // MemBarAcquire + // + // As a special case when using the compressed oops optimization we + // may also see this variant + // + // LoadN[mo_acquire] + // DecodeN + // MemBarAcquire + // + // A volatile write is translated to the node sequence + // + // MemBarRelease + // StoreX[mo_release] {CardMark}-optional + // MemBarVolatile + // + // n.b. the above node patterns are generated with a strict + // 'signature' configuration of input and output dependencies (see + // the predicates below for exact details). The card mark may be as + // simple as a few extra nodes or, in a few GC configurations, may + // include more complex control flow between the leading and + // trailing memory barriers. However, whatever the card mark + // configuration these signatures are unique to translated volatile + // reads/stores -- they will not appear as a result of any other + // bytecode translation or inlining nor as a consequence of + // optimizing transforms. + // + // We also want to catch inlined unsafe volatile gets and puts and + // be able to implement them using either ldar<x>/stlr<x> or some + // combination of ldr<x>/stlr<x> and dmb instructions. + // + // Inlined unsafe volatiles puts manifest as a minor variant of the + // normal volatile put node sequence containing an extra cpuorder + // membar + // + // MemBarRelease + // MemBarCPUOrder + // StoreX[mo_release] {CardMark}-optional + // MemBarVolatile + // + // n.b. as an aside, the cpuorder membar is not itself subject to + // matching and translation by adlc rules. However, the rule + // predicates need to detect its presence in order to correctly + // select the desired adlc rules. + // + // Inlined unsafe volatile gets manifest as a somewhat different + // node sequence to a normal volatile get + // + // MemBarCPUOrder + // || \\ + // MemBarAcquire LoadX[mo_acquire] + // || + // MemBarCPUOrder + // + // In this case the acquire membar does not directly depend on the + // load. However, we can be sure that the load is generated from an + // inlined unsafe volatile get if we see it dependent on this unique + // sequence of membar nodes. Similarly, given an acquire membar we + // can know that it was added because of an inlined unsafe volatile + // get if it is fed and feeds a cpuorder membar and if its feed + // membar also feeds an acquiring load. + // + // Finally an inlined (Unsafe) CAS operation is translated to the + // following ideal graph + // + // MemBarRelease + // MemBarCPUOrder + // CompareAndSwapX {CardMark}-optional + // MemBarCPUOrder + // MemBarAcquire + // + // So, where we can identify these volatile read and write + // signatures we can choose to plant either of the above two code + // sequences. For a volatile read we can simply plant a normal + // ldr<x> and translate the MemBarAcquire to a dmb. However, we can + // also choose to inhibit translation of the MemBarAcquire and + // inhibit planting of the ldr<x>, instead planting an ldar<x>. + // + // When we recognise a volatile store signature we can choose to + // plant at a dmb ish as a translation for the MemBarRelease, a + // normal str<x> and then a dmb ish for the MemBarVolatile. + // Alternatively, we can inhibit translation of the MemBarRelease + // and MemBarVolatile and instead plant a simple stlr<x> + // instruction. + // + // when we recognise a CAS signature we can choose to plant a dmb + // ish as a translation for the MemBarRelease, the conventional + // macro-instruction sequence for the CompareAndSwap node (which + // uses ldxr<x>) and then a dmb ishld for the MemBarAcquire. + // Alternatively, we can elide generation of the dmb instructions + // and plant the alternative CompareAndSwap macro-instruction + // sequence (which uses ldaxr<x>). + // + // Of course, the above only applies when we see these signature + // configurations. We still want to plant dmb instructions in any + // other cases where we may see a MemBarAcquire, MemBarRelease or + // MemBarVolatile. For example, at the end of a constructor which + // writes final/volatile fields we will see a MemBarRelease + // instruction and this needs a 'dmb ish' lest we risk the + // constructed object being visible without making the + // final/volatile field writes visible. + // + // n.b. the translation rules below which rely on detection of the + // volatile signatures and insert ldar<x> or stlr<x> are failsafe. + // If we see anything other than the signature configurations we + // always just translate the loads and stores to ldr<x> and str<x> + // and translate acquire, release and volatile membars to the + // relevant dmb instructions. + // + + // graph traversal helpers used for volatile put/get and CAS + // optimization + + // 1) general purpose helpers + + // if node n is linked to a parent MemBarNode by an intervening + // Control and Memory ProjNode return the MemBarNode otherwise return + // NULL. + // + // n may only be a Load or a MemBar. + + MemBarNode *parent_membar(const Node *n) + { + Node *ctl = NULL; + Node *mem = NULL; + Node *membar = NULL; + + if (n->is_Load()) { + ctl = n->lookup(LoadNode::Control); + mem = n->lookup(LoadNode::Memory); + } else if (n->is_MemBar()) { + ctl = n->lookup(TypeFunc::Control); + mem = n->lookup(TypeFunc::Memory); + } else { + return NULL; + } + + if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj()) { + return NULL; + } + + membar = ctl->lookup(0); + + if (!membar || !membar->is_MemBar()) { + return NULL; + } + + if (mem->lookup(0) != membar) { + return NULL; + } + + return membar->as_MemBar(); + } + + // if n is linked to a child MemBarNode by intervening Control and + // Memory ProjNodes return the MemBarNode otherwise return NULL. + + MemBarNode *child_membar(const MemBarNode *n) + { + ProjNode *ctl = n->proj_out(TypeFunc::Control); + ProjNode *mem = n->proj_out(TypeFunc::Memory); + + // MemBar needs to have both a Ctl and Mem projection + if (! ctl || ! mem) + return NULL; + + MemBarNode *child = NULL; + Node *x; + + for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) { + x = ctl->fast_out(i); + // if we see a membar we keep hold of it. we may also see a new + // arena copy of the original but it will appear later + if (x->is_MemBar()) { + child = x->as_MemBar(); + break; + } + } + + if (child == NULL) { + return NULL; + } + + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { + x = mem->fast_out(i); + // if we see a membar we keep hold of it. we may also see a new + // arena copy of the original but it will appear later + if (x == child) { + return child; + } + } + return NULL; + } + + // helper predicate use to filter candidates for a leading memory + // barrier + // + // returns true if barrier is a MemBarRelease or a MemBarCPUOrder + // whose Ctl and Mem feeds come from a MemBarRelease otherwise false + + bool leading_membar(const MemBarNode *barrier) + { + int opcode = barrier->Opcode(); + // if this is a release membar we are ok + if (opcode == Op_MemBarRelease) { + return true; + } + // if its a cpuorder membar . . . + if (opcode != Op_MemBarCPUOrder) { + return false; + } + // then the parent has to be a release membar + MemBarNode *parent = parent_membar(barrier); + if (!parent) { + return false; + } + opcode = parent->Opcode(); + return opcode == Op_MemBarRelease; + } + + // 2) card mark detection helper + + // helper predicate which can be used to detect a volatile membar + // introduced as part of a conditional card mark sequence either by + // G1 or by CMS when UseCondCardMark is true. + // + // membar can be definitively determined to be part of a card mark + // sequence if and only if all the following hold + // + // i) it is a MemBarVolatile + // + // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is + // true + // + // iii) the node's Mem projection feeds a StoreCM node. + + bool is_card_mark_membar(const MemBarNode *barrier) + { + if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark)) { + return false; + } + + if (barrier->Opcode() != Op_MemBarVolatile) { + return false; + } + + ProjNode *mem = barrier->proj_out(TypeFunc::Memory); + + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) { + Node *y = mem->fast_out(i); + if (y->Opcode() == Op_StoreCM) { + return true; + } + } + + return false; + } + + + // 3) helper predicates to traverse volatile put or CAS graphs which + // may contain GC barrier subgraphs + + // Preamble + // -------- + // + // for volatile writes we can omit generating barriers and employ a + // releasing store when we see a node sequence sequence with a + // leading MemBarRelease and a trailing MemBarVolatile as follows + // + // MemBarRelease + // { || } -- optional + // {MemBarCPUOrder} + // || \\ + // || StoreX[mo_release] + // | \ / + // | MergeMem + // | / + // MemBarVolatile + // + // where + // || and \\ represent Ctl and Mem feeds via Proj nodes + // | \ and / indicate further routing of the Ctl and Mem feeds + // + // this is the graph we see for non-object stores. however, for a + // volatile Object store (StoreN/P) we may see other nodes below the + // leading membar because of the need for a GC pre- or post-write + // barrier. + // + // with most GC configurations we with see this simple variant which + // includes a post-write barrier card mark. + // + // MemBarRelease______________________________ + // || \\ Ctl \ \\ + // || StoreN/P[mo_release] CastP2X StoreB/CM + // | \ / . . . / + // | MergeMem + // | / + // || / + // MemBarVolatile + // + // i.e. the leading membar feeds Ctl to a CastP2X (which converts + // the object address to an int used to compute the card offset) and + // Ctl+Mem to a StoreB node (which does the actual card mark). + // + // n.b. a StoreCM node will only appear in this configuration when + // using CMS. StoreCM differs from a normal card mark write (StoreB) + // because it implies a requirement to order visibility of the card + // mark (StoreCM) relative to the object put (StoreP/N) using a + // StoreStore memory barrier (arguably this ought to be represented + // explicitly in the ideal graph but that is not how it works). This + // ordering is required for both non-volatile and volatile + // puts. Normally that means we need to translate a StoreCM using + // the sequence + // + // dmb ishst + // stlrb + // + // However, in the case of a volatile put if we can recognise this + // configuration and plant an stlr for the object write then we can + // omit the dmb and just plant an strb since visibility of the stlr + // is ordered before visibility of subsequent stores. StoreCM nodes + // also arise when using G1 or using CMS with conditional card + // marking. In these cases (as we shall see) we don't need to insert + // the dmb when translating StoreCM because there is already an + // intervening StoreLoad barrier between it and the StoreP/N. + // + // It is also possible to perform the card mark conditionally on it + // currently being unmarked in which case the volatile put graph + // will look slightly different + // + // MemBarRelease____________________________________________ + // || \\ Ctl \ Ctl \ \\ Mem \ + // || StoreN/P[mo_release] CastP2X If LoadB | + // | \ / \ | + // | MergeMem . . . StoreB + // | / / + // || / + // MemBarVolatile + // + // It is worth noting at this stage that both the above + // configurations can be uniquely identified by checking that the + // memory flow includes the following subgraph: + // + // MemBarRelease + // {MemBarCPUOrder} + // | \ . . . + // | StoreX[mo_release] . . . + // | / + // MergeMem + // | + // MemBarVolatile + // + // This is referred to as a *normal* subgraph. It can easily be + // detected starting from any candidate MemBarRelease, + // StoreX[mo_release] or MemBarVolatile. + // + // A simple variation on this normal case occurs for an unsafe CAS + // operation. The basic graph for a non-object CAS is + // + // MemBarRelease + // || + // MemBarCPUOrder + // || \\ . . . + // || CompareAndSwapX + // || | + // || SCMemProj + // | \ / + // | MergeMem + // | / + // MemBarCPUOrder + // || + // MemBarAcquire + // + // The same basic variations on this arrangement (mutatis mutandis) + // occur when a card mark is introduced. i.e. we se the same basic + // shape but the StoreP/N is replaced with CompareAndSawpP/N and the + // tail of the graph is a pair comprising a MemBarCPUOrder + + // MemBarAcquire. + // + // So, in the case of a CAS the normal graph has the variant form + // + // MemBarRelease + // MemBarCPUOrder + // | \ . . . + // | CompareAndSwapX . . . + // | | + // | SCMemProj + // | / . . . + // MergeMem + // | + // MemBarCPUOrder + // MemBarAcquire + // + // This graph can also easily be detected starting from any + // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire. + // + // the code below uses two helper predicates, leading_to_normal and + // normal_to_leading to identify these normal graphs, one validating + // the layout starting from the top membar and searching down and + // the other validating the layout starting from the lower membar + // and searching up. + // + // There are two special case GC configurations when a normal graph + // may not be generated: when using G1 (which always employs a + // conditional card mark); and when using CMS with conditional card + // marking configured. These GCs are both concurrent rather than + // stop-the world GCs. So they introduce extra Ctl+Mem flow into the + // graph between the leading and trailing membar nodes, in + // particular enforcing stronger memory serialisation beween the + // object put and the corresponding conditional card mark. CMS + // employs a post-write GC barrier while G1 employs both a pre- and + // post-write GC barrier. Of course the extra nodes may be absent -- + // they are only inserted for object puts. This significantly + // complicates the task of identifying whether a MemBarRelease, + // StoreX[mo_release] or MemBarVolatile forms part of a volatile put + // when using these GC configurations (see below). It adds similar + // complexity to the task of identifying whether a MemBarRelease, + // CompareAndSwapX or MemBarAcquire forms part of a CAS. + // + // In both cases the post-write subtree includes an auxiliary + // MemBarVolatile (StoreLoad barrier) separating the object put and + // the read of the corresponding card. This poses two additional + // problems. + // + // Firstly, a card mark MemBarVolatile needs to be distinguished + // from a normal trailing MemBarVolatile. Resolving this first + // problem is straightforward: a card mark MemBarVolatile always + // projects a Mem feed to a StoreCM node and that is a unique marker + // + // MemBarVolatile (card mark) + // C | \ . . . + // | StoreCM . . . + // . . . + // + // The second problem is how the code generator is to translate the + // card mark barrier? It always needs to be translated to a "dmb + // ish" instruction whether or not it occurs as part of a volatile + // put. A StoreLoad barrier is needed after the object put to ensure + // i) visibility to GC threads of the object put and ii) visibility + // to the mutator thread of any card clearing write by a GC + // thread. Clearly a normal store (str) will not guarantee this + // ordering but neither will a releasing store (stlr). The latter + // guarantees that the object put is visible but does not guarantee + // that writes by other threads have also been observed. + // + // So, returning to the task of translating the object put and the + // leading/trailing membar nodes: what do the non-normal node graph + // look like for these 2 special cases? and how can we determine the + // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile + // in both normal and non-normal cases? + // + // A CMS GC post-barrier wraps its card write (StoreCM) inside an If + // which selects conditonal execution based on the value loaded + // (LoadB) from the card. Ctl and Mem are fed to the If via an + // intervening StoreLoad barrier (MemBarVolatile). + // + // So, with CMS we may see a node graph for a volatile object store + // which looks like this + // + // MemBarRelease + // MemBarCPUOrder_(leading)__________________ + // C | M \ \\ C \ + // | \ StoreN/P[mo_release] CastP2X + // | Bot \ / + // | MergeMem + // | / + // MemBarVolatile (card mark) + // C | || M | + // | LoadB | + // | | | + // | Cmp |\ + // | / | \ + // If | \ + // | \ | \ + // IfFalse IfTrue | \ + // \ / \ | \ + // \ / StoreCM | + // \ / | | + // Region . . . | + // | \ / + // | . . . \ / Bot + // | MergeMem + // | | + // MemBarVolatile (trailing) + // + // The first MergeMem merges the AliasIdxBot Mem slice from the + // leading membar and the oopptr Mem slice from the Store into the + // card mark membar. The trailing MergeMem merges the AliasIdxBot + // Mem slice from the card mark membar and the AliasIdxRaw slice + // from the StoreCM into the trailing membar (n.b. the latter + // proceeds via a Phi associated with the If region). + // + // The graph for a CAS varies slightly, the obvious difference being + // that the StoreN/P node is replaced by a CompareAndSwapP/N node + // and the trailing MemBarVolatile by a MemBarCPUOrder + + // MemBarAcquire pair. The other important difference is that the + // CompareAndSwap node's SCMemProj is not merged into the card mark + // membar - it still feeds the trailing MergeMem. This also means + // that the card mark membar receives its Mem feed directly from the + // leading membar rather than via a MergeMem. + // + // MemBarRelease + // MemBarCPUOrder__(leading)_________________________ + // || \\ C \ + // MemBarVolatile (card mark) CompareAndSwapN/P CastP2X + // C | || M | | + // | LoadB | ______/| + // | | | / | + // | Cmp | / SCMemProj + // | / | / | + // If | / / + // | \ | / / + // IfFalse IfTrue | / / + // \ / \ |/ prec / + // \ / StoreCM / + // \ / | / + // Region . . . / + // | \ / + // | . . . \ / Bot + // | MergeMem + // | | + // MemBarCPUOrder + // MemBarAcquire (trailing) + // + // This has a slightly different memory subgraph to the one seen + // previously but the core of it is the same as for the CAS normal + // sungraph + // + // MemBarRelease + // MemBarCPUOrder____ + // || \ . . . + // MemBarVolatile CompareAndSwapX . . . + // | \ | + // . . . SCMemProj + // | / . . . + // MergeMem + // | + // MemBarCPUOrder + // MemBarAcquire + // + // + // G1 is quite a lot more complicated. The nodes inserted on behalf + // of G1 may comprise: a pre-write graph which adds the old value to + // the SATB queue; the releasing store itself; and, finally, a + // post-write graph which performs a card mark. + // + // The pre-write graph may be omitted, but only when the put is + // writing to a newly allocated (young gen) object and then only if + // there is a direct memory chain to the Initialize node for the + // object allocation. This will not happen for a volatile put since + // any memory chain passes through the leading membar. + // + // The pre-write graph includes a series of 3 If tests. The outermost + // If tests whether SATB is enabled (no else case). The next If tests + // whether the old value is non-NULL (no else case). The third tests + // whether the SATB queue index is > 0, if so updating the queue. The + // else case for this third If calls out to the runtime to allocate a + // new queue buffer. + // + // So with G1 the pre-write and releasing store subgraph looks like + // this (the nested Ifs are omitted). + // + // MemBarRelease (leading)____________ + // C | || M \ M \ M \ M \ . . . + // | LoadB \ LoadL LoadN \ + // | / \ \ + // If |\ \ + // | \ | \ \ + // IfFalse IfTrue | \ \ + // | | | \ | + // | If | /\ | + // | | \ | + // | \ | + // | . . . \ | + // | / | / | | + // Region Phi[M] | | + // | \ | | | + // | \_____ | ___ | | + // C | C \ | C \ M | | + // | CastP2X | StoreN/P[mo_release] | + // | | | | + // C | M | M | M | + // \ | | / + // . . . + // (post write subtree elided) + // . . . + // C \ M / + // MemBarVolatile (trailing) + // + // n.b. the LoadB in this subgraph is not the card read -- it's a + // read of the SATB queue active flag. + // + // Once again the CAS graph is a minor variant on the above with the + // expected substitutions of CompareAndSawpX for StoreN/P and + // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile. + // + // The G1 post-write subtree is also optional, this time when the + // new value being written is either null or can be identified as a + // newly allocated (young gen) object with no intervening control + // flow. The latter cannot happen but the former may, in which case + // the card mark membar is omitted and the memory feeds form the + // leading membar and the SToreN/P are merged direct into the + // trailing membar as per the normal subgraph. So, the only special + // case which arises is when the post-write subgraph is generated. + // + // The kernel of the post-write G1 subgraph is the card mark itself + // which includes a card mark memory barrier (MemBarVolatile), a + // card test (LoadB), and a conditional update (If feeding a + // StoreCM). These nodes are surrounded by a series of nested Ifs + // which try to avoid doing the card mark. The top level If skips if + // the object reference does not cross regions (i.e. it tests if + // (adr ^ val) >> log2(regsize) != 0) -- intra-region references + // need not be recorded. The next If, which skips on a NULL value, + // may be absent (it is not generated if the type of value is >= + // OopPtr::NotNull). The 3rd If skips writes to young regions (by + // checking if card_val != young). n.b. although this test requires + // a pre-read of the card it can safely be done before the StoreLoad + // barrier. However that does not bypass the need to reread the card + // after the barrier. + // + // (pre-write subtree elided) + // . . . . . . . . . . . . + // C | M | M | M | + // Region Phi[M] StoreN | + // | / \ | | + // / \_______ / \ | | + // C / C \ . . . \ | | + // If CastP2X . . . | | | + // / \ | | | + // / \ | | | + // IfFalse IfTrue | | | + // | | | | /| + // | If | | / | + // | / \ | | / | + // | / \ \ | / | + // | IfFalse IfTrue MergeMem | + // | . . . / \ / | + // | / \ / | + // | IfFalse IfTrue / | + // | . . . | / | + // | If / | + // | / \ / | + // | / \ / | + // | IfFalse IfTrue / | + // | . . . | / | + // | \ / | + // | \ / | + // | MemBarVolatile__(card mark) | + // | || C | M \ M \ | + // | LoadB If | | | + // | / \ | | | + // | . . . | | | + // | \ | | / + // | StoreCM | / + // | . . . | / + // | _________/ / + // | / _____________/ + // | . . . . . . | / / + // | | | / _________/ + // | | Phi[M] / / + // | | | / / + // | | | / / + // | Region . . . Phi[M] _____/ + // | / | / + // | | / + // | . . . . . . | / + // | / | / + // Region | | Phi[M] + // | | | / Bot + // \ MergeMem + // \ / + // MemBarVolatile + // + // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice + // from the leading membar and the oopptr Mem slice from the Store + // into the card mark membar i.e. the memory flow to the card mark + // membar still looks like a normal graph. + // + // The trailing MergeMem merges an AliasIdxBot Mem slice with other + // Mem slices (from the StoreCM and other card mark queue stores). + // However in this case the AliasIdxBot Mem slice does not come + // direct from the card mark membar. It is merged through a series + // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow + // from the leading membar with the Mem feed from the card mark + // membar. Each Phi corresponds to one of the Ifs which may skip + // around the card mark membar. So when the If implementing the NULL + // value check has been elided the total number of Phis is 2 + // otherwise it is 3. + // + // The CAS graph when using G1GC also includes a pre-write subgraph + // and an optional post-write subgraph. Teh sam evarioations are + // introduced as for CMS with conditional card marking i.e. the + // StoreP/N is swapped for a CompareAndSwapP/N, the tariling + // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the + // Mem feed from the CompareAndSwapP/N includes a precedence + // dependency feed to the StoreCM and a feed via an SCMemProj to the + // trailing membar. So, as before the configuration includes the + // normal CAS graph as a subgraph of the memory flow. + // + // So, the upshot is that in all cases the volatile put graph will + // include a *normal* memory subgraph betwen the leading membar and + // its child membar, either a volatile put graph (including a + // releasing StoreX) or a CAS graph (including a CompareAndSwapX). + // When that child is not a card mark membar then it marks the end + // of the volatile put or CAS subgraph. If the child is a card mark + // membar then the normal subgraph will form part of a volatile put + // subgraph if and only if the child feeds an AliasIdxBot Mem feed + // to a trailing barrier via a MergeMem. That feed is either direct + // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier + // memory flow (for G1). + // + // The predicates controlling generation of instructions for store + // and barrier nodes employ a few simple helper functions (described + // below) which identify the presence or absence of all these + // subgraph configurations and provide a means of traversing from + // one node in the subgraph to another. + + // is_CAS(int opcode) + // + // return true if opcode is one of the possible CompareAndSwapX + // values otherwise false. + + bool is_CAS(int opcode) + { + return (opcode == Op_CompareAndSwapI || + opcode == Op_CompareAndSwapL || + opcode == Op_CompareAndSwapN || + opcode == Op_CompareAndSwapP); + } + + // leading_to_normal + // + //graph traversal helper which detects the normal case Mem feed from + // a release membar (or, optionally, its cpuorder child) to a + // dependent volatile membar i.e. it ensures that one or other of + // the following Mem flow subgraph is present. + // + // MemBarRelease + // MemBarCPUOrder {leading} + // | \ . . . + // | StoreN/P[mo_release] . . . + // | / + // MergeMem + // | + // MemBarVolatile {trailing or card mark} + // + // MemBarRelease + // MemBarCPUOrder {leading} + // | \ . . . + // | CompareAndSwapX . . . + // | + // . . . SCMemProj + // \ | + // | MergeMem + // | / + // MemBarCPUOrder + // MemBarAcquire {trailing} + // + // if the correct configuration is present returns the trailing + // membar otherwise NULL. + // + // the input membar is expected to be either a cpuorder membar or a + // release membar. in the latter case it should not have a cpu membar + // child. + // + // the returned value may be a card mark or trailing membar + // + + MemBarNode *leading_to_normal(MemBarNode *leading) + { + assert((leading->Opcode() == Op_MemBarRelease || + leading->Opcode() == Op_MemBarCPUOrder), + "expecting a volatile or cpuroder membar!"); + + // check the mem flow + ProjNode *mem = leading->proj_out(TypeFunc::Memory); + + if (!mem) { + return NULL; + } + + Node *x = NULL; + StoreNode * st = NULL; + LoadStoreNode *cas = NULL; + MergeMemNode *mm = NULL; + + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { + x = mem->fast_out(i); + if (x->is_MergeMem()) { + if (mm != NULL) { + return NULL; + } + // two merge mems is one too many + mm = x->as_MergeMem(); + } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) { + // two releasing stores/CAS nodes is one too many + if (st != NULL || cas != NULL) { + return NULL; + } + st = x->as_Store(); + } else if (is_CAS(x->Opcode())) { + if (st != NULL || cas != NULL) { + return NULL; + } + cas = x->as_LoadStore(); + } + } + + // must have a store or a cas + if (!st && !cas) { + return NULL; + } + + // must have a merge if we also have st + if (st && !mm) { + return NULL; + } + + Node *y = NULL; + if (cas) { + // look for an SCMemProj + for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) { + x = cas->fast_out(i); + if (x->is_Proj()) { + y = x; + break; + } + } + if (y == NULL) { + return NULL; + } + // the proj must feed a MergeMem + for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) { + x = y->fast_out(i); + if (x->is_MergeMem()) { + mm = x->as_MergeMem(); + break; + } + } + if (mm == NULL) + return NULL; + } else { + // ensure the store feeds the existing mergemem; + for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) { + if (st->fast_out(i) == mm) { + y = st; + break; + } + } + if (y == NULL) { + return NULL; + } + } + + MemBarNode *mbar = NULL; + // ensure the merge feeds to the expected type of membar + for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) { + x = mm->fast_out(i); + if (x->is_MemBar()) { + int opcode = x->Opcode(); + if (opcode == Op_MemBarVolatile && st) { + mbar = x->as_MemBar(); + } else if (cas && opcode == Op_MemBarCPUOrder) { + MemBarNode *y = x->as_MemBar(); + y = child_membar(y); + if (y != NULL && y->Opcode() == Op_MemBarAcquire) { + mbar = y; + } + } + break; + } + } + + return mbar; + } + + // normal_to_leading + // + // graph traversal helper which detects the normal case Mem feed + // from either a card mark or a trailing membar to a preceding + // release membar (optionally its cpuorder child) i.e. it ensures + // that one or other of the following Mem flow subgraphs is present. + // + // MemBarRelease + // MemBarCPUOrder {leading} + // | \ . . . + // | StoreN/P[mo_release] . . . + // | / + // MergeMem + // | + // MemBarVolatile {card mark or trailing} + // + // MemBarRelease + // MemBarCPUOrder {leading} + // | \ . . . + // | CompareAndSwapX . . . + // | + // . . . SCMemProj + // \ | + // | MergeMem + // | / + // MemBarCPUOrder + // MemBarAcquire {trailing} + // + // this predicate checks for the same flow as the previous predicate + // but starting from the bottom rather than the top. + // + // if the configuration is present returns the cpuorder member for + // preference or when absent the release membar otherwise NULL. + // + // n.b. the input membar is expected to be a MemBarVolatile but + // need not be a card mark membar. + + MemBarNode *normal_to_leading(const MemBarNode *barrier) + { + // input must be a volatile membar + assert((barrier->Opcode() == Op_MemBarVolatile || + barrier->Opcode() == Op_MemBarAcquire), + "expecting a volatile or an acquire membar"); + Node *x; + bool is_cas = barrier->Opcode() == Op_MemBarAcquire; + + // if we have an acquire membar then it must be fed via a CPUOrder + // membar + + if (is_cas) { + // skip to parent barrier which must be a cpuorder + x = parent_membar(barrier); + if (x->Opcode() != Op_MemBarCPUOrder) + return NULL; + } else { + // start from the supplied barrier + x = (Node *)barrier; + } + + // the Mem feed to the membar should be a merge + x = x ->in(TypeFunc::Memory); + if (!x->is_MergeMem()) + return NULL; + + MergeMemNode *mm = x->as_MergeMem(); + + if (is_cas) { + // the merge should be fed from the CAS via an SCMemProj node + x = NULL; + for (uint idx = 1; idx < mm->req(); idx++) { + if (mm->in(idx)->Opcode() == Op_SCMemProj) { + x = mm->in(idx); + break; + } + } + if (x == NULL) { + return NULL; + } + // check for a CAS feeding this proj + x = x->in(0); + int opcode = x->Opcode(); + if (!is_CAS(opcode)) { + return NULL; + } + // the CAS should get its mem feed from the leading membar + x = x->in(MemNode::Memory); + } else { + // the merge should get its Bottom mem feed from the leading membar + x = mm->in(Compile::AliasIdxBot); + } + + // ensure this is a non control projection + if (!x->is_Proj() || x->is_CFG()) { + return NULL; + } + // if it is fed by a membar that's the one we want + x = x->in(0); + + if (!x->is_MemBar()) { + return NULL; + } + + MemBarNode *leading = x->as_MemBar(); + // reject invalid candidates + if (!leading_membar(leading)) { + return NULL; + } + + // ok, we have a leading membar, now for the sanity clauses + + // the leading membar must feed Mem to a releasing store or CAS + ProjNode *mem = leading->proj_out(TypeFunc::Memory); + StoreNode *st = NULL; + LoadStoreNode *cas = NULL; + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { + x = mem->fast_out(i); + if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) { + // two stores or CASes is one too many + if (st != NULL || cas != NULL) { + return NULL; + } + st = x->as_Store(); + } else if (is_CAS(x->Opcode())) { + if (st != NULL || cas != NULL) { + return NULL; + } + cas = x->as_LoadStore(); + } + } + + // we should not have both a store and a cas + if (st == NULL & cas == NULL) { + return NULL; + } + + if (st == NULL) { + // nothing more to check + return leading; + } else { + // we should not have a store if we started from an acquire + if (is_cas) { + return NULL; + } + + // the store should feed the merge we used to get here + for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) { + if (st->fast_out(i) == mm) { + return leading; + } + } + } + + return NULL; + } + + // card_mark_to_trailing + // + // graph traversal helper which detects extra, non-normal Mem feed + // from a card mark volatile membar to a trailing membar i.e. it + // ensures that one of the following three GC post-write Mem flow + // subgraphs is present. + // + // 1) + // . . . + // | + // MemBarVolatile (card mark) + // | | + // | StoreCM + // | | + // | . . . + // Bot | / + // MergeMem + // | + // | + // MemBarVolatile {trailing} + // + // 2) + // MemBarRelease/CPUOrder (leading) + // | + // | + // |\ . . . + // | \ | + // | \ MemBarVolatile (card mark) + // | \ | | + // \ \ | StoreCM . . . + // \ \ | + // \ Phi + // \ / + // Phi . . . + // Bot | / + // MergeMem + // | + // MemBarVolatile {trailing} + // + // + // 3) + // MemBarRelease/CPUOrder (leading) + // | + // |\ + // | \ + // | \ . . . + // | \ | + // |\ \ MemBarVolatile (card mark) + // | \ \ | | + // | \ \ | StoreCM . . . + // | \ \ | + // \ \ Phi + // \ \ / + // \ Phi + // \ / + // Phi . . . + // Bot | / + // MergeMem + // | + // | + // MemBarVolatile {trailing} + // + // configuration 1 is only valid if UseConcMarkSweepGC && + // UseCondCardMark + // + // configurations 2 and 3 are only valid if UseG1GC. + // + // if a valid configuration is present returns the trailing membar + // otherwise NULL. + // + // n.b. the supplied membar is expected to be a card mark + // MemBarVolatile i.e. the caller must ensure the input node has the + // correct operand and feeds Mem to a StoreCM node + + MemBarNode *card_mark_to_trailing(const MemBarNode *barrier) + { + // input must be a card mark volatile membar + assert(is_card_mark_membar(barrier), "expecting a card mark membar"); + + Node *feed = barrier->proj_out(TypeFunc::Memory); + Node *x; + MergeMemNode *mm = NULL; + + const int MAX_PHIS = 3; // max phis we will search through + int phicount = 0; // current search count + + bool retry_feed = true; + while (retry_feed) { + // see if we have a direct MergeMem feed + for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) { + x = feed->fast_out(i); + // the correct Phi will be merging a Bot memory slice + if (x->is_MergeMem()) { + mm = x->as_MergeMem(); + break; + } + } + if (mm) { + retry_feed = false; + } else if (UseG1GC & phicount++ < MAX_PHIS) { + // the barrier may feed indirectly via one or two Phi nodes + PhiNode *phi = NULL; + for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) { + x = feed->fast_out(i); + // the correct Phi will be merging a Bot memory slice + if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) { + phi = x->as_Phi(); + break; + } + } + if (!phi) { + return NULL; + } + // look for another merge below this phi + feed = phi; + } else { + // couldn't find a merge + return NULL; + } + } + + // sanity check this feed turns up as the expected slice + assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge"); + + MemBarNode *trailing = NULL; + // be sure we have a trailing membar the merge + for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) { + x = mm->fast_out(i); + if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) { + trailing = x->as_MemBar(); + break; + } + } + + return trailing; + } + + // trailing_to_card_mark + // + // graph traversal helper which detects extra, non-normal Mem feed + // from a trailing volatile membar to a preceding card mark volatile + // membar i.e. it identifies whether one of the three possible extra + // GC post-write Mem flow subgraphs is present + // + // this predicate checks for the same flow as the previous predicate + // but starting from the bottom rather than the top. + // + // if the configuration is present returns the card mark membar + // otherwise NULL + // + // n.b. the supplied membar is expected to be a trailing + // MemBarVolatile i.e. the caller must ensure the input node has the + // correct opcode + + MemBarNode *trailing_to_card_mark(const MemBarNode *trailing) + { + assert(trailing->Opcode() == Op_MemBarVolatile, + "expecting a volatile membar"); + assert(!is_card_mark_membar(trailing), + "not expecting a card mark membar"); + + // the Mem feed to the membar should be a merge + Node *x = trailing->in(TypeFunc::Memory); + if (!x->is_MergeMem()) { + return NULL; + } + + MergeMemNode *mm = x->as_MergeMem(); + + x = mm->in(Compile::AliasIdxBot); + // with G1 we may possibly see a Phi or two before we see a Memory + // Proj from the card mark membar + + const int MAX_PHIS = 3; // max phis we will search through + int phicount = 0; // current search count + + bool retry_feed = !x->is_Proj(); + + while (retry_feed) { + if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) { + PhiNode *phi = x->as_Phi(); + ProjNode *proj = NULL; + PhiNode *nextphi = NULL; + bool found_leading = false; + for (uint i = 1; i < phi->req(); i++) { + x = phi->in(i); + if (x->is_Phi()) { + nextphi = x->as_Phi(); + } else if (x->is_Proj()) { + int opcode = x->in(0)->Opcode(); + if (opcode == Op_MemBarVolatile) { + proj = x->as_Proj(); + } else if (opcode == Op_MemBarRelease || + opcode == Op_MemBarCPUOrder) { + // probably a leading membar + found_leading = true; + } + } + } + // if we found a correct looking proj then retry from there + // otherwise we must see a leading and a phi or this the + // wrong config + if (proj != NULL) { + x = proj; + retry_feed = false; + } else if (found_leading && nextphi != NULL) { + // retry from this phi to check phi2 + x = nextphi; + } else { + // not what we were looking for + return NULL; + } + } else { + return NULL; + } + } + // the proj has to come from the card mark membar + x = x->in(0); + if (!x->is_MemBar()) { + return NULL; + } + + MemBarNode *card_mark_membar = x->as_MemBar(); + + if (!is_card_mark_membar(card_mark_membar)) { + return NULL; + } + + return card_mark_membar; + } + + // trailing_to_leading + // + // graph traversal helper which checks the Mem flow up the graph + // from a (non-card mark) trailing membar attempting to locate and + // return an associated leading membar. it first looks for a + // subgraph in the normal configuration (relying on helper + // normal_to_leading). failing that it then looks for one of the + // possible post-write card mark subgraphs linking the trailing node + // to a the card mark membar (relying on helper + // trailing_to_card_mark), and then checks that the card mark membar + // is fed by a leading membar (once again relying on auxiliary + // predicate normal_to_leading). + // + // if the configuration is valid returns the cpuorder member for + // preference or when absent the release membar otherwise NULL. + // + // n.b. the input membar is expected to be either a volatile or + // acquire membar but in the former case must *not* be a card mark + // membar. + + MemBarNode *trailing_to_leading(const MemBarNode *trailing) + { + assert((trailing->Opcode() == Op_MemBarAcquire || + trailing->Opcode() == Op_MemBarVolatile), + "expecting an acquire or volatile membar"); + assert((trailing->Opcode() != Op_MemBarVolatile || + !is_card_mark_membar(trailing)), + "not expecting a card mark membar"); + + MemBarNode *leading = normal_to_leading(trailing); + + if (leading) { + return leading; + } + + // nothing more to do if this is an acquire + if (trailing->Opcode() == Op_MemBarAcquire) { + return NULL; + } + + MemBarNode *card_mark_membar = trailing_to_card_mark(trailing); + + if (!card_mark_membar) { + return NULL; + } + + return normal_to_leading(card_mark_membar); + } + + // predicates controlling emit of ldr<x>/ldar<x> and associated dmb + +bool unnecessary_acquire(const Node *barrier) +{ + assert(barrier->is_MemBar(), "expecting a membar"); + + if (UseBarriersForVolatile) { + // we need to plant a dmb + return false; + } + + // a volatile read derived from bytecode (or also from an inlined + // SHA field read via LibraryCallKit::load_field_from_object) + // manifests as a LoadX[mo_acquire] followed by an acquire membar + // with a bogus read dependency on it's preceding load. so in those + // cases we will find the load node at the PARMS offset of the + // acquire membar. n.b. there may be an intervening DecodeN node. + // + // a volatile load derived from an inlined unsafe field access + // manifests as a cpuorder membar with Ctl and Mem projections + // feeding both an acquire membar and a LoadX[mo_acquire]. The + // acquire then feeds another cpuorder membar via Ctl and Mem + // projections. The load has no output dependency on these trailing + // membars because subsequent nodes inserted into the graph take + // their control feed from the final membar cpuorder meaning they + // are all ordered after the load. + + Node *x = barrier->lookup(TypeFunc::Parms); + if (x) { + // we are starting from an acquire and it has a fake dependency + // + // need to check for + // + // LoadX[mo_acquire] + // { |1 } + // {DecodeN} + // |Parms + // MemBarAcquire* + // + // where * tags node we were passed + // and |k means input k + if (x->is_DecodeNarrowPtr()) { + x = x->in(1); + } + + return (x->is_Load() && x->as_Load()->is_acquire()); + } + + // now check for an unsafe volatile get + + // need to check for + // + // MemBarCPUOrder + // || \\ + // MemBarAcquire* LoadX[mo_acquire] + // || + // MemBarCPUOrder + // + // where * tags node we were passed + // and || or \\ are Ctl+Mem feeds via intermediate Proj Nodes + + // check for a parent MemBarCPUOrder + ProjNode *ctl; + ProjNode *mem; + MemBarNode *parent = parent_membar(barrier); + if (!parent || parent->Opcode() != Op_MemBarCPUOrder) + return false; + ctl = parent->proj_out(TypeFunc::Control); + mem = parent->proj_out(TypeFunc::Memory); + if (!ctl || !mem) { + return false; + } + // ensure the proj nodes both feed a LoadX[mo_acquire] + LoadNode *ld = NULL; + for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) { + x = ctl->fast_out(i); + // if we see a load we keep hold of it and stop searching + if (x->is_Load()) { + ld = x->as_Load(); + break; + } + } + // it must be an acquiring load + if (ld && ld->is_acquire()) { + + for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { + x = mem->fast_out(i); + // if we see the same load we drop it and stop searching + if (x == ld) { + ld = NULL; + break; + } + } + // we must have dropped the load + if (ld == NULL) { + // check for a child cpuorder membar + MemBarNode *child = child_membar(barrier->as_MemBar()); + if (child && child->Opcode() == Op_MemBarCPUOrder) + return true; + } + } + + // final option for unnecessary mebar is that it is a trailing node + // belonging to a CAS + + MemBarNode *leading = trailing_to_leading(barrier->as_MemBar()); + + return leading != NULL; +} + +bool needs_acquiring_load(const Node *n) +{ + assert(n->is_Load(), "expecting a load"); + if (UseBarriersForVolatile) { + // we use a normal load and a dmb + return false; + } + + LoadNode *ld = n->as_Load(); + + if (!ld->is_acquire()) { + return false; + } + + // check if this load is feeding an acquire membar + // + // LoadX[mo_acquire] + // { |1 } + // {DecodeN} + // |Parms + // MemBarAcquire* + // + // where * tags node we were passed + // and |k means input k + + Node *start = ld; + Node *mbacq = NULL; + + // if we hit a DecodeNarrowPtr we reset the start node and restart + // the search through the outputs + restart: + + for (DUIterator_Fast imax, i = start->fast_outs(imax); i < imax; i++) { + Node *x = start->fast_out(i); + if (x->is_MemBar() && x->Opcode() == Op_MemBarAcquire) { + mbacq = x; + } else if (!mbacq && + (x->is_DecodeNarrowPtr() || + (x->is_Mach() && x->Opcode() == Op_DecodeN))) { + start = x; + goto restart; + } + } + + if (mbacq) { + return true; + } + + // now check for an unsafe volatile get + + // check if Ctl and Proj feed comes from a MemBarCPUOrder + // + // MemBarCPUOrder + // || \\ + // MemBarAcquire* LoadX[mo_acquire] + // || + // MemBarCPUOrder + + MemBarNode *membar; + + membar = parent_membar(ld); + + if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) { + return false; + } + + // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain + + membar = child_membar(membar); + + if (!membar || !membar->Opcode() == Op_MemBarAcquire) { + return false; + } + + membar = child_membar(membar); + + if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) { + return false; + } + + return true; +} + +bool unnecessary_release(const Node *n) +{ + assert((n->is_MemBar() && + n->Opcode() == Op_MemBarRelease), + "expecting a release membar"); + + if (UseBarriersForVolatile) { + // we need to plant a dmb + return false; + } + + // if there is a dependent CPUOrder barrier then use that as the + // leading + + MemBarNode *barrier = n->as_MemBar(); + // check for an intervening cpuorder membar + MemBarNode *b = child_membar(barrier); + if (b && b->Opcode() == Op_MemBarCPUOrder) { + // ok, so start the check from the dependent cpuorder barrier + barrier = b; + } + + // must start with a normal feed + MemBarNode *child_barrier = leading_to_normal(barrier); + + if (!child_barrier) { + return false; + } + + if (!is_card_mark_membar(child_barrier)) { + // this is the trailing membar and we are done + return true; + } + + // must be sure this card mark feeds a trailing membar + MemBarNode *trailing = card_mark_to_trailing(child_barrier); + return (trailing != NULL); +} + +bool unnecessary_volatile(const Node *n) +{ + // assert n->is_MemBar(); + if (UseBarriersForVolatile) { + // we need to plant a dmb + return false; + } + + MemBarNode *mbvol = n->as_MemBar(); + + // first we check if this is part of a card mark. if so then we have + // to generate a StoreLoad barrier + + if (is_card_mark_membar(mbvol)) { + return false; + } + + // ok, if it's not a card mark then we still need to check if it is + // a trailing membar of a volatile put hgraph. + + return (trailing_to_leading(mbvol) != NULL); +} + +// predicates controlling emit of str<x>/stlr<x> and associated dmbs + +bool needs_releasing_store(const Node *n) +{ + // assert n->is_Store(); + if (UseBarriersForVolatile) { + // we use a normal store and dmb combination + return false; + } + + StoreNode *st = n->as_Store(); + + // the store must be marked as releasing + if (!st->is_release()) { + return false; + } + + // the store must be fed by a membar + + Node *x = st->lookup(StoreNode::Memory); + + if (! x || !x->is_Proj()) { + return false; + } + + ProjNode *proj = x->as_Proj(); + + x = proj->lookup(0); + + if (!x || !x->is_MemBar()) { + return false; + } + + MemBarNode *barrier = x->as_MemBar(); + + // if the barrier is a release membar or a cpuorder mmebar fed by a + // release membar then we need to check whether that forms part of a + // volatile put graph. + + // reject invalid candidates + if (!leading_membar(barrier)) { + return false; + } + + // does this lead a normal subgraph? + MemBarNode *mbvol = leading_to_normal(barrier); + + if (!mbvol) { + return false; + } + + // all done unless this is a card mark + if (!is_card_mark_membar(mbvol)) { + return true; + } + + // we found a card mark -- just make sure we have a trailing barrier + + return (card_mark_to_trailing(mbvol) != NULL); +} + +// predicate controlling translation of CAS +// +// returns true if CAS needs to use an acquiring load otherwise false + +bool needs_acquiring_load_exclusive(const Node *n) +{ + assert(is_CAS(n->Opcode()), "expecting a compare and swap"); + if (UseBarriersForVolatile) { + return false; + } + + // CAS nodes only ought to turn up in inlined unsafe CAS operations +#ifdef ASSERT + LoadStoreNode *st = n->as_LoadStore(); + + // the store must be fed by a membar + + Node *x = st->lookup(StoreNode::Memory); + + assert (x && x->is_Proj(), "CAS not fed by memory proj!"); + + ProjNode *proj = x->as_Proj(); + + x = proj->lookup(0); + + assert (x && x->is_MemBar(), "CAS not fed by membar!"); + + MemBarNode *barrier = x->as_MemBar(); + + // the barrier must be a cpuorder mmebar fed by a release membar + + assert(barrier->Opcode() == Op_MemBarCPUOrder, + "CAS not fed by cpuorder membar!"); + + MemBarNode *b = parent_membar(barrier); + assert ((b != NULL && b->Opcode() == Op_MemBarRelease), + "CAS not fed by cpuorder+release membar pair!"); + + // does this lead a normal subgraph? + MemBarNode *mbar = leading_to_normal(barrier); + + assert(mbar != NULL, "CAS not embedded in normal graph!"); + + assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire"); +#endif // ASSERT + // so we can just return true here + return true; +} + +// predicate controlling translation of StoreCM +// +// returns true if a StoreStore must precede the card write otherwise +// false + +bool unnecessary_storestore(const Node *storecm) +{ + assert(storecm->Opcode() == Op_StoreCM, "expecting a StoreCM"); + + // we only ever need to generate a dmb ishst between an object put + // and the associated card mark when we are using CMS without + // conditional card marking + + if (!UseConcMarkSweepGC || UseCondCardMark) { + return true; + } + + // if we are implementing volatile puts using barriers then the + // object put as an str so we must insert the dmb ishst + + if (UseBarriersForVolatile) { + return false; + } + + // we can omit the dmb ishst if this StoreCM is part of a volatile + // put because in thta case the put will be implemented by stlr + // + // we need to check for a normal subgraph feeding this StoreCM. + // that means the StoreCM must be fed Memory from a leading membar, + // either a MemBarRelease or its dependent MemBarCPUOrder, and the + // leading membar must be part of a normal subgraph + + Node *x = storecm->in(StoreNode::Memory); + + if (!x->is_Proj()) { + return false; + } + + x = x->in(0); + + if (!x->is_MemBar()) { + return false; + } + + MemBarNode *leading = x->as_MemBar(); + + // reject invalid candidates + if (!leading_membar(leading)) { + return false; + } + + // we can omit the StoreStore if it is the head of a normal subgraph + return (leading_to_normal(leading) != NULL); +} + + #define __ _masm. // advance declaratuons for helper functions to convert register @@ -2056,6 +3900,13 @@ as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); %} + enc_class aarch64_enc_strb0_ordered(memory mem) %{ + MacroAssembler _masm(&cbuf); + __ membar(Assembler::StoreStore); + loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + enc_class aarch64_enc_strh(iRegI src, memory mem) %{ Register src_reg = as_Register($src$$reg); loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(), @@ -2396,80 +4247,35 @@ enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{ MacroAssembler _masm(&cbuf); - Register old_reg = as_Register($oldval$$reg); - Register new_reg = as_Register($newval$$reg); - Register base = as_Register($mem$$base); - Register addr_reg; - int index = $mem$$index; - int scale = $mem$$scale; - int disp = $mem$$disp; - if (index == -1) { - if (disp != 0) { - __ lea(rscratch2, Address(base, disp)); - addr_reg = rscratch2; - } else { - // TODO - // should we ever get anything other than this case? - addr_reg = base; - } - } else { - Register index_reg = as_Register(index); - if (disp == 0) { - __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } else { - __ lea(rscratch2, Address(base, disp)); - __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } - } - Label retry_load, done; - __ bind(retry_load); - __ ldxr(rscratch1, addr_reg); - __ cmp(rscratch1, old_reg); - __ br(Assembler::NE, done); - __ stlxr(rscratch1, new_reg, addr_reg); - __ cbnzw(rscratch1, retry_load); - __ bind(done); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr); %} enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{ MacroAssembler _masm(&cbuf); - Register old_reg = as_Register($oldval$$reg); - Register new_reg = as_Register($newval$$reg); - Register base = as_Register($mem$$base); - Register addr_reg; - int index = $mem$$index; - int scale = $mem$$scale; - int disp = $mem$$disp; - if (index == -1) { - if (disp != 0) { - __ lea(rscratch2, Address(base, disp)); - addr_reg = rscratch2; - } else { - // TODO - // should we ever get anything other than this case? - addr_reg = base; - } - } else { - Register index_reg = as_Register(index); - if (disp == 0) { - __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } else { - __ lea(rscratch2, Address(base, disp)); - __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale))); - addr_reg = rscratch2; - } - } - Label retry_load, done; - __ bind(retry_load); - __ ldxrw(rscratch1, addr_reg); - __ cmpw(rscratch1, old_reg); - __ br(Assembler::NE, done); - __ stlxrw(rscratch1, new_reg, addr_reg); - __ cbnzw(rscratch1, retry_load); - __ bind(done); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw); + %} + + + // The only difference between aarch64_enc_cmpxchg and + // aarch64_enc_cmpxchg_acq is that we use load-acquire in the + // CompareAndSwap sequence to serve as a barrier on acquiring a + // lock. + enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{ + MacroAssembler _masm(&cbuf); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr); + %} + + enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{ + MacroAssembler _masm(&cbuf); + guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding"); + __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register, + &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw); %} // auxiliary used for CompareAndSwapX to set result register @@ -5380,7 +7186,7 @@ instruct loadB(iRegINoSp dst, memory mem) %{ match(Set dst (LoadB mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrsbw $dst, $mem\t# byte" %} @@ -5394,7 +7200,7 @@ instruct loadB2L(iRegLNoSp dst, memory mem) %{ match(Set dst (ConvI2L (LoadB mem))); - // predicate(n->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1))); ins_cost(4 * INSN_COST); format %{ "ldrsb $dst, $mem\t# byte" %} @@ -5408,7 +7214,7 @@ instruct loadUB(iRegINoSp dst, memory mem) %{ match(Set dst (LoadUB mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrbw $dst, $mem\t# byte" %} @@ -5422,7 +7228,7 @@ instruct loadUB2L(iRegLNoSp dst, memory mem) %{ match(Set dst (ConvI2L (LoadUB mem))); - // predicate(n->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1))); ins_cost(4 * INSN_COST); format %{ "ldrb $dst, $mem\t# byte" %} @@ -5436,7 +7242,7 @@ instruct loadS(iRegINoSp dst, memory mem) %{ match(Set dst (LoadS mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrshw $dst, $mem\t# short" %} @@ -5450,7 +7256,7 @@ instruct loadS2L(iRegLNoSp dst, memory mem) %{ match(Set dst (ConvI2L (LoadS mem))); - // predicate(n->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1))); ins_cost(4 * INSN_COST); format %{ "ldrsh $dst, $mem\t# short" %} @@ -5464,7 +7270,7 @@ instruct loadUS(iRegINoSp dst, memory mem) %{ match(Set dst (LoadUS mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrh $dst, $mem\t# short" %} @@ -5478,7 +7284,7 @@ instruct loadUS2L(iRegLNoSp dst, memory mem) %{ match(Set dst (ConvI2L (LoadUS mem))); - // predicate(n->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1))); ins_cost(4 * INSN_COST); format %{ "ldrh $dst, $mem\t# short" %} @@ -5492,7 +7298,7 @@ instruct loadI(iRegINoSp dst, memory mem) %{ match(Set dst (LoadI mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrw $dst, $mem\t# int" %} @@ -5506,7 +7312,7 @@ instruct loadI2L(iRegLNoSp dst, memory mem) %{ match(Set dst (ConvI2L (LoadI mem))); - // predicate(n->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1))); ins_cost(4 * INSN_COST); format %{ "ldrsw $dst, $mem\t# int" %} @@ -5520,7 +7326,7 @@ instruct loadUI2L(iRegLNoSp dst, memory mem, immL_32bits mask) %{ match(Set dst (AndL (ConvI2L (LoadI mem)) mask)); - // predicate(n->in(1)->in(1)->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n->in(1)->in(1)->as_Load())); ins_cost(4 * INSN_COST); format %{ "ldrw $dst, $mem\t# int" %} @@ -5534,7 +7340,7 @@ instruct loadL(iRegLNoSp dst, memory mem) %{ match(Set dst (LoadL mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldr $dst, $mem\t# int" %} @@ -5561,7 +7367,7 @@ instruct loadP(iRegPNoSp dst, memory mem) %{ match(Set dst (LoadP mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldr $dst, $mem\t# ptr" %} @@ -5575,7 +7381,7 @@ instruct loadN(iRegNNoSp dst, memory mem) %{ match(Set dst (LoadN mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrw $dst, $mem\t# compressed ptr" %} @@ -5589,7 +7395,7 @@ instruct loadKlass(iRegPNoSp dst, memory mem) %{ match(Set dst (LoadKlass mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldr $dst, $mem\t# class" %} @@ -5603,7 +7409,7 @@ instruct loadNKlass(iRegNNoSp dst, memory mem) %{ match(Set dst (LoadNKlass mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrw $dst, $mem\t# compressed class ptr" %} @@ -5617,7 +7423,7 @@ instruct loadF(vRegF dst, memory mem) %{ match(Set dst (LoadF mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrs $dst, $mem\t# float" %} @@ -5631,7 +7437,7 @@ instruct loadD(vRegD dst, memory mem) %{ match(Set dst (LoadD mem)); - // predicate(n->as_Load()->is_unordered()); + predicate(!needs_acquiring_load(n)); ins_cost(4 * INSN_COST); format %{ "ldrd $dst, $mem\t# double" %} @@ -5849,6 +7655,7 @@ instruct storeimmCM0(immI0 zero, memory mem) %{ match(Set mem (StoreCM mem zero)); + predicate(unnecessary_storestore(n)); ins_cost(INSN_COST); format %{ "strb zr, $mem\t# byte" %} @@ -5858,11 +7665,26 @@ ins_pipe(istore_mem); %} +// Store CMS card-mark Immediate with intervening StoreStore +// needed when using CMS with no conditional card marking +instruct storeimmCM0_ordered(immI0 zero, memory mem) +%{ + match(Set mem (StoreCM mem zero)); + + ins_cost(INSN_COST * 2); + format %{ "dmb ishst" + "\n\tstrb zr, $mem\t# byte" %} + + ins_encode(aarch64_enc_strb0_ordered(mem)); + + ins_pipe(istore_mem); +%} + // Store Byte instruct storeB(iRegIorL2I src, memory mem) %{ match(Set mem (StoreB mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strb $src, $mem\t# byte" %} @@ -5876,7 +7698,7 @@ instruct storeimmB0(immI0 zero, memory mem) %{ match(Set mem (StoreB mem zero)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strb zr, $mem\t# byte" %} @@ -5890,7 +7712,7 @@ instruct storeC(iRegIorL2I src, memory mem) %{ match(Set mem (StoreC mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strh $src, $mem\t# short" %} @@ -5903,7 +7725,7 @@ instruct storeimmC0(immI0 zero, memory mem) %{ match(Set mem (StoreC mem zero)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strh zr, $mem\t# short" %} @@ -5918,7 +7740,7 @@ instruct storeI(iRegIorL2I src, memory mem) %{ match(Set mem(StoreI mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strw $src, $mem\t# int" %} @@ -5931,7 +7753,7 @@ instruct storeimmI0(immI0 zero, memory mem) %{ match(Set mem(StoreI mem zero)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strw zr, $mem\t# int" %} @@ -5945,7 +7767,7 @@ instruct storeL(iRegL src, memory mem) %{ match(Set mem (StoreL mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "str $src, $mem\t# int" %} @@ -5959,7 +7781,7 @@ instruct storeimmL0(immL0 zero, memory mem) %{ match(Set mem (StoreL mem zero)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "str zr, $mem\t# int" %} @@ -5973,7 +7795,7 @@ instruct storeP(iRegP src, memory mem) %{ match(Set mem (StoreP mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "str $src, $mem\t# ptr" %} @@ -5987,7 +7809,7 @@ instruct storeimmP0(immP0 zero, memory mem) %{ match(Set mem (StoreP mem zero)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "str zr, $mem\t# ptr" %} @@ -6046,7 +7868,7 @@ instruct storeN(iRegN src, memory mem) %{ match(Set mem (StoreN mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strw $src, $mem\t# compressed ptr" %} @@ -6060,9 +7882,8 @@ %{ match(Set mem (StoreN mem zero)); predicate(Universe::narrow_oop_base() == NULL && - Universe::narrow_klass_base() == NULL// && - // n->as_Store()->is_unordered() - ); + Universe::narrow_klass_base() == NULL && + (!needs_releasing_store(n))); ins_cost(INSN_COST); format %{ "strw rheapbase, $mem\t# compressed ptr (rheapbase==0)" %} @@ -6076,7 +7897,7 @@ instruct storeF(vRegF src, memory mem) %{ match(Set mem (StoreF mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strs $src, $mem\t# float" %} @@ -6093,7 +7914,7 @@ instruct storeD(vRegD src, memory mem) %{ match(Set mem (StoreD mem src)); -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); ins_cost(INSN_COST); format %{ "strd $src, $mem\t# double" %} @@ -6106,7 +7927,7 @@ // Store Compressed Klass Pointer instruct storeNKlass(iRegN src, memory mem) %{ -// predicate(n->as_Store()->is_unordered()); + predicate(!needs_releasing_store(n)); match(Set mem (StoreNKlass mem src)); ins_cost(INSN_COST); @@ -6156,6 +7977,312 @@ ins_pipe(iload_prefetch); %} +// ---------------- volatile loads and stores ---------------- + +// Load Byte (8 bit signed) +instruct loadB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadB mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarsb $dst, $mem\t# byte" %} + + ins_encode(aarch64_enc_ldarsb(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Byte (8 bit signed) into long +instruct loadB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (ConvI2L (LoadB mem))); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarsb $dst, $mem\t# byte" %} + + ins_encode(aarch64_enc_ldarsb(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Byte (8 bit unsigned) +instruct loadUB_volatile(iRegINoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadUB mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarb $dst, $mem\t# byte" %} + + ins_encode(aarch64_enc_ldarb(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Byte (8 bit unsigned) into long +instruct loadUB2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (ConvI2L (LoadUB mem))); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarb $dst, $mem\t# byte" %} + + ins_encode(aarch64_enc_ldarb(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Short (16 bit signed) +instruct loadS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadS mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarshw $dst, $mem\t# short" %} + + ins_encode(aarch64_enc_ldarshw(dst, mem)); + + ins_pipe(pipe_serial); +%} + +instruct loadUS_volatile(iRegINoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadUS mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarhw $dst, $mem\t# short" %} + + ins_encode(aarch64_enc_ldarhw(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Short/Char (16 bit unsigned) into long +instruct loadUS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (ConvI2L (LoadUS mem))); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarh $dst, $mem\t# short" %} + + ins_encode(aarch64_enc_ldarh(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Short/Char (16 bit signed) into long +instruct loadS2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (ConvI2L (LoadS mem))); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarh $dst, $mem\t# short" %} + + ins_encode(aarch64_enc_ldarsh(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Integer (32 bit signed) +instruct loadI_volatile(iRegINoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadI mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarw $dst, $mem\t# int" %} + + ins_encode(aarch64_enc_ldarw(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Integer (32 bit unsigned) into long +instruct loadUI2L_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem, immL_32bits mask) +%{ + match(Set dst (AndL (ConvI2L (LoadI mem)) mask)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarw $dst, $mem\t# int" %} + + ins_encode(aarch64_enc_ldarw(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Long (64 bit signed) +instruct loadL_volatile(iRegLNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadL mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldar $dst, $mem\t# int" %} + + ins_encode(aarch64_enc_ldar(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Pointer +instruct loadP_volatile(iRegPNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadP mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldar $dst, $mem\t# ptr" %} + + ins_encode(aarch64_enc_ldar(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Compressed Pointer +instruct loadN_volatile(iRegNNoSp dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadN mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldarw $dst, $mem\t# compressed ptr" %} + + ins_encode(aarch64_enc_ldarw(dst, mem)); + + ins_pipe(pipe_serial); +%} + +// Load Float +instruct loadF_volatile(vRegF dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadF mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldars $dst, $mem\t# float" %} + + ins_encode( aarch64_enc_fldars(dst, mem) ); + + ins_pipe(pipe_serial); +%} + +// Load Double +instruct loadD_volatile(vRegD dst, /* sync_memory*/indirect mem) +%{ + match(Set dst (LoadD mem)); + + ins_cost(VOLATILE_REF_COST); + format %{ "ldard $dst, $mem\t# double" %} + + ins_encode( aarch64_enc_fldard(dst, mem) ); + + ins_pipe(pipe_serial); +%} + +// Store Byte +instruct storeB_volatile(iRegIorL2I src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreB mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrb $src, $mem\t# byte" %} + + ins_encode(aarch64_enc_stlrb(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Char/Short +instruct storeC_volatile(iRegIorL2I src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreC mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrh $src, $mem\t# short" %} + + ins_encode(aarch64_enc_stlrh(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Integer + +instruct storeI_volatile(iRegIorL2I src, /* sync_memory*/indirect mem) +%{ + match(Set mem(StoreI mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrw $src, $mem\t# int" %} + + ins_encode(aarch64_enc_stlrw(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Long (64 bit signed) +instruct storeL_volatile(iRegL src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreL mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlr $src, $mem\t# int" %} + + ins_encode(aarch64_enc_stlr(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Pointer +instruct storeP_volatile(iRegP src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreP mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlr $src, $mem\t# ptr" %} + + ins_encode(aarch64_enc_stlr(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Compressed Pointer +instruct storeN_volatile(iRegN src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreN mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrw $src, $mem\t# compressed ptr" %} + + ins_encode(aarch64_enc_stlrw(src, mem)); + + ins_pipe(pipe_class_memory); +%} + +// Store Float +instruct storeF_volatile(vRegF src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreF mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrs $src, $mem\t# float" %} + + ins_encode( aarch64_enc_fstlrs(src, mem) ); + + ins_pipe(pipe_class_memory); +%} + +// TODO +// implement storeImmF0 and storeFImmPacked + +// Store Double +instruct storeD_volatile(vRegD src, /* sync_memory*/indirect mem) +%{ + match(Set mem (StoreD mem src)); + + ins_cost(VOLATILE_REF_COST); + format %{ "stlrd $src, $mem\t# double" %} + + ins_encode( aarch64_enc_fstlrd(src, mem) ); + + ins_pipe(pipe_class_memory); +%} + +// ---------------- end of volatile loads and stores ---------------- + // ============================================================================ // BSWAP Instructions @@ -6373,6 +8500,20 @@ ins_pipe(pipe_serial); %} +instruct unnecessary_membar_acquire() %{ + predicate(unnecessary_acquire(n)); + match(MemBarAcquire); + ins_cost(0); + + format %{ "membar_acquire (elided)" %} + + ins_encode %{ + __ block_comment("membar_acquire (elided)"); + %} + + ins_pipe(pipe_class_empty); +%} + instruct membar_acquire() %{ match(MemBarAcquire); ins_cost(VOLATILE_REF_COST); @@ -6380,6 +8521,7 @@ format %{ "membar_acquire" %} ins_encode %{ + __ block_comment("membar_acquire"); __ membar(Assembler::LoadLoad|Assembler::LoadStore); %} @@ -6412,6 +8554,19 @@ ins_pipe(pipe_serial); %} +instruct unnecessary_membar_release() %{ + predicate(unnecessary_release(n)); + match(MemBarRelease); + ins_cost(0); + + format %{ "membar_release (elided)" %} + + ins_encode %{ + __ block_comment("membar_release (elided)"); + %} + ins_pipe(pipe_serial); +%} + instruct membar_release() %{ match(MemBarRelease); ins_cost(VOLATILE_REF_COST); @@ -6419,6 +8574,7 @@ format %{ "membar_release" %} ins_encode %{ + __ block_comment("membar_release"); __ membar(Assembler::LoadStore|Assembler::StoreStore); %} ins_pipe(pipe_serial); @@ -6449,6 +8605,20 @@ ins_pipe(pipe_serial); %} +instruct unnecessary_membar_volatile() %{ + predicate(unnecessary_volatile(n)); + match(MemBarVolatile); + ins_cost(0); + + format %{ "membar_volatile (elided)" %} + + ins_encode %{ + __ block_comment("membar_volatile (elided)"); + %} + + ins_pipe(pipe_serial); +%} + instruct membar_volatile() %{ match(MemBarVolatile); ins_cost(VOLATILE_REF_COST*100); @@ -6456,6 +8626,7 @@ format %{ "membar_volatile" %} ins_encode %{ + __ block_comment("membar_volatile"); __ membar(Assembler::StoreLoad); %} @@ -6751,9 +8922,13 @@ // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher // can't match them +// standard CompareAndSwapX when we are using barriers +// these have higher priority than the rules selected by a predicate + instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{ match(Set res (CompareAndSwapI mem (Binary oldval newval))); + ins_cost(2 * VOLATILE_REF_COST); effect(KILL cr); @@ -6771,6 +8946,7 @@ instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{ match(Set res (CompareAndSwapL mem (Binary oldval newval))); + ins_cost(2 * VOLATILE_REF_COST); effect(KILL cr); @@ -6788,6 +8964,7 @@ instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{ match(Set res (CompareAndSwapP mem (Binary oldval newval))); + ins_cost(2 * VOLATILE_REF_COST); effect(KILL cr); @@ -6805,6 +8982,7 @@ instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{ match(Set res (CompareAndSwapN mem (Binary oldval newval))); + ins_cost(2 * VOLATILE_REF_COST); effect(KILL cr); @@ -6820,6 +8998,85 @@ %} +// alternative CompareAndSwapX when we are eliding barriers + +instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{ + + predicate(needs_acquiring_load_exclusive(n)); + match(Set res (CompareAndSwapI mem (Binary oldval newval))); + ins_cost(VOLATILE_REF_COST); + + effect(KILL cr); + + format %{ + "cmpxchgw_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval" + "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)" + %} + + ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval), + aarch64_enc_cset_eq(res)); + + ins_pipe(pipe_slow); +%} + +instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{ + + predicate(needs_acquiring_load_exclusive(n)); + match(Set res (CompareAndSwapL mem (Binary oldval newval))); + ins_cost(VOLATILE_REF_COST); + + effect(KILL cr); + + format %{ + "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval" + "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)" + %} + + ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval), + aarch64_enc_cset_eq(res)); + + ins_pipe(pipe_slow); +%} + +instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{ + + predicate(needs_acquiring_load_exclusive(n)); + match(Set res (CompareAndSwapP mem (Binary oldval newval))); + ins_cost(VOLATILE_REF_COST); + + effect(KILL cr); + + format %{ + "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval" + "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)" + %} + + ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval), + aarch64_enc_cset_eq(res)); + + ins_pipe(pipe_slow); +%} + +instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{ + + predicate(needs_acquiring_load_exclusive(n)); + match(Set res (CompareAndSwapN mem (Binary oldval newval))); + ins_cost(VOLATILE_REF_COST); + + effect(KILL cr); + + format %{ + "cmpxchgw_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval" + "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)" + %} + + ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval), + aarch64_enc_cset_eq(res)); + + ins_pipe(pipe_slow); +%} + + instruct get_and_setI(indirect mem, iRegINoSp newv, iRegI prev) %{ match(Set prev (GetAndSetI mem newv)); format %{ "atomic_xchgw $prev, $newv, [$mem]" %}
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp Mon Sep 21 10:36:36 2015 -0400 +++ b/src/cpu/aarch64/vm/globals_aarch64.hpp Thu Oct 08 11:06:07 2015 -0400 @@ -96,8 +96,8 @@ product(bool, NearCpool, true, \ "constant pool is close to instructions") \ \ - notproduct(bool, UseAcqRelForVolatileFields, false, \ - "Use acquire and release insns for volatile fields") \ + product(bool, UseBarriersForVolatile, false, \ + "Use memory barriers to implement volatile accesses") \ \ product(bool, UseCRC32, false, \ "Use CRC32 instructions for CRC32 computation") \ @@ -115,8 +115,8 @@ product(bool, NearCpool, true, \ "constant pool is close to instructions") \ \ - notproduct(bool, UseAcqRelForVolatileFields, false, \ - "Use acquire and release insns for volatile fields") \ + product(bool, UseBarriersForVolatile, false, \ + "Use memory barriers to implement volatile accesses") \ product(bool, UseNeon, false, \ "Use Neon for CRC32 computation") \ product(bool, UseCRC32, false, \
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon Sep 21 10:36:36 2015 -0400 +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Oct 08 11:06:07 2015 -0400 @@ -3811,14 +3811,6 @@ } } - bool MacroAssembler::use_acq_rel_for_volatile_fields() { -#ifdef PRODUCT - return false; -#else - return UseAcqRelForVolatileFields; -#endif - } - void MacroAssembler::build_frame(int framesize) { if (framesize == 0) { // Is this even possible?
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Mon Sep 21 10:36:36 2015 -0400 +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Oct 08 11:06:07 2015 -0400 @@ -923,6 +923,23 @@ str(rscratch2, adr); } + // A generic CAS; success or failure is in the EQ flag. + template <typename T1, typename T2> + void cmpxchg(Register addr, Register expected, Register new_val, + T1 load_insn, + void (MacroAssembler::*cmp_insn)(Register, Register), + T2 store_insn, + Register tmp = rscratch1) { + Label retry_load, done; + bind(retry_load); + (this->*load_insn)(tmp, addr); + (this->*cmp_insn)(tmp, expected); + br(Assembler::NE, done); + (this->*store_insn)(tmp, new_val, addr); + cbnzw(tmp, retry_load); + bind(done); + } + // Calls // void call(Label& L, relocInfo::relocType rtype); @@ -1090,9 +1107,6 @@ address read_polling_page(Register r, address page, relocInfo::relocType rtype); address read_polling_page(Register r, relocInfo::relocType rtype); - // Used by aarch64.ad to control code generation - static bool use_acq_rel_for_volatile_fields(); - // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic. void update_byte_crc32(Register crc, Register val, Register table); void update_word_crc32(Register crc, Register v, Register tmp, @@ -1186,10 +1200,6 @@ } }; -// Used by aarch64.ad to control code generation -#define treat_as_volatile(MEM_NODE) \ - (MacroAssembler::use_acq_rel_for_volatile_fields() ? (MEM_NODE)->is_volatile() : false) - #ifdef ASSERT inline bool AbstractAssembler::pd_check_instruction_mark() { return false; } #endif
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp Mon Sep 21 10:36:36 2015 -0400 +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Oct 08 11:06:07 2015 -0400 @@ -224,6 +224,10 @@ UseMultiplyToLenIntrinsic = true; } + if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) { + UseBarriersForVolatile = (_cpuFeatures & CPU_DMB_ATOMICS) != 0; + } + if (FLAG_IS_DEFAULT(UsePopCountInstruction)) { UsePopCountInstruction = true; }
--- a/src/share/vm/opto/graphKit.cpp Mon Sep 21 10:36:36 2015 -0400 +++ b/src/share/vm/opto/graphKit.cpp Thu Oct 08 11:06:07 2015 -0400 @@ -3803,7 +3803,7 @@ // Smash zero into card if( !UseConcMarkSweepGC ) { - __ store(__ ctrl(), card_adr, zero, bt, adr_type, MemNode::release); + __ store(__ ctrl(), card_adr, zero, bt, adr_type, MemNode::unordered); } else { // Specialized path for CM store barrier __ storeCM(__ ctrl(), card_adr, zero, oop_store, adr_idx, bt, adr_type);