# HG changeset patch # User never # Date 1282955629 25200 # Node ID d64a8c7aa9d51386b51918988af1572b2d900692 # Parent d3b3540db4c3e14e6b61384f5862f2e06e16983e 4809552: Optimize Arrays.fill(...) Reviewed-by: kvn diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/cpu/sparc/vm/stubGenerator_sparc.cpp --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -1588,6 +1588,185 @@ } // + // Generate stub for disjoint short fill. If "aligned" is true, the + // "to" address is assumed to be heapword aligned. + // + // Arguments for generated stub: + // to: O0 + // value: O1 + // count: O2 treated as signed + // + address generate_fill(BasicType t, bool aligned, const char* name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + const Register to = O0; // source array address + const Register value = O1; // fill value + const Register count = O2; // elements count + // O3 is used as a temp register + + assert_clean_int(count, O3); // Make sure 'count' is clean int. + + Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; + Label L_fill_2_bytes, L_fill_4_bytes, L_fill_32_bytes; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 2; + break; + case T_SHORT: + shift = 1; + break; + case T_INT: + shift = 0; + break; + default: ShouldNotReachHere(); + } + + BLOCK_COMMENT("Entry:"); + + if (t == T_BYTE) { + // Zero extend value + __ and3(value, 0xff, value); + __ sllx(value, 8, O3); + __ or3(value, O3, value); + } + if (t == T_SHORT) { + // Zero extend value + __ sethi(0xffff0000, O3); + __ andn(value, O3, value); + } + if (t == T_BYTE || t == T_SHORT) { + __ sllx(value, 16, O3); + __ or3(value, O3, value); + } + + __ cmp(count, 2<andcc(count, 1<nop(); + __ stb(value, to, 0); + __ inc(to, 1); + __ dec(count, 1); + __ BIND(L_skip_align1); + } + // Two bytes misalignment happens only for byte and short (char) arrays + __ andcc(to, 2, G0); + __ br(Assembler::zero, false, Assembler::pt, L_skip_align2); + __ delayed()->nop(); + __ sth(value, to, 0); + __ inc(to, 2); + __ dec(count, 1 << (shift - 1)); + __ BIND(L_skip_align2); + } +#ifdef _LP64 + if (!aligned) { +#endif + // align to 8 bytes, we know we are 4 byte aligned to start + __ andcc(to, 7, G0); + __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes); + __ delayed()->nop(); + __ stw(value, to, 0); + __ inc(to, 4); + __ dec(count, 1 << shift); + __ BIND(L_fill_32_bytes); +#ifdef _LP64 + } +#endif + + Label L_check_fill_8_bytes; + // Fill 32-byte chunks + __ subcc(count, 8 << shift, count); + __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes); + __ delayed()->nop(); + + if (t == T_INT) { + // Zero extend value + __ srl(value, 0, value); + } + if (t == T_BYTE || t == T_SHORT || t == T_INT) { + __ sllx(value, 32, O3); + __ or3(value, O3, value); + } + + Label L_fill_32_bytes_loop; + __ align(16); + __ BIND(L_fill_32_bytes_loop); + + __ stx(value, to, 0); + __ stx(value, to, 8); + __ stx(value, to, 16); + __ stx(value, to, 24); + + __ subcc(count, 8 << shift, count); + __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop); + __ delayed()->add(to, 32, to); + + __ BIND(L_check_fill_8_bytes); + __ addcc(count, 8 << shift, count); + __ brx(Assembler::zero, false, Assembler::pn, L_exit); + __ delayed()->subcc(count, 1 << (shift + 1), count); + __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes); + __ delayed()->andcc(count, 1<add(to, 8, to); + + // fill trailing 4 bytes + __ andcc(count, 1<andcc(count, 1<<(shift-1), G0); + } else { + __ delayed()->nop(); + } + __ stw(value, to, 0); + if (t == T_BYTE || t == T_SHORT) { + __ inc(to, 4); + // fill trailing 2 bytes + __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches + __ BIND(L_fill_2_bytes); + __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte); + __ delayed()->andcc(count, 1, count); + __ sth(value, to, 0); + if (t == T_BYTE) { + __ inc(to, 2); + // fill trailing byte + __ andcc(count, 1, count); // in delay slot of branches + __ BIND(L_fill_byte); + __ brx(Assembler::zero, false, Assembler::pt, L_exit); + __ delayed()->nop(); + __ stb(value, to, 0); + } else { + __ BIND(L_fill_byte); + } + } else { + __ BIND(L_fill_2_bytes); + } + __ BIND(L_exit); + __ retl(); + __ delayed()->mov(G0, O0); // return 0 + return start; + } + + // // Generate stub for conjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // @@ -2855,6 +3034,13 @@ StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy"); StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); + + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); } void generate_initial() { diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/cpu/x86/vm/assembler_x86.cpp --- a/src/cpu/x86/vm/assembler_x86.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/cpu/x86/vm/assembler_x86.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -8767,6 +8767,186 @@ bind(DONE); } +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#else +#define BLOCK_COMMENT(str) block_comment(str) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") +void MacroAssembler::generate_fill(BasicType t, bool aligned, + Register to, Register value, Register count, + Register rtmp, XMMRegister xtmp) { + assert_different_registers(to, value, count, rtmp); + Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; + Label L_fill_2_bytes, L_fill_4_bytes; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 2; + break; + case T_SHORT: + shift = 1; + break; + case T_INT: + shift = 0; + break; + default: ShouldNotReachHere(); + } + + if (t == T_BYTE) { + andl(value, 0xff); + movl(rtmp, value); + shll(rtmp, 8); + orl(value, rtmp); + } + if (t == T_SHORT) { + andl(value, 0xffff); + } + if (t == T_BYTE || t == T_SHORT) { + movl(rtmp, value); + shll(rtmp, 16); + orl(value, rtmp); + } + + cmpl(count, 2<= 2, "supported cpu only" ); + Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; + // Fill 32-byte chunks + movdl(xtmp, value); + pshufd(xtmp, xtmp, 0); + + subl(count, 8 << shift); + jcc(Assembler::less, L_check_fill_8_bytes); + align(16); + + BIND(L_fill_32_bytes_loop); + + if (UseUnalignedLoadStores) { + movdqu(Address(to, 0), xtmp); + movdqu(Address(to, 16), xtmp); + } else { + movq(Address(to, 0), xtmp); + movq(Address(to, 8), xtmp); + movq(Address(to, 16), xtmp); + movq(Address(to, 24), xtmp); + } + + addptr(to, 32); + subl(count, 8 << shift); + jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); + BIND(L_check_fill_8_bytes); + addl(count, 8 << shift); + jccb(Assembler::zero, L_exit); + jmpb(L_fill_8_bytes); + + // + // length is too short, just fill qwords + // + BIND(L_fill_8_bytes_loop); + movq(Address(to, 0), xtmp); + addptr(to, 8); + BIND(L_fill_8_bytes); + subl(count, 1 << (shift + 1)); + jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); + } + } + // fill trailing 4 bytes + BIND(L_fill_4_bytes); + testl(count, 1< Input and output aligned on a HeapWord == 8-byte boundary // ignored @@ -2712,6 +2732,13 @@ StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); + // We don't generate specialized code for HeapWord-aligned source // arrays, so just use the code we've already generated StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/includeDB_compiler2 --- a/src/share/vm/includeDB_compiler2 Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/includeDB_compiler2 Fri Aug 27 17:33:49 2010 -0700 @@ -624,6 +624,7 @@ loopTransform.cpp loopnode.hpp loopTransform.cpp mulnode.hpp loopTransform.cpp rootnode.hpp +loopTransform.cpp runtime.hpp loopTransform.cpp subnode.hpp loopUnswitch.cpp allocation.inline.hpp diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/addnode.cpp --- a/src/share/vm/opto/addnode.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/addnode.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -705,6 +705,9 @@ } addr = addr->in(AddPNode::Address); } + if (addr != base) { + return -1; + } return count; } diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/c2_globals.hpp --- a/src/share/vm/opto/c2_globals.hpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/c2_globals.hpp Fri Aug 27 17:33:49 2010 -0700 @@ -157,6 +157,12 @@ develop(bool, TraceLoopPredicate, false, \ "Trace generation of loop predicates") \ \ + product(bool, OptimizeFill, false, \ + "convert fill/copy loops into intrinsic") \ + \ + develop(bool, TraceOptimizeFill, false, \ + "print detailed information about fill conversion") \ + \ develop(bool, OptoCoalesce, true, \ "Use Conservative Copy Coalescing in the Register Allocator") \ \ diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/loopTransform.cpp --- a/src/share/vm/opto/loopTransform.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/loopTransform.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -2049,11 +2049,18 @@ if (cmp->Opcode() != Op_CmpU ) { return false; } - if (cmp->in(2)->Opcode() != Op_LoadRange) { - return false; + Node* range = cmp->in(2); + if (range->Opcode() != Op_LoadRange) { + const TypeInt* tint = phase->_igvn.type(range)->isa_int(); + if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) { + // Allow predication on positive values that aren't LoadRanges. + // This allows optimization of loops where the length of the + // array is a known value and doesn't need to be loaded back + // from the array. + return false; + } } - LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2); - if (!invar.is_invariant(lr)) { // loadRange must be invariant + if (!invar.is_invariant(range)) { return false; } Node *iv = _head->as_CountedLoop()->phi(); @@ -2248,9 +2255,9 @@ const Node* cmp = bol->in(1)->as_Cmp(); Node* idx = cmp->in(1); assert(!invar.is_invariant(idx), "index is variant"); - assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be"); - Node* ld_rng = cmp->in(2); // LoadRangeNode - assert(invar.is_invariant(ld_rng), "load range must be invariant"); + assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be"); + Node* rng = cmp->in(2); + assert(invar.is_invariant(rng), "range must be invariant"); int scale = 1; Node* offset = zero; bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset); @@ -2271,21 +2278,21 @@ // Perform cloning to keep Invariance state correct since the // late schedule will place invariant things in the loop. - ld_rng = invar.clone(ld_rng, ctrl); + rng = invar.clone(rng, ctrl); if (offset && offset != zero) { assert(invar.is_invariant(offset), "offset must be loop invariant"); offset = invar.clone(offset, ctrl); } // Test the lower bound - Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false); + Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false); IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If(); _igvn.hash_delete(lower_bound_iff); lower_bound_iff->set_req(1, lower_bound_bol); if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx); // Test the upper bound - Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true); + Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true); IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If(); _igvn.hash_delete(upper_bound_iff); upper_bound_iff->set_req(1, upper_bound_bol); @@ -2366,3 +2373,348 @@ return hoisted; } + + +// Process all the loops in the loop tree and replace any fill +// patterns with an intrisc version. +bool PhaseIdealLoop::do_intrinsify_fill() { + bool changed = false; + for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { + IdealLoopTree* lpt = iter.current(); + changed |= intrinsify_fill(lpt); + } + return changed; +} + + +// Examine an inner loop looking for a a single store of an invariant +// value in a unit stride loop, +bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, + Node*& shift, Node*& con) { + const char* msg = NULL; + Node* msg_node = NULL; + + store_value = NULL; + con = NULL; + shift = NULL; + + // Process the loop looking for stores. If there are multiple + // stores or extra control flow give at this point. + CountedLoopNode* head = lpt->_head->as_CountedLoop(); + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + if (n->outcnt() == 0) continue; // Ignore dead + if (n->is_Store()) { + if (store != NULL) { + msg = "multiple stores"; + break; + } + int opc = n->Opcode(); + if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) { + msg = "oop fills not handled"; + break; + } + Node* value = n->in(MemNode::ValueIn); + if (!lpt->is_invariant(value)) { + msg = "variant store value"; + } + store = n; + store_value = value; + } else if (n->is_If() && n != head->loopexit()) { + msg = "extra control flow"; + msg_node = n; + } + } + + if (store == NULL) { + // No store in loop + return false; + } + + if (msg == NULL && head->stride_con() != 1) { + // could handle negative strides too + if (head->stride_con() < 0) { + msg = "negative stride"; + } else { + msg = "non-unit stride"; + } + } + + if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) { + msg = "can't handle store address"; + msg_node = store->in(MemNode::Address); + } + + // Make sure there is an appropriate fill routine + BasicType t = store->as_Mem()->memory_type(); + const char* fill_name; + if (msg == NULL && + StubRoutines::select_fill_function(t, false, fill_name) == NULL) { + msg = "unsupported store"; + msg_node = store; + } + + if (msg != NULL) { +#ifndef PRODUCT + if (TraceOptimizeFill) { + tty->print_cr("not fill intrinsic candidate: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } +#endif + return false; + } + + // Make sure the address expression can be handled. It should be + // head->phi * elsize + con. head->phi might have a ConvI2L. + Node* elements[4]; + Node* conv = NULL; + int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements)); + for (int e = 0; e < count; e++) { + Node* n = elements[e]; + if (n->is_Con() && con == NULL) { + con = n; + } else if (n->Opcode() == Op_LShiftX && shift == NULL) { + Node* value = n->in(1); +#ifdef _LP64 + if (value->Opcode() == Op_ConvI2L) { + conv = value; + value = value->in(1); + } +#endif + if (value != head->phi()) { + msg = "unhandled shift in address"; + } else { + shift = n; + assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match"); + } + } else if (n->Opcode() == Op_ConvI2L && conv == NULL) { + if (n->in(1) == head->phi()) { + conv = n; + } else { + msg = "unhandled input to ConvI2L"; + } + } else if (n == head->phi()) { + // no shift, check below for allowed cases + } else { + msg = "unhandled node in address"; + msg_node = n; + } + } + + if (count == -1) { + msg = "malformed address expression"; + msg_node = store; + } + + // byte sized items won't have a shift + if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) { + msg = "can't find shift"; + msg_node = store; + } + + if (msg != NULL) { +#ifndef PRODUCT + if (TraceOptimizeFill) { + tty->print_cr("not fill intrinsic: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } +#endif + return false; + } + + // No make sure all the other nodes in the loop can be handled + VectorSet ok(Thread::current()->resource_area()); + + // store related values are ok + ok.set(store->_idx); + ok.set(store->in(MemNode::Memory)->_idx); + + // Loop structure is ok + ok.set(head->_idx); + ok.set(head->loopexit()->_idx); + ok.set(head->phi()->_idx); + ok.set(head->incr()->_idx); + ok.set(head->loopexit()->cmp_node()->_idx); + ok.set(head->loopexit()->in(1)->_idx); + + // Address elements are ok + if (con) ok.set(con->_idx); + if (shift) ok.set(shift->_idx); + if (conv) ok.set(conv->_idx); + + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + if (n->outcnt() == 0) continue; // Ignore dead + if (ok.test(n->_idx)) continue; + // Backedge projection is ok + if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue; + if (!n->is_AddP()) { + msg = "unhandled node"; + msg_node = n; + break; + } + } + + // Make sure no unexpected values are used outside the loop + for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + // These values can be replaced with other nodes if they are used + // outside the loop. + if (n == store || n == head->loopexit() || n == head->incr()) continue; + for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) { + Node* use = iter.get(); + if (!lpt->_body.contains(use)) { + msg = "node is used outside loop"; + // lpt->_body.dump(); + msg_node = n; + break; + } + } + } + +#ifdef ASSERT + if (TraceOptimizeFill) { + if (msg != NULL) { + tty->print_cr("no fill intrinsic: %s", msg); + if (msg_node != NULL) msg_node->dump(); + } else { + tty->print_cr("fill intrinsic for:"); + } + store->dump(); + if (Verbose) { + lpt->_body.dump(); + } + } +#endif + + return msg == NULL; +} + + + +bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) { + // Only for counted inner loops + if (!lpt->is_counted() || !lpt->is_inner()) { + return false; + } + + // Must have constant stride + CountedLoopNode* head = lpt->_head->as_CountedLoop(); + if (!head->stride_is_con() || !head->is_normal_loop()) { + return false; + } + + // Check that the body only contains a store of a loop invariant + // value that is indexed by the loop phi. + Node* store = NULL; + Node* store_value = NULL; + Node* shift = NULL; + Node* offset = NULL; + if (!match_fill_loop(lpt, store, store_value, shift, offset)) { + return false; + } + + // Now replace the whole loop body by a call to a fill routine that + // covers the same region as the loop. + Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base); + + // Build an expression for the beginning of the copy region + Node* index = head->init_trip(); +#ifdef _LP64 + index = new (C, 2) ConvI2LNode(index); + _igvn.register_new_node_with_optimizer(index); +#endif + if (shift != NULL) { + // byte arrays don't require a shift but others do. + index = new (C, 3) LShiftXNode(index, shift->in(2)); + _igvn.register_new_node_with_optimizer(index); + } + index = new (C, 4) AddPNode(base, base, index); + _igvn.register_new_node_with_optimizer(index); + Node* from = new (C, 4) AddPNode(base, index, offset); + _igvn.register_new_node_with_optimizer(from); + // Compute the number of elements to copy + Node* len = new (C, 3) SubINode(head->limit(), head->init_trip()); + _igvn.register_new_node_with_optimizer(len); + + BasicType t = store->as_Mem()->memory_type(); + bool aligned = false; + if (offset != NULL && head->init_trip()->is_Con()) { + int element_size = type2aelembytes(t); + aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0; + } + + // Build a call to the fill routine + const char* fill_name; + address fill = StubRoutines::select_fill_function(t, aligned, fill_name); + assert(fill != NULL, "what?"); + + // Convert float/double to int/long for fill routines + if (t == T_FLOAT) { + store_value = new (C, 2) MoveF2INode(store_value); + _igvn.register_new_node_with_optimizer(store_value); + } else if (t == T_DOUBLE) { + store_value = new (C, 2) MoveD2LNode(store_value); + _igvn.register_new_node_with_optimizer(store_value); + } + + Node* mem_phi = store->in(MemNode::Memory); + Node* result_ctrl; + Node* result_mem; + const TypeFunc* call_type = OptoRuntime::array_fill_Type(); + int size = call_type->domain()->cnt(); + CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill, + fill_name, TypeAryPtr::get_array_body_type(t)); + call->init_req(TypeFunc::Parms+0, from); + call->init_req(TypeFunc::Parms+1, store_value); + call->init_req(TypeFunc::Parms+2, len); + call->init_req( TypeFunc::Control, head->init_control()); + call->init_req( TypeFunc::I_O , C->top() ) ; // does no i/o + call->init_req( TypeFunc::Memory , mem_phi->in(LoopNode::EntryControl) ); + call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) ); + call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) ); + _igvn.register_new_node_with_optimizer(call); + result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control); + _igvn.register_new_node_with_optimizer(result_ctrl); + result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory); + _igvn.register_new_node_with_optimizer(result_mem); + + // If this fill is tightly coupled to an allocation and overwrites + // the whole body, allow it to take over the zeroing. + AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this); + if (alloc != NULL && alloc->is_AllocateArray()) { + Node* length = alloc->as_AllocateArray()->Ideal_length(); + if (head->limit() == length && + head->init_trip() == _igvn.intcon(0)) { + if (TraceOptimizeFill) { + tty->print_cr("Eliminated zeroing in allocation"); + } + alloc->maybe_set_complete(&_igvn); + } else { +#ifdef ASSERT + if (TraceOptimizeFill) { + tty->print_cr("filling array but bounds don't match"); + alloc->dump(); + head->init_trip()->dump(); + head->limit()->dump(); + length->dump(); + } +#endif + } + } + + // Redirect the old control and memory edges that are outside the loop. + Node* exit = head->loopexit()->proj_out(0); + _igvn.replace_node(exit, result_ctrl); + _igvn.replace_node(store, result_mem); + // Any uses the increment outside of the loop become the loop limit. + _igvn.replace_node(head->incr(), head->limit()); + + // Disconnect the head from the loop. + for (uint i = 0; i < lpt->_body.size(); i++) { + Node* n = lpt->_body.at(i); + _igvn.replace_node(n, C->top()); + } + + return true; +} diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/loopnode.cpp --- a/src/share/vm/opto/loopnode.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/loopnode.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -1673,6 +1673,12 @@ _ltree_root->_child->loop_predication(this); } + if (OptimizeFill && UseLoopPredicate && C->has_loops() && !C->major_progress()) { + if (do_intrinsify_fill()) { + C->set_major_progress(); + } + } + // Perform iteration-splitting on inner loops. Split iterations to avoid // range checks or one-shot null checks. diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/loopnode.hpp --- a/src/share/vm/opto/loopnode.hpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/loopnode.hpp Fri Aug 27 17:33:49 2010 -0700 @@ -937,6 +937,12 @@ // same block. Split thru the Region. void do_split_if( Node *iff ); + // Conversion of fill/copy patterns into intrisic versions + bool do_intrinsify_fill(); + bool intrinsify_fill(IdealLoopTree* lpt); + bool match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, + Node*& shift, Node*& offset); + private: // Return a type based on condition control flow const TypeInt* filtered_type( Node *n, Node* n_ctrl); diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/runtime.cpp --- a/src/share/vm/opto/runtime.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/runtime.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -645,6 +645,22 @@ } +const TypeFunc* OptoRuntime::array_fill_Type() { + // create input type (domain) + const Type** fields = TypeTuple::fields(3); + fields[TypeFunc::Parms+0] = TypePtr::NOTNULL; + fields[TypeFunc::Parms+1] = TypeInt::INT; + fields[TypeFunc::Parms+2] = TypeInt::INT; + const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms + 3, fields); + + // create result type + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple *range = TypeTuple::make(TypeFunc::Parms, fields); + + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/opto/runtime.hpp --- a/src/share/vm/opto/runtime.hpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/opto/runtime.hpp Fri Aug 27 17:33:49 2010 -0700 @@ -260,6 +260,8 @@ static const TypeFunc* generic_arraycopy_Type(); static const TypeFunc* slow_arraycopy_Type(); // the full routine + static const TypeFunc* array_fill_Type(); + // leaf on stack replacement interpreter accessor types static const TypeFunc* osr_end_Type(); diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/runtime/arguments.cpp --- a/src/share/vm/runtime/arguments.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/runtime/arguments.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -1513,6 +1513,9 @@ if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeStringConcat)) { FLAG_SET_DEFAULT(OptimizeStringConcat, true); } + if (AggressiveOpts && FLAG_IS_DEFAULT(OptimizeFill)) { + FLAG_SET_DEFAULT(OptimizeFill, true); + } #endif if (AggressiveOpts) { diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/runtime/stubRoutines.cpp --- a/src/share/vm/runtime/stubRoutines.cpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/runtime/stubRoutines.cpp Fri Aug 27 17:33:49 2010 -0700 @@ -97,6 +97,15 @@ address StubRoutines::_unsafe_arraycopy = NULL; address StubRoutines::_generic_arraycopy = NULL; + +address StubRoutines::_jbyte_fill; +address StubRoutines::_jshort_fill; +address StubRoutines::_jint_fill; +address StubRoutines::_arrayof_jbyte_fill; +address StubRoutines::_arrayof_jshort_fill; +address StubRoutines::_arrayof_jint_fill; + + double (* StubRoutines::_intrinsic_log )(double) = NULL; double (* StubRoutines::_intrinsic_log10 )(double) = NULL; double (* StubRoutines::_intrinsic_exp )(double) = NULL; @@ -195,6 +204,46 @@ #undef TEST_ARRAYCOPY +#define TEST_FILL(type) \ + if (_##type##_fill != NULL) { \ + union { \ + double d; \ + type body[96]; \ + } s; \ + \ + int v = 32; \ + for (int offset = -2; offset <= 2; offset++) { \ + for (int i = 0; i < 96; i++) { \ + s.body[i] = 1; \ + } \ + type* start = s.body + 8 + offset; \ + for (int aligned = 0; aligned < 2; aligned++) { \ + if (aligned) { \ + if (((intptr_t)start) % HeapWordSize == 0) { \ + ((void (*)(type*, int, int))StubRoutines::_arrayof_##type##_fill)(start, v, 80); \ + } else { \ + continue; \ + } \ + } else { \ + ((void (*)(type*, int, int))StubRoutines::_##type##_fill)(start, v, 80); \ + } \ + for (int i = 0; i < 96; i++) { \ + if (i < (8 + offset) || i >= (88 + offset)) { \ + assert(s.body[i] == 1, "what?"); \ + } else { \ + assert(s.body[i] == 32, "what?"); \ + } \ + } \ + } \ + } \ + } \ + + TEST_FILL(jbyte); + TEST_FILL(jshort); + TEST_FILL(jint); + +#undef TEST_FILL + #define TEST_COPYRTN(type) \ test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::conjoint_##type##s_atomic), sizeof(type)); \ test_arraycopy_func(CAST_FROM_FN_PTR(address, Copy::arrayof_conjoint_##type##s), (int)MAX2(sizeof(HeapWord), sizeof(type))) @@ -315,3 +364,39 @@ Copy::arrayof_conjoint_oops(src, dest, count); gen_arraycopy_barrier((oop *) dest, count); JRT_END + + +address StubRoutines::select_fill_function(BasicType t, bool aligned, const char* &name) { +#define RETURN_STUB(xxx_fill) { \ + name = #xxx_fill; \ + return StubRoutines::xxx_fill(); } + + switch (t) { + case T_BYTE: + case T_BOOLEAN: + if (!aligned) RETURN_STUB(jbyte_fill); + RETURN_STUB(arrayof_jbyte_fill); + case T_CHAR: + case T_SHORT: + if (!aligned) RETURN_STUB(jshort_fill); + RETURN_STUB(arrayof_jshort_fill); + case T_INT: + case T_FLOAT: + if (!aligned) RETURN_STUB(jint_fill); + RETURN_STUB(arrayof_jint_fill); + case T_DOUBLE: + case T_LONG: + case T_ARRAY: + case T_OBJECT: + case T_NARROWOOP: + case T_ADDRESS: + // Currently unsupported + return NULL; + + default: + ShouldNotReachHere(); + return NULL; + } + +#undef RETURN_STUB +} diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/runtime/stubRoutines.hpp --- a/src/share/vm/runtime/stubRoutines.hpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/runtime/stubRoutines.hpp Fri Aug 27 17:33:49 2010 -0700 @@ -148,6 +148,13 @@ static address _unsafe_arraycopy; static address _generic_arraycopy; + static address _jbyte_fill; + static address _jshort_fill; + static address _jint_fill; + static address _arrayof_jbyte_fill; + static address _arrayof_jshort_fill; + static address _arrayof_jint_fill; + // These are versions of the java.lang.Math methods which perform // the same operations as the intrinsic version. They are used for // constant folding in the compiler to ensure equivalence. If the @@ -259,6 +266,16 @@ static address unsafe_arraycopy() { return _unsafe_arraycopy; } static address generic_arraycopy() { return _generic_arraycopy; } + static address jbyte_fill() { return _jbyte_fill; } + static address jshort_fill() { return _jshort_fill; } + static address jint_fill() { return _jint_fill; } + static address arrayof_jbyte_fill() { return _arrayof_jbyte_fill; } + static address arrayof_jshort_fill() { return _arrayof_jshort_fill; } + static address arrayof_jint_fill() { return _arrayof_jint_fill; } + + static address select_fill_function(BasicType t, bool aligned, const char* &name); + + static double intrinsic_log(double d) { assert(_intrinsic_log != NULL, "must be defined"); return _intrinsic_log(d); diff -r d3b3540db4c3 -r d64a8c7aa9d5 src/share/vm/utilities/globalDefinitions.hpp --- a/src/share/vm/utilities/globalDefinitions.hpp Fri Sep 24 00:52:04 2010 -0700 +++ b/src/share/vm/utilities/globalDefinitions.hpp Fri Aug 27 17:33:49 2010 -0700 @@ -529,7 +529,7 @@ #ifdef ASSERT extern int type2aelembytes(BasicType t, bool allow_address = false); // asserts #else -inline int type2aelembytes(BasicType t) { return _type2aelembytes[t]; } +inline int type2aelembytes(BasicType t, bool allow_address = false) { return _type2aelembytes[t]; } #endif