Mercurial > hg > release > icedtea7-forest-2.6 > hotspot
changeset 6731:87aadff83304
8155617, PR3741: aarch64: ClearArray does not use DC ZVA
Summary: Implement block zero using DC ZVA
Reviewed-by: aph
Contributed-by: long.chen@linaro.org, edward.nevill@gmail.com
author | enevill |
---|---|
date | Tue, 16 Jul 2019 09:32:04 +0100 |
parents | 6dcb006c9f20 |
children | cbb799cc6c7c |
files | src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/assembler_aarch64.cpp src/cpu/aarch64/vm/assembler_aarch64.hpp src/cpu/aarch64/vm/globals_aarch64.hpp src/cpu/aarch64/vm/stubGenerator_aarch64.cpp src/cpu/aarch64/vm/stubRoutines_aarch64.cpp src/cpu/aarch64/vm/stubRoutines_aarch64.hpp src/cpu/aarch64/vm/vm_version_aarch64.cpp src/cpu/aarch64/vm/vm_version_aarch64.hpp |
diffstat | 9 files changed, 247 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/aarch64.ad Tue Jul 16 09:32:04 2019 +0100 @@ -11732,9 +11732,10 @@ ins_pipe(pipe_class_memory); %} -instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr) +instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr) %{ match(Set dummy (ClearArray cnt base)); + effect(USE_KILL base, TEMP tmp); ins_cost(4 * INSN_COST); format %{ "ClearArray $cnt, $base" %}
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp Tue Jul 16 09:32:04 2019 +0100 @@ -5445,24 +5445,35 @@ BLOCK_COMMENT("} string_compare"); } -// base: Address of a buffer to be zeroed, 8 bytes aligned. -// cnt: Count in 8-byte unit. + +// base: Address of a buffer to be zeroed, 8 bytes aligned. +// cnt: Count in HeapWords. +// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. void MacroAssembler::zero_words(Register base, Register cnt) { - fill_words(base, cnt, zr); -} - -// base: Address of a buffer to be zeroed, 8 bytes aligned. -// cnt: Immediate count in 8-byte unit. + if (UseBlockZeroing) { + block_zero(base, cnt); + } else { + fill_words(base, cnt, zr); + } +} + +// r10 = base: Address of a buffer to be zeroed, 8 bytes aligned. +// cnt: Immediate count in HeapWords. +// r11 = tmp: For use as cnt if we need to call out #define ShortArraySize (18 * BytesPerLong) void MacroAssembler::zero_words(Register base, u_int64_t cnt) { + Register tmp = r11; int i = cnt & 1; // store any odd word to start if (i) str(zr, Address(base)); if (cnt <= ShortArraySize / BytesPerLong) { for (; i < (int)cnt; i += 2) stp(zr, zr, Address(base, i * wordSize)); + } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) { + mov(tmp, cnt); + block_zero(base, tmp, true); } else { const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll int remainder = cnt % (2 * unroll); @@ -5514,24 +5525,96 @@ assert_different_registers(base, cnt, value, rscratch1, rscratch2); - Label entry, loop; - const int unroll = 8; // Number of str instructions we'll unroll - - andr(rscratch1, cnt, unroll - 1); // tmp1 = cnt % unroll - cbz(rscratch1, entry); - sub(cnt, cnt, rscratch1); // cnt -= tmp1 - // base always points to the end of the region we're about to fill + Label fini, skip, entry, loop; + const int unroll = 8; // Number of stp instructions we'll unroll + + cbz(cnt, fini); + tbz(base, 3, skip); + str(value, Address(post(base, 8))); + sub(cnt, cnt, 1); + bind(skip); + + andr(rscratch1, cnt, (unroll-1) * 2); + sub(cnt, cnt, rscratch1); add(base, base, rscratch1, Assembler::LSL, 3); adr(rscratch2, entry); - sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); + sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); br(rscratch2); + bind(loop); - add(base, base, unroll * 8); - sub(cnt, cnt, unroll); + add(base, base, unroll * 16); for (int i = -unroll; i < 0; i++) - str(value, Address(base, i * 8)); + stp(value, value, Address(base, i * 16)); bind(entry); - cbnz(cnt, loop); + subs(cnt, cnt, unroll * 2); + br(Assembler::GE, loop); + + tbz(cnt, 0, fini); + str(value, Address(post(base, 8))); + bind(fini); +} + +// Use DC ZVA to do fast zeroing. +// base: Address of a buffer to be zeroed, 8 bytes aligned. +// cnt: Count in HeapWords. +// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit. +void MacroAssembler::block_zero(Register base, Register cnt, bool is_large) +{ + Label small; + Label store_pair, loop_store_pair, done; + Label base_aligned; + + assert_different_registers(base, cnt, rscratch1); + guarantee(base == r10 && cnt == r11, "fix register usage"); + + Register tmp = rscratch1; + Register tmp2 = rscratch2; + int zva_length = VM_Version::zva_length(); + + // Ensure ZVA length can be divided by 16. This is required by + // the subsequent operations. + assert (zva_length % 16 == 0, "Unexpected ZVA Length"); + + if (!is_large) cbz(cnt, done); + tbz(base, 3, base_aligned); + str(zr, Address(post(base, 8))); + sub(cnt, cnt, 1); + bind(base_aligned); + + // Ensure count >= zva_length * 2 so that it still deserves a zva after + // alignment. + if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) { + int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); + cmp(cnt, low_limit >> 3); + br(Assembler::LT, small); + } + + far_call(StubRoutines::aarch64::get_zero_longs()); + + bind(small); + + const int unroll = 8; // Number of stp instructions we'll unroll + Label small_loop, small_table_end; + + andr(tmp, cnt, (unroll-1) * 2); + sub(cnt, cnt, tmp); + add(base, base, tmp, Assembler::LSL, 3); + adr(tmp2, small_table_end); + sub(tmp2, tmp2, tmp, Assembler::LSL, 1); + br(tmp2); + + bind(small_loop); + add(base, base, unroll * 16); + for (int i = -unroll; i < 0; i++) + stp(zr, zr, Address(base, i * 16)); + bind(small_table_end); + subs(cnt, cnt, unroll * 2); + br(Assembler::GE, small_loop); + + tbz(cnt, 0, done); + str(zr, Address(post(base, 8))); + + bind(done); } void MacroAssembler::string_equals(Register str1, Register str2,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp Tue Jul 16 09:32:04 2019 +0100 @@ -1034,12 +1034,28 @@ system(0b00, 0b011, 0b00011, SY, 0b110); } - void dc(Register Rt) { - system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt); + void sys(int op1, int CRn, int CRm, int op2, + Register rt = (Register)0b11111) { + system(0b01, op1, CRn, CRm, op2, rt); } - void ic(Register Rt) { - system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt); + // Only implement operations accessible from EL0 or higher, i.e., + // op1 CRn CRm op2 + // IC IVAU 3 7 5 1 + // DC CVAC 3 7 10 1 + // DC CVAU 3 7 11 1 + // DC CIVAC 3 7 14 1 + // DC ZVA 3 7 4 1 + // So only deal with the CRm field. + enum icache_maintenance {IVAU = 0b0101}; + enum dcache_maintenance {CVAC = 0b1010, CVAU = 0b1011, CIVAC = 0b1110, ZVA = 0b100}; + + void dc(dcache_maintenance cm, Register Rt) { + sys(0b011, 0b0111, cm, 0b001, Rt); + } + + void ic(icache_maintenance cm, Register Rt) { + sys(0b011, 0b0111, cm, 0b001, Rt); } // A more convenient access to dmb for our purposes @@ -2774,6 +2790,15 @@ msr(0b011, 0b0100, 0b0100, 0b001, zr); } + // DCZID_EL0: op1 == 011 + // CRn == 0000 + // CRm == 0000 + // op2 == 111 + inline void get_dczid_el0(Register reg) + { + mrs(0b011, 0b0000, 0b0000, 0b111, reg); + } + // idiv variant which deals with MINLONG as dividend and -1 as divisor int corrected_idivl(Register result, Register ra, Register rb, bool want_remainder, Register tmp = rscratch1); @@ -3582,8 +3607,9 @@ void char_arrays_equals(Register ary1, Register ary2, Register result, Register tmp1); void fill_words(Register base, Register cnt, Register value); + void zero_words(Register base, u_int64_t cnt); void zero_words(Register base, Register cnt); - void zero_words(Register base, u_int64_t cnt); + void block_zero(Register base, Register cnt, bool is_large = false); // ISB may be needed because of a safepoint void maybe_isb() { isb(); }
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Jul 16 09:32:04 2019 +0100 @@ -119,6 +119,10 @@ "Use Neon for CRC32 computation") \ product(bool, UseCRC32, false, \ "Use CRC32 instructions for CRC32 computation") \ + product(bool, UseBlockZeroing, true, \ + "Use DC ZVA for block zeroing") \ + product(intx, BlockZeroingLowLimit, 256, \ + "Minimum size in bytes when block zeroing will be used") \ product(bool, TraceTraps, false, "Trace all traps the signal handler") #endif
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Jul 16 09:32:04 2019 +0100 @@ -782,6 +782,48 @@ } } + address generate_zero_longs(Register base, Register cnt) { + Register tmp = rscratch1; + Register tmp2 = rscratch2; + int zva_length = VM_Version::zva_length(); + Label initial_table_end, loop_zva; + Label fini; + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "zero_longs"); + address start = __ pc(); + + // Base must be 16 byte aligned. If not just return and let caller handle it + __ tst(base, 0x0f); + __ br(Assembler::NE, fini); + // Align base with ZVA length. + __ neg(tmp, base); + __ andr(tmp, tmp, zva_length - 1); + + // tmp: the number of bytes to be filled to align the base with ZVA length. + __ add(base, base, tmp); + __ sub(cnt, cnt, tmp, Assembler::ASR, 3); + __ adr(tmp2, initial_table_end); + __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); + __ br(tmp2); + + for (int i = -zva_length + 16; i < 0; i += 16) + __ stp(zr, zr, Address(base, i)); + __ bind(initial_table_end); + + __ sub(cnt, cnt, zva_length >> 3); + __ bind(loop_zva); + __ dc(Assembler::ZVA, base); + __ subs(cnt, cnt, zva_length >> 3); + __ add(base, base, zva_length); + __ br(Assembler::GE, loop_zva); + __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA + __ bind(fini); + __ ret(lr); + + return start; + } + typedef enum { copy_forwards = 1, copy_backwards = -1 @@ -1613,7 +1655,9 @@ const Register to = c_rarg0; // source array address const Register value = c_rarg1; // value const Register count = c_rarg2; // elements count - const Register cnt_words = c_rarg3; // temp register + + const Register bz_base = r10; // base for block_zero routine + const Register cnt_words = r11; // temp register __ enter(); @@ -1677,7 +1721,23 @@ __ lsrw(cnt_words, count, 3 - shift); // number of words __ bfi(value, value, 32, 32); // 32 bit -> 64 bit __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); - __ fill_words(to, cnt_words, value); + if (UseBlockZeroing) { + Label non_block_zeroing, rest; + // count >= BlockZeroingLowLimit && value == 0 + __ cmp(cnt_words, BlockZeroingLowLimit >> 3); + __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); + __ br(Assembler::NE, non_block_zeroing); + __ mov(bz_base, to); + __ block_zero(bz_base, cnt_words, true); + __ mov(to, bz_base); + __ b(rest); + __ bind(non_block_zeroing); + __ fill_words(to, cnt_words, value); + __ bind(rest); + } + else { + __ fill_words(to, cnt_words, value); + } // Remaining count is less than 8 bytes. Fill it by a single store. // Note that the total length is no less than 8 bytes. @@ -1736,6 +1796,8 @@ generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); + StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); + //*** jbyte // Always need aligned and unaligned versions StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Jul 16 09:32:04 2019 +0100 @@ -55,6 +55,7 @@ address StubRoutines::aarch64::_float_sign_flip = NULL; address StubRoutines::aarch64::_double_sign_mask = NULL; address StubRoutines::aarch64::_double_sign_flip = NULL; +address StubRoutines::aarch64::_zero_longs = NULL; /** * crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Jul 16 09:32:04 2019 +0100 @@ -62,6 +62,8 @@ static address _double_sign_mask; static address _double_sign_flip; + static address _zero_longs; + public: static address get_previous_fp_entry() @@ -114,6 +116,11 @@ return _double_sign_flip; } + static address get_zero_longs() + { + return _zero_longs; + } + private: static juint _crc_table[]; };
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Jul 16 09:32:04 2019 +0100 @@ -68,6 +68,7 @@ int VM_Version::_stepping; int VM_Version::_cpuFeatures; const char* VM_Version::_features_str = ""; +VM_Version::PsrInfo VM_Version::_psr_info = { 0, }; static BufferBlob* stub_blob; static const int stub_size = 550; @@ -92,13 +93,16 @@ __ c_stub_prolog(1, 0, MacroAssembler::ret_type_void); #endif - // void getPsrInfo(VM_Version::CpuidInfo* cpuid_info); + // void getPsrInfo(VM_Version::PsrInfo* psr_info); address entry = __ pc(); - // TODO : redefine fields in CpuidInfo and generate - // code to fill them in + __ enter(); + __ get_dczid_el0(rscratch1); + __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset()))); + + __ leave(); __ ret(lr); # undef __ @@ -115,6 +119,8 @@ _supports_atomic_getset8 = true; _supports_atomic_getadd8 = true; + getPsrInfo_stub(&_psr_info); + if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256); if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize)) @@ -187,6 +193,18 @@ UseCRC32Intrinsics = true; } + if (is_zva_enabled()) { + if (FLAG_IS_DEFAULT(UseBlockZeroing)) { + FLAG_SET_DEFAULT(UseBlockZeroing, true); + } + if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) { + FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * VM_Version::zva_length()); + } + } else if (UseBlockZeroing) { + warning("DC ZVA is not available on this CPU"); + FLAG_SET_DEFAULT(UseBlockZeroing, false); + } + // This machine allows unaligned memory accesses if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) { FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800 +++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp Tue Jul 16 09:32:04 2019 +0100 @@ -42,6 +42,10 @@ // 0 if this instruction is not available static const char* _features_str; + struct PsrInfo { + uint32_t dczid_el0; + }; + static PsrInfo _psr_info; static void get_processor_features(); public: @@ -85,7 +89,17 @@ static int cpu_variant() { return _variant; } static int cpu_revision() { return _revision; } static int cpu_cpuFeatures() { return _cpuFeatures; } - + static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); } + static bool is_zva_enabled() { + // Check the DZP bit (bit 4) of dczid_el0 is zero + // and block size (bit 0~3) is not zero. + return ((_psr_info.dczid_el0 & 0x10) == 0 && + (_psr_info.dczid_el0 & 0xf) != 0); + } + static int zva_length() { + assert(is_zva_enabled(), "ZVA not available"); + return 4 << (_psr_info.dczid_el0 & 0xf); + } }; #endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP