changeset 6731:87aadff83304

8155617, PR3741: aarch64: ClearArray does not use DC ZVA Summary: Implement block zero using DC ZVA Reviewed-by: aph Contributed-by: long.chen@linaro.org, edward.nevill@gmail.com
author enevill
date Tue, 16 Jul 2019 09:32:04 +0100
parents 6dcb006c9f20
children cbb799cc6c7c
files src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/assembler_aarch64.cpp src/cpu/aarch64/vm/assembler_aarch64.hpp src/cpu/aarch64/vm/globals_aarch64.hpp src/cpu/aarch64/vm/stubGenerator_aarch64.cpp src/cpu/aarch64/vm/stubRoutines_aarch64.cpp src/cpu/aarch64/vm/stubRoutines_aarch64.hpp src/cpu/aarch64/vm/vm_version_aarch64.cpp src/cpu/aarch64/vm/vm_version_aarch64.hpp
diffstat 9 files changed, 247 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 16 09:32:04 2019 +0100
@@ -11732,9 +11732,10 @@
   ins_pipe(pipe_class_memory);
 %}
 
-instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr)
+instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
 %{
   match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL base, TEMP tmp);
 
   ins_cost(4 * INSN_COST);
   format %{ "ClearArray $cnt, $base" %}
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Jul 16 09:32:04 2019 +0100
@@ -5445,24 +5445,35 @@
   BLOCK_COMMENT("} string_compare");
 }
 
-// base:   Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:    Count in 8-byte unit.
+
+// base:     Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:      Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
 void MacroAssembler::zero_words(Register base, Register cnt)
 {
-  fill_words(base, cnt, zr);
-}
-
-// base:   Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:    Immediate count in 8-byte unit.
+  if (UseBlockZeroing) {
+    block_zero(base, cnt);
+  } else {
+    fill_words(base, cnt, zr);
+  }
+}
+
+// r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:          Immediate count in HeapWords.
+// r11 = tmp:    For use as cnt if we need to call out
 #define ShortArraySize (18 * BytesPerLong)
 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
 {
+  Register tmp = r11;
   int i = cnt & 1;  // store any odd word to start
   if (i) str(zr, Address(base));
 
   if (cnt <= ShortArraySize / BytesPerLong) {
     for (; i < (int)cnt; i += 2)
       stp(zr, zr, Address(base, i * wordSize));
+  } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
+    mov(tmp, cnt);
+    block_zero(base, tmp, true);
   } else {
     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
     int remainder = cnt % (2 * unroll);
@@ -5514,24 +5525,96 @@
 
   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
 
-  Label entry, loop;
-  const int unroll = 8; // Number of str instructions we'll unroll
-
-  andr(rscratch1, cnt, unroll - 1);  // tmp1 = cnt % unroll
-  cbz(rscratch1, entry);
-  sub(cnt, cnt, rscratch1);          // cnt -= tmp1
-  // base always points to the end of the region we're about to fill
+  Label fini, skip, entry, loop;
+  const int unroll = 8; // Number of stp instructions we'll unroll
+
+  cbz(cnt, fini);
+  tbz(base, 3, skip);
+  str(value, Address(post(base, 8)));
+  sub(cnt, cnt, 1);
+  bind(skip);
+
+  andr(rscratch1, cnt, (unroll-1) * 2);
+  sub(cnt, cnt, rscratch1);
   add(base, base, rscratch1, Assembler::LSL, 3);
   adr(rscratch2, entry);
-  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
   br(rscratch2);
+
   bind(loop);
-  add(base, base, unroll * 8);
-  sub(cnt, cnt, unroll);
+  add(base, base, unroll * 16);
   for (int i = -unroll; i < 0; i++)
-    str(value, Address(base, i * 8));
+    stp(value, value, Address(base, i * 16));
   bind(entry);
-  cbnz(cnt, loop);
+  subs(cnt, cnt, unroll * 2);
+  br(Assembler::GE, loop);
+
+  tbz(cnt, 0, fini);
+  str(value, Address(post(base, 8)));
+  bind(fini);
+}
+
+// Use DC ZVA to do fast zeroing.
+// base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:    Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
+void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
+{
+  Label small;
+  Label store_pair, loop_store_pair, done;
+  Label base_aligned;
+
+  assert_different_registers(base, cnt, rscratch1);
+  guarantee(base == r10 && cnt == r11, "fix register usage");
+
+  Register tmp = rscratch1;
+  Register tmp2 = rscratch2;
+  int zva_length = VM_Version::zva_length();
+
+  // Ensure ZVA length can be divided by 16. This is required by
+  // the subsequent operations.
+  assert (zva_length % 16 == 0, "Unexpected ZVA Length");
+
+  if (!is_large) cbz(cnt, done);
+  tbz(base, 3, base_aligned);
+  str(zr, Address(post(base, 8)));
+  sub(cnt, cnt, 1);
+  bind(base_aligned);
+
+  // Ensure count >= zva_length * 2 so that it still deserves a zva after
+  // alignment.
+  if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
+    int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
+    cmp(cnt, low_limit >> 3);
+    br(Assembler::LT, small);
+  }
+
+  far_call(StubRoutines::aarch64::get_zero_longs());
+
+  bind(small);
+
+  const int unroll = 8; // Number of stp instructions we'll unroll
+  Label small_loop, small_table_end;
+
+  andr(tmp, cnt, (unroll-1) * 2);
+  sub(cnt, cnt, tmp);
+  add(base, base, tmp, Assembler::LSL, 3);
+  adr(tmp2, small_table_end);
+  sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
+  br(tmp2);
+
+  bind(small_loop);
+  add(base, base, unroll * 16);
+  for (int i = -unroll; i < 0; i++)
+    stp(zr, zr, Address(base, i * 16));
+  bind(small_table_end);
+  subs(cnt, cnt, unroll * 2);
+  br(Assembler::GE, small_loop);
+
+  tbz(cnt, 0, done);
+  str(zr, Address(post(base, 8)));
+
+  bind(done);
 }
 
 void MacroAssembler::string_equals(Register str1, Register str2,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 16 09:32:04 2019 +0100
@@ -1034,12 +1034,28 @@
     system(0b00, 0b011, 0b00011, SY, 0b110);
   }
 
-  void dc(Register Rt) {
-    system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt);
+  void sys(int op1, int CRn, int CRm, int op2,
+           Register rt = (Register)0b11111) {
+    system(0b01, op1, CRn, CRm, op2, rt);
   }
 
-  void ic(Register Rt) {
-    system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt);
+  // Only implement operations accessible from EL0 or higher, i.e.,
+  //            op1    CRn    CRm    op2
+  // IC IVAU     3      7      5      1
+  // DC CVAC     3      7      10     1
+  // DC CVAU     3      7      11     1
+  // DC CIVAC    3      7      14     1
+  // DC ZVA      3      7      4      1
+  // So only deal with the CRm field.
+  enum icache_maintenance {IVAU = 0b0101};
+  enum dcache_maintenance {CVAC = 0b1010, CVAU = 0b1011, CIVAC = 0b1110, ZVA = 0b100};
+
+  void dc(dcache_maintenance cm, Register Rt) {
+    sys(0b011, 0b0111, cm, 0b001, Rt);
+  }
+
+  void ic(icache_maintenance cm, Register Rt) {
+    sys(0b011, 0b0111, cm, 0b001, Rt);
   }
 
   // A more convenient access to dmb for our purposes
@@ -2774,6 +2790,15 @@
     msr(0b011, 0b0100, 0b0100, 0b001, zr);
   }
 
+  // DCZID_EL0: op1 == 011
+  //            CRn == 0000
+  //            CRm == 0000
+  //            op2 == 111
+  inline void get_dczid_el0(Register reg)
+  {
+    mrs(0b011, 0b0000, 0b0000, 0b111, reg);
+  }
+
   // idiv variant which deals with MINLONG as dividend and -1 as divisor
   int corrected_idivl(Register result, Register ra, Register rb,
                       bool want_remainder, Register tmp = rscratch1);
@@ -3582,8 +3607,9 @@
   void char_arrays_equals(Register ary1, Register ary2,
                           Register result, Register tmp1);
   void fill_words(Register base, Register cnt, Register value);
+  void zero_words(Register base, u_int64_t cnt);
   void zero_words(Register base, Register cnt);
-  void zero_words(Register base, u_int64_t cnt);
+  void block_zero(Register base, Register cnt, bool is_large = false);
 
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Jul 16 09:32:04 2019 +0100
@@ -119,6 +119,10 @@
           "Use Neon for CRC32 computation")                             \
   product(bool, UseCRC32, false,                                        \
           "Use CRC32 instructions for CRC32 computation")               \
+  product(bool, UseBlockZeroing, true,                                  \
+          "Use DC ZVA for block zeroing")                               \
+  product(intx, BlockZeroingLowLimit, 256,                              \
+          "Minimum size in bytes when block zeroing will be used")      \
   product(bool, TraceTraps, false, "Trace all traps the signal handler")
 
 #endif
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Jul 16 09:32:04 2019 +0100
@@ -782,6 +782,48 @@
     }
   }
 
+  address generate_zero_longs(Register base, Register cnt) {
+    Register tmp = rscratch1;
+    Register tmp2 = rscratch2;
+    int zva_length = VM_Version::zva_length();
+    Label initial_table_end, loop_zva;
+    Label fini;
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "zero_longs");
+    address start = __ pc();
+
+    // Base must be 16 byte aligned. If not just return and let caller handle it
+    __ tst(base, 0x0f);
+    __ br(Assembler::NE, fini);
+    // Align base with ZVA length.
+    __ neg(tmp, base);
+    __ andr(tmp, tmp, zva_length - 1);
+
+    // tmp: the number of bytes to be filled to align the base with ZVA length.
+    __ add(base, base, tmp);
+    __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
+    __ adr(tmp2, initial_table_end);
+    __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
+    __ br(tmp2);
+
+    for (int i = -zva_length + 16; i < 0; i += 16)
+      __ stp(zr, zr, Address(base, i));
+    __ bind(initial_table_end);
+
+    __ sub(cnt, cnt, zva_length >> 3);
+    __ bind(loop_zva);
+    __ dc(Assembler::ZVA, base);
+    __ subs(cnt, cnt, zva_length >> 3);
+    __ add(base, base, zva_length);
+    __ br(Assembler::GE, loop_zva);
+    __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
+    __ bind(fini);
+    __ ret(lr);
+
+    return start;
+  }
+
   typedef enum {
     copy_forwards = 1,
     copy_backwards = -1
@@ -1613,7 +1655,9 @@
     const Register to        = c_rarg0;  // source array address
     const Register value     = c_rarg1;  // value
     const Register count     = c_rarg2;  // elements count
-    const Register cnt_words = c_rarg3; // temp register
+
+    const Register bz_base = r10;        // base for block_zero routine
+    const Register cnt_words = r11;      // temp register
 
     __ enter();
 
@@ -1677,7 +1721,23 @@
     __ lsrw(cnt_words, count, 3 - shift); // number of words
     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
-    __ fill_words(to, cnt_words, value);
+    if (UseBlockZeroing) {
+      Label non_block_zeroing, rest;
+      // count >= BlockZeroingLowLimit && value == 0
+      __ cmp(cnt_words, BlockZeroingLowLimit >> 3);
+      __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
+      __ br(Assembler::NE, non_block_zeroing);
+      __ mov(bz_base, to);
+      __ block_zero(bz_base, cnt_words, true);
+      __ mov(to, bz_base);
+      __ b(rest);
+      __ bind(non_block_zeroing);
+      __ fill_words(to, cnt_words, value);
+      __ bind(rest);
+    }
+    else {
+      __ fill_words(to, cnt_words, value);
+    }
 
     // Remaining count is less than 8 bytes. Fill it by a single store.
     // Note that the total length is no less than 8 bytes.
@@ -1736,6 +1796,8 @@
     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
 
+    StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
+
     //*** jbyte
     // Always need aligned and unaligned versions
     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Tue Jul 16 09:32:04 2019 +0100
@@ -55,6 +55,7 @@
 address StubRoutines::aarch64::_float_sign_flip = NULL;
 address StubRoutines::aarch64::_double_sign_mask = NULL;
 address StubRoutines::aarch64::_double_sign_flip = NULL;
+address StubRoutines::aarch64::_zero_longs = NULL;
 
 /**
  *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
--- a/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Tue Jul 16 09:32:04 2019 +0100
@@ -62,6 +62,8 @@
   static address _double_sign_mask;
   static address _double_sign_flip;
 
+  static address _zero_longs;
+
  public:
 
   static address get_previous_fp_entry()
@@ -114,6 +116,11 @@
     return _double_sign_flip;
   }
 
+  static address get_zero_longs()
+  {
+    return _zero_longs;
+  }
+
 private:
   static juint    _crc_table[];
 };
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Jul 16 09:32:04 2019 +0100
@@ -68,6 +68,7 @@
 int VM_Version::_stepping;
 int VM_Version::_cpuFeatures;
 const char*           VM_Version::_features_str = "";
+VM_Version::PsrInfo VM_Version::_psr_info   = { 0, };
 
 static BufferBlob* stub_blob;
 static const int stub_size = 550;
@@ -92,13 +93,16 @@
     __ c_stub_prolog(1, 0, MacroAssembler::ret_type_void);
 #endif
 
-    // void getPsrInfo(VM_Version::CpuidInfo* cpuid_info);
+    // void getPsrInfo(VM_Version::PsrInfo* psr_info);
 
     address entry = __ pc();
 
-    // TODO : redefine fields in CpuidInfo and generate
-    // code to fill them in
+    __ enter();
 
+    __ get_dczid_el0(rscratch1);
+    __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset())));
+
+    __ leave();
     __ ret(lr);
 
 #   undef __
@@ -115,6 +119,8 @@
   _supports_atomic_getset8 = true;
   _supports_atomic_getadd8 = true;
 
+  getPsrInfo_stub(&_psr_info);
+
   if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
     FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
   if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
@@ -187,6 +193,18 @@
     UseCRC32Intrinsics = true;
   }
 
+  if (is_zva_enabled()) {
+    if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
+      FLAG_SET_DEFAULT(UseBlockZeroing, true);
+    }
+    if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
+      FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * VM_Version::zva_length());
+    }
+  } else if (UseBlockZeroing) {
+    warning("DC ZVA is not available on this CPU");
+    FLAG_SET_DEFAULT(UseBlockZeroing, false);
+  }
+
   // This machine allows unaligned memory accesses
   if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
     FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Tue Apr 12 11:53:44 2016 +0800
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Tue Jul 16 09:32:04 2019 +0100
@@ -42,6 +42,10 @@
                                // 0 if this instruction is not available
   static const char* _features_str;
 
+  struct PsrInfo {
+    uint32_t dczid_el0;
+  };
+  static PsrInfo _psr_info;
   static void get_processor_features();
 
 public:
@@ -85,7 +89,17 @@
   static int cpu_variant()                    { return _variant; }
   static int cpu_revision()                   { return _revision; }
   static int cpu_cpuFeatures()                { return _cpuFeatures; }
-
+  static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); }
+  static bool is_zva_enabled() {
+    // Check the DZP bit (bit 4) of dczid_el0 is zero
+    // and block size (bit 0~3) is not zero.
+    return ((_psr_info.dczid_el0 & 0x10) == 0 &&
+            (_psr_info.dczid_el0 & 0xf) != 0);
+  }
+  static int zva_length() {
+    assert(is_zva_enabled(), "ZVA not available");
+    return 4 << (_psr_info.dczid_el0 & 0xf);
+  }
 };
 
 #endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP