changeset 6733:6bd6d4c7940e

8157841, PR3741: aarch64: prefetch ignores cache line size Summary: fix prefetch to take account of cache line size Reviewed-by: aph Contributed-by: stuart.monteith@linaro.org, edward.nevill@linaro.org
author enevill
date Tue, 16 Jul 2019 10:47:47 +0100
parents cbb799cc6c7c
children 5e713e212064
files src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/assembler_aarch64.cpp src/cpu/aarch64/vm/assembler_aarch64.hpp src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp src/cpu/aarch64/vm/vm_version_aarch64.cpp src/cpu/aarch64/vm/vm_version_aarch64.hpp
diffstat 6 files changed, 69 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 16 10:47:47 2019 +0100
@@ -2905,7 +2905,8 @@
     // membar_acquire_lock().
     {
       Label retry_load;
-      __ prfm(Address(oop), PSTL1STRM);
+      if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+        __ prfm(Address(oop), PSTL1STRM);
       __ bind(retry_load);
       __ ldxr(tmp, oop);
       __ cmp(tmp, disp_hdr);
@@ -2956,7 +2957,8 @@
 
       {
 	Label retry_load, fail;
-        __ prfm(Address(tmp), PSTL1STRM);
+        if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+          __ prfm(Address(tmp), PSTL1STRM);
 	__ bind(retry_load);
 	__ ldxr(rscratch1, tmp);
 	__ cmp(disp_hdr, rscratch1);
@@ -3046,7 +3048,8 @@
 
       {
 	Label retry_load;
-        __ prfm(Address(oop), PSTL1STRM);
+        if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+          __ prfm(Address(oop), PSTL1STRM);
 	__ bind(retry_load);
 	__ ldxr(tmp, oop);
 	__ cmp(box, tmp);
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Jul 16 10:47:47 2019 +0100
@@ -3146,7 +3146,8 @@
 
 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
-  prfm(Address(counter_addr), PSTL1STRM);
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+    prfm(Address(counter_addr), PSTL1STRM);
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
@@ -3572,7 +3573,8 @@
   // addr identifies memory word to compare against/update
   // tmp returns 0/1 for success/failure
   Label retry_load, nope;
-  prfm(Address(addr), PSTL1STRM);
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+    prfm(Address(addr), PSTL1STRM);
   bind(retry_load);
   // flush and load exclusive from the memory location
   // and fail if it is not what we expect
@@ -3600,7 +3602,8 @@
   // addr identifies memory word to compare against/update
   // tmp returns 0/1 for success/failure
   Label retry_load, nope;
-  prfm(Address(addr), PSTL1STRM);
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+    prfm(Address(addr), PSTL1STRM);
   bind(retry_load);
   // flush and load exclusive from the memory location
   // and fail if it is not what we expect
@@ -3635,7 +3638,8 @@
     result = different(prev, incr, addr) ? prev : rscratch2;            \
                                                                         \
   Label retry_load;                                                     \
-  prfm(Address(addr), PSTL1STRM);                                       \
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
+    prfm(Address(addr), PSTL1STRM);                                     \
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   OP(rscratch1, result, incr);                                          \
@@ -3658,7 +3662,8 @@
     result = different(prev, newv, addr) ? prev : rscratch2;            \
                                                                         \
   Label retry_load;                                                     \
-  prfm(Address(addr), PSTL1STRM);                                       \
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
+    prfm(Address(addr), PSTL1STRM);                                     \
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   STXR(rscratch1, newv, addr);                                          \
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 16 10:47:47 2019 +0100
@@ -2677,10 +2677,11 @@
     umaddl(Rd, Rn, Rm, zr);
   }
 
-#define WRAP(INSN)                                                \
-  void INSN(Register Rd, Register Rn, Register Rm, Register Ra) { \
-    if (Ra != zr) nop();                                          \
-    Assembler::INSN(Rd, Rn, Rm, Ra);                              \
+#define WRAP(INSN)                                                            \
+  void INSN(Register Rd, Register Rn, Register Rm, Register Ra) {             \
+    if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_A53MAC) && Ra != zr) \
+      nop();                                                                  \
+    Assembler::INSN(Rd, Rn, Rm, Ra);                                          \
   }
 
   WRAP(madd) WRAP(msub) WRAP(maddw) WRAP(msubw)
@@ -2799,6 +2800,15 @@
     mrs(0b011, 0b0000, 0b0000, 0b111, reg);
   }
 
+  // CTR_EL0:   op1 == 011
+  //            CRn == 0000
+  //            CRm == 0000
+  //            op2 == 001
+  inline void get_ctr_el0(Register reg)
+  {
+    mrs(0b011, 0b0000, 0b0000, 0b001, reg);
+  }
+
   // idiv variant which deals with MINLONG as dividend and -1 as divisor
   int corrected_idivl(Register result, Register ra, Register rb,
                       bool want_remainder, Register tmp = rscratch1);
--- a/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	Tue Jul 16 10:47:47 2019 +0100
@@ -1512,7 +1512,8 @@
   Label retry_load, nope;
   // flush and load exclusive from the memory location
   // and fail if it is not what we expect
-  __ prfm(Address(addr), PSTL1STRM);
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+    __ prfm(Address(addr), PSTL1STRM);
   __ bind(retry_load);
   __ ldaxrw(rscratch1, addr);
   __ cmpw(rscratch1, cmpval);
@@ -1531,7 +1532,8 @@
   Label retry_load, nope;
   // flush and load exclusive from the memory location
   // and fail if it is not what we expect
-  __ prfm(Address(addr), PSTL1STRM);
+  if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
+    __ prfm(Address(addr), PSTL1STRM);
   __ bind(retry_load);
   __ ldaxr(rscratch1, addr);
   __ cmp(rscratch1, cmpval);
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Jul 16 10:47:47 2019 +0100
@@ -63,6 +63,7 @@
 
 int VM_Version::_cpu;
 int VM_Version::_model;
+int VM_Version::_model2;
 int VM_Version::_variant;
 int VM_Version::_revision;
 int VM_Version::_stepping;
@@ -102,6 +103,9 @@
     __ get_dczid_el0(rscratch1);
     __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset())));
 
+    __ get_ctr_el0(rscratch1);
+    __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::ctr_el0_offset())));
+
     __ leave();
     __ ret(lr);
 
@@ -121,17 +125,20 @@
 
   getPsrInfo_stub(&_psr_info);
 
+  int dcache_line = VM_Version::dcache_line_size();
+
   if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
-    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 3*dcache_line);
   if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
-    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, 64);
-  FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
-  FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
-  FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, dcache_line);
+  if (FLAG_IS_DEFAULT(PrefetchScanIntervalInBytes))
+    FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 3*dcache_line);
   if (FLAG_IS_DEFAULT(PrefetchCopyIntervalInBytes))
-    FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
-  if ((PrefetchCopyIntervalInBytes & 7) || (PrefetchCopyIntervalInBytes >= 32768)) {
-    warning("PrefetchCopyIntervalInBytes must be a multiple of 8 and < 32768");
+    FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 3*dcache_line);
+
+  if (PrefetchCopyIntervalInBytes != -1 &&
+       ((PrefetchCopyIntervalInBytes & 7) || (PrefetchCopyIntervalInBytes >= 32768))) {
+    warning("PrefetchCopyIntervalInBytes must be -1, or a multiple of 8 and < 32768");
     PrefetchCopyIntervalInBytes &= ~7;
     if (PrefetchCopyIntervalInBytes >= 32768)
       PrefetchCopyIntervalInBytes = 32760;
@@ -150,6 +157,7 @@
   _features_str = strdup(buf);
   _cpuFeatures = auxv;
 
+  int cpu_lines = 0;
   if (FILE *f = fopen("/proc/cpuinfo", "r")) {
     char buf[128], *p;
     while (fgets(buf, sizeof (buf), f) != NULL) {
@@ -157,9 +165,11 @@
         long v = strtol(p+1, NULL, 0);
         if (strncmp(buf, "CPU implementer", sizeof "CPU implementer" - 1) == 0) {
           _cpu = v;
+          cpu_lines++;
         } else if (strncmp(buf, "CPU variant", sizeof "CPU variant" - 1) == 0) {
           _variant = v;
         } else if (strncmp(buf, "CPU part", sizeof "CPU part" - 1) == 0) {
+          if (_model != v)  _model2 = _model;
           _model = v;
         } else if (strncmp(buf, "CPU revision", sizeof "CPU revision" - 1) == 0) {
           _revision = v;
@@ -170,8 +180,13 @@
   }
 
   // Enable vendor specific features
-  if (_cpu == CPU_CAVIUM) _cpuFeatures |= CPU_DMB_ATOMICS;
-  if (_cpu == CPU_ARM) _cpuFeatures |= CPU_A53MAC;
+  if (_cpu == CPU_CAVIUM && _variant == 0) _cpuFeatures |= CPU_DMB_ATOMICS;
+  if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _cpuFeatures |= CPU_A53MAC;
+  if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07)) _cpuFeatures |= CPU_STXR_PREFETCH;
+  // If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is an A57 (0xd07)
+  // we assume the worst and assume we could be on a big little system and have
+  // undisclosed A53 cores which we could be swapped to at any stage
+  if (_cpu == CPU_ARM && cpu_lines == 1 && _model == 0xd07) _cpuFeatures |= CPU_A53MAC;
 
   if (FLAG_IS_DEFAULT(UseCRC32)) {
     UseCRC32 = (auxv & HWCAP_CRC32) != 0;
--- a/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Mon Feb 08 14:14:35 2016 +0000
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Tue Jul 16 10:47:47 2019 +0100
@@ -35,6 +35,7 @@
 protected:
   static int _cpu;
   static int _model;
+  static int _model2;
   static int _variant;
   static int _revision;
   static int _stepping;
@@ -44,6 +45,7 @@
 
   struct PsrInfo {
     uint32_t dczid_el0;
+    uint32_t ctr_el0;
   };
   static PsrInfo _psr_info;
   static void get_processor_features();
@@ -79,6 +81,7 @@
     CPU_SHA1         = (1<<5),
     CPU_SHA2         = (1<<6),
     CPU_CRC32        = (1<<7),
+    CPU_STXR_PREFETCH= (1 << 29),
     CPU_A53MAC       = (1 << 30),
     CPU_DMB_ATOMICS  = (1 << 31),
   } cpuFeatureFlags;
@@ -90,6 +93,7 @@
   static int cpu_revision()                   { return _revision; }
   static int cpu_cpuFeatures()                { return _cpuFeatures; }
   static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); }
+  static ByteSize ctr_el0_offset()   { return byte_offset_of(PsrInfo, ctr_el0); }
   static bool is_zva_enabled() {
     // Check the DZP bit (bit 4) of dczid_el0 is zero
     // and block size (bit 0~3) is not zero.
@@ -100,6 +104,12 @@
     assert(is_zva_enabled(), "ZVA not available");
     return 4 << (_psr_info.dczid_el0 & 0xf);
   }
+  static int icache_line_size() {
+    return (1 << (_psr_info.ctr_el0 & 0x0f)) * 4;
+  }
+  static int dcache_line_size() {
+    return (1 << ((_psr_info.ctr_el0 >> 16) & 0x0f)) * 4;
+  }
 };
 
 #endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP