changeset 6729:12eb373fbf8f

8153797, PR3741: aarch64: Add Arrays.fill stub code Reviewed-by: aph Contributed-by: long.chen@linaro.org
author enevill
date Tue, 16 Jul 2019 07:18:49 +0100
parents c98fe53cf02b
children 6dcb006c9f20
files src/cpu/aarch64/vm/aarch64.ad src/cpu/aarch64/vm/assembler_aarch64.cpp src/cpu/aarch64/vm/assembler_aarch64.hpp src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
diffstat 4 files changed, 197 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 16 06:37:47 2019 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Tue Jul 16 07:18:49 2019 +0100
@@ -2479,55 +2479,6 @@
     }
   %}
 
-  enc_class aarch64_enc_clear_array_reg_reg(iRegL_R11 cnt, iRegP_R10 base) %{
-    MacroAssembler _masm(&cbuf);
-    Register cnt_reg = as_Register($cnt$$reg);
-    Register base_reg = as_Register($base$$reg);
-    // base is word aligned
-    // cnt is count of words
-
-    Label loop;
-    Label entry;
-
-//  Algorithm:
-//
-//    scratch1 = cnt & 7;
-//    cnt -= scratch1;
-//    p += scratch1;
-//    switch (scratch1) {
-//      do {
-//        cnt -= 8;
-//          p[-8] = 0;
-//        case 7:
-//          p[-7] = 0;
-//        case 6:
-//          p[-6] = 0;
-//          // ...
-//        case 1:
-//          p[-1] = 0;
-//        case 0:
-//          p += 8;
-//      } while (cnt);
-//    }
-
-    const int unroll = 8; // Number of str(zr) instructions we'll unroll
-
-    __ andr(rscratch1, cnt_reg, unroll - 1);  // tmp1 = cnt % unroll
-    __ sub(cnt_reg, cnt_reg, rscratch1);      // cnt -= unroll
-    // base_reg always points to the end of the region we're about to zero
-    __ add(base_reg, base_reg, rscratch1, Assembler::LSL, exact_log2(wordSize));
-    __ adr(rscratch2, entry);
-    __ sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
-    __ br(rscratch2);
-    __ bind(loop);
-    __ sub(cnt_reg, cnt_reg, unroll);
-    for (int i = -unroll; i < 0; i++)
-      __ str(zr, Address(base_reg, i * wordSize));
-    __ bind(entry);
-    __ add(base_reg, base_reg, unroll * wordSize);
-    __ cbnz(cnt_reg, loop);
-  %}
-
   /// mov envcodings
 
   enc_class aarch64_enc_movw_imm(iRegI dst, immI src) %{
@@ -10408,7 +10359,9 @@
   ins_cost(4 * INSN_COST);
   format %{ "ClearArray $cnt, $base" %}
 
-  ins_encode(aarch64_enc_clear_array_reg_reg(cnt, base));
+  ins_encode %{
+    __ zero_words($base$$Register, $cnt$$Register);
+  %}
 
   ins_pipe(pipe_class_memory);
 %}
--- a/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Jul 16 06:37:47 2019 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Jul 16 07:18:49 2019 +0100
@@ -5445,6 +5445,61 @@
   BLOCK_COMMENT("} string_compare");
 }
 
+// base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:    Count in 8-byte unit.
+void MacroAssembler::zero_words(Register base, Register cnt)
+{
+  fill_words(base, cnt, zr);
+}
+
+// base:   Address of a buffer to be filled, 8 bytes aligned.
+// cnt:    Count in 8-byte unit.
+// value:  Value to be filled with.
+// base will point to the end of the buffer after filling.
+void MacroAssembler::fill_words(Register base, Register cnt, Register value)
+{
+//  Algorithm:
+//
+//    scratch1 = cnt & 7;
+//    cnt -= scratch1;
+//    p += scratch1;
+//    switch (scratch1) {
+//      do {
+//        cnt -= 8;
+//          p[-8] = v;
+//        case 7:
+//          p[-7] = v;
+//        case 6:
+//          p[-6] = v;
+//          // ...
+//        case 1:
+//          p[-1] = v;
+//        case 0:
+//          p += 8;
+//      } while (cnt);
+//    }
+
+  assert_different_registers(base, cnt, value, rscratch1, rscratch2);
+
+  Label entry, loop;
+  const int unroll = 8; // Number of str instructions we'll unroll
+
+  andr(rscratch1, cnt, unroll - 1);  // tmp1 = cnt % unroll
+  cbz(rscratch1, entry);
+  sub(cnt, cnt, rscratch1);          // cnt -= tmp1
+  // base always points to the end of the region we're about to fill
+  add(base, base, rscratch1, Assembler::LSL, 3);
+  adr(rscratch2, entry);
+  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+  br(rscratch2);
+  bind(loop);
+  add(base, base, unroll * 8);
+  sub(cnt, cnt, unroll);
+  for (int i = -unroll; i < 0; i++)
+    str(value, Address(base, i * 8));
+  bind(entry);
+  cbnz(cnt, loop);
+}
 
 void MacroAssembler::string_equals(Register str1, Register str2,
                                    Register cnt, Register result,
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 16 06:37:47 2019 +0100
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jul 16 07:18:49 2019 +0100
@@ -3581,6 +3581,9 @@
                      Register tmp1);
   void char_arrays_equals(Register ary1, Register ary2,
                           Register result, Register tmp1);
+  void fill_words(Register base, Register cnt, Register value);
+  void zero_words(Register base, Register cnt);
+
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }
 };
--- a/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Jul 16 06:37:47 2019 +0100
+++ b/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Jul 16 07:18:49 2019 +0100
@@ -1594,6 +1594,136 @@
   }
 
 
+  //
+  // Generate stub for array fill. If "aligned" is true, the
+  // "to" address is assumed to be heapword aligned.
+  //
+  // Arguments for generated stub:
+  //   to:    c_rarg0
+  //   value: c_rarg1
+  //   count: c_rarg2 treated as signed
+  //
+  address generate_fill(BasicType t, bool aligned, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+
+    const Register to        = c_rarg0;  // source array address
+    const Register value     = c_rarg1;  // value
+    const Register count     = c_rarg2;  // elements count
+    const Register cnt_words = c_rarg3; // temp register
+
+    __ enter();
+
+    Label L_fill_elements, L_exit1;
+
+    int shift = -1;
+    switch (t) {
+      case T_BYTE:
+        shift = 0;
+        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+        __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
+        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
+        __ br(Assembler::LO, L_fill_elements);
+        break;
+      case T_SHORT:
+        shift = 1;
+        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+        __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
+        __ br(Assembler::LO, L_fill_elements);
+        break;
+      case T_INT:
+        shift = 2;
+        __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+        __ br(Assembler::LO, L_fill_elements);
+        break;
+      default: ShouldNotReachHere();
+    }
+
+    // Align source address at 8 bytes address boundary.
+    Label L_skip_align1, L_skip_align2, L_skip_align4;
+    if (!aligned) {
+      switch (t) {
+        case T_BYTE:
+          // One byte misalignment happens only for byte arrays.
+          __ tbz(to, 0, L_skip_align1);
+          __ strb(value, Address(__ post(to, 1)));
+          __ subw(count, count, 1);
+          __ bind(L_skip_align1);
+          // Fallthrough
+        case T_SHORT:
+          // Two bytes misalignment happens only for byte and short (char) arrays.
+          __ tbz(to, 1, L_skip_align2);
+          __ strh(value, Address(__ post(to, 2)));
+          __ subw(count, count, 2 >> shift);
+          __ bind(L_skip_align2);
+          // Fallthrough
+        case T_INT:
+          // Align to 8 bytes, we know we are 4 byte aligned to start.
+          __ tbz(to, 2, L_skip_align4);
+          __ strw(value, Address(__ post(to, 4)));
+          __ subw(count, count, 4 >> shift);
+          __ bind(L_skip_align4);
+          break;
+        default: ShouldNotReachHere();
+      }
+    }
+
+    //
+    //  Fill large chunks
+    //
+    __ lsrw(cnt_words, count, 3 - shift); // number of words
+    __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
+    __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
+    __ fill_words(to, cnt_words, value);
+
+    // Remaining count is less than 8 bytes. Fill it by a single store.
+    // Note that the total length is no less than 8 bytes.
+    if (t == T_BYTE || t == T_SHORT) {
+      Label L_exit1;
+      __ cbzw(count, L_exit1);
+      __ add(to, to, count, Assembler::LSL, shift); // points to the end
+      __ str(value, Address(to, -8));    // overwrite some elements
+      __ bind(L_exit1);
+      __ leave();
+      __ ret(lr);
+    }
+
+    // Handle copies less than 8 bytes.
+    Label L_fill_2, L_fill_4, L_exit2;
+    __ bind(L_fill_elements);
+    switch (t) {
+      case T_BYTE:
+        __ tbz(count, 0, L_fill_2);
+        __ strb(value, Address(__ post(to, 1)));
+        __ bind(L_fill_2);
+        __ tbz(count, 1, L_fill_4);
+        __ strh(value, Address(__ post(to, 2)));
+        __ bind(L_fill_4);
+        __ tbz(count, 2, L_exit2);
+        __ strw(value, Address(to));
+        break;
+      case T_SHORT:
+        __ tbz(count, 0, L_fill_4);
+        __ strh(value, Address(__ post(to, 2)));
+        __ bind(L_fill_4);
+        __ tbz(count, 1, L_exit2);
+        __ strw(value, Address(to));
+        break;
+      case T_INT:
+        __ cbzw(count, L_exit2);
+        __ strw(value, Address(to));
+        break;
+      default: ShouldNotReachHere();
+    }
+    __ bind(L_exit2);
+    __ leave();
+    __ ret(lr);
+    return start;
+  }
+
   void generate_arraycopy_stubs() {
     address entry;
     address entry_jbyte_arraycopy;
@@ -1682,6 +1812,12 @@
     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
                                                                         /*dest_uninitialized*/true);
+    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
+    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
+    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
+    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
+    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
+    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
   }
 
   // Arguments: