Mercurial > hg > openjdk > aarch64-port > hotspot

--- a/make/hotspot_version	Wed May 13 12:50:06 2015 -0700
+++ b/make/hotspot_version	Thu May 14 18:22:32 2015 -0700
@@ -35,7 +35,7 @@

 HS_MAJOR_VER=25
 HS_MINOR_VER=60
-HS_BUILD_NUMBER=15
+HS_BUILD_NUMBER=16

 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed May 13 12:50:06 2015 -0700
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu May 14 18:22:32 2015 -0700
@@ -6690,7 +6690,7 @@
     subl(cnt2, stride2);
     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
     // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);

     // compare wide vectors tail
     bind(COMPARE_WIDE_TAIL);
@@ -6705,7 +6705,7 @@
     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
     bind(VECTOR_NOT_EQUAL);
     // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);
     lea(str1, Address(str1, result, scale));
     lea(str2, Address(str2, result, scale));
     jmp(COMPARE_16_CHARS);
@@ -6964,7 +6964,8 @@
   bind(DONE);
   if (UseAVX >= 2) {
     // clean upper bits of YMM registers
-    vzeroupper();
+    vpxor(vec1, vec1);
+    vpxor(vec2, vec2);
   }
 }

@@ -7098,7 +7099,8 @@

         BIND(L_check_fill_8_bytes);
         // clean upper bits of YMM registers
-        vzeroupper();
+        movdl(xtmp, value);
+        pshufd(xtmp, xtmp, 0);
       } else {
         // Fill 32-byte chunks
         pshufd(xtmp, xtmp, 0);
@@ -7261,7 +7263,11 @@
     bind(L_copy_16_chars_exit);
     if (UseAVX >= 2) {
       // clean upper bits of YMM registers
-      vzeroupper();
+      vpxor(tmp2Reg, tmp2Reg);
+      vpxor(tmp3Reg, tmp3Reg);
+      vpxor(tmp4Reg, tmp4Reg);
+      movdl(tmp1Reg, tmp5);
+      pshufd(tmp1Reg, tmp1Reg, 0);
     }
     subptr(len, 8);
     jccb(Assembler::greater, L_copy_8_chars_exit);
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Wed May 13 12:50:06 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu May 14 18:22:32 2015 -0700
@@ -837,7 +837,8 @@

     if (UseUnalignedLoadStores && (UseAVX >= 2)) {
       // clean upper bits of YMM registers
-      __ vzeroupper();
+      __ vpxor(xmm0, xmm0);
+      __ vpxor(xmm1, xmm1);
     }
     __ addl(qword_count, 8);
     __ jccb(Assembler::zero, L_exit);
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Wed May 13 12:50:06 2015 -0700
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu May 14 18:22:32 2015 -0700
@@ -1328,7 +1328,8 @@
       __ BIND(L_end);
       if (UseAVX >= 2) {
         // clean upper bits of YMM registers
-        __ vzeroupper();
+        __ vpxor(xmm0, xmm0);
+        __ vpxor(xmm1, xmm1);
       }
     } else {
       // Copy 32-bytes per iteration
@@ -1405,7 +1406,8 @@
       __ BIND(L_end);
       if (UseAVX >= 2) {
         // clean upper bits of YMM registers
-        __ vzeroupper();
+        __ vpxor(xmm0, xmm0);
+        __ vpxor(xmm1, xmm1);
       }
     } else {
       // Copy 32-bytes per iteration
--- a/src/share/vm/opto/superword.cpp	Wed May 13 12:50:06 2015 -0700
+++ b/src/share/vm/opto/superword.cpp	Thu May 14 18:22:32 2015 -0700
@@ -232,6 +232,13 @@
           // if unaligned memory access is not allowed because number of
           // iterations in pre-loop will be not enough to align it.
           create_pack = false;
+        } else {
+          SWPointer p2(best_align_to_mem_ref, this);
+          if (align_to_ref_p.invar() != p2.invar()) {
+            // Do not vectorize memory accesses with different invariants
+            // if unaligned memory accesses are not allowed.
+            create_pack = false;
+          }
         }
       }
     } else {
@@ -445,29 +452,57 @@
   int preloop_stride = pre_end->stride_con();

   int span = preloop_stride * p.scale_in_bytes();
-
-  // Stride one accesses are alignable.
-  if (ABS(span) == p.memory_size())
+  int mem_size = p.memory_size();
+  int offset   = p.offset_in_bytes();
+  // Stride one accesses are alignable if offset is aligned to memory operation size.
+  // Offset can be unaligned when UseUnalignedAccesses is used.
+  if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
     return true;
-
-  // If initial offset from start of object is computable,
-  // compute alignment within the vector.
+  }
+  // If the initial offset from start of the object is computable,
+  // check if the pre-loop can align the final offset accordingly.
+  //
+  // In other words: Can we find an i such that the offset
+  // after i pre-loop iterations is aligned to vw?
+  //   (init_offset + pre_loop) % vw == 0              (1)
+  // where
+  //   pre_loop = i * span
+  // is the number of bytes added to the offset by i pre-loop iterations.
+  //
+  // For this to hold we need pre_loop to increase init_offset by
+  //   pre_loop = vw - (init_offset % vw)
+  //
+  // This is only possible if pre_loop is divisible by span because each
+  // pre-loop iteration increases the initial offset by 'span' bytes:
+  //   (vw - (init_offset % vw)) % span == 0
+  //
   int vw = vector_width_in_bytes(p.mem());
   assert(vw > 1, "sanity");
-  if (vw % span == 0) {
-    Node* init_nd = pre_end->init_trip();
-    if (init_nd->is_Con() && p.invar() == NULL) {
-      int init = init_nd->bottom_type()->is_int()->get_con();
-
-      int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes();
-      assert(init_offset >= 0, "positive offset from object start");
-
+  Node* init_nd = pre_end->init_trip();
+  if (init_nd->is_Con() && p.invar() == NULL) {
+    int init = init_nd->bottom_type()->is_int()->get_con();
+    int init_offset = init * p.scale_in_bytes() + offset;
+    assert(init_offset >= 0, "positive offset from object start");
+    if (vw % span == 0) {
+      // If vm is a multiple of span, we use formula (1).
       if (span > 0) {
         return (vw - (init_offset % vw)) % span == 0;
       } else {
         assert(span < 0, "nonzero stride * scale");
         return (init_offset % vw) % -span == 0;
       }
+    } else if (span % vw == 0) {
+      // If span is a multiple of vw, we can simplify formula (1) to:
+      //   (init_offset + i * span) % vw == 0
+      //     =>
+      //   (init_offset % vw) + ((i * span) % vw) == 0
+      //     =>
+      //   init_offset % vw == 0
+      //
+      // Because we add a multiple of vw to the initial offset, the final
+      // offset is a multiple of vw if and only if init_offset is a multiple.
+      //
+      return (init_offset % vw) == 0;
     }
   }
   return false;
@@ -479,17 +514,23 @@
   SWPointer align_to_ref_p(mem_ref, this);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
+  int elt_size = align_to_ref_p.memory_size();
   int vw       = vector_width_in_bytes(mem_ref);
   assert(vw > 1, "sanity");
-  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  // At least one iteration is executed in pre-loop by default. As result
-  // several iterations are needed to align memory operations in main-loop even
-  // if offset is 0.
-  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
-  int elt_size = align_to_ref_p.memory_size();
-  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
-         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
-  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  int iv_adjustment;
+  if (scale != 0) {
+    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
+    // At least one iteration is executed in pre-loop by default. As result
+    // several iterations are needed to align memory operations in main-loop even
+    // if offset is 0.
+    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+    assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+           err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+    iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  } else {
+    // This memory op is not dependent on iv (scale == 0)
+    iv_adjustment = 0;
+  }

 #ifndef PRODUCT
   if (TraceSuperWord)
@@ -2247,6 +2288,11 @@
   }
   // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant)
   Node* base = adr->in(AddPNode::Base);
+  // The base address should be loop invariant
+  if (!invariant(base)) {
+    assert(!valid(), "base address is loop variant");
+    return;
+  }
   //unsafe reference could not be aligned appropriately without runtime checking
   if (base == NULL || base->bottom_type() == Type::TOP) {
     assert(!valid(), "unsafe access");
--- a/src/share/vm/opto/superword.hpp	Wed May 13 12:50:06 2015 -0700
+++ b/src/share/vm/opto/superword.hpp	Thu May 14 18:22:32 2015 -0700
@@ -41,7 +41,7 @@
 // Exploiting SuperWord Level Parallelism with
 //   Multimedia Instruction Sets
 // by
-//   Samuel Larsen and Saman Amarasighe
+//   Samuel Larsen and Saman Amarasinghe
 //   MIT Laboratory for Computer Science
 // date
 //   May 2000
@@ -432,7 +432,7 @@

   Node* _base;         // NULL if unsafe nonheap reference
   Node* _adr;          // address pointer
-  jint  _scale;        // multipler for iv (in bytes), 0 if no loop iv
+  jint  _scale;        // multiplier for iv (in bytes), 0 if no loop iv
   jint  _offset;       // constant offset (in bytes)
   Node* _invar;        // invariant offset (in bytes), NULL if none
   bool  _negate_invar; // if true then use: (0 - _invar)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java	Thu May 14 18:22:32 2015 -0700
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+import com.oracle.java.testlibrary.*;
+import sun.misc.Unsafe;
+
+/**
+ * @test
+ * @bug 8078497
+ * @summary Tests correct alignment of vectors with loop invariant offset.
+ * @library /testlibrary
+ * @run main TestVectorizationWithInvariant
+ */
+public class TestVectorizationWithInvariant {
+
+    private static Unsafe unsafe;
+    private static final long BYTE_ARRAY_OFFSET;
+    private static final long CHAR_ARRAY_OFFSET;
+
+    static {
+        unsafe = Utils.getUnsafe();
+        BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
+        CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
+    }
+
+    public static void main(String[] args) throws Exception {
+        byte[] byte_array1 = new byte[1000];
+        byte[] byte_array2 = new byte[1000];
+        char[] char_array = new char[1000];
+
+        for (int i = 0; i < 20_000; ++i) {
+            copyByteToChar(byte_array1, byte_array2, char_array, 1);
+            copyCharToByte(char_array, byte_array1, 1);
+            copyCharToByteAligned(char_array, byte_array1);
+            copyCharToByteUnaligned(char_array, byte_array1);
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a byte array to a given offset in a char array
+     * to trigger C2's superword optimization. The offset in the byte array is independent
+     * of the loop induction variable and can be set to an arbitrary value. It may then not
+     * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
+     * should only be done in this case if unaligned memory accesses are allowed.
+     */
+    public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        byte[] src = src1;
+        for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
+            unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
+            unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
+            unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
+            unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
+            unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
+            unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
+            unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
+
+            // Prevent loop invariant code motion of char read.
+            src = (src == src1) ? src2 : src1;
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a char array to a given offset in a byte array
+     * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
+     */
+    public static void copyCharToByte(char[] src, byte[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset.
+     * The loop should always be vectorized because both the LoadUS and StoreC
+     * operations can be aligned.
+     */
+    public static void copyCharToByteAligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET;
+        for (int i = 8; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset. The
+     * loop should only be vectorized if unaligned memory operations are allowed
+     * because not both the LoadUS and the StoreC can be aligned.
+     */
+    public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET + 2;
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+}