view patches/openjdk/7034464-hugepage.patch @ 2545:2aa0bd4d0654

S7037939, S7034464: Import Andrew Haley's HugePage support patches
author Deepak Bhole <dbhole@redhat.com>
date Thu, 05 Jan 2012 10:37:17 -0500
parents
children
line wrap: on
line source

# HG changeset patch
# User iveresov
# Date 1303344724 25200
# Node ID 139667d9836ae218ab06a74d5813a5a700a076c9
# Parent  49a67202bc671d23d719c9e14752d80295a9e8f6
7034464: Support transparent large pages on Linux
Summary: Support transparent huge pages on Linux available since 2.6.38
Reviewed-by: iveresov, ysr
Contributed-by: aph@redhat.com
diff -u -r openjdk.patched/hotspot/src/os/linux/vm/globals_linux.hpp openjdk/hotspot/src/os/linux/vm/globals_linux.hpp
--- openjdk.patched/hotspot/src/os/linux/vm/globals_linux.hpp	2011-03-16 02:30:16.000000000 +0000
+++ openjdk/hotspot/src/os/linux/vm/globals_linux.hpp	2011-05-04 11:57:15.083470027 +0100
@@ -29,13 +29,19 @@
 // Defines Linux specific flags. They are not available on other platforms.
 //
 #define RUNTIME_OS_FLAGS(develop, develop_pd, product, product_pd, diagnostic, notproduct) \
-  product(bool, UseOprofile, false,                                 \
-        "enable support for Oprofile profiler")                     \
-                                                                    \
-  product(bool, UseLinuxPosixThreadCPUClocks, true,                 \
-          "enable fast Linux Posix clocks where available")
-// NB: The default value of UseLinuxPosixThreadCPUClocks may be
-// overridden in Arguments::parse_each_vm_init_arg.
+  product(bool, UseOprofile, false,                                     \
+        "enable support for Oprofile profiler")                         \
+                                                                        \
+  product(bool, UseLinuxPosixThreadCPUClocks, true,                     \
+          "enable fast Linux Posix clocks where available")             \
+/*  NB: The default value of UseLinuxPosixThreadCPUClocks may be        \
+    overridden in Arguments::parse_each_vm_init_arg.  */                \
+                                                                        \
+  product(bool, UseHugeTLBFS, false,                                    \
+          "Use MAP_HUGETLB for large pages")                            \
+                                                                        \
+  product(bool, UseSHM, false,                                          \
+          "Use SYSV shared memory for large pages")
 
 //
 // Defines Linux-specific default values. The flags are available on all
diff -u -r openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp openjdk/hotspot/src/os/linux/vm/os_linux.cpp
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp	2011-05-03 14:32:54.285722656 +0100
+++ openjdk/hotspot/src/os/linux/vm/os_linux.cpp	2011-05-09 14:31:51.358502774 +0100
@@ -2495,16 +2495,40 @@
   return res != (uintptr_t) MAP_FAILED;
 }
 
+// Define MAP_HUGETLB here so we can build HotSpot on old systems.
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+
+// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 14
+#endif
+
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
                        bool exec) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+    uintptr_t res =
+      (uintptr_t) ::mmap(addr, size, prot,
+                         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
+                         -1, 0);
+    return res != (uintptr_t) MAP_FAILED;
+  }
+
   return commit_memory(addr, size, exec);
 }
 
-void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
+void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
+    // be supported or the memory may already be backed by huge pages.
+    ::madvise(addr, bytes, MADV_HUGEPAGE);
+  }
+}
 
 void os::free_memory(char *addr, size_t bytes) {
-  ::mmap(addr, bytes, PROT_READ | PROT_WRITE,
-         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+  ::madvise(addr, bytes, MADV_DONTNEED);
 }
 
 void os::numa_make_global(char *addr, size_t bytes) {
@@ -2870,7 +2894,16 @@
 static size_t _large_page_size = 0;
 
 bool os::large_page_init() {
-  if (!UseLargePages) return false;
+  if (!UseLargePages) {
+    UseHugeTLBFS = false;
+    UseSHM = false;
+    return false;
+  }
+
+  if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
+    // Our user has not expressed a preference, so we'll try both.
+    UseHugeTLBFS = UseSHM = true;
+  }
 
   if (LargePageSizeInBytes) {
     _large_page_size = LargePageSizeInBytes;
@@ -2915,6 +2948,9 @@
     }
   }
 
+  // print a warning if any large page related flag is specified on command line
+  bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+
   const size_t default_page_size = (size_t)Linux::page_size();
   if (_large_page_size > default_page_size) {
     _page_sizes[0] = _large_page_size;
@@ -2922,6 +2958,14 @@
     _page_sizes[2] = 0;
   }
 
+  UseHugeTLBFS = UseHugeTLBFS &&
+                 Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
+
+  if (UseHugeTLBFS)
+    UseSHM = false;
+
+  UseLargePages = UseHugeTLBFS || UseSHM;
+
   // Large page support is available on 2.6 or newer kernel, some vendors
   // (e.g. Redhat) have backported it to their 2.4 based distributions.
   // We optimistically assume the support is available. If later it turns out
@@ -2936,7 +2980,7 @@
 char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
-  assert(UseLargePages, "only for large pages");
+  assert(UseLargePages && UseSHM, "only for SHM large pages");
 
   key_t key = IPC_PRIVATE;
   char *addr;
@@ -3003,19 +3047,19 @@
   return _large_page_size;
 }
 
-// Linux does not support anonymous mmap with large page memory. The only way
-// to reserve large page memory without file backing is through SysV shared
-// memory API. The entire memory region is committed and pinned upfront.
-// Hopefully this will change in the future...
+// HugeTLBFS allows application to commit large page memory on demand;
+// with SysV SHM the entire memory region must be allocated as shared
+// memory.
 bool os::can_commit_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }
 
 bool os::can_execute_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }
 
 // Reserve memory at an arbitrary address, only if that area is
+// Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).
 
 char* os::attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
@@ -5009,6 +5053,43 @@
 // JSR166
 // -------------------------------------------------------
 
+bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
+  bool result = false;
+  void *p = mmap (NULL, page_size, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
+                  -1, 0);
+
+  if (p != (void *) -1) {
+    // We don't know if this really is a huge page or not.
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (fp) {
+      while (!feof(fp)) {
+        char chars[257];
+        long x = 0;
+        if (fgets(chars, sizeof(chars), fp)) {
+          if (sscanf(chars, "%lx-%*lx", &x) == 1
+              && x == (long)p) {
+            if (strstr (chars, "hugepage")) {
+              result = true;
+              break;
+            }
+          }
+        }
+      }
+      fclose(fp);
+    }
+    munmap (p, page_size);
+    if (result)
+      return true;
+  }
+
+  if (warn) {
+    warning("HugeTLBFS is not supported by the operating system.");
+  }
+
+  return result;
+}
+
 /*
  * The solaris and linux implementations of park/unpark are fairly
  * conservative for now, but can be improved. They currently use a
diff -u -r openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp.orig openjdk/hotspot/src/os/linux/vm/os_linux.cpp.orig
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.cpp.orig	2011-05-03 14:32:53.972649764 +0100
+++ openjdk/hotspot/src/os/linux/vm/os_linux.cpp.orig	2011-05-04 11:57:15.085470494 +0100
@@ -117,6 +117,10 @@
 # include <inttypes.h>
 # include <sys/ioctl.h>
 
+#if __x86_64__
+#include <asm/vsyscall.h>
+#endif
+
 #define MAX_PATH    (2 * K)
 
 // for timer info max values which include all bits
@@ -2491,16 +2495,40 @@
   return res != (uintptr_t) MAP_FAILED;
 }
 
+// Define MAP_HUGETLB here so we can build HotSpot on old systems.
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000
+#endif
+
+// Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 14
+#endif
+
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
                        bool exec) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+    uintptr_t res =
+      (uintptr_t) ::mmap(addr, size, prot,
+                         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
+                         -1, 0);
+    return res != (uintptr_t) MAP_FAILED;
+  }
+
   return commit_memory(addr, size, exec);
 }
 
-void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
+void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+    // We don't check the return value: madvise(MADV_HUGEPAGE) may not
+    // be supported or the memory may already be backed by huge pages.
+    ::madvise(addr, bytes, MADV_HUGEPAGE);
+  }
+}
 
 void os::free_memory(char *addr, size_t bytes) {
-  ::mmap(addr, bytes, PROT_READ | PROT_WRITE,
-         MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
+  ::madvise(addr, bytes, MADV_DONTNEED);
 }
 
 void os::numa_make_global(char *addr, size_t bytes) {
@@ -2544,6 +2572,21 @@
   return end;
 }
 
+static int sched_getcpu_syscall(void) {
+  unsigned int cpu;
+  int retval = -1;
+
+#if __x86_64__
+  typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
+  vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
+  retval = vgetcpu(&cpu, NULL, NULL);
+#elif __i386__
+  retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
+#endif
+
+  return (retval == -1) ? retval : cpu;
+}
+
 extern "C" void numa_warn(int number, char *where, ...) { }
 extern "C" void numa_error(char *where) { }
 
@@ -2565,6 +2608,10 @@
   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));
 
+  // If it's not, try a direct syscall.
+  if (sched_getcpu() == -1)
+    set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t, (void*)&sched_getcpu_syscall));
+
   if (sched_getcpu() != -1) { // Does it work?
     void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
     if (handle != NULL) {
@@ -2847,7 +2894,16 @@
 static size_t _large_page_size = 0;
 
 bool os::large_page_init() {
-  if (!UseLargePages) return false;
+  if (!UseLargePages) {
+    UseHugeTLBFS = false;
+    UseSHM = false;
+    return false;
+  }
+
+  if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
+    // Our user has not expressed a preference, so we'll try both.
+    UseHugeTLBFS = UseSHM = true;
+  }
 
   if (LargePageSizeInBytes) {
     _large_page_size = LargePageSizeInBytes;
@@ -2892,6 +2948,9 @@
     }
   }
 
+  // print a warning if any large page related flag is specified on command line
+  bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+
   const size_t default_page_size = (size_t)Linux::page_size();
   if (_large_page_size > default_page_size) {
     _page_sizes[0] = _large_page_size;
@@ -2899,6 +2958,14 @@
     _page_sizes[2] = 0;
   }
 
+  UseHugeTLBFS = UseHugeTLBFS &&
+                 Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
+
+  if (UseHugeTLBFS)
+    UseSHM = false;
+
+  UseLargePages = UseHugeTLBFS || UseSHM;
+
   // Large page support is available on 2.6 or newer kernel, some vendors
   // (e.g. Redhat) have backported it to their 2.4 based distributions.
   // We optimistically assume the support is available. If later it turns out
@@ -2913,7 +2980,7 @@
 char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
-  assert(UseLargePages, "only for large pages");
+  assert(UseLargePages && UseSHM, "only for SHM large pages");
 
   key_t key = IPC_PRIVATE;
   char *addr;
@@ -2980,19 +3047,19 @@
   return _large_page_size;
 }
 
-// Linux does not support anonymous mmap with large page memory. The only way
-// to reserve large page memory without file backing is through SysV shared
-// memory API. The entire memory region is committed and pinned upfront.
-// Hopefully this will change in the future...
+// HugeTLBFS allows application to commit large page memory on demand;
+// with SysV SHM the entire memory region must be allocated as shared
+// memory.
 bool os::can_commit_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }
 
 bool os::can_execute_large_page_memory() {
-  return false;
+  return UseHugeTLBFS;
 }
 
 // Reserve memory at an arbitrary address, only if that area is
+// Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).
 
 char* os::attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
@@ -4081,6 +4148,23 @@
         UseNUMA = false;
       }
     }
+    // With SHM large pages we cannot uncommit a page, so there's not way
+    // we can make the adaptive lgrp chunk resizing work. If the user specified
+    // both UseNUMA and UseLargePages (or UseSHM) on the command line - warn and
+    // disable adaptive resizing.
+    if (UseNUMA && UseLargePages && UseSHM) {
+      if (!FLAG_IS_DEFAULT(UseNUMA)) {
+        if (FLAG_IS_DEFAULT(UseLargePages) && FLAG_IS_DEFAULT(UseSHM)) {
+          UseLargePages = false;
+        } else {
+          warning("UseNUMA is not fully compatible with SHM large pages, disabling adaptive resizing");
+          UseAdaptiveSizePolicy = false;
+          UseAdaptiveNUMAChunkSizing = false;
+        }
+      } else {
+        UseNUMA = false;
+      }
+    }
     if (!UseNUMA && ForceNUMA) {
       UseNUMA = true;
     }
@@ -4986,6 +5070,43 @@
 // JSR166
 // -------------------------------------------------------
 
+bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
+  bool result = false;
+  void *p = mmap (NULL, page_size, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
+                  -1, 0);
+
+  if (p != (void *) -1) {
+    // We don't know if this really is a huge page or not.
+    FILE *fp = fopen("/proc/self/maps", "r");
+    if (fp) {
+      while (!feof(fp)) {
+        char chars[257];
+        long x = 0;
+        if (fgets(chars, sizeof(chars), fp)) {
+          if (sscanf(chars, "%lx-%*lx", &x) == 1
+              && x == (long)p) {
+            if (strstr (chars, "hugepage")) {
+              result = true;
+              break;
+            }
+          }
+<+        }
+      }
+      fclose(fp);
+    }
+    munmap (p, page_size);
+    if (result)
+      return true;
+  }
+
+  if (warn) {
+    warning("HugeTLBFS is not supported by the operating system.");
+  }
+
+  return result;
+}
+
 /*
  * The solaris and linux implementations of park/unpark are fairly
  * conservative for now, but can be improved. They currently use a
diff -u -r openjdk.patched/hotspot/src/os/linux/vm/os_linux.hpp openjdk/hotspot/src/os/linux/vm/os_linux.hpp
--- openjdk.patched/hotspot/src/os/linux/vm/os_linux.hpp	2011-03-16 02:30:16.000000000 +0000
+++ openjdk/hotspot/src/os/linux/vm/os_linux.hpp	2011-05-04 11:57:15.088471194 +0100
@@ -86,6 +86,9 @@
 
   static void rebuild_cpu_to_node_map();
   static GrowableArray<int>* cpu_to_node()    { return _cpu_to_node; }
+
+  static bool hugetlbfs_sanity_check(bool warn, size_t page_size);
+
  public:
   static void init_thread_fpu_state();
   static int  get_fpu_control_word();