Mercurial > hg > gc-bench
changeset 0:f8496889e1ac
Initial import.
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pom.xml Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,182 @@ +<!-- +Copyright (c) 2014, Oracle America, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Oracle nor the names of its contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +THE POSSIBILITY OF SUCH DAMAGE. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.openjdk</groupId> + <artifactId>gc-bench</artifactId> + <version>1.0</version> + <packaging>jar</packaging> + + <name>JMH benchmark sample: Java</name> + + <!-- + This is the demo/sample template build script for building Java benchmarks with JMH. + Edit as needed. + --> + + <prerequisites> + <maven>3.0</maven> + </prerequisites> + + <dependencies> + <dependency> + <groupId>org.openjdk.jmh</groupId> + <artifactId>jmh-core</artifactId> + <version>${jmh.version}</version> + </dependency> + <dependency> + <groupId>org.openjdk.jol</groupId> + <artifactId>jol-core</artifactId> + <version>0.6</version> + </dependency> + <dependency> + <groupId>org.openjdk.jmh</groupId> + <artifactId>jmh-generator-annprocess</artifactId> + <version>${jmh.version}</version> + <scope>provided</scope> + </dependency> + </dependencies> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + + <!-- + JMH version to use with this project. + --> + <jmh.version>1.17</jmh.version> + + <!-- + Java source/target to use for compilation. + --> + <javac.target>1.8</javac.target> + + <!-- + Name of the benchmark Uber-JAR to generate. + --> + <uberjar.name>benchmarks</uberjar.name> + </properties> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.1</version> + <configuration> + <compilerVersion>${javac.target}</compilerVersion> + <source>${javac.target}</source> + <target>${javac.target}</target> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.2</version> + <executions> + <execution> + <id>uberjar</id> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <finalName>${uberjar.name}</finalName> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.openjdk.jmh.Main</mainClass> + </transformer> + </transformers> + </configuration> + </execution> + <execution> + <id>gcbench</id> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <finalName>gcbench</finalName> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.openjdk.gcbench.GCBench</mainClass> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + <pluginManagement> + <plugins> + <plugin> + <artifactId>maven-clean-plugin</artifactId> + <version>2.5</version> + </plugin> + <plugin> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.8.1</version> + </plugin> + <plugin> + <artifactId>maven-install-plugin</artifactId> + <version>2.5.1</version> + </plugin> + <plugin> + <artifactId>maven-jar-plugin</artifactId> + <version>2.4</version> + </plugin> + <plugin> + <artifactId>maven-javadoc-plugin</artifactId> + <version>2.9.1</version> + </plugin> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>2.6</version> + </plugin> + <plugin> + <artifactId>maven-site-plugin</artifactId> + <version>3.3</version> + </plugin> + <plugin> + <artifactId>maven-source-plugin</artifactId> + <version>2.2.1</version> + </plugin> + <plugin> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.17</version> + </plugin> + </plugins> + </pluginManagement> + </build> + +</project>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/GCBench.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,300 @@ +package org.openjdk.gcbench; + +import org.openjdk.gcbench.alloc.ratelimited.Objects; +import org.openjdk.gcbench.alloc.ratelimited.PrimArray; +import org.openjdk.gcbench.alloc.ratelimited.RefArray; +import org.openjdk.gcbench.fragger.ArrayFragger; +import org.openjdk.gcbench.util.Dummy; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.profile.GCProfiler; +import org.openjdk.jmh.profile.SafepointsProfiler; +import org.openjdk.jmh.results.Result; +import org.openjdk.jmh.results.RunResult; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.TimeValue; +import org.openjdk.jmh.runner.options.VerboseMode; + +import java.io.PrintWriter; +import java.util.Map; + +public class GCBench { + + private final PrintWriter pw; + private final Options baseOpts; + private int maxHeapMB; + + public static void main(String... args) throws RunnerException { + GCBench bench = new GCBench(); + bench.run(); + } + + private void run() throws RunnerException { + maxHeapMB = calibrateMaxHeap(); + +// runAllocationPressure_Peak(); +// runAllocationPressure_RateLimited(); + runFraggers_RateLimited(); + } + + private void runAllocationPressure_Peak() throws RunnerException { + pw.println("=== PEAK ALLOCATION PRESSURE TESTS"); + pw.println(); + + pw.println("Allocates the objects in almost completely empty heap. This tests how well the collector" + + " can withstand peak allocation pressure without taking care of anything else."); + pw.println(); + + pw.println("*** Allocating Object:"); + pw.println(); + doRun_ThreadsHeap(org.openjdk.gcbench.alloc.plain.Objects.class); + pw.println(); + + pw.println("*** Allocating int[]:"); + pw.println(); + doRun_ThreadsHeapSize(org.openjdk.gcbench.alloc.plain.PrimArray.class); + pw.println(); + + pw.println("*** Allocating Object[]:"); + pw.println(); + doRun_ThreadsHeapSize(org.openjdk.gcbench.alloc.plain.RefArray.class); + pw.println(); + } + + private void runAllocationPressure_RateLimited() throws RunnerException { + pw.println("=== RATE-LIMITED ALLOCATION PRESSURE TESTS"); + pw.println(); + + pw.println("Allocates the objects in almost completely empty heap, but with the controlled allocation rate." + + "This tests when the collector SLA requirements break under dynamic allocation pressure."); + pw.println(); + + pw.println("*** Allocating Object:"); + pw.println(); + doRun_AllocRate(Objects.class); + pw.println(); + + pw.println("*** Allocating int[]:"); + pw.println(); + doRun_AllocRateSize(PrimArray.class); + pw.println(); + + pw.println("*** Allocating Object[]:"); + pw.println(); + doRun_AllocRateSize(RefArray.class); + pw.println(); + } + + private void runFraggers_RateLimited() throws RunnerException { + pw.println("=== RATE-LIMITED FRAGMENTATION TESTS"); + pw.println(); + + pw.println("*** Array fragger:"); + pw.println(); + doRun_AllocRate(ArrayFragger.class); + pw.println(); + } + + public GCBench() { + pw = new PrintWriter(System.out, true); + + baseOpts = new OptionsBuilder() + .detectJvmArgs() + .warmupIterations(1) + .warmupTime(TimeValue.seconds(1)) + .measurementIterations(3) + .measurementTime(TimeValue.seconds(1)) + .forks(1) + .threads(Threads.MAX) +// .threads(1) + .addProfiler(GCProfiler.class) + .addProfiler(SafepointsProfiler.class) + .verbosity(VerboseMode.SILENT) + .build(); + } + + private int calibrateMaxHeap() { + pw.println("=== Calibrating the target heap size"); + pw.println(); + + int baseHeapMB = 1000; + int latestSuccessMB = 0; + boolean progress; + do { + progress = false; + for (int incr = 100; incr < Integer.MAX_VALUE; incr *= 2) { + int heapGB = baseHeapMB + incr; + pw.print(heapGB + "? "); + pw.flush(); + Options opts = new OptionsBuilder() + .include(Dummy.class.getCanonicalName()) + .threads(1) + .jvmArgsAppend("-Xmx" + heapGB + "m", "-Xms" + heapGB + "m") + .verbosity(VerboseMode.SILENT) + .build(); + try { + new Runner(opts).runSingle(); + latestSuccessMB = heapGB; + progress = true; + } catch (RunnerException e) { + baseHeapMB = latestSuccessMB; + break; + } + } + } while (progress); + + pw.println(); + pw.println("Max heap size is " + latestSuccessMB + " Mb"); + pw.println(); + + return latestSuccessMB; + } + + private int calibrateRate(Class<?> benchmark, int size) throws RunnerException { + { + Options opts = new OptionsBuilder() + .parent(baseOpts) + .include(benchmark.getName()) + .param("size", String.valueOf(size)) + .param("rate", String.valueOf(Integer.MAX_VALUE)) + .build(); + + RunResult result = new Runner(opts).runSingle(); + return (int) result.getPrimaryResult().getScore(); + } + } + + private void doRun_ThreadsHeapSize(Class<?> benchmark) { + for (int size = 1; size <= 1000000; size *= 100) { + pw.println(); + pw.println("size = " + size); + pw.println(); + Options opts = new OptionsBuilder() + .parent(baseOpts) + .param("size", String.valueOf(size)) + .build(); + doRun_ThreadsHeapX(benchmark, opts); + } + } + + private void doRun_ThreadsHeap(Class<?> benchmark) { + doRun_ThreadsHeapX(benchmark, baseOpts); + } + + private void doRun_ThreadsHeapX(Class<?> benchmark, Options baseOpts) { + pw.printf("%-10s %-20s %-40s %-40s %-44s %-44s %n", + "threads", + "heap, MB", + "performance", + "allocation rate", + "pauses (sum, 99%, 99.9%, 99.99%)", + "ttsp (sum, 99%, 99.9%, 99.99%)" + ); + + int maxThreads = Runtime.getRuntime().availableProcessors(); + for (int threads = 1; threads <= maxThreads; threads *= 2) { + pw.println(); + int margin = maxHeapMB / 8; + int step = maxHeapMB / 8; + for (int heapMB = margin; heapMB < maxHeapMB - margin; heapMB += step) { + Options opts = new OptionsBuilder() + .parent(baseOpts) + .include(benchmark.getName()) + .threads(threads) + .jvmArgsAppend("-Xmx" + heapMB + "m", "-Xms" + heapMB + "m") + .build(); + + try { + RunResult result = new Runner(opts).runSingle(); + + Result prim = result.getPrimaryResult(); + Map<String, Result> sec = result.getSecondaryResults(); + + pw.printf("%-10d %-20d %-40s %-40s %10s %10s %10s %10s %10s %10s %10s %10s %n", + threads, + heapMB, + prim, + sec.get("·gc.alloc.rate"), + sec.get("·safepoints.pause"), + sec.get("·safepoints.pause.p0.99"), + sec.get("·safepoints.pause.p0.999"), + sec.get("·safepoints.pause.p0.9999"), + sec.get("·safepoints.ttsp"), + sec.get("·safepoints.ttsp.p0.99"), + sec.get("·safepoints.ttsp.p0.999"), + sec.get("·safepoints.ttsp.p0.9999") + ); + } catch (RunnerException e) { + // OOME, fail + } + } + } + } + + + private void doRun_AllocRateSize(Class<?> benchmark) throws RunnerException { + for (int size = 1; size <= 1000000; size *= 100) { + pw.println(); + pw.println("size = " + size); + pw.println(); + Options opts = new OptionsBuilder() + .parent(baseOpts) + .param("size", String.valueOf(size)) + .build(); + doRun_AllocRateX(benchmark, opts); + } + } + + private void doRun_AllocRate(Class<?> benchmark) throws RunnerException { + doRun_AllocRateX(benchmark, baseOpts); + } + + private void doRun_AllocRateX(Class<?> benchmark, Options baseOpts) throws RunnerException { + pw.printf("%-10s %-20s %-40s %-40s %-44s %-44s %n", + "size", + "target rate", + "actual rate", + "allocation rate", + "pauses", + "ttsp" + ); + + for (int size = 1; size <= 1000000; size *= 100) { + int maxRate = calibrateRate(benchmark, size); + + pw.println(); + for (int rate = maxRate / 10; rate <= maxRate; rate += maxRate / 10) { + Options opts = new OptionsBuilder() + .parent(baseOpts) + .include(benchmark.getName()) + .param("size", String.valueOf(size)) + .param("rate", String.valueOf(rate)) + .build(); + + RunResult result = new Runner(opts).runSingle(); + + Result prim = result.getPrimaryResult(); + Map<String, Result> sec = result.getSecondaryResults(); + + pw.printf("%-10d %-20d %-40s %-40s %10s %10s %10s %10s %10s %10s %10s %10s %n", + size, + rate, + prim, + sec.get("·gc.alloc.rate"), + sec.get("·safepoints.pause"), + sec.get("·safepoints.pause.p0.99"), + sec.get("·safepoints.pause.p0.999"), + sec.get("·safepoints.pause.p0.9999"), + sec.get("·safepoints.ttsp"), + sec.get("·safepoints.ttsp.p0.99"), + sec.get("·safepoints.ttsp.p0.999"), + sec.get("·safepoints.ttsp.p0.9999") + ); + } + + } + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/plain/Objects.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,18 @@ +package org.openjdk.gcbench.alloc.plain; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +public class Objects { + + @Benchmark + public Object test() { + return new Object(); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/plain/PrimArray.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,21 @@ +package org.openjdk.gcbench.alloc.plain; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +public class PrimArray { + + @Param({"1"}) + int size; + + @Benchmark + public Object test() { + return new int[size]; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/plain/RefArray.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,21 @@ +package org.openjdk.gcbench.alloc.plain; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +public class RefArray { + + @Param({"1"}) + int size; + + @Benchmark + public Object test() { + return new Object[size]; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/Objects.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,29 @@ +package org.openjdk.gcbench.alloc.ratelimited; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +public class Objects { + + @Param({"1"}) + int rate; + + private MultiTokenBucket bucket; + + @Setup + public void setup() { + bucket = new MultiTokenBucket(rate); + } + + @Benchmark + public Object test() { + bucket.limit(); + return new Object(); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/PrimArray.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,32 @@ +package org.openjdk.gcbench.alloc.ratelimited; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +public class PrimArray { + + MultiTokenBucket bucket; + + @Param({"1"}) + int size; + + @Param({"1"}) + int rate; + + @Setup + public void setup() { + bucket = new MultiTokenBucket(rate); + } + + @Benchmark + public Object test() { + bucket.limit(); + return new int[size]; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/RefArray.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,33 @@ +package org.openjdk.gcbench.alloc.ratelimited; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +public class RefArray { + + @Param({"1"}) + int size; + + @Param({"1"}) + int rate; + + private MultiTokenBucket bucket; + + @Setup + public void setup() { + bucket = new MultiTokenBucket(rate); + } + + @Benchmark + public Object test() { + bucket.limit(); + return new Object[size]; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/fragger/ArrayFragger.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,62 @@ +package org.openjdk.gcbench.fragger; + +import org.openjdk.gcbench.util.ratelimit.TokenBucket; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +//@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class ArrayFragger { + + @Param({"100"}) + int size; + + @Param({"1000"}) + int ldsMB; + + @Param({"10"}) + int rate; + + Object[] objects; + + int count; + + TokenBucket bucket; + + @Setup + public void setup() { + bucket = new TokenBucket(rate); + count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / align(16 + 4 + size, 8)); + objects = new Object[count]; + for (int c = 0; c < count; c++) { + doStore(c, new byte[size]); + } + } + + public static int align(int size, int align) { + if ((size % align) == 0) { + return size; + } else { + return ((size / align) + 1) * align; + } + } + + @Benchmark + public void test() { + bucket.limit(); + doStore(ThreadLocalRandom.current().nextInt(count), new byte[size]); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void doStore(int idx, byte[] obj) { + objects[idx] = obj; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/fragger/LinkedListFragger.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,56 @@ +package org.openjdk.gcbench.fragger; + +import org.openjdk.jmh.annotations.*; + +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class LinkedListFragger { + + @Param({"1", "10", "100", "1000", "10000"}) + int objSize; + + @Param({"10", "100", "1000"}) + int ldsMB; + + List<Object> objects; + + int count; + + @Setup + public void setup() { + count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / align(16 + 4 + objSize, 8)); + objects = new LinkedList(); + for (int c = 0; c < count; c++) { + objects.add(new byte[objSize]); + } + } + + public static int align(int size, int align) { + if ((size % align) == 0) { + return size; + } else { + return ((size / align) + 1) * align; + } + } + + @Benchmark + public void test() { + doStore(ThreadLocalRandom.current().nextInt(count), new byte[objSize]); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void doStore(int idx, byte[] obj) { + objects.set(idx, obj); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/fragger/TreeFragger.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,93 @@ +package org.openjdk.gcbench.fragger; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class TreeFragger { + + @Param({"1", "10", "100", "1000", "10000"}) + int objSize; + + @Param({"10", "100", "1000"}) + int ldsMB; + + Node root; + + int count; + + @Setup + public void setup() { + int sizePerCount = + align(12 + 3*4, 8) + // Node + align(12 + 4 + objSize, 8); // array + count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / sizePerCount); + + root = new Node(new byte[objSize]); + + for (int addr = 0; addr < count; addr++) { + Node cur = root; + for (int m = 31 - Integer.numberOfLeadingZeros(addr); m >= 0; m--) { + if ((addr & (1 << m)) != 0) { + if (cur.left == null) { + cur.left = new Node(new byte[objSize]); + } + cur = cur.left; + } else { + if (cur.right == null) { + cur.right = new Node(new byte[objSize]); + } + cur = cur.right; + } + } + } + System.out.println("Setup completed"); + } + + + + public static int align(int size, int align) { + if ((size % align) == 0) { + return size; + } else { + return ((size / align) + 1) * align; + } + } + + @Benchmark + public void test() { + doStore(ThreadLocalRandom.current().nextInt(count), new byte[objSize]); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void doStore(int addr, byte[] obj) { + Node cur = root; + for (int m = 31 - Integer.numberOfLeadingZeros(addr); m >= 0; m--) { + if ((addr & (1 << m)) != 0) { + cur = cur.left; + } else { + cur = cur.right; + } + } + cur.payload = obj; + } + + static class Node { + Node left; + Node right; + Object payload; + + public Node(Object payload) { + this.payload = payload; + } + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNew.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,95 @@ +package org.openjdk.gcbench.runtime.cmp; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Thread) +public class ACmpBarriersKnownNew { + + Object target; + + @Setup + public void setup() { + target = new Object(); + } + + @Benchmark + public void left() { + doLeft(target); + } + + @Benchmark + public void right() { + doRight(target); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private boolean doLeft(Object t1) { + return t1 == new Object(); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private boolean doRight(Object t2) { + return new Object() == t2; + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + ACmpBarriersKnownNew.left avgt 5 1.983 ± 0.100 ns/op + ACmpBarriersKnownNew.right avgt 5 1.977 ± 0.005 ns/op + + # G1 + ACmpBarriersKnownNew.left avgt 5 2.057 ± 0.019 ns/op + ACmpBarriersKnownNew.right avgt 5 2.059 ± 0.022 ns/op + + # Parallel + ACmpBarriersKnownNew.left avgt 5 2.057 ± 0.030 ns/op + ACmpBarriersKnownNew.right avgt 5 2.060 ± 0.007 ns/op + + The difference is not caused by different compilation of doLeft/doRight methods, + but rather the additional read barriers in the JMH loop itself. Note that + additional barrier code *IMPROVES* performance. + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + ACmpBarriersKnownNew.left avgt 25 1.970 ± 0.004 ns/op + ACmpBarriersKnownNew.left:CPI avgt 5 0.330 ± 0.012 #/op + ACmpBarriersKnownNew.left:L1-dcache-load-misses avgt 5 0.010 ± 0.005 #/op + ACmpBarriersKnownNew.left:L1-dcache-loads avgt 5 11.095 ± 0.380 #/op <--- more loads + ACmpBarriersKnownNew.left:cycles avgt 5 7.820 ± 0.088 #/op <--- yet, less cycles + ACmpBarriersKnownNew.left:instructions avgt 5 23.665 ± 0.833 #/op <--- a few more instructions + + # G1 + ACmpBarriersKnownNew.left avgt 25 2.061 ± 0.008 ns/op + ACmpBarriersKnownNew.left:CPI avgt 5 0.363 ± 0.010 #/op + ACmpBarriersKnownNew.left:L1-dcache-load-misses avgt 5 0.010 ± 0.012 #/op + ACmpBarriersKnownNew.left:L1-dcache-loads avgt 5 9.322 ± 0.153 #/op + ACmpBarriersKnownNew.left:cycles avgt 5 8.115 ± 0.107 #/op + ACmpBarriersKnownNew.left:instructions avgt 5 22.331 ± 0.681 #/op + + The generated code for doLeft/doRight in Shenandoah/G1/Parallel is the same: + + [Verified Entry Point] + 12.42% 7.23% 0x00007fbe6953f8c0: sub $0x18,%rsp + 0.37% 0.40% 0x00007fbe6953f8c7: mov %rbp,0x10(%rsp) + 11.19% 10.82% 0x00007fbe6953f8cc: xor %eax,%eax ; always false + 1.53% 1.76% 0x00007fbe6953f8ce: add $0x10,%rsp + 0.22% 0.22% 0x00007fbe6953f8d2: pop %rbp + 0.23% 0.12% 0x00007fbe6953f8d3: test %eax,0x18b70727(%rip) + 10.71% 14.51% 0x00007fbe6953f8d9: retq + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNulls.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,77 @@ +package org.openjdk.gcbench.runtime.cmp; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Thread) +public class ACmpBarriersKnownNulls { + + Object target; + + @Setup + public void setup() { + target = new Object(); + } + + @Benchmark + public void left() { + doLeft(target); + } + + @Benchmark + public void right() { + doRight(target); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private boolean doLeft(Object t1) { + return t1 == null; + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private boolean doRight(Object t2) { + return null == t2; + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + ACmpBarriersKnownNulls.left avgt 5 2.235 ± 0.011 ns/op + ACmpBarriersKnownNulls.right avgt 5 2.240 ± 0.089 ns/op + + # G1 + ACmpBarriersKnownNulls.left avgt 5 1.971 ± 0.001 ns/op + ACmpBarriersKnownNulls.right avgt 5 1.974 ± 0.021 ns/op + + # Parallel + ACmpBarriersKnownNulls.left avgt 5 1.977 ± 0.026 ns/op + ACmpBarriersKnownNulls.right avgt 5 1.973 ± 0.001 ns/op + + The difference is not caused by different compilation of doLeft/doRight methods, + but rather the additional read barriers in the JMH loop itself. The generated code + for doLeft/doRight in Shenandoah/G1/Parallel is the same: + + [Verified Entry Point] + 11.16% 7.47% 0x00007f309d542240: mov %eax,-0x14000(%rsp) + 0.15% 0.07% 0x00007f309d542247: push %rbp + 0.22% 0.14% 0x00007f309d542248: sub $0x10,%rsp + 11.05% 10.65% 0x00007f309d54224c: test %rdx,%rdx + ╭ 0x00007f309d54224f: je 0x00007f309d54225f + 0.10% 0.06% │ 0x00007f309d542251: xor %eax,%eax + 0.11% 0.08% │ 0x00007f309d542253: add $0x10,%rsp + 11.43% 11.21% │ 0x00007f309d542257: pop %rbp + 0.07% 0.04% │ 0x00007f309d542258: test %eax,0x18026da2(%rip) + 0.07% 0.01% │ 0x00007f309d54225e: retq + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersRandom.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,106 @@ +package org.openjdk.gcbench.runtime.cmp; + +import org.openjdk.jmh.annotations.*; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 10, jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:-TieredCompilation"}) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Threads(1) +@State(Scope.Thread) +public class ACmpBarriersRandom { + + @Param("10000") + private int size; + + Object[] targets; + + @Setup(Level.Iteration) + public void setup() { + Object[] cases = new Object[] {new Object(), new Object(), null}; + targets = new Object[size]; + + Random r = new Random(); + for (int c = 0; c < size; c++) { + targets[c] = cases[r.nextInt(cases.length)]; + } + } + + @Benchmark + public void test() { + Object[] targets = this.targets; + for (int c = 0; c < size - 1; c++) { + acmp(targets[c], targets[c+1]); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private boolean acmp(Object t1, Object t2) { + return t1 == t2; + } + + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark (size) Mode Cnt Score Error Units + + # Shenandoah + ACmpBarriersRandom.test 10000 avgt 50 81.733 ± 0.237 us/op + + # G1 + ACmpBarriersRandom.test 10000 avgt 50 33.487 ± 0.082 us/op + + # Parallel + ACmpBarriersRandom.test 10000 avgt 50 33.461 ± 0.049 us/op + + The difference is explained by a more complicated acmp barrier that needs + to handle false negatives caused by concurrent evacuation. + + Parallel and G1: + + [Verified Entry Point] + 2.62% 3.27% 0x00007f03c5af48c0: sub $0x18,%rsp + 5.92% 5.57% 0x00007f03c5af48c7: mov %rbp,0x10(%rsp) + 1.39% 1.64% 0x00007f03c5af48cc: xor %r10d,%r10d + 1.67% 1.54% 0x00007f03c5af48cf: mov $0x1,%eax + 4.45% 5.35% 0x00007f03c5af48d4: cmp %rcx,%rdx ; compare + 0.68% 1.07% 0x00007f03c5af48d7: cmovne %r10d,%eax ; choose 0 or 1 + 3.11% 3.15% 0x00007f03c5af48db: add $0x10,%rsp + 4.07% 6.03% 0x00007f03c5af48df: pop %rbp + 1.06% 1.66% 0x00007f03c5af48e0: test %eax,0x1326a71a(%rip) + 1.30% 1.22% 0x00007f03c5af48e6: retq + + Shenandoah: + + [Verified Entry Point] + 3.24% 1.91% 0x00007f11fd2addc0: sub $0x18,%rsp + 0.35% 1.02% 0x00007f11fd2addc7: mov %rbp,0x10(%rsp) + 0.12% 0.13% 0x00007f11fd2addcc: cmp %rcx,%rdx ; compare + ╭ 0x00007f11fd2addcf: je 0x00007f11fd2adde3 + 2.79% 2.58% │ 0x00007f11fd2addd1: test %rcx,%rcx ; null check t1 + │╭ 0x00007f11fd2addd4: je 0x00007f11fd2addfe + 1.74% 2.39% ││ 0x00007f11fd2addd6: mov -0x8(%rcx),%rcx ; read barrier t1 + 5.90% 8.70% ││ ↗ 0x00007f11fd2addda: test %rdx,%rdx ; null check t2 + ││╭│ 0x00007f11fd2adddd: je 0x00007f11fd2ade02 + 0.25% 0.43% ││││ 0x00007f11fd2adddf: mov -0x8(%rdx),%rdx ; read barrier t2 + 7.54% 8.56% ↘│││↗ 0x00007f11fd2adde3: xor %r11d,%r11d ; same as Parallel/G1: + 1.02% 1.02% ││││ 0x00007f11fd2adde6: mov $0x1,%eax + 4.91% 8.30% ││││ 0x00007f11fd2addeb: cmp %rcx,%rdx ; <--- redundant compare for the "==" path, can reuse the first one + 1.50% 2.22% ││││ 0x00007f11fd2addee: cmovne %r11d,%eax ; choose 0 or 1 + 5.38% 9.70% ││││ 0x00007f11fd2addf2: add $0x10,%rsp + 0.47% 0.47% ││││ 0x00007f11fd2addf6: pop %rbp + 5.10% 5.67% ││││ 0x00007f11fd2addf7: test %eax,0x11df7203(%rip) + 1.00% 0.69% ││││ 0x00007f11fd2addfd: retq + 1.79% 2.44% ↘│││ 0x00007f11fd2addfe: xor %ecx,%ecx ; <--- redundant branch, can reuse %rcx above, proven to be zero + 0.50% 0.54% │╰│ 0x00007f11fd2ade00: jmp 0x00007f11fd2addda + 0.02% 0.05% ↘ │ 0x00007f11fd2ade02: xor %edx,%edx ; <--- redundant branch, can reuse %rdx above, proven to be zero + 0.48% 0.43% ╰ 0x00007f11fd2ade04: jmp 0x00007f11fd2adde3 + */ + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveFailure.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,52 @@ +package org.openjdk.gcbench.runtime.cmpxchg; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class CasBarriersPrimitiveFailure { + + AtomicInteger ai; + + int value1, value2; + + @Setup + public void setup() { + value1 = 42; + value2 = 43; + ai = new AtomicInteger(); + ai.set(value2); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + ai.compareAndSet(value1, value2); + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + CasBarriers.test avgt 25 16.418 ± 0.029 ns/op + + # G1 + CasBarriers.test avgt 25 12.545 ± 0.041 ns/op + + # Parallel + CasBarriers.test avgt 25 12.526 ± 0.030 ns/op + + Analysis pending. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveSuccess.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,53 @@ +package org.openjdk.gcbench.runtime.cmpxchg; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class CasBarriersPrimitiveSuccess { + + AtomicInteger ai; + + Object t1, t2; + + int value; + + @Setup + public void setup() { + value = 42; + ai = new AtomicInteger(); + ai.set(value); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + ai.compareAndSet(value, value); + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + CasBarriers.test avgt 25 16.418 ± 0.029 ns/op + + # G1 + CasBarriers.test avgt 25 12.545 ± 0.041 ns/op + + # Parallel + CasBarriers.test avgt 25 12.526 ± 0.030 ns/op + + Analysis pending. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefFailure.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,51 @@ +package org.openjdk.gcbench.runtime.cmpxchg; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class CasBarriersRefFailure { + + AtomicReference<Object> ai; + + Object t1 = new Object(); + Object t2 = new Object(); + + @Setup + public void setup() { + ai = new AtomicReference<>(); + ai.set(t2); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + ai.compareAndSet(t1, t2); + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + CasBarriers.test avgt 25 16.418 ± 0.029 ns/op + + # G1 + CasBarriers.test avgt 25 12.545 ± 0.041 ns/op + + # Parallel + CasBarriers.test avgt 25 12.526 ± 0.030 ns/op + + Analysis pending. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefSuccess.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,52 @@ +package org.openjdk.gcbench.runtime.cmpxchg; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class CasBarriersRefSuccess { + + AtomicReference<Object> ai; + + Object t1 = new Object(); + Object t2 = t1; + + @Setup + public void setup() { + ai = new AtomicReference<>(); + ai.set(t1); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + ai.compareAndSet(t1, t2); + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + CasBarriers.test avgt 25 16.418 ± 0.029 ns/op + + # G1 + CasBarriers.test avgt 25 12.545 ± 0.041 ns/op + + # Parallel + CasBarriers.test avgt 25 12.526 ± 0.030 ns/op + + Analysis pending. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/WeakCasLoop.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,80 @@ +package org.openjdk.gcbench.runtime.cmpxchg; + +import org.openjdk.jmh.annotations.*; +import sun.misc.Unsafe; + +import java.lang.reflect.Field; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class WeakCasLoop { + + static final Unsafe U; + static final long OFF_REF; + + static { + try { + Field unsafeField = Unsafe.class.getDeclaredField("theUnsafe"); + unsafeField.setAccessible(true); + U = (Unsafe) unsafeField.get(null); + OFF_REF = U.objectFieldOffset(WeakCasLoop.class.getDeclaredField("ref")); + } catch (Exception e) { + throw new AssertionError(e); + } + } + + volatile Point ref; + + @Setup + public void setup() { + ref = new Point(0, 0); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void strong() { + Point ep, np; + do { + ep = ref; + np = ep.next(); + } while (!U.compareAndSwapObject(this, OFF_REF, ep, np)); + } + + public static class Point { + final int x; + final int y; + + public Point(int x, int y) { + this.x = x; + this.y = y; + } + + public Point next() { + return new Point((x + 1) & 255, (y + 2) & 255); + } + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + CasBarriers.test avgt 25 16.418 ± 0.029 ns/op + + # G1 + CasBarriers.test avgt 25 12.545 ± 0.041 ns/op + + # Parallel + CasBarriers.test avgt 25 12.526 ± 0.030 ns/op + + Analysis pending. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersArrays.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,118 @@ +package org.openjdk.gcbench.runtime.reads; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class ReadBarriersArrays { + + @Param({"1", "1000", "1000000", "1000000000"}) + private int size; + + int[] target; + + @Setup + public void setup() { + target = new int[size]; + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void plain() { + for (int t : target) { + sink(t); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void sink(int i) { + + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + ReadBarriersArrays.plain 1 avgt 25 0.005 ± 0.001 us/op + ReadBarriersArrays.plain 1000 avgt 25 1.970 ± 0.003 us/op + ReadBarriersArrays.plain 1000000 avgt 25 1869.969 ± 0.554 us/op + ReadBarriersArrays.plain 1000000000 avgt 25 3772318.787 ± 114008.238 us/op + + # G1 + ReadBarriersArrays.plain 1 avgt 25 0.004 ± 0.001 us/op + ReadBarriersArrays.plain 1000 avgt 25 1.993 ± 0.003 us/op + ReadBarriersArrays.plain 1000000 avgt 25 1803.248 ± 2.122 us/op + ReadBarriersArrays.plain 1000000000 avgt 25 1821469.162 ± 10974.715 us/op + + # Parallel + ReadBarriersArrays.plain 1 avgt 25 0.004 ± 0.001 us/op + ReadBarriersArrays.plain 1000 avgt 25 2.000 ± 0.006 us/op + ReadBarriersArrays.plain 1000000 avgt 25 1817.009 ± 35.630 us/op + ReadBarriersArrays.plain 1000000000 avgt 25 1825045.442 ± 9787.079 us/op + + In Shenandoah, the hottest loop looks like this: + + 13.59% 12.98% ↗ 0x00007f33c95428a0: mov (%rsp),%r9 + │ 0x00007f33c95428a4: mov -0x8(%r9),%r10 ; <--- read barrier + 0.04% 0.01% │ 0x00007f33c95428a8: mov %r9,(%rsp) + │ 0x00007f33c95428ac: mov 0x10(%r10,%rbp,4),%edx ; array access + 36.49% 35.45% │ 0x00007f33c95428b1: mov 0x8(%rsp),%rsi + 0.03% │ 0x00007f33c95428b6: nop + │ 0x00007f33c95428b7: callq 0x00007f33c1a80f80 ; call sink(); + 0.52% 0.52% │ 0x00007f33c95428bc: inc %ebp ; increment and test loop counter + 0.01% 0.01% │ 0x00007f33c95428be: cmp 0x10(%rsp),%ebp + ╰ 0x00007f33c95428c2: jl 0x00007f33c95428a0 + + In G1 and Parallel it looks like this: + + 14.51% 13.22% ↗ 0x00007fa49c6ceaa0: mov (%rsp),%r10 + 0.01% │ 0x00007fa49c6ceaa4: mov 0x10(%r10,%rbp,4),%edx ; array access + 31.43% 32.02% │ 0x00007fa49c6ceaa9: mov %r10,(%rsp) + 0.84% 0.79% │ 0x00007fa49c6ceaad: mov 0x8(%rsp),%rsi + 1.59% 1.65% │ 0x00007fa49c6ceab2: nop + │ 0x00007fa49c6ceab3: callq 0x00007fa494c0bf80 ; call sink(); + 5.25% 4.46% │ 0x00007fa49c6ceab8: inc %ebp ; increment and test loop counter + 0.05% 0.03% │ 0x00007fa49c6ceaba: cmp 0x10(%rsp),%ebp + 0.03% ╰ 0x00007fa49c6ceabe: jl 0x00007fa49c6ceaa0 + + So, the difference is in read barrier. It does not affect performance much. + + With 1G array, Shenandoah nose-dives into excessive mark (?): + + ....[Hottest Methods (after inlining)].............................................................. + 37.40% 0.61% libc-2.23.so __memset_avx2 + 23.92% 38.48% libjvm.so ParallelTaskTerminator::offer_termination + 12.42% 18.48% libjvm.so SpinPause + 9.59% 13.47% C2, level 4 org.openjdk.shenandoah.reads.ReadBarriersArrays::plain, version 691 + 4.31% 8.85% C1, level 1 org.openjdk.shenandoah.reads.ReadBarriersArrays::sink, version 647 + 2.62% 5.14% libjvm.so GenericTaskQueueSet<Padded<OverflowTaskQueue<ObjArrayTask, (MemoryType)5, 131072u>, 128ul>, (MemoryType)5>::peek + 1.74% 1.33% libjvm.so ShenandoahInitMarkRootsClosure::do_oop + 1.66% 0.21% libjvm.so ShenandoahHeapRegionSet::claim_next + 1.24% 1.90% [unknown] [unknown] + 1.06% 0.15% libjvm.so ResetBitmapTask::work + 0.79% 2.81% libjvm.so StringTable::possibly_parallel_oops_do + 0.72% 1.69% libjvm.so ShenandoahConcurrentMark::mark_and_push + 0.40% 0.20% libjvm.so ShenandoahHeapRegion::top_at_mark_start + 0.39% 0.36% libjvm.so BitMap::at_put_range + 0.26% 0.27% libjvm.so CMBitMap::clear_range + 0.15% 2.35% libjvm.so nmethod::oops_do + 0.09% 0.01% libjvm.so SCMConcurrentMarkingTask::work + 0.09% 0.22% libjvm.so CodeHeap::next_used + 0.07% 0.25% libjvm.so ShenandoahHeapRegion::init_top_at_mark_start + 0.06% 0.25% libjvm.so SafepointSynchronize::begin + 1.03% 2.83% <...other 257 warm methods...> + ........................................................................ + + Setting -Xmx8g -Xms8g alleviates this problem. + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersCachePressure.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,92 @@ +package org.openjdk.gcbench.runtime.reads; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class ReadBarriersCachePressure { + + @Param({"1", "16", "128", "1024"}) + private int size; + + + int mask; + Object[][][] target; + + @Setup + public void setup() { + target = new Object[size][][]; + for (int c = 0; c < size; c++) { + target[c] = new Object[size][]; + for (int j = 0; j < size; j++) { + target[c][j] = new Object[size]; + } + } + + mask = size - 1; + } + + private int s; + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void plain() { + Object[][][] tgt = target; + int t = s; + int m = mask; + t = t * 1664525 + 1013904223; + int idx1 = t & m; + t = t * 1664525 + 1013904223; + int idx2 = t & m; + t = t * 1664525 + 1013904223; + int idx3 = t & m; + sink(tgt[idx1][idx2][idx3]); + s = t; + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void sink(Object o) { + + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark (size) Mode Cnt Score Error Units + + # Shenandoah + ReadBarriersCachePressure.plain 1 avgt 25 7.806 ± 0.006 ns/op + ReadBarriersCachePressure.plain 16 avgt 25 7.802 ± 0.003 ns/op + ReadBarriersCachePressure.plain 128 avgt 25 9.220 ± 0.024 ns/op + ReadBarriersCachePressure.plain 1024 avgt 25 38.590 ± 0.251 ns/op + + # G1 + ReadBarriersCachePressure.plain 1 avgt 25 6.727 ± 0.014 ns/op + ReadBarriersCachePressure.plain 16 avgt 25 6.736 ± 0.024 ns/op + ReadBarriersCachePressure.plain 128 avgt 25 7.075 ± 0.015 ns/op + ReadBarriersCachePressure.plain 1024 avgt 25 36.811 ± 0.259 ns/op + + # Parallel + ReadBarriersCachePressure.plain 1 avgt 25 6.791 ± 0.026 ns/op + ReadBarriersCachePressure.plain 16 avgt 25 6.780 ± 0.002 ns/op + ReadBarriersCachePressure.plain 128 avgt 25 7.087 ± 0.021 ns/op + ReadBarriersCachePressure.plain 1024 avgt 25 36.037 ± 0.264 ns/op + + This benchmark tries to validate the speculation that adding an indirection pointer + before the object has the cache capacity implications: i.e. accessing the indirection + pointer for the object aligned at 8 may touch the previous cache line. + + This does not seem to be validated, and the read barrier performance cost seems to + be consistent across different sizes. + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersFields.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,140 @@ +package org.openjdk.gcbench.runtime.reads; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class ReadBarriersFields { + + Target target; + + @Setup + public void setup() { + target = new Target(); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void plainField() { + sink(target.plainInt); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void volatileField() { + sink(target.volatileInt); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void sink(int i) { + + } + + static class Target { + int plainInt; + volatile int volatileInt; + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + ReadBarriersFields.plainField avgt 25 3.440 ± 0.019 ns/op + ReadBarriersFields.volatileField avgt 25 3.431 ± 0.006 ns/op + + # G1 + ReadBarriersFields.plainField avgt 25 3.149 ± 0.005 ns/op + ReadBarriersFields.volatileField avgt 25 3.147 ± 0.003 ns/op + + # Parallel + ReadBarriersFields.plainField avgt 25 3.149 ± 0.004 ns/op + ReadBarriersFields.volatileField avgt 25 3.152 ± 0.006 ns/op + + The difference is caused by the additional instructions: + + Benchmark Mode Cnt Score Error Units + + # ------------- Shenandoah + ReadBarriersFields.plainField avgt 25 3.440 ± 0.014 ns/op + ReadBarriersFields.plainField:CPI avgt 5 0.380 ± 0.018 #/op <--- better CPI + ReadBarriersFields.plainField:L1-dcache-load-misses avgt 5 0.018 ± 0.005 #/op + ReadBarriersFields.plainField:L1-dcache-loads avgt 5 15.646 ± 0.548 #/op <--- more loads + ReadBarriersFields.plainField:L1-dcache-stores avgt 5 7.672 ± 0.292 #/op + ReadBarriersFields.plainField:branch-misses avgt 5 0.008 ± 0.002 #/op + ReadBarriersFields.plainField:branches avgt 5 6.186 ± 0.306 #/op <--- one more branch + ReadBarriersFields.plainField:cycles avgt 5 13.866 ± 0.967 #/op + ReadBarriersFields.plainField:instructions avgt 5 36.500 ± 1.875 #/op + + # ------------- G1 + ReadBarriersFields.plainField avgt 25 3.152 ± 0.008 ns/op + ReadBarriersFields.plainField:CPI avgt 5 0.426 ± 0.017 #/op + ReadBarriersFields.plainField:L1-dcache-load-misses avgt 5 0.019 ± 0.021 #/op + ReadBarriersFields.plainField:L1-dcache-loads avgt 5 12.612 ± 0.146 #/op + ReadBarriersFields.plainField:L1-dcache-stores avgt 5 6.932 ± 0.141 #/op + ReadBarriersFields.plainField:branch-misses avgt 5 0.008 ± 0.002 #/op + ReadBarriersFields.plainField:branches avgt 5 5.226 ± 0.214 #/op + ReadBarriersFields.plainField:cycles avgt 5 12.794 ± 0.929 #/op + ReadBarriersFields.plainField:instructions avgt 5 30.012 ± 1.198 #/op + + # ------------- Parallel + Benchmark Mode Cnt Score Error Units + ReadBarriersFields.plainField avgt 25 3.157 ± 0.012 ns/op + ReadBarriersFields.plainField:CPI avgt 4 0.426 ± 0.027 #/op + ReadBarriersFields.plainField:L1-dcache-load-misses avgt 5 0.020 ± 0.016 #/op + ReadBarriersFields.plainField:L1-dcache-loads avgt 5 12.641 ± 0.446 #/op + ReadBarriersFields.plainField:L1-dcache-stores avgt 5 6.927 ± 0.139 #/op + ReadBarriersFields.plainField:branch-misses avgt 5 0.007 ± 0.002 #/op + ReadBarriersFields.plainField:branches avgt 5 5.212 ± 0.235 #/op + ReadBarriersFields.plainField:cycles avgt 5 12.754 ± 0.771 #/op + ReadBarriersFields.plainField:instructions avgt 4 29.963 ± 1.944 #/op + + These instructions are the read barriers, plus an explicit null check: + + [Verified Entry Point] + 7.25% 7.27% 0x00007fde8d540be0: mov %eax,-0x14000(%rsp) + 0.16% 0.06% 0x00007fde8d540be7: push %rbp + 0.04% 0x00007fde8d540be8: sub $0x10,%rsp + 7.51% 8.68% 0x00007fde8d540bec: mov -0x8(%rsi),%r10 ; <--- read barrier + 0.04% 0x00007fde8d540bf0: mov 0xc(%r10),%r10d ; get field $target + 0.04% 0.04% 0x00007fde8d540bf4: test %r10d,%r10d ; <--- null check $target + 0.01% ╭ 0x00007fde8d540bf7: je 0x00007fde8d540c18 + 7.36% 8.40% │ 0x00007fde8d540bf9: shl $0x3,%r10 + 0.19% 0.13% │ 0x00007fde8d540bfd: mov -0x8(%r10),%r10 ; <--- read barrier + 1.44% 1.20% │ 0x00007fde8d540c01: mov 0xc(%r10),%edx ; get field $plainInt + 23.63% 29.86% │ 0x00007fde8d540c05: xchg %ax,%ax + 0.11% 0.10% │ 0x00007fde8d540c07: callq 0x00007fde860c2ce0 ; call sink() + 7.41% 5.41% │ 0x00007fde8d540c0c: add $0x10,%rsp + 0.11% 0.06% │ 0x00007fde8d540c10: pop %rbp + 0.01% │ 0x00007fde8d540c11: test %eax,0x18d633e9(%rip) + 7.82% 5.10% │ 0x00007fde8d540c17: retq + + It seems compressed oops are precluding folding the explicit null check. The same run with + -XX:-UseCompressedOops: + + [Verified Entry Point] + 7.82% 7.02% 0x00007f2ec55430d0: mov %eax,-0x14000(%rsp) + 0.04% 0x00007f2ec55430d7: push %rbp + 0.65% 0.05% 0x00007f2ec55430d8: sub $0x10,%rsp + 6.71% 8.14% 0x00007f2ec55430dc: mov -0x8(%rsi),%r10 ; <--- read barrier + 0.01% 0x00007f2ec55430e0: mov 0x10(%r10),%r10 ; get field $target + 0.54% 0.65% ╭ 0x00007f2ec55430e4: mov -0x8(%r10),%r10 ; <--- read barrier ; implicit exception: dispatches to 0x00007f2ec554310d + 7.54% 9.11% │ 0x00007f2ec55430e8: mov 0x10(%r10),%edx ; get field $plainInt + 20.15% 26.85% │ 0x00007f2ec55430ec: data16 xchg %ax,%ax + │ 0x00007f2ec55430ef: callq 0x00007f2ebe0c81e0 ; call sink(); + 7.02% 5.61% │ 0x00007f2ec55430f4: add $0x10,%rsp + 0.01% 0.01% │ 0x00007f2ec55430f8: pop %rbp + 0.49% 0.71% │ 0x00007f2ec55430f9: test %eax,0x18ccff01(%rip) + 7.19% 4.31% │ 0x00007f2ec55430ff: retq + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersKnownNull.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,97 @@ +package org.openjdk.gcbench.runtime.writes; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Thread) +public class WriteBarriersKnownNull { + + Target target; + + @Setup + public void setup() { + target = new Target(); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + target.field = null; + } + + static class Target { + Object field; + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + WriteBarriersKnownNull.test avgt 25 2.637 ± 0.027 ns/op + + # G1 + WriteBarriersKnownNull.test avgt 25 1.958 ± 0.006 ns/op + + # Parallel + WriteBarriersKnownNull.test avgt 25 1.832 ± 0.097 ns/op + + Mostly the same as the regular barrier, plus the optimized out redundant nullchecks + for stored value. + + Shenandoah: + + [Verified Entry Point] + 9.60% 11.27% 0x00007f7031af6f40: mov %eax,-0x14000(%rsp) + 0.02% 0x00007f7031af6f47: push %rbp + 0x00007f7031af6f48: sub $0x10,%rsp + 10.13% 10.60% 0x00007f7031af6f4c: mov -0x8(%rsi),%r10 ; <--- read barrier + 0.02% 0x00007f7031af6f50: mov 0xc(%r10),%r11d ; get field $target + 0.07% 0x00007f7031af6f54: test %r11d,%r11d ; null check $target <--- NOT IMPLICIT + 0x00007f7031af6f57: je 0x00007f7031af6fc7 + 9.91% 10.59% 0x00007f7031af6f59: mov %r11,%r10 + 0x00007f7031af6f5c: mov -0x8(%r10),%rbx ; <--- read barrier AGAIN + 0.18% 0.12% 0x00007f7031af6f60: cmpb $0x0,0x3d8(%r15) ; "evacuation in progress?" + 0.02% 0x00007f7031af6f68: mov -0x8(%r10),%rbx ; <--- read barrier (in sequence) + 10.08% 8.85% ╭ 0x00007f7031af6f6c: je 0x00007f7031af6f79 ; no evacuation in progress, jump over <--- SHOULD BE MOVED TO SLOWPATH + │ 0x00007f7031af6f6e: xchg %rax,%rbx + │ 0x00007f7031af6f71: callq Stub::shenandoah_wb + │ 0x00007f7031af6f76: xchg %rax,%rbx + ↘ 0x00007f7031af6f79: movsbl 0x378(%r15),%r11d ; SATB test + 0x00007f7031af6f81: test %r11d,%r11d + ╭ 0x00007f7031af6f84: jne 0x00007f7031af6f96 + │↗ 0x00007f7031af6f86: mov %r12d,0xc(%rbx) ; field store + 25.14% 22.09% ││ 0x00007f7031af6f8a: add $0x10,%rsp + ││ 0x00007f7031af6f8e: pop %rbp + ││ 0x00007f7031af6f8f: test %eax,0x1286706b(%rip) + ││ 0x00007f7031af6f95: retq + + + G1: + + [Verified Entry Point] + 7.45% 6.58% 0x00007f3204840cc0: mov %eax,-0x14000(%rsp) + 5.74% 6.06% 0x00007f3204840cc7: push %rbp + 7.74% 7.35% 0x00007f3204840cc8: sub $0x10,%rsp + 0.18% 0.18% 0x00007f3204840ccc: mov 0xc(%rsi),%ebp ; get field $target + 5.84% 6.56% 0x00007f3204840ccf: mov 0xc(%rbp),%r10d ; <--- read old for SATB, plus NPE check + 18.27% 20.87% 0x00007f3204840cd3: movsbl 0x378(%r15),%r8d ; SATB test + 0.02% 0.02% 0x00007f3204840cdb: test %r8d,%r8d + ╭ 0x00007f3204840cde: jne 0x00007f3204840cf0 + │↗ 0x00007f3204840ce0: mov %r12d,0xc(%rbp) ; field store + 4.11% 4.73% ││ 0x00007f3204840ce4: add $0x10,%rsp + 9.60% 10.33% ││ 0x00007f3204840ce8: pop %rbp + 0.30% 0.33% ││ 0x00007f3204840ce9: test %eax,0x11f29311(%rip) + 0.02% ││ 0x00007f3204840cef: retq + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersPrimitive.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,96 @@ +package org.openjdk.gcbench.runtime.writes; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Thread) +public class WriteBarriersPrimitive { + + Target target; + int source; + + @Setup + public void setup() { + target = new Target(); + source = 42; + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + target.field = source; + } + + static class Target { + int field; + } + + /* + i5 4210U, 1.7 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-12) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + WriteBarriersPrimitive.test avgt 25 3.771 ± 0.010 ns/op + + # G1 + WriteBarriersPrimitive.test avgt 25 2.623 ± 0.022 ns/op + + # Parallel + WriteBarriersPrimitive.test avgt 25 2.551 ± 0.008 ns/op + + Shenandoah has to do the write barriers even for primitive stores, + in order to preserve "no writes into evacuated regions" invariant. + Other collectors do not have to do this. The write barrier code + quality may be improved to amortize the costs. + + Shenandoah: + + [Verified Entry Point] + 6.81% 8.72% 0x00007f85a58dddc0: mov %eax,-0x14000(%rsp) + 3.08% 3.37% 0x00007f85a58dddc7: push %rbp + 0.05% 0.11% 0x00007f85a58dddc8: sub $0x10,%rsp + 8.72% 7.79% 0x00007f85a58dddcc: mov -0x8(%rsi),%r10 ; <--- read barrier ($this) + 0.67% 0.50% 0x00007f85a58dddd0: mov 0x10(%r10),%r11d ; get field $target + 0.08% 0.09% 0x00007f85a58dddd4: mov 0xc(%r10),%r10d ; get field $source + 6.83% 7.37% 0x00007f85a58dddd8: test %r11d,%r11d ; null check $target + ╭ 0x00007f85a58ddddb: je 0x00007f85a58dde0e + 2.30% 0.64% │ 0x00007f85a58ddddd: shl $0x3,%r11 ; unpack $target + 0.61% 0.52% │ 0x00007f85a58ddde1: mov -0x8(%r11),%r11 ; <--- read barrier once; WHY? There is a null check before already + 0.14% 0.09% │ 0x00007f85a58ddde5: cmpb $0x0,0x3d8(%r15) ; evacuation in progress? + 6.70% 6.03% │ 0x00007f85a58ddded: mov -0x8(%r11),%r11 ; <--- read barrier twice (in sequence now) + 3.53% 1.67% │╭ 0x00007f85a58dddf1: je 0x00007f85a58dddfe + ││ 0x00007f85a58dddf3: xchg %rax,%r11 ; <--- barrier slowpath + ││ 0x00007f85a58dddf6: callq Stub::shenandoah_wb + ││ 0x00007f85a58dddfb: xchg %rax,%r11 + 0.53% 0.62% │↘ 0x00007f85a58dddfe: mov %r10d,0xc(%r11) ; actual store + 25.37% 27.26% │ 0x00007f85a58dde02: add $0x10,%rsp + 2.28% 2.20% │ 0x00007f85a58dde06: pop %rbp + 0.72% 0.03% │ 0x00007f85a58dde07: test %eax,0x11df41f3(%rip) + 0.06% 0.09% │ 0x00007f85a58dde0d: retq + + + Parallel: + + [Verified Entry Point] + 4.35% 5.11% 0x00007fc6fdaf3840: mov %eax,-0x14000(%rsp) + 10.05% 9.23% 0x00007fc6fdaf3847: push %rbp + 0.05% 0x00007fc6fdaf3848: sub $0x10,%rsp + 4.65% 4.18% 0x00007fc6fdaf384c: mov 0xc(%rsi),%r11d ; get field $source + 10.83% 9.16% 0x00007fc6fdaf3850: mov 0x10(%rsi),%r10d ; get field $target + 0.02% 0.02% 0x00007fc6fdaf3854: mov %r11d,0xc(%r12,%r10,8) ; actual field store + 14.61% 15.76% 0x00007fc6fdaf3859: add $0x10,%rsp + 0x00007fc6fdaf385d: pop %rbp + 6.13% 6.24% 0x00007fc6fdaf385e: test %eax,0x13c1a79c(%rip) + 0.02% 0x00007fc6fdaf3864: retq + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersRef.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,114 @@ +package org.openjdk.gcbench.runtime.writes; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Thread) +public class WriteBarriersRef { + + Target target; + Target source; + + @Setup + public void setup() { + target = new Target(); + source = new Target(); + } + + @Benchmark + + public void test() { + doStore(target, source); + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void doStore(Target t, Target v) { + t.field = v; + } + + static class Target { + Object field; + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark Mode Cnt Score Error Units + + # Shenandoah + WriteBarriersRef.test avgt 25 3.222 ± 0.003 ns/op + + # G1 + WriteBarriersRef.test avgt 25 2.341 ± 0.005 ns/op + + # Parallel + WriteBarriersRef.test avgt 25 2.176 ± 0.006 ns/op + + The difference is in a more complicated write barrier that may be improved a little with more + implicit null checks and code quality tuneups. + + Shenandoah: + + [Verified Entry Point] + 5.12% 4.76% 0x00007f51f0845dc0: mov %eax,-0x14000(%rsp) + 3.00% 2.29% 0x00007f51f0845dc7: push %rbp + 0.08% 0.05% 0x00007f51f0845dc8: sub $0x10,%rsp + 4.90% 3.93% 0x00007f51f0845dcc: mov -0x8(%rsi),%r10 ; <--- read barrier + 2.34% 0.63% 0x00007f51f0845dd0: mov 0xc(%r10),%r11d ; get field $target + 0.05% 0.08% 0x00007f51f0845dd4: mov 0x10(%r10),%r10d ; <-- get field $source + 4.68% 5.24% 0x00007f51f0845dd8: test %r11d,%r11d ; nullcheck $target <--- NOT IMPLICIT? + 0x00007f51f0845ddb: je 0x00007f51f0845e6c + 0.05% 0.03% 0x00007f51f0845de1: test %r10d,%r10d ; nullcheck $source <--- NOT IMPLICIT? + ╭ 0x00007f51f0845de4: je 0x00007f51f0845e33 + 2.56% 0.19% │ 0x00007f51f0845de6: shl $0x3,%r10 ; unpack $source + 0.03% │ 0x00007f51f0845dea: mov -0x8(%r10),%rbx ; <--- HUH? Is this a null check trap? Unpacking is not needed then. + 6.58% 5.81% │ ↗ 0x00007f51f0845dee: lea (%r12,%r11,8),%r10 ; unpack $target + │ │ 0x00007f51f0845df2: mov -0x8(%r10),%rbp ; <--- read barrier + 2.00% 0.06% │ │ 0x00007f51f0845df6: cmpb $0x0,0x3d8(%r15) ; "evacuation in progress?" + │ │ 0x00007f51f0845dfe: mov -0x8(%r10),%rbp ; <--- read barrier again (in sequence) + 6.46% 6.90% │╭ │ 0x00007f51f0845e02: je 0x00007f51f0845e0f ; no evacuation? jump over the barrier <--- SHOULD MOVE TO SLOWPATH + ││ │ 0x00007f51f0845e04: xchg %rax,%rbp + ││ │ 0x00007f51f0845e07: callq Stub::shenandoah_wb ; {runtime_call StubRoutines (2)} + ││ │ 0x00007f51f0845e0c: xchg %rax,%rbp + │↘ │ 0x00007f51f0845e0f: movsbl 0x378(%r15),%r11d ; SATB check and jump + 1.29% 1.33% │ │ 0x00007f51f0845e17: test %r11d,%r11d + │ ╭│ 0x00007f51f0845e1a: jne 0x00007f51f0845e37 + │ ││↗ 0x00007f51f0845e1c: mov %rbx,%r10 ; packing $source (note, we unpacked before) + 6.61% 6.90% │ │││ 0x00007f51f0845e1f: shr $0x3,%r10 + │ │││ 0x00007f51f0845e23: mov %r10d,0xc(%rbp) ; field store! + 18.55% 20.33% │ │││ 0x00007f51f0845e27: add $0x10,%rsp + │ │││ 0x00007f51f0845e2b: pop %rbp + 2.93% 3.37% │ │││ 0x00007f51f0845e2c: test %eax,0x11e691ce(%rip) + │ │││ 0x00007f51f0845e32: retq + + + Parallel: + + [Verified Entry Point] + 3.65% 4.20% 0x00007f5169af3040: mov %eax,-0x14000(%rsp) + 9.31% 9.50% 0x00007f5169af3047: push %rbp + 0x00007f5169af3048: sub $0x10,%rsp + 3.17% 4.53% 0x00007f5169af304c: mov 0x10(%rsi),%r11d ; get field $source + 8.67% 7.93% 0x00007f5169af3050: mov 0xc(%rsi),%r10d ; get field $target + ╭ 0x00007f5169af3054: mov %r11d,0xc(%r12,%r10,8) ; field store, implicit exception: dispatches to 0x00007f5169af307b + 8.29% 8.77% │ 0x00007f5169af3059: shl $0x3,%r10 ; card mark update + │ 0x00007f5169af305d: shr $0x9,%r10 + 6.58% 5.85% │ 0x00007f5169af3061: movabs $0x7f517632f000,%r11 + │ 0x00007f5169af306b: mov %r12b,(%r11,%r10,1) + 11.76% 11.87% │ 0x00007f5169af306f: add $0x10,%rsp ; epilog + │ 0x00007f5169af3073: pop %rbp + 5.28% 5.76% │ 0x00007f5169af3074: test %eax,0x145b3f86(%rip) + │ 0x00007f5169af307a: retq + ↘ 0x00007f5169af307b: mov $0xfffffff6,%esi + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/util/Dummy.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,15 @@ +package org.openjdk.gcbench.util; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 1, time = 100, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 1, time = 100, timeUnit = TimeUnit.MILLISECONDS) +@Fork(1) +public class Dummy { + + @Benchmark + public void test() {} + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/util/TokenBucketBench.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,35 @@ +package org.openjdk.gcbench.util; + +import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket; +import org.openjdk.gcbench.util.ratelimit.RateLimiter; +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Benchmark) +public class TokenBucketBench { + + private RateLimiter limiter; + + @Param({"10", "100", "1000", "10000", "100000", "1000000", "100000000"}) + int rate; + + @Setup + public void setup() { + limiter = new MultiTokenBucket(rate); + } + + @Benchmark + public void baseline() { + + } + + @Benchmark + public void test() { + limiter.limit(); +// return new Object(); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/MultiTokenBucket.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,216 @@ +package org.openjdk.gcbench.util.ratelimit; + +import java.util.concurrent.atomic.*; + +public class MultiTokenBucket implements RateLimiter { + + static final int QUANTA_PER_SEC = 10; + static final int MS_PER_QUANTUM = 1000 / QUANTA_PER_SEC; + + static final AtomicReferenceFieldUpdater<MultiTokenBucket, Counters> STATE = + AtomicReferenceFieldUpdater.newUpdater(MultiTokenBucket.class, Counters.class, "counters"); + + private final int tokensPerQuantum; + private final long timeBase; + + private final int stateCount; + private final int stateCountMask; + + private volatile Counters counters; + private volatile int currentQuant; + + public MultiTokenBucket(int ratePerSec) { + this.tokensPerQuantum = Math.max(1, ratePerSec / QUANTA_PER_SEC); + this.stateCount = roundToPow2(Runtime.getRuntime().availableProcessors() * 2); + this.stateCountMask = stateCount - 1; + this.timeBase = System.currentTimeMillis(); + STATE.set(this, new Counters(newCounters(), 0)); + + new StampUpdater().start(); + } + + private Counter[] newCounters() { + Counter[] counters = new Counter[stateCount]; + for (int c = 0; c < stateCount; c++) { + counters[c] = new Counter(); + } + return counters; + } + + private static int roundToPow2(int v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; + } + + @Override + public void limit() { + int id = (int)(Thread.currentThread().getId() & stateCountMask); + + while (true) { + int quantId = currentQuant; + + Counters st = STATE.get(this); + Counter[] states = st.states; + int time = st.time; + + if (time == quantId) { + // our time, try to figure out the state + + // try to optimistically poll my own ID + Counter my = states[id]; + + if (my.dec() >= 1) { + return; // success! + } + + // try to steal! + for (int i = id + 1; i < stateCount; i++) { + if (trySteal(my, states[i])) + return; // success! + } + + for (int i = 0; i < id; i++) { + if (trySteal(my, states[i])) + return; // success! + } + } + + // no rush, this is not our quantum: wait before re-spinning + try { + Thread.sleep(1); + } catch (InterruptedException e) { + // ignore + } + } + } + + private boolean trySteal(Counter dst, Counter src) { + if (src.val() != 0) { + int stolen = src.steal(); + if (stolen > 0) { + dst.add(stolen - 1); // borrow one! + return true; + } + } + return false; + } + + class StampUpdater extends Thread { + public StampUpdater() { + setDaemon(true); + setPriority(MAX_PRIORITY); + } + + @Override + public void run() { + int lastQuantId = 0; + while (!Thread.interrupted()) { + int quantId = (int) ((System.currentTimeMillis() - timeBase) / MS_PER_QUANTUM); + if (quantId != lastQuantId) { + Counter[] cnts = newCounters(); + cnts[0].add(tokensPerQuantum); + + currentQuant = quantId; + lastQuantId = quantId; + + STATE.set(MultiTokenBucket.this, new Counters(cnts, quantId)); + } + try { + Thread.sleep(1); + } catch (InterruptedException e) { + // do nothing + } + } + } + } + + static class Counters { + private final Counter[] states; + private final int time; + + public Counters(Counter[] states, int time) { + this.states = states; + this.time = time; + } + } + + private static class Counter_Payload extends Counter_B1 { + static final AtomicIntegerFieldUpdater<Counter_Payload> CURRENT = + AtomicIntegerFieldUpdater.newUpdater(Counter_Payload.class, "cnt"); + + volatile int cnt; + + int val() { + return CURRENT.get(this); + } + + int dec() { + return CURRENT.getAndDecrement(this); + } + + public int steal() { + while (true) { + int val = CURRENT.get(this); + int steal = val / 2; + int remain = val - steal; + if (remain <= 0) + return 0; + if (CURRENT.compareAndSet(this, val, remain)) + return steal; + } + } + + void add(int val) { + CURRENT.addAndGet(this, val); + } + } + + private static class Counter_B1 { + boolean p000, p001, p002, p003, p004, p005, p006, p007, p008, p009, p010, p011, p012, p013, p014, p015; + boolean p016, p017, p018, p019, p020, p021, p022, p023, p024, p025, p026, p027, p028, p029, p030, p031; + boolean p032, p033, p034, p035, p036, p037, p038, p039, p040, p041, p042, p043, p044, p045, p046, p047; + boolean p048, p049, p050, p051, p052, p053, p054, p055, p056, p057, p058, p059, p060, p061, p062, p063; + boolean p064, p065, p066, p067, p068, p069, p070, p071, p072, p073, p074, p075, p076, p077, p078, p079; + boolean p080, p081, p082, p083, p084, p085, p086, p087, p088, p089, p090, p091, p092, p093, p094, p095; + boolean p096, p097, p098, p099, p100, p101, p102, p103, p104, p105, p106, p107, p108, p109, p110, p111; + boolean p112, p113, p114, p115, p116, p117, p118, p119, p120, p121, p122, p123, p124, p125, p126, p127; + boolean p128, p129, p130, p131, p132, p133, p134, p135, p136, p137, p138, p139, p140, p141, p142, p143; + boolean p144, p145, p146, p147, p148, p149, p150, p151, p152, p153, p154, p155, p156, p157, p158, p159; + boolean p160, p161, p162, p163, p164, p165, p166, p167, p168, p169, p170, p171, p172, p173, p174, p175; + boolean p176, p177, p178, p179, p180, p181, p182, p183, p184, p185, p186, p187, p188, p189, p190, p191; + boolean p192, p193, p194, p195, p196, p197, p198, p199, p200, p201, p202, p203, p204, p205, p206, p207; + boolean p208, p209, p210, p211, p212, p213, p214, p215, p216, p217, p218, p219, p220, p221, p222, p223; + boolean p224, p225, p226, p227, p228, p229, p230, p231, p232, p233, p234, p235, p236, p237, p238, p239; + boolean p240, p241, p242, p243, p244, p245, p246, p247, p248, p249, p250, p251, p252, p253, p254, p255; + } + + private static class Counter_B2 extends Counter_Payload { + boolean p000, p001, p002, p003, p004, p005, p006, p007, p008, p009, p010, p011, p012, p013, p014, p015; + boolean p016, p017, p018, p019, p020, p021, p022, p023, p024, p025, p026, p027, p028, p029, p030, p031; + boolean p032, p033, p034, p035, p036, p037, p038, p039, p040, p041, p042, p043, p044, p045, p046, p047; + boolean p048, p049, p050, p051, p052, p053, p054, p055, p056, p057, p058, p059, p060, p061, p062, p063; + boolean p064, p065, p066, p067, p068, p069, p070, p071, p072, p073, p074, p075, p076, p077, p078, p079; + boolean p080, p081, p082, p083, p084, p085, p086, p087, p088, p089, p090, p091, p092, p093, p094, p095; + boolean p096, p097, p098, p099, p100, p101, p102, p103, p104, p105, p106, p107, p108, p109, p110, p111; + boolean p112, p113, p114, p115, p116, p117, p118, p119, p120, p121, p122, p123, p124, p125, p126, p127; + boolean p128, p129, p130, p131, p132, p133, p134, p135, p136, p137, p138, p139, p140, p141, p142, p143; + boolean p144, p145, p146, p147, p148, p149, p150, p151, p152, p153, p154, p155, p156, p157, p158, p159; + boolean p160, p161, p162, p163, p164, p165, p166, p167, p168, p169, p170, p171, p172, p173, p174, p175; + boolean p176, p177, p178, p179, p180, p181, p182, p183, p184, p185, p186, p187, p188, p189, p190, p191; + boolean p192, p193, p194, p195, p196, p197, p198, p199, p200, p201, p202, p203, p204, p205, p206, p207; + boolean p208, p209, p210, p211, p212, p213, p214, p215, p216, p217, p218, p219, p220, p221, p222, p223; + boolean p224, p225, p226, p227, p228, p229, p230, p231, p232, p233, p234, p235, p236, p237, p238, p239; + boolean p240, p241, p242, p243, p244, p245, p246, p247, p248, p249, p250, p251, p252, p253, p254, p255; + } + + private static class Counter extends Counter_B2 { + + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/RateLimiter.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,7 @@ +package org.openjdk.gcbench.util.ratelimit; + +public interface RateLimiter { + + void limit(); + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/TokenBucket.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,64 @@ +package org.openjdk.gcbench.util.ratelimit; + +import java.util.concurrent.atomic.AtomicLongFieldUpdater; + +public class TokenBucket implements RateLimiter { + + static final int QUANTA_PER_SEC = 5; + static final int MS_PER_QUANTUM = 1000 / QUANTA_PER_SEC; + + static final AtomicLongFieldUpdater<TokenBucket> STATE = + AtomicLongFieldUpdater.newUpdater(TokenBucket.class, "state"); + + private final int tokensPerQuantum; + private final long timeBase; + private volatile long state; + + public TokenBucket(int ratePerSec) { + this.tokensPerQuantum = Math.max(1, ratePerSec / QUANTA_PER_SEC); + this.timeBase = System.currentTimeMillis(); + } + + private static int timestamp(long l) { + return (int)(l >> 32); + } + + private static int tokens(long l) { + return (int)(l & 0x7FFFFFFF); + } + + private static long pack(int timestamp, int tokens) { + return ((long)timestamp << 32) + tokens; + } + + @Override + public void limit() { + while (true) { + int quantId = (int) ((System.currentTimeMillis() - timeBase) / MS_PER_QUANTUM); + + long cur = STATE.get(this); + int time = timestamp(cur); + int tokens = tokens(cur); + + if (time == quantId && tokens != 0) { + // current quantum has tokens, try to claim and exit + if (STATE.compareAndSet(this, cur, pack(quantId, tokens - 1))) { + return; // success + } else { + continue; // immediate respin + } + } else if (time <= quantId) { + // current or past quantum is empty, try to install a new one, and respin + STATE.compareAndSet(this, cur, pack(quantId + 1, tokensPerQuantum)); + } + + // no rush: wait before respinning + try { + Thread.sleep(1); + } catch (InterruptedException e) { + // ignore + } + } + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/LinkedListGC.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,37 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.LinkedList; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class LinkedListGC { + + LinkedList<Object> list; + + @Param({"1", "10", "100", "1000"}) + private int size; + + + @Setup + public void setup() { + list = new LinkedList<>(); + for (int c = 0; c < size; c++) { + list.add(new Object()); + } + } + + @Benchmark + public void test() throws InterruptedException { + Thread.sleep(100); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/LinkedListTraversal.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,192 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.LinkedList; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"}) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class LinkedListTraversal { + + LinkedList<Object> list; + + @Param({"1", "10", "100", "1000"}) + private int size; + + @Param({"0", "1", "10", "100"}) + private int work; + + @Setup + public void setup() { + list = new LinkedList<>(); + for (int c = 0; c < size; c++) { + list.add(new Object()); + } + } + + @Benchmark + public void test() { + for (Object o : list) { + process(o); + } + } + + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + private void process(Object o) { + Blackhole.consumeCPU(work); + } + + /* + i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05) + + Benchmark (size) (work) Mode Cnt Score Error Units + + # ------------- Shenandoah + LinkedListTraversal.test 1 0 avgt 5 17.388 ± 0.424 ns/op + LinkedListTraversal.test 1 1 avgt 5 18.755 ± 0.102 ns/op + LinkedListTraversal.test 1 10 avgt 5 33.846 ± 0.185 ns/op + LinkedListTraversal.test 1 100 avgt 5 230.246 ± 4.150 ns/op + + LinkedListTraversal.test 10 0 avgt 5 126.915 ± 0.619 ns/op + LinkedListTraversal.test 10 1 avgt 5 139.774 ± 1.138 ns/op + LinkedListTraversal.test 10 10 avgt 5 299.499 ± 0.536 ns/op + LinkedListTraversal.test 10 100 avgt 5 2275.352 ± 108.207 ns/op + + LinkedListTraversal.test 100 0 avgt 5 1242.132 ± 12.020 ns/op + LinkedListTraversal.test 100 1 avgt 5 1362.940 ± 18.402 ns/op + LinkedListTraversal.test 100 10 avgt 5 2937.820 ± 88.949 ns/op + LinkedListTraversal.test 100 100 avgt 5 22461.248 ± 215.900 ns/op + + LinkedListTraversal.test 1000 0 avgt 5 12407.080 ± 784.980 ns/op + LinkedListTraversal.test 1000 1 avgt 5 13568.390 ± 69.637 ns/op + LinkedListTraversal.test 1000 10 avgt 5 28889.154 ± 156.406 ns/op + LinkedListTraversal.test 1000 100 avgt 5 224457.455 ± 1615.782 ns/op + + # ------------- G1 + LinkedListTraversal.test 1 0 avgt 5 11.554 ± 0.044 ns/op + LinkedListTraversal.test 1 1 avgt 5 13.590 ± 0.512 ns/op + LinkedListTraversal.test 1 10 avgt 5 29.012 ± 0.098 ns/op + LinkedListTraversal.test 1 100 avgt 5 224.851 ± 1.311 ns/op + + LinkedListTraversal.test 10 0 avgt 5 101.351 ± 0.610 ns/op + LinkedListTraversal.test 10 1 avgt 5 114.732 ± 0.477 ns/op + LinkedListTraversal.test 10 10 avgt 5 278.078 ± 2.164 ns/op + LinkedListTraversal.test 10 100 avgt 5 2258.527 ± 95.300 ns/op + + LinkedListTraversal.test 100 0 avgt 5 997.334 ± 3.157 ns/op + LinkedListTraversal.test 100 1 avgt 5 1129.239 ± 7.287 ns/op + LinkedListTraversal.test 100 10 avgt 5 2692.583 ± 46.002 ns/op + LinkedListTraversal.test 100 100 avgt 5 22185.850 ± 119.166 ns/op + + LinkedListTraversal.test 1000 0 avgt 5 10459.159 ± 619.495 ns/op + LinkedListTraversal.test 1000 1 avgt 5 11306.074 ± 57.860 ns/op + LinkedListTraversal.test 1000 10 avgt 5 26786.378 ± 98.852 ns/op + LinkedListTraversal.test 1000 100 avgt 5 223273.974 ± 1668.267 ns/op + + # ------------- Parallel + LinkedListTraversal.test 1 0 avgt 5 12.045 ± 0.102 ns/op + LinkedListTraversal.test 1 1 avgt 5 13.521 ± 0.109 ns/op + LinkedListTraversal.test 1 10 avgt 5 28.933 ± 0.170 ns/op + LinkedListTraversal.test 1 100 avgt 5 224.028 ± 1.007 ns/op + + LinkedListTraversal.test 10 0 avgt 5 99.943 ± 0.319 ns/op + LinkedListTraversal.test 10 1 avgt 5 113.020 ± 0.472 ns/op + LinkedListTraversal.test 10 10 avgt 5 278.874 ± 0.981 ns/op + LinkedListTraversal.test 10 100 avgt 5 2240.404 ± 68.685 ns/op + + LinkedListTraversal.test 100 0 avgt 5 984.926 ± 9.101 ns/op + LinkedListTraversal.test 100 1 avgt 5 1123.761 ± 22.491 ns/op + LinkedListTraversal.test 100 10 avgt 5 2686.100 ± 27.615 ns/op + LinkedListTraversal.test 100 100 avgt 5 22200.145 ± 137.753 ns/op + + LinkedListTraversal.test 1000 0 avgt 5 10608.240 ± 1392.802 ns/op + LinkedListTraversal.test 1000 1 avgt 5 11415.247 ± 448.812 ns/op + LinkedListTraversal.test 1000 10 avgt 5 27146.258 ± 786.984 ns/op + LinkedListTraversal.test 1000 100 avgt 5 223566.680 ± 4465.310 ns/op + + Bottom-line: Shenandoah experiences slowdowns compared to G1 and Parallel, mostly visible + when the work associated with each element is small. For "no operation" mode, the overhead + is around 25%, and explained by more instructions emitted by Shenandoah which do more + memory accesses, see: + + Benchmark (size) (work) Mode Cnt Score Error Units + + # ------------- Shenandoah + LinkedListTraversal.test 1000 0 avgt 50 12354.399 ± 23.683 ns/op + LinkedListTraversal.test:CPI 1000 0 avgt 10 0.581 ± 0.003 #/op + LinkedListTraversal.test:L1-dcache-load-misses 1000 0 avgt 10 664.917 ± 9.217 #/op + LinkedListTraversal.test:L1-dcache-loads 1000 0 avgt 10 32601.580 ± 374.103 #/op <--- !!! + LinkedListTraversal.test:L1-dcache-stores 1000 0 avgt 10 15101.405 ± 176.765 #/op + LinkedListTraversal.test:branch-misses 1000 0 avgt 10 6.186 ± 0.311 #/op + LinkedListTraversal.test:branches 1000 0 avgt 10 13746.542 ± 154.449 #/op + LinkedListTraversal.test:bus-cycles 1000 0 avgt 10 1268.583 ± 13.952 #/op + LinkedListTraversal.test:cycles 1000 0 avgt 10 45603.892 ± 502.818 #/op + LinkedListTraversal.test:dTLB-load-misses 1000 0 avgt 10 1.584 ± 0.228 #/op + LinkedListTraversal.test:dTLB-loads 1000 0 avgt 9 32488.220 ± 487.012 #/op + LinkedListTraversal.test:dTLB-store-misses 1000 0 avgt 10 0.046 ± 0.017 #/op + LinkedListTraversal.test:dTLB-stores 1000 0 avgt 10 15058.174 ± 175.262 #/op + LinkedListTraversal.test:instructions 1000 0 avgt 10 78513.188 ± 756.405 #/op <--- !!! + LinkedListTraversal.test:ref-cycles 1000 0 avgt 10 50832.373 ± 505.813 #/op + + # ------------- G1 + LinkedListTraversal.test 1000 0 avgt 50 10495.699 ± 65.219 ns/op + LinkedListTraversal.test:CPI 1000 0 avgt 10 0.618 ± 0.014 #/op + LinkedListTraversal.test:L1-dcache-load-misses 1000 0 avgt 10 377.307 ± 12.210 #/op + LinkedListTraversal.test:L1-dcache-loads 1000 0 avgt 10 22606.939 ± 518.441 #/op + LinkedListTraversal.test:L1-dcache-stores 1000 0 avgt 10 16238.168 ± 329.001 #/op + LinkedListTraversal.test:branch-misses 1000 0 avgt 10 4.739 ± 0.523 #/op + LinkedListTraversal.test:branches 1000 0 avgt 10 9589.771 ± 226.666 #/op + LinkedListTraversal.test:bus-cycles 1000 0 avgt 10 1077.738 ± 21.578 #/op + LinkedListTraversal.test:cycles 1000 0 avgt 10 38816.240 ± 706.088 #/op + LinkedListTraversal.test:dTLB-load-misses 1000 0 avgt 10 1.343 ± 0.182 #/op + LinkedListTraversal.test:dTLB-loads 1000 0 avgt 10 22527.332 ± 500.666 #/op + LinkedListTraversal.test:dTLB-store-misses 1000 0 avgt 8 0.047 ± 0.058 #/op + LinkedListTraversal.test:dTLB-stores 1000 0 avgt 8 16151.683 ± 322.656 #/op + LinkedListTraversal.test:instructions 1000 0 avgt 10 62848.740 ± 1555.431 #/op + LinkedListTraversal.test:ref-cycles 1000 0 avgt 10 43275.574 ± 748.133 #/op + + # ------------- Parallel + LinkedListTraversal.test 1000 0 avgt 50 10507.829 ± 65.413 ns/op + LinkedListTraversal.test:CPI 1000 0 avgt 10 0.617 ± 0.008 #/op + LinkedListTraversal.test:L1-dcache-load-misses 1000 0 avgt 9 379.135 ± 16.184 #/op + LinkedListTraversal.test:L1-dcache-loads 1000 0 avgt 9 22681.509 ± 390.124 #/op + LinkedListTraversal.test:L1-dcache-stores 1000 0 avgt 9 16162.990 ± 244.165 #/op + LinkedListTraversal.test:branch-misses 1000 0 avgt 10 4.777 ± 0.553 #/op + LinkedListTraversal.test:branches 1000 0 avgt 10 9537.536 ± 122.677 #/op + LinkedListTraversal.test:bus-cycles 1000 0 avgt 10 1072.327 ± 19.018 #/op + LinkedListTraversal.test:cycles 1000 0 avgt 10 38632.620 ± 934.336 #/op + LinkedListTraversal.test:dTLB-load-misses 1000 0 avgt 8 1.338 ± 0.148 #/op + LinkedListTraversal.test:dTLB-loads 1000 0 avgt 9 22627.587 ± 392.352 #/op + LinkedListTraversal.test:dTLB-store-misses 1000 0 avgt 8 0.035 ± 0.016 #/op + LinkedListTraversal.test:dTLB-stores 1000 0 avgt 9 16104.147 ± 230.044 #/op + LinkedListTraversal.test:instructions 1000 0 avgt 10 62623.520 ± 1394.639 #/op + LinkedListTraversal.test:ref-cycles 1000 0 avgt 10 42984.093 ± 1044.819 #/op + + These additional memory accesses are read barriers, e.g.: + + Shenandoah: + + 0.88% 0.54% 0x00007f2029549703: mov -0x8(%rcx),%r10 ; implicit exception: dispatches to 0x00007f2029549cb9 + 1.18% 0.95% 0x00007f2029549707: mov 0x10(%r10),%r10d ;*getfield next {reexecute=0 rethrow=0 return_oop=0} + ; - java.util.LinkedList$ListItr::next@32 (line 897) + ; - org.openjdk.shenandoah.scenarios.LinkedListTraversal::test@18 (line 36) + ; - org.openjdk.shenandoah.scenarios.generated.LinkedListTraversal_test_jmhTest::test_avgt_jmhStub@15 (line 213) + + G1/Parallel: + + 0.95% 0.72% 0x00007fe7c554707b: mov 0x10(%r11),%r10d ;*getfield next {reexecute=0 rethrow=0 return_oop=0} + ; - java.util.LinkedList$ListItr::next@32 (line 897) + ; - org.openjdk.shenandoah.scenarios.LinkedListTraversal::test@18 (line 36) + ; - org.openjdk.shenandoah.scenarios.generated.LinkedListTraversal_test_jmhTest::test_avgt_jmhStub@15 (line 213) + ; implicit exception: dispatches to 0x00007fe7c5547305 + + */ + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/ReadWriteBarriers.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,38 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class ReadWriteBarriers { + + AtomicInteger ai; + + @Setup + public void setup() { + ai = new AtomicInteger(); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public Object cas() { + AtomicInteger ai = this.ai; + int cur = ai.get(); + return ai.compareAndSet(cur, cur + 1); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public Object incrAndGet() { + return ai.incrementAndGet(); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/Synchronizers.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,78 @@ +package org.openjdk.gcbench.wip; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(value = 1, jvmArgsAppend = "-Xss32m") +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Threads(Threads.MAX) +@State(Scope.Benchmark) +public class Synchronizers { + + List<Object> list; + + @Param({"40000"}) + private int size; + + @Setup + public void setup() { + list = new ArrayList<>(); + for (int c = 0; c < size; c++) { + list.add(new Object()); + } + } + + @Benchmark + public void test(Blackhole bh) throws InterruptedException { + recursiveLock(bh, list, 0); + } + + private void recursiveLock(Blackhole bh, List<Object> list, int i) { + if (i < list.size()) { + Object o0 = list.get(i + 0); + Object o1 = list.get(i + 1); + Object o2 = list.get(i + 2); + Object o3 = list.get(i + 3); + Object o4 = list.get(i + 4); + Object o5 = list.get(i + 5); + Object o6 = list.get(i + 6); + Object o7 = list.get(i + 7); + Object o8 = list.get(i + 8); + Object o9 = list.get(i + 9); + synchronized (o0) { + synchronized (o1) { + synchronized (o2) { + synchronized (o3) { + synchronized (o4) { + synchronized (o5) { + synchronized (o6) { + synchronized (o7) { + synchronized (o8) { + synchronized (o9) { + recursiveLock(bh, list, i + 10); + } + } + } + } + } + } + } + } + } + } + } else { + for (int c = 0; c < size; c++) { + bh.consume(list.get(c).hashCode()); + list.set(c, new Object()); + } + } + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/WeakCasBarriers.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,36 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class WeakCasBarriers { + + AtomicReference ai; + + Object t1, t2; + + @Setup + public void setup() { + ai = new AtomicReference(); + t1 = new Object(); + t2 = new Object(); + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + AtomicReference ai = this.ai; + ai.weakCompareAndSet(t1, t2); + ai.weakCompareAndSet(t2, t1); + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/WeakRefs.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,62 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; + +import java.lang.ref.ReferenceQueue; +import java.lang.ref.WeakReference; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class WeakRefs { + + @Param({"1", "10", "100", "1000", "10000"}) + int count; + + ReferenceQueue<Target> refq; + Target[] targets; + MyRef[] refs; + + @Setup + public void setup() { + refq = new ReferenceQueue<>(); + targets = new Target[count]; + refs = new MyRef[count]; + for (int c = 0; c < count; c++) { + Target o = new Target(); + targets[c] = o; + refs[c] = new MyRef(refq, o, c); + } + } + + @Benchmark + public void test() { + targets[ThreadLocalRandom.current().nextInt(count)] = null; + + MyRef r; + while ((r = (MyRef)refq.poll()) != null) { + Target o = new Target(); + targets[r.index] = o; + refs[r.index] = new MyRef(refq, o, r.index); + } + } + + static class MyRef extends WeakReference<Target> { + int index; + public MyRef(ReferenceQueue<Target> refq, Target obj, int idx) { + super(obj, refq); + this.index = idx; + } + } + + static class Target { + + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/WriteBarriersKnownNew.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,35 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class WriteBarriersKnownNew { + + Target target; + Target targetNull; + + @Setup + public void setup() { + target = new Target(); + targetNull = null; + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void test() { + target.field = new Object(); + } + + static class Target { + Object field; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/main/java/org/openjdk/gcbench/wip/WriteBarriersNullCheck.java Wed Nov 23 16:04:53 2016 +0100 @@ -0,0 +1,45 @@ +package org.openjdk.shenandoah.wip; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Threads(1) +@State(Scope.Benchmark) +public class WriteBarriersNullCheck { + + Target target; + Target targetNull; + + @Setup + public void setup() { + target = new Target(); + targetNull = null; + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void storeToNull() { + try { + targetNull.field = null; + } catch (NullPointerException e) { + // expected + } + } + + @Benchmark + @CompilerControl(CompilerControl.Mode.DONT_INLINE) + public void storeNull() { + target.field = null; + } + + static class Target { + Object field; + } + +}