changeset 0:f8496889e1ac

Initial import.
author shade
date Wed, 23 Nov 2016 16:04:53 +0100
parents
children 06dbd0d0fe69
files pom.xml src/main/java/org/openjdk/gcbench/GCBench.java src/main/java/org/openjdk/gcbench/alloc/plain/Objects.java src/main/java/org/openjdk/gcbench/alloc/plain/PrimArray.java src/main/java/org/openjdk/gcbench/alloc/plain/RefArray.java src/main/java/org/openjdk/gcbench/alloc/ratelimited/Objects.java src/main/java/org/openjdk/gcbench/alloc/ratelimited/PrimArray.java src/main/java/org/openjdk/gcbench/alloc/ratelimited/RefArray.java src/main/java/org/openjdk/gcbench/fragger/ArrayFragger.java src/main/java/org/openjdk/gcbench/fragger/LinkedListFragger.java src/main/java/org/openjdk/gcbench/fragger/TreeFragger.java src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNew.java src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNulls.java src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersRandom.java src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveFailure.java src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveSuccess.java src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefFailure.java src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefSuccess.java src/main/java/org/openjdk/gcbench/runtime/cmpxchg/WeakCasLoop.java src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersArrays.java src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersCachePressure.java src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersFields.java src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersKnownNull.java src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersPrimitive.java src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersRef.java src/main/java/org/openjdk/gcbench/util/Dummy.java src/main/java/org/openjdk/gcbench/util/TokenBucketBench.java src/main/java/org/openjdk/gcbench/util/ratelimit/MultiTokenBucket.java src/main/java/org/openjdk/gcbench/util/ratelimit/RateLimiter.java src/main/java/org/openjdk/gcbench/util/ratelimit/TokenBucket.java src/main/java/org/openjdk/gcbench/wip/LinkedListGC.java src/main/java/org/openjdk/gcbench/wip/LinkedListTraversal.java src/main/java/org/openjdk/gcbench/wip/ReadWriteBarriers.java src/main/java/org/openjdk/gcbench/wip/Synchronizers.java src/main/java/org/openjdk/gcbench/wip/WeakCasBarriers.java src/main/java/org/openjdk/gcbench/wip/WeakRefs.java src/main/java/org/openjdk/gcbench/wip/WriteBarriersKnownNew.java src/main/java/org/openjdk/gcbench/wip/WriteBarriersNullCheck.java
diffstat 38 files changed, 2930 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pom.xml	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,182 @@
+<!--
+Copyright (c) 2014, Oracle America, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Oracle nor the names of its contributors may be used
+   to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.openjdk</groupId>
+    <artifactId>gc-bench</artifactId>
+    <version>1.0</version>
+    <packaging>jar</packaging>
+
+    <name>JMH benchmark sample: Java</name>
+
+    <!--
+       This is the demo/sample template build script for building Java benchmarks with JMH.
+       Edit as needed.
+    -->
+
+    <prerequisites>
+        <maven>3.0</maven>
+    </prerequisites>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-core</artifactId>
+            <version>${jmh.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.openjdk.jol</groupId>
+            <artifactId>jol-core</artifactId>
+            <version>0.6</version>
+        </dependency>
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-generator-annprocess</artifactId>
+            <version>${jmh.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+
+        <!--
+            JMH version to use with this project.
+          -->
+        <jmh.version>1.17</jmh.version>
+
+        <!--
+            Java source/target to use for compilation.
+          -->
+        <javac.target>1.8</javac.target>
+
+        <!--
+            Name of the benchmark Uber-JAR to generate.
+          -->
+        <uberjar.name>benchmarks</uberjar.name>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.1</version>
+                <configuration>
+                    <compilerVersion>${javac.target}</compilerVersion>
+                    <source>${javac.target}</source>
+                    <target>${javac.target}</target>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>2.2</version>
+                <executions>
+                    <execution>
+                        <id>uberjar</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <finalName>${uberjar.name}</finalName>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.openjdk.jmh.Main</mainClass>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                    <execution>
+                        <id>gcbench</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <finalName>gcbench</finalName>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.openjdk.gcbench.GCBench</mainClass>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+        <pluginManagement>
+            <plugins>
+                <plugin>
+                    <artifactId>maven-clean-plugin</artifactId>
+                    <version>2.5</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-deploy-plugin</artifactId>
+                    <version>2.8.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-install-plugin</artifactId>
+                    <version>2.5.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <version>2.4</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-javadoc-plugin</artifactId>
+                    <version>2.9.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-resources-plugin</artifactId>
+                    <version>2.6</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-site-plugin</artifactId>
+                    <version>3.3</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-source-plugin</artifactId>
+                    <version>2.2.1</version>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.17</version>
+                </plugin>
+            </plugins>
+        </pluginManagement>
+    </build>
+
+</project>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/GCBench.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,300 @@
+package org.openjdk.gcbench;
+
+import org.openjdk.gcbench.alloc.ratelimited.Objects;
+import org.openjdk.gcbench.alloc.ratelimited.PrimArray;
+import org.openjdk.gcbench.alloc.ratelimited.RefArray;
+import org.openjdk.gcbench.fragger.ArrayFragger;
+import org.openjdk.gcbench.util.Dummy;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.profile.GCProfiler;
+import org.openjdk.jmh.profile.SafepointsProfiler;
+import org.openjdk.jmh.results.Result;
+import org.openjdk.jmh.results.RunResult;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.TimeValue;
+import org.openjdk.jmh.runner.options.VerboseMode;
+
+import java.io.PrintWriter;
+import java.util.Map;
+
+public class GCBench {
+
+    private final PrintWriter pw;
+    private final Options baseOpts;
+    private int maxHeapMB;
+
+    public static void main(String... args) throws RunnerException {
+        GCBench bench = new GCBench();
+        bench.run();
+    }
+
+    private void run() throws RunnerException {
+        maxHeapMB = calibrateMaxHeap();
+
+//        runAllocationPressure_Peak();
+//        runAllocationPressure_RateLimited();
+        runFraggers_RateLimited();
+    }
+
+    private void runAllocationPressure_Peak() throws RunnerException {
+        pw.println("=== PEAK ALLOCATION PRESSURE TESTS");
+        pw.println();
+
+        pw.println("Allocates the objects in almost completely empty heap. This tests how well the collector" +
+                " can withstand peak allocation pressure without taking care of anything else.");
+        pw.println();
+
+        pw.println("*** Allocating Object:");
+        pw.println();
+        doRun_ThreadsHeap(org.openjdk.gcbench.alloc.plain.Objects.class);
+        pw.println();
+
+        pw.println("*** Allocating int[]:");
+        pw.println();
+        doRun_ThreadsHeapSize(org.openjdk.gcbench.alloc.plain.PrimArray.class);
+        pw.println();
+
+        pw.println("*** Allocating Object[]:");
+        pw.println();
+        doRun_ThreadsHeapSize(org.openjdk.gcbench.alloc.plain.RefArray.class);
+        pw.println();
+    }
+
+    private void runAllocationPressure_RateLimited() throws RunnerException {
+        pw.println("=== RATE-LIMITED ALLOCATION PRESSURE TESTS");
+        pw.println();
+
+        pw.println("Allocates the objects in almost completely empty heap, but with the controlled allocation rate." +
+                "This tests when the collector SLA requirements break under dynamic allocation pressure.");
+        pw.println();
+
+        pw.println("*** Allocating Object:");
+        pw.println();
+        doRun_AllocRate(Objects.class);
+        pw.println();
+
+        pw.println("*** Allocating int[]:");
+        pw.println();
+        doRun_AllocRateSize(PrimArray.class);
+        pw.println();
+
+        pw.println("*** Allocating Object[]:");
+        pw.println();
+        doRun_AllocRateSize(RefArray.class);
+        pw.println();
+    }
+
+    private void runFraggers_RateLimited() throws RunnerException {
+        pw.println("=== RATE-LIMITED FRAGMENTATION TESTS");
+        pw.println();
+
+        pw.println("*** Array fragger:");
+        pw.println();
+        doRun_AllocRate(ArrayFragger.class);
+        pw.println();
+    }
+
+    public GCBench() {
+        pw = new PrintWriter(System.out, true);
+
+        baseOpts = new OptionsBuilder()
+                .detectJvmArgs()
+                .warmupIterations(1)
+                .warmupTime(TimeValue.seconds(1))
+                .measurementIterations(3)
+                .measurementTime(TimeValue.seconds(1))
+                .forks(1)
+                .threads(Threads.MAX)
+//                .threads(1)
+                .addProfiler(GCProfiler.class)
+                .addProfiler(SafepointsProfiler.class)
+                .verbosity(VerboseMode.SILENT)
+                .build();
+    }
+
+    private int calibrateMaxHeap() {
+        pw.println("=== Calibrating the target heap size");
+        pw.println();
+
+        int baseHeapMB = 1000;
+        int latestSuccessMB = 0;
+        boolean progress;
+        do {
+            progress = false;
+            for (int incr = 100; incr < Integer.MAX_VALUE; incr *= 2) {
+                int heapGB = baseHeapMB + incr;
+                pw.print(heapGB + "? ");
+                pw.flush();
+                Options opts = new OptionsBuilder()
+                        .include(Dummy.class.getCanonicalName())
+                        .threads(1)
+                        .jvmArgsAppend("-Xmx" + heapGB + "m", "-Xms" + heapGB + "m")
+                        .verbosity(VerboseMode.SILENT)
+                        .build();
+                try {
+                    new Runner(opts).runSingle();
+                    latestSuccessMB = heapGB;
+                    progress = true;
+                } catch (RunnerException e) {
+                    baseHeapMB = latestSuccessMB;
+                    break;
+                }
+            }
+        } while (progress);
+
+        pw.println();
+        pw.println("Max heap size is " + latestSuccessMB + " Mb");
+        pw.println();
+
+        return latestSuccessMB;
+    }
+
+    private int calibrateRate(Class<?> benchmark, int size) throws RunnerException {
+        {
+            Options opts = new OptionsBuilder()
+                    .parent(baseOpts)
+                    .include(benchmark.getName())
+                    .param("size", String.valueOf(size))
+                    .param("rate", String.valueOf(Integer.MAX_VALUE))
+                    .build();
+
+            RunResult result = new Runner(opts).runSingle();
+            return (int) result.getPrimaryResult().getScore();
+        }
+    }
+
+    private void doRun_ThreadsHeapSize(Class<?> benchmark) {
+        for (int size = 1; size <= 1000000; size *= 100) {
+            pw.println();
+            pw.println("size = " + size);
+            pw.println();
+            Options opts = new OptionsBuilder()
+                    .parent(baseOpts)
+                    .param("size", String.valueOf(size))
+                    .build();
+            doRun_ThreadsHeapX(benchmark, opts);
+        }
+    }
+
+    private void doRun_ThreadsHeap(Class<?> benchmark) {
+        doRun_ThreadsHeapX(benchmark, baseOpts);
+    }
+
+    private void doRun_ThreadsHeapX(Class<?> benchmark, Options baseOpts) {
+        pw.printf("%-10s %-20s %-40s %-40s %-44s %-44s %n",
+                "threads",
+                "heap, MB",
+                "performance",
+                "allocation rate",
+                "pauses (sum, 99%, 99.9%, 99.99%)",
+                "ttsp (sum, 99%, 99.9%, 99.99%)"
+        );
+
+        int maxThreads = Runtime.getRuntime().availableProcessors();
+        for (int threads = 1; threads <= maxThreads; threads *= 2) {
+            pw.println();
+            int margin = maxHeapMB / 8;
+            int step = maxHeapMB / 8;
+            for (int heapMB = margin; heapMB < maxHeapMB - margin; heapMB += step) {
+                Options opts = new OptionsBuilder()
+                        .parent(baseOpts)
+                        .include(benchmark.getName())
+                        .threads(threads)
+                        .jvmArgsAppend("-Xmx" + heapMB + "m", "-Xms" + heapMB + "m")
+                        .build();
+
+                try {
+                    RunResult result = new Runner(opts).runSingle();
+
+                    Result prim = result.getPrimaryResult();
+                    Map<String, Result> sec = result.getSecondaryResults();
+
+                    pw.printf("%-10d %-20d %-40s %-40s %10s %10s %10s %10s %10s %10s %10s %10s %n",
+                            threads,
+                            heapMB,
+                            prim,
+                            sec.get("·gc.alloc.rate"),
+                            sec.get("·safepoints.pause"),
+                            sec.get("·safepoints.pause.p0.99"),
+                            sec.get("·safepoints.pause.p0.999"),
+                            sec.get("·safepoints.pause.p0.9999"),
+                            sec.get("·safepoints.ttsp"),
+                            sec.get("·safepoints.ttsp.p0.99"),
+                            sec.get("·safepoints.ttsp.p0.999"),
+                            sec.get("·safepoints.ttsp.p0.9999")
+                    );
+                } catch (RunnerException e) {
+                    // OOME, fail
+                }
+            }
+        }
+    }
+
+
+    private void doRun_AllocRateSize(Class<?> benchmark) throws RunnerException {
+        for (int size = 1; size <= 1000000; size *= 100) {
+            pw.println();
+            pw.println("size = " + size);
+            pw.println();
+            Options opts = new OptionsBuilder()
+                    .parent(baseOpts)
+                    .param("size", String.valueOf(size))
+                    .build();
+            doRun_AllocRateX(benchmark, opts);
+        }
+    }
+
+    private void doRun_AllocRate(Class<?> benchmark) throws RunnerException {
+        doRun_AllocRateX(benchmark, baseOpts);
+    }
+
+    private void doRun_AllocRateX(Class<?> benchmark, Options baseOpts) throws RunnerException {
+        pw.printf("%-10s %-20s %-40s %-40s %-44s %-44s %n",
+                "size",
+                "target rate",
+                "actual rate",
+                "allocation rate",
+                "pauses",
+                "ttsp"
+                );
+
+        for (int size = 1; size <= 1000000; size *= 100) {
+            int maxRate = calibrateRate(benchmark, size);
+
+            pw.println();
+            for (int rate = maxRate / 10; rate <= maxRate; rate += maxRate / 10) {
+                Options opts = new OptionsBuilder()
+                        .parent(baseOpts)
+                        .include(benchmark.getName())
+                        .param("size", String.valueOf(size))
+                        .param("rate", String.valueOf(rate))
+                        .build();
+
+                RunResult result = new Runner(opts).runSingle();
+
+                Result prim = result.getPrimaryResult();
+                Map<String, Result> sec = result.getSecondaryResults();
+
+                pw.printf("%-10d %-20d %-40s %-40s %10s %10s %10s %10s %10s %10s %10s %10s %n",
+                        size,
+                        rate,
+                        prim,
+                        sec.get("·gc.alloc.rate"),
+                        sec.get("·safepoints.pause"),
+                        sec.get("·safepoints.pause.p0.99"),
+                        sec.get("·safepoints.pause.p0.999"),
+                        sec.get("·safepoints.pause.p0.9999"),
+                        sec.get("·safepoints.ttsp"),
+                        sec.get("·safepoints.ttsp.p0.99"),
+                        sec.get("·safepoints.ttsp.p0.999"),
+                        sec.get("·safepoints.ttsp.p0.9999")
+                );
+            }
+
+        }
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/plain/Objects.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,18 @@
+package org.openjdk.gcbench.alloc.plain;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Benchmark)
+public class Objects {
+
+    @Benchmark
+    public Object test() {
+        return new Object();
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/plain/PrimArray.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,21 @@
+package org.openjdk.gcbench.alloc.plain;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Benchmark)
+public class PrimArray {
+
+    @Param({"1"})
+    int size;
+
+    @Benchmark
+    public Object test() {
+        return new int[size];
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/plain/RefArray.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,21 @@
+package org.openjdk.gcbench.alloc.plain;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Benchmark)
+public class RefArray {
+
+    @Param({"1"})
+    int size;
+
+    @Benchmark
+    public Object test() {
+        return new Object[size];
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/Objects.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,29 @@
+package org.openjdk.gcbench.alloc.ratelimited;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Benchmark)
+public class Objects {
+
+    @Param({"1"})
+    int rate;
+
+    private MultiTokenBucket bucket;
+
+    @Setup
+    public void setup() {
+        bucket = new MultiTokenBucket(rate);
+    }
+
+    @Benchmark
+    public Object test() {
+        bucket.limit();
+        return new Object();
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/PrimArray.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,32 @@
+package org.openjdk.gcbench.alloc.ratelimited;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Benchmark)
+public class PrimArray {
+
+    MultiTokenBucket bucket;
+
+    @Param({"1"})
+    int size;
+
+    @Param({"1"})
+    int rate;
+
+    @Setup
+    public void setup() {
+        bucket = new MultiTokenBucket(rate);
+    }
+
+    @Benchmark
+    public Object test() {
+        bucket.limit();
+        return new int[size];
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/alloc/ratelimited/RefArray.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,33 @@
+package org.openjdk.gcbench.alloc.ratelimited;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Benchmark)
+public class RefArray {
+
+    @Param({"1"})
+    int size;
+
+    @Param({"1"})
+    int rate;
+
+    private MultiTokenBucket bucket;
+
+    @Setup
+    public void setup() {
+        bucket = new MultiTokenBucket(rate);
+    }
+
+    @Benchmark
+    public Object test() {
+        bucket.limit();
+        return new Object[size];
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/fragger/ArrayFragger.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,62 @@
+package org.openjdk.gcbench.fragger;
+
+import org.openjdk.gcbench.util.ratelimit.TokenBucket;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+//@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"})
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class ArrayFragger {
+
+    @Param({"100"})
+    int size;
+
+    @Param({"1000"})
+    int ldsMB;
+
+    @Param({"10"})
+    int rate;
+
+    Object[] objects;
+
+    int count;
+
+    TokenBucket bucket;
+
+    @Setup
+    public void setup() {
+        bucket = new TokenBucket(rate);
+        count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / align(16 + 4 + size, 8));
+        objects = new Object[count];
+        for (int c = 0; c < count; c++) {
+            doStore(c, new byte[size]);
+        }
+    }
+
+    public static int align(int size, int align) {
+        if ((size % align) == 0) {
+            return size;
+        } else {
+            return ((size / align) + 1) * align;
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        bucket.limit();
+        doStore(ThreadLocalRandom.current().nextInt(count), new byte[size]);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void doStore(int idx, byte[] obj) {
+        objects[idx] = obj;
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/fragger/LinkedListFragger.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,56 @@
+package org.openjdk.gcbench.fragger;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class LinkedListFragger {
+
+    @Param({"1", "10", "100", "1000", "10000"})
+    int objSize;
+
+    @Param({"10", "100", "1000"})
+    int ldsMB;
+
+    List<Object> objects;
+
+    int count;
+
+    @Setup
+    public void setup() {
+        count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / align(16 + 4 + objSize, 8));
+        objects = new LinkedList();
+        for (int c = 0; c < count; c++) {
+            objects.add(new byte[objSize]);
+        }
+    }
+
+    public static int align(int size, int align) {
+        if ((size % align) == 0) {
+            return size;
+        } else {
+            return ((size / align) + 1) * align;
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        doStore(ThreadLocalRandom.current().nextInt(count), new byte[objSize]);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void doStore(int idx, byte[] obj) {
+        objects.set(idx, obj);
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/fragger/TreeFragger.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,93 @@
+package org.openjdk.gcbench.fragger;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class TreeFragger {
+
+    @Param({"1", "10", "100", "1000", "10000"})
+    int objSize;
+
+    @Param({"10", "100", "1000"})
+    int ldsMB;
+
+    Node root;
+
+    int count;
+
+    @Setup
+    public void setup() {
+        int sizePerCount =
+                        align(12 + 3*4, 8) +        // Node
+                        align(12 + 4 + objSize, 8); // array
+        count = (int)Math.max(1, (1L * ldsMB * 1024 * 1024) / sizePerCount);
+
+        root = new Node(new byte[objSize]);
+
+        for (int addr = 0; addr < count; addr++) {
+            Node cur = root;
+            for (int m = 31 - Integer.numberOfLeadingZeros(addr); m >= 0; m--) {
+                if ((addr & (1 << m)) != 0) {
+                    if (cur.left == null) {
+                        cur.left = new Node(new byte[objSize]);
+                    }
+                    cur = cur.left;
+                } else {
+                    if (cur.right == null) {
+                        cur.right = new Node(new byte[objSize]);
+                    }
+                    cur = cur.right;
+                }
+            }
+        }
+        System.out.println("Setup completed");
+    }
+
+
+
+    public static int align(int size, int align) {
+        if ((size % align) == 0) {
+            return size;
+        } else {
+            return ((size / align) + 1) * align;
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        doStore(ThreadLocalRandom.current().nextInt(count), new byte[objSize]);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void doStore(int addr, byte[] obj) {
+        Node cur = root;
+        for (int m = 31 - Integer.numberOfLeadingZeros(addr); m >= 0; m--) {
+            if ((addr & (1 << m)) != 0) {
+                cur = cur.left;
+            } else {
+                cur = cur.right;
+            }
+        }
+        cur.payload = obj;
+    }
+
+    static class Node {
+        Node left;
+        Node right;
+        Object payload;
+
+        public Node(Object payload) {
+            this.payload = payload;
+        }
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNew.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,95 @@
+package org.openjdk.gcbench.runtime.cmp;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class ACmpBarriersKnownNew {
+
+    Object target;
+
+    @Setup
+    public void setup() {
+        target = new Object();
+    }
+
+    @Benchmark
+    public void left() {
+        doLeft(target);
+    }
+
+    @Benchmark
+    public void right() {
+        doRight(target);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private boolean doLeft(Object t1) {
+        return t1 == new Object();
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private boolean doRight(Object t2) {
+        return new Object() == t2;
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                   Mode  Cnt  Score   Error  Units
+
+            # Shenandoah
+            ACmpBarriersKnownNew.left   avgt    5  1.983 ± 0.100  ns/op
+            ACmpBarriersKnownNew.right  avgt    5  1.977 ± 0.005  ns/op
+
+            # G1
+            ACmpBarriersKnownNew.left   avgt    5  2.057 ± 0.019  ns/op
+            ACmpBarriersKnownNew.right  avgt    5  2.059 ± 0.022  ns/op
+
+            # Parallel
+            ACmpBarriersKnownNew.left   avgt    5  2.057 ± 0.030  ns/op
+            ACmpBarriersKnownNew.right  avgt    5  2.060 ± 0.007  ns/op
+
+        The difference is not caused by different compilation of doLeft/doRight methods,
+        but rather the additional read barriers in the JMH loop itself. Note that
+        additional barrier code *IMPROVES* performance.
+
+            Benchmark                                        Mode  Cnt   Score    Error  Units
+
+            # Shenandoah
+            ACmpBarriersKnownNew.left                        avgt   25   1.970 ±  0.004  ns/op
+            ACmpBarriersKnownNew.left:CPI                    avgt    5   0.330 ±  0.012   #/op
+            ACmpBarriersKnownNew.left:L1-dcache-load-misses  avgt    5   0.010 ±  0.005   #/op
+            ACmpBarriersKnownNew.left:L1-dcache-loads        avgt    5  11.095 ±  0.380   #/op  <--- more loads
+            ACmpBarriersKnownNew.left:cycles                 avgt    5   7.820 ±  0.088   #/op  <--- yet, less cycles
+            ACmpBarriersKnownNew.left:instructions           avgt    5  23.665 ±  0.833   #/op  <--- a few more instructions
+
+            # G1
+            ACmpBarriersKnownNew.left                        avgt   25   2.061 ±  0.008  ns/op
+            ACmpBarriersKnownNew.left:CPI                    avgt    5   0.363 ±  0.010   #/op
+            ACmpBarriersKnownNew.left:L1-dcache-load-misses  avgt    5   0.010 ±  0.012   #/op
+            ACmpBarriersKnownNew.left:L1-dcache-loads        avgt    5   9.322 ±  0.153   #/op
+            ACmpBarriersKnownNew.left:cycles                 avgt    5   8.115 ±  0.107   #/op
+            ACmpBarriersKnownNew.left:instructions           avgt    5  22.331 ±  0.681   #/op
+
+        The generated code for doLeft/doRight in Shenandoah/G1/Parallel is the same:
+
+                          [Verified Entry Point]
+         12.42%    7.23%    0x00007fbe6953f8c0: sub    $0x18,%rsp
+          0.37%    0.40%    0x00007fbe6953f8c7: mov    %rbp,0x10(%rsp)
+         11.19%   10.82%    0x00007fbe6953f8cc: xor    %eax,%eax           ; always false
+          1.53%    1.76%    0x00007fbe6953f8ce: add    $0x10,%rsp
+          0.22%    0.22%    0x00007fbe6953f8d2: pop    %rbp
+          0.23%    0.12%    0x00007fbe6953f8d3: test   %eax,0x18b70727(%rip)
+         10.71%   14.51%    0x00007fbe6953f8d9: retq
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersKnownNulls.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,77 @@
+package org.openjdk.gcbench.runtime.cmp;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class ACmpBarriersKnownNulls {
+
+    Object target;
+
+    @Setup
+    public void setup() {
+        target = new Object();
+    }
+
+    @Benchmark
+    public void left() {
+        doLeft(target);
+    }
+
+    @Benchmark
+    public void right() {
+        doRight(target);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private boolean doLeft(Object t1) {
+        return t1 == null;
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private boolean doRight(Object t2) {
+        return null == t2;
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                     Mode  Cnt  Score   Error  Units
+
+            # Shenandoah
+            ACmpBarriersKnownNulls.left   avgt    5  2.235 ± 0.011  ns/op
+            ACmpBarriersKnownNulls.right  avgt    5  2.240 ± 0.089  ns/op
+
+            # G1
+            ACmpBarriersKnownNulls.left   avgt    5  1.971 ± 0.001  ns/op
+            ACmpBarriersKnownNulls.right  avgt    5  1.974 ± 0.021  ns/op
+
+            # Parallel
+            ACmpBarriersKnownNulls.left   avgt    5  1.977 ± 0.026  ns/op
+            ACmpBarriersKnownNulls.right  avgt    5  1.973 ± 0.001  ns/op
+
+        The difference is not caused by different compilation of doLeft/doRight methods,
+        but rather the additional read barriers in the JMH loop itself. The generated code
+        for doLeft/doRight in Shenandoah/G1/Parallel is the same:
+
+                               [Verified Entry Point]
+             11.16%    7.47%     0x00007f309d542240: mov    %eax,-0x14000(%rsp)
+              0.15%    0.07%     0x00007f309d542247: push   %rbp
+              0.22%    0.14%     0x00007f309d542248: sub    $0x10,%rsp
+             11.05%   10.65%     0x00007f309d54224c: test   %rdx,%rdx
+                              ╭  0x00007f309d54224f: je     0x00007f309d54225f
+              0.10%    0.06%  │  0x00007f309d542251: xor    %eax,%eax
+              0.11%    0.08%  │  0x00007f309d542253: add    $0x10,%rsp
+             11.43%   11.21%  │  0x00007f309d542257: pop    %rbp
+              0.07%    0.04%  │  0x00007f309d542258: test   %eax,0x18026da2(%rip)
+              0.07%    0.01%  │  0x00007f309d54225e: retq
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmp/ACmpBarriersRandom.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,106 @@
+package org.openjdk.gcbench.runtime.cmp;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 10, jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:-TieredCompilation"})
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class ACmpBarriersRandom {
+
+    @Param("10000")
+    private int size;
+
+    Object[] targets;
+
+    @Setup(Level.Iteration)
+    public void setup() {
+        Object[] cases = new Object[] {new Object(), new Object(), null};
+        targets = new Object[size];
+
+        Random r = new Random();
+        for (int c = 0; c < size; c++) {
+            targets[c] = cases[r.nextInt(cases.length)];
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        Object[] targets = this.targets;
+        for (int c = 0; c < size - 1; c++) {
+            acmp(targets[c], targets[c+1]);
+        }
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private boolean acmp(Object t1, Object t2) {
+        return t1 == t2;
+    }
+
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                (size)  Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            ACmpBarriersRandom.test   10000  avgt   50  81.733 ± 0.237  us/op
+
+            # G1
+            ACmpBarriersRandom.test   10000  avgt   50  33.487 ± 0.082  us/op
+
+            # Parallel
+            ACmpBarriersRandom.test   10000  avgt   50  33.461 ± 0.049  us/op
+
+        The difference is explained by a more complicated acmp barrier that needs
+        to handle false negatives caused by concurrent evacuation.
+
+        Parallel and G1:
+
+                          [Verified Entry Point]
+          2.62%    3.27%    0x00007f03c5af48c0: sub    $0x18,%rsp
+          5.92%    5.57%    0x00007f03c5af48c7: mov    %rbp,0x10(%rsp)
+          1.39%    1.64%    0x00007f03c5af48cc: xor    %r10d,%r10d
+          1.67%    1.54%    0x00007f03c5af48cf: mov    $0x1,%eax
+          4.45%    5.35%    0x00007f03c5af48d4: cmp    %rcx,%rdx         ; compare
+          0.68%    1.07%    0x00007f03c5af48d7: cmovne %r10d,%eax        ; choose 0 or 1
+          3.11%    3.15%    0x00007f03c5af48db: add    $0x10,%rsp
+          4.07%    6.03%    0x00007f03c5af48df: pop    %rbp
+          1.06%    1.66%    0x00007f03c5af48e0: test   %eax,0x1326a71a(%rip)
+          1.30%    1.22%    0x00007f03c5af48e6: retq
+
+        Shenandoah:
+
+                               [Verified Entry Point]
+          3.24%    1.91%         0x00007f11fd2addc0: sub    $0x18,%rsp
+          0.35%    1.02%         0x00007f11fd2addc7: mov    %rbp,0x10(%rsp)
+          0.12%    0.13%         0x00007f11fd2addcc: cmp    %rcx,%rdx           ; compare
+                          ╭      0x00007f11fd2addcf: je     0x00007f11fd2adde3
+          2.79%    2.58%  │      0x00007f11fd2addd1: test   %rcx,%rcx           ; null check t1
+                          │╭     0x00007f11fd2addd4: je     0x00007f11fd2addfe
+          1.74%    2.39%  ││     0x00007f11fd2addd6: mov    -0x8(%rcx),%rcx     ; read barrier t1
+          5.90%    8.70%  ││ ↗   0x00007f11fd2addda: test   %rdx,%rdx           ; null check t2
+                          ││╭│   0x00007f11fd2adddd: je     0x00007f11fd2ade02
+          0.25%    0.43%  ││││   0x00007f11fd2adddf: mov    -0x8(%rdx),%rdx     ; read barrier t2
+          7.54%    8.56%  ↘│││↗  0x00007f11fd2adde3: xor    %r11d,%r11d         ; same as Parallel/G1:
+          1.02%    1.02%   ││││  0x00007f11fd2adde6: mov    $0x1,%eax
+          4.91%    8.30%   ││││  0x00007f11fd2addeb: cmp    %rcx,%rdx           ; <--- redundant compare for the "==" path, can reuse the first one
+          1.50%    2.22%   ││││  0x00007f11fd2addee: cmovne %r11d,%eax          ; choose 0 or 1
+          5.38%    9.70%   ││││  0x00007f11fd2addf2: add    $0x10,%rsp
+          0.47%    0.47%   ││││  0x00007f11fd2addf6: pop    %rbp
+          5.10%    5.67%   ││││  0x00007f11fd2addf7: test   %eax,0x11df7203(%rip)
+          1.00%    0.69%   ││││  0x00007f11fd2addfd: retq
+          1.79%    2.44%   ↘│││  0x00007f11fd2addfe: xor    %ecx,%ecx           ; <--- redundant branch, can reuse %rcx above, proven to be zero
+          0.50%    0.54%    │╰│  0x00007f11fd2ade00: jmp    0x00007f11fd2addda
+          0.02%    0.05%    ↘ │  0x00007f11fd2ade02: xor    %edx,%edx           ; <--- redundant branch, can reuse %rdx above, proven to be zero
+          0.48%    0.43%      ╰  0x00007f11fd2ade04: jmp    0x00007f11fd2adde3
+     */
+
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveFailure.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,52 @@
+package org.openjdk.gcbench.runtime.cmpxchg;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class CasBarriersPrimitiveFailure {
+
+    AtomicInteger ai;
+
+    int value1, value2;
+
+    @Setup
+    public void setup() {
+        value1 = 42;
+        value2 = 43;
+        ai = new AtomicInteger();
+        ai.set(value2);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        ai.compareAndSet(value1, value2);
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark         Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            CasBarriers.test  avgt   25  16.418 ± 0.029  ns/op
+
+            # G1
+            CasBarriers.test  avgt   25  12.545 ± 0.041  ns/op
+
+            # Parallel
+            CasBarriers.test  avgt   25  12.526 ± 0.030  ns/op
+
+        Analysis pending.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersPrimitiveSuccess.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,53 @@
+package org.openjdk.gcbench.runtime.cmpxchg;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class CasBarriersPrimitiveSuccess {
+
+    AtomicInteger ai;
+
+    Object t1, t2;
+
+    int value;
+
+    @Setup
+    public void setup() {
+        value = 42;
+        ai = new AtomicInteger();
+        ai.set(value);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        ai.compareAndSet(value, value);
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark         Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            CasBarriers.test  avgt   25  16.418 ± 0.029  ns/op
+
+            # G1
+            CasBarriers.test  avgt   25  12.545 ± 0.041  ns/op
+
+            # Parallel
+            CasBarriers.test  avgt   25  12.526 ± 0.030  ns/op
+
+        Analysis pending.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefFailure.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,51 @@
+package org.openjdk.gcbench.runtime.cmpxchg;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class CasBarriersRefFailure {
+
+    AtomicReference<Object> ai;
+
+    Object t1 = new Object();
+    Object t2 = new Object();
+
+    @Setup
+    public void setup() {
+        ai = new AtomicReference<>();
+        ai.set(t2);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        ai.compareAndSet(t1, t2);
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark         Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            CasBarriers.test  avgt   25  16.418 ± 0.029  ns/op
+
+            # G1
+            CasBarriers.test  avgt   25  12.545 ± 0.041  ns/op
+
+            # Parallel
+            CasBarriers.test  avgt   25  12.526 ± 0.030  ns/op
+
+        Analysis pending.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/CasBarriersRefSuccess.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,52 @@
+package org.openjdk.gcbench.runtime.cmpxchg;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class CasBarriersRefSuccess {
+
+    AtomicReference<Object> ai;
+
+    Object t1 = new Object();
+    Object t2 = t1;
+
+    @Setup
+    public void setup() {
+        ai = new AtomicReference<>();
+        ai.set(t1);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        ai.compareAndSet(t1, t2);
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark         Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            CasBarriers.test  avgt   25  16.418 ± 0.029  ns/op
+
+            # G1
+            CasBarriers.test  avgt   25  12.545 ± 0.041  ns/op
+
+            # Parallel
+            CasBarriers.test  avgt   25  12.526 ± 0.030  ns/op
+
+        Analysis pending.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/cmpxchg/WeakCasLoop.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,80 @@
+package org.openjdk.gcbench.runtime.cmpxchg;
+
+import org.openjdk.jmh.annotations.*;
+import sun.misc.Unsafe;
+
+import java.lang.reflect.Field;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WeakCasLoop {
+
+    static final Unsafe U;
+    static final long OFF_REF;
+
+    static {
+        try {
+            Field unsafeField = Unsafe.class.getDeclaredField("theUnsafe");
+            unsafeField.setAccessible(true);
+            U = (Unsafe) unsafeField.get(null);
+            OFF_REF  = U.objectFieldOffset(WeakCasLoop.class.getDeclaredField("ref"));
+        } catch (Exception e) {
+            throw new AssertionError(e);
+        }
+    }
+
+    volatile Point ref;
+
+    @Setup
+    public void setup() {
+        ref = new Point(0, 0);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void strong() {
+        Point ep, np;
+        do {
+            ep = ref;
+            np = ep.next();
+        } while (!U.compareAndSwapObject(this, OFF_REF, ep, np));
+    }
+
+    public static class Point {
+        final int x;
+        final int y;
+
+        public Point(int x, int y) {
+            this.x = x;
+            this.y = y;
+        }
+
+        public Point next() {
+            return new Point((x + 1) & 255, (y + 2) & 255);
+        }
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark         Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            CasBarriers.test  avgt   25  16.418 ± 0.029  ns/op
+
+            # G1
+            CasBarriers.test  avgt   25  12.545 ± 0.041  ns/op
+
+            # Parallel
+            CasBarriers.test  avgt   25  12.526 ± 0.030  ns/op
+
+        Analysis pending.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersArrays.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,118 @@
+package org.openjdk.gcbench.runtime.reads;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class ReadBarriersArrays {
+
+    @Param({"1", "1000", "1000000", "1000000000"})
+    private int size;
+
+    int[] target;
+
+    @Setup
+    public void setup() {
+        target = new int[size];
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void plain() {
+        for (int t : target) {
+            sink(t);
+        }
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void sink(int i) {
+
+    }
+
+    /*
+       i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                         Mode  Cnt  Score   Error  Units
+
+            # Shenandoah
+            ReadBarriersArrays.plain           1  avgt   25        0.005 ±      0.001  us/op
+            ReadBarriersArrays.plain        1000  avgt   25        1.970 ±      0.003  us/op
+            ReadBarriersArrays.plain     1000000  avgt   25     1869.969 ±      0.554  us/op
+            ReadBarriersArrays.plain  1000000000  avgt   25  3772318.787 ± 114008.238  us/op
+
+            # G1
+            ReadBarriersArrays.plain           1  avgt   25        0.004 ±     0.001  us/op
+            ReadBarriersArrays.plain        1000  avgt   25        1.993 ±     0.003  us/op
+            ReadBarriersArrays.plain     1000000  avgt   25     1803.248 ±     2.122  us/op
+            ReadBarriersArrays.plain  1000000000  avgt   25  1821469.162 ± 10974.715  us/op
+
+            # Parallel
+            ReadBarriersArrays.plain           1  avgt   25        0.004 ±     0.001  us/op
+            ReadBarriersArrays.plain        1000  avgt   25        2.000 ±     0.006  us/op
+            ReadBarriersArrays.plain     1000000  avgt   25     1817.009 ±    35.630  us/op
+            ReadBarriersArrays.plain  1000000000  avgt   25  1825045.442 ±  9787.079  us/op
+
+        In Shenandoah, the hottest loop looks like this:
+
+             13.59%   12.98%  ↗  0x00007f33c95428a0: mov    (%rsp),%r9
+                              │  0x00007f33c95428a4: mov    -0x8(%r9),%r10      ; <--- read barrier
+              0.04%    0.01%  │  0x00007f33c95428a8: mov    %r9,(%rsp)
+                              │  0x00007f33c95428ac: mov    0x10(%r10,%rbp,4),%edx ; array access
+             36.49%   35.45%  │  0x00007f33c95428b1: mov    0x8(%rsp),%rsi
+              0.03%           │  0x00007f33c95428b6: nop
+                              │  0x00007f33c95428b7: callq  0x00007f33c1a80f80  ; call sink();
+              0.52%    0.52%  │  0x00007f33c95428bc: inc    %ebp                ; increment and test loop counter
+              0.01%    0.01%  │  0x00007f33c95428be: cmp    0x10(%rsp),%ebp
+                              ╰  0x00007f33c95428c2: jl     0x00007f33c95428a0
+
+        In G1 and Parallel it looks like this:
+
+             14.51%   13.22%  ↗  0x00007fa49c6ceaa0: mov    (%rsp),%r10
+              0.01%           │  0x00007fa49c6ceaa4: mov    0x10(%r10,%rbp,4),%edx  ; array access
+             31.43%   32.02%  │  0x00007fa49c6ceaa9: mov    %r10,(%rsp)
+              0.84%    0.79%  │  0x00007fa49c6ceaad: mov    0x8(%rsp),%rsi
+              1.59%    1.65%  │  0x00007fa49c6ceab2: nop
+                              │  0x00007fa49c6ceab3: callq  0x00007fa494c0bf80  ; call sink();
+              5.25%    4.46%  │  0x00007fa49c6ceab8: inc    %ebp                ; increment and test loop counter
+              0.05%    0.03%  │  0x00007fa49c6ceaba: cmp    0x10(%rsp),%ebp
+              0.03%           ╰  0x00007fa49c6ceabe: jl     0x00007fa49c6ceaa0
+
+        So, the difference is in read barrier. It does not affect performance much.
+
+        With 1G array, Shenandoah nose-dives into excessive mark (?):
+
+            ....[Hottest Methods (after inlining)]..............................................................
+             37.40%    0.61%        libc-2.23.so  __memset_avx2
+             23.92%   38.48%           libjvm.so  ParallelTaskTerminator::offer_termination
+             12.42%   18.48%           libjvm.so  SpinPause
+              9.59%   13.47%         C2, level 4  org.openjdk.shenandoah.reads.ReadBarriersArrays::plain, version 691
+              4.31%    8.85%         C1, level 1  org.openjdk.shenandoah.reads.ReadBarriersArrays::sink, version 647
+              2.62%    5.14%           libjvm.so  GenericTaskQueueSet<Padded<OverflowTaskQueue<ObjArrayTask, (MemoryType)5, 131072u>, 128ul>, (MemoryType)5>::peek
+              1.74%    1.33%           libjvm.so  ShenandoahInitMarkRootsClosure::do_oop
+              1.66%    0.21%           libjvm.so  ShenandoahHeapRegionSet::claim_next
+              1.24%    1.90%           [unknown]  [unknown]
+              1.06%    0.15%           libjvm.so  ResetBitmapTask::work
+              0.79%    2.81%           libjvm.so  StringTable::possibly_parallel_oops_do
+              0.72%    1.69%           libjvm.so  ShenandoahConcurrentMark::mark_and_push
+              0.40%    0.20%           libjvm.so  ShenandoahHeapRegion::top_at_mark_start
+              0.39%    0.36%           libjvm.so  BitMap::at_put_range
+              0.26%    0.27%           libjvm.so  CMBitMap::clear_range
+              0.15%    2.35%           libjvm.so  nmethod::oops_do
+              0.09%    0.01%           libjvm.so  SCMConcurrentMarkingTask::work
+              0.09%    0.22%           libjvm.so  CodeHeap::next_used
+              0.07%    0.25%           libjvm.so  ShenandoahHeapRegion::init_top_at_mark_start
+              0.06%    0.25%           libjvm.so  SafepointSynchronize::begin
+              1.03%    2.83%  <...other 257 warm methods...>
+            ........................................................................
+
+        Setting -Xmx8g -Xms8g alleviates this problem.
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersCachePressure.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,92 @@
+package org.openjdk.gcbench.runtime.reads;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class ReadBarriersCachePressure {
+
+    @Param({"1", "16", "128", "1024"})
+    private int size;
+
+
+    int mask;
+    Object[][][] target;
+
+    @Setup
+    public void setup() {
+        target = new Object[size][][];
+        for (int c = 0; c < size; c++) {
+            target[c] = new Object[size][];
+            for (int j = 0; j < size; j++) {
+                target[c][j] = new Object[size];
+            }
+        }
+
+        mask = size - 1;
+    }
+
+    private int s;
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void plain() {
+        Object[][][] tgt = target;
+        int t = s;
+        int m = mask;
+        t = t * 1664525 + 1013904223;
+        int idx1 = t & m;
+        t = t * 1664525 + 1013904223;
+        int idx2 = t & m;
+        t = t * 1664525 + 1013904223;
+        int idx3 = t & m;
+        sink(tgt[idx1][idx2][idx3]);
+        s = t;
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void sink(Object o) {
+
+    }
+
+    /*
+       i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                        (size)  Mode  Cnt   Score   Error  Units
+
+            # Shenandoah
+            ReadBarriersCachePressure.plain       1  avgt   25   7.806 ± 0.006  ns/op
+            ReadBarriersCachePressure.plain      16  avgt   25   7.802 ± 0.003  ns/op
+            ReadBarriersCachePressure.plain     128  avgt   25   9.220 ± 0.024  ns/op
+            ReadBarriersCachePressure.plain    1024  avgt   25  38.590 ± 0.251  ns/op
+
+            # G1
+            ReadBarriersCachePressure.plain       1  avgt   25   6.727 ± 0.014  ns/op
+            ReadBarriersCachePressure.plain      16  avgt   25   6.736 ± 0.024  ns/op
+            ReadBarriersCachePressure.plain     128  avgt   25   7.075 ± 0.015  ns/op
+            ReadBarriersCachePressure.plain    1024  avgt   25  36.811 ± 0.259  ns/op
+
+            # Parallel
+            ReadBarriersCachePressure.plain       1  avgt   25   6.791 ± 0.026  ns/op
+            ReadBarriersCachePressure.plain      16  avgt   25   6.780 ± 0.002  ns/op
+            ReadBarriersCachePressure.plain     128  avgt   25   7.087 ± 0.021  ns/op
+            ReadBarriersCachePressure.plain    1024  avgt   25  36.037 ± 0.264  ns/op
+
+        This benchmark tries to validate the speculation that adding an indirection pointer
+        before the object has the cache capacity implications: i.e. accessing the indirection
+        pointer for the object aligned at 8 may touch the previous cache line.
+
+        This does not seem to be validated, and the read barrier performance cost seems to
+        be consistent across different sizes.
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/reads/ReadBarriersFields.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,140 @@
+package org.openjdk.gcbench.runtime.reads;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class ReadBarriersFields {
+
+    Target target;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void plainField() {
+        sink(target.plainInt);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void volatileField() {
+        sink(target.volatileInt);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void sink(int i) {
+
+    }
+
+    static class Target {
+        int plainInt;
+        volatile int volatileInt;
+    }
+
+    /*
+       i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                         Mode  Cnt  Score   Error  Units
+
+            # Shenandoah
+            ReadBarriersFields.plainField     avgt   25  3.440 ± 0.019  ns/op
+            ReadBarriersFields.volatileField  avgt   25  3.431 ± 0.006  ns/op
+
+            # G1
+            ReadBarriersFields.plainField     avgt   25  3.149 ± 0.005  ns/op
+            ReadBarriersFields.volatileField  avgt   25  3.147 ± 0.003  ns/op
+
+            # Parallel
+            ReadBarriersFields.plainField     avgt   25  3.149 ± 0.004  ns/op
+            ReadBarriersFields.volatileField  avgt   25  3.152 ± 0.006  ns/op
+
+        The difference is caused by the additional instructions:
+
+            Benchmark                                             Mode  Cnt   Score    Error  Units
+
+            # ------------- Shenandoah
+            ReadBarriersFields.plainField                        avgt   25   3.440 ±  0.014  ns/op
+            ReadBarriersFields.plainField:CPI                    avgt    5   0.380 ±  0.018   #/op  <--- better CPI
+            ReadBarriersFields.plainField:L1-dcache-load-misses  avgt    5   0.018 ±  0.005   #/op
+            ReadBarriersFields.plainField:L1-dcache-loads        avgt    5  15.646 ±  0.548   #/op  <--- more loads
+            ReadBarriersFields.plainField:L1-dcache-stores       avgt    5   7.672 ±  0.292   #/op
+            ReadBarriersFields.plainField:branch-misses          avgt    5   0.008 ±  0.002   #/op
+            ReadBarriersFields.plainField:branches               avgt    5   6.186 ±  0.306   #/op  <--- one more branch
+            ReadBarriersFields.plainField:cycles                 avgt    5  13.866 ±  0.967   #/op
+            ReadBarriersFields.plainField:instructions           avgt    5  36.500 ±  1.875   #/op
+
+            # ------------- G1
+            ReadBarriersFields.plainField                        avgt   25   3.152 ±  0.008  ns/op
+            ReadBarriersFields.plainField:CPI                    avgt    5   0.426 ±  0.017   #/op
+            ReadBarriersFields.plainField:L1-dcache-load-misses  avgt    5   0.019 ±  0.021   #/op
+            ReadBarriersFields.plainField:L1-dcache-loads        avgt    5  12.612 ±  0.146   #/op
+            ReadBarriersFields.plainField:L1-dcache-stores       avgt    5   6.932 ±  0.141   #/op
+            ReadBarriersFields.plainField:branch-misses          avgt    5   0.008 ±  0.002   #/op
+            ReadBarriersFields.plainField:branches               avgt    5   5.226 ±  0.214   #/op
+            ReadBarriersFields.plainField:cycles                 avgt    5  12.794 ±  0.929   #/op
+            ReadBarriersFields.plainField:instructions           avgt    5  30.012 ±  1.198   #/op
+
+            # ------------- Parallel
+            Benchmark                                            Mode  Cnt   Score    Error  Units
+            ReadBarriersFields.plainField                        avgt   25   3.157 ±  0.012  ns/op
+            ReadBarriersFields.plainField:CPI                    avgt    4   0.426 ±  0.027   #/op
+            ReadBarriersFields.plainField:L1-dcache-load-misses  avgt    5   0.020 ±  0.016   #/op
+            ReadBarriersFields.plainField:L1-dcache-loads        avgt    5  12.641 ±  0.446   #/op
+            ReadBarriersFields.plainField:L1-dcache-stores       avgt    5   6.927 ±  0.139   #/op
+            ReadBarriersFields.plainField:branch-misses          avgt    5   0.007 ±  0.002   #/op
+            ReadBarriersFields.plainField:branches               avgt    5   5.212 ±  0.235   #/op
+            ReadBarriersFields.plainField:cycles                 avgt    5  12.754 ±  0.771   #/op
+            ReadBarriersFields.plainField:instructions           avgt    4  29.963 ±  1.944   #/op
+
+        These instructions are the read barriers, plus an explicit null check:
+
+                               [Verified Entry Point]
+              7.25%    7.27%     0x00007fde8d540be0: mov    %eax,-0x14000(%rsp)
+              0.16%    0.06%     0x00007fde8d540be7: push   %rbp
+              0.04%              0x00007fde8d540be8: sub    $0x10,%rsp
+              7.51%    8.68%     0x00007fde8d540bec: mov    -0x8(%rsi),%r10     ; <--- read barrier
+              0.04%              0x00007fde8d540bf0: mov    0xc(%r10),%r10d     ; get field $target
+              0.04%    0.04%     0x00007fde8d540bf4: test   %r10d,%r10d         ; <--- null check $target
+              0.01%           ╭  0x00007fde8d540bf7: je     0x00007fde8d540c18
+              7.36%    8.40%  │  0x00007fde8d540bf9: shl    $0x3,%r10
+              0.19%    0.13%  │  0x00007fde8d540bfd: mov    -0x8(%r10),%r10     ; <--- read barrier
+              1.44%    1.20%  │  0x00007fde8d540c01: mov    0xc(%r10),%edx      ; get field $plainInt
+             23.63%   29.86%  │  0x00007fde8d540c05: xchg   %ax,%ax
+              0.11%    0.10%  │  0x00007fde8d540c07: callq  0x00007fde860c2ce0  ; call sink()
+              7.41%    5.41%  │  0x00007fde8d540c0c: add    $0x10,%rsp
+              0.11%    0.06%  │  0x00007fde8d540c10: pop    %rbp
+                       0.01%  │  0x00007fde8d540c11: test   %eax,0x18d633e9(%rip)
+              7.82%    5.10%  │  0x00007fde8d540c17: retq
+
+         It seems compressed oops are precluding folding the explicit null check. The same run with
+         -XX:-UseCompressedOops:
+
+                               [Verified Entry Point]
+              7.82%    7.02%     0x00007f2ec55430d0: mov    %eax,-0x14000(%rsp)
+              0.04%              0x00007f2ec55430d7: push   %rbp
+              0.65%    0.05%     0x00007f2ec55430d8: sub    $0x10,%rsp
+              6.71%    8.14%     0x00007f2ec55430dc: mov    -0x8(%rsi),%r10    ; <--- read barrier
+                       0.01%     0x00007f2ec55430e0: mov    0x10(%r10),%r10    ; get field $target
+              0.54%    0.65%  ╭  0x00007f2ec55430e4: mov    -0x8(%r10),%r10    ; <--- read barrier    ; implicit exception: dispatches to 0x00007f2ec554310d
+              7.54%    9.11%  │  0x00007f2ec55430e8: mov    0x10(%r10),%edx    ; get field $plainInt
+             20.15%   26.85%  │  0x00007f2ec55430ec: data16 xchg %ax,%ax
+                              │  0x00007f2ec55430ef: callq  0x00007f2ebe0c81e0 ; call sink();
+              7.02%    5.61%  │  0x00007f2ec55430f4: add    $0x10,%rsp
+              0.01%    0.01%  │  0x00007f2ec55430f8: pop    %rbp
+              0.49%    0.71%  │  0x00007f2ec55430f9: test   %eax,0x18ccff01(%rip)
+              7.19%    4.31%  │  0x00007f2ec55430ff: retq
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersKnownNull.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,97 @@
+package org.openjdk.gcbench.runtime.writes;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class WriteBarriersKnownNull {
+
+    Target target;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        target.field = null;
+    }
+
+    static class Target {
+        Object field;
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                    Mode  Cnt  Score   Error  Units
+
+            # Shenandoah
+            WriteBarriersKnownNull.test  avgt   25  2.637 ± 0.027  ns/op
+
+            # G1
+            WriteBarriersKnownNull.test  avgt   25  1.958 ± 0.006  ns/op
+
+            # Parallel
+            WriteBarriersKnownNull.test  avgt   25  1.832 ± 0.097  ns/op
+
+        Mostly the same as the regular barrier, plus the optimized out redundant nullchecks
+        for stored value.
+
+        Shenandoah:
+
+                             [Verified Entry Point]
+          9.60%   11.27%       0x00007f7031af6f40: mov    %eax,-0x14000(%rsp)
+          0.02%                0x00007f7031af6f47: push   %rbp
+                               0x00007f7031af6f48: sub    $0x10,%rsp
+         10.13%   10.60%       0x00007f7031af6f4c: mov    -0x8(%rsi),%r10       ; <--- read barrier
+                   0.02%       0x00007f7031af6f50: mov    0xc(%r10),%r11d       ; get field $target
+          0.07%                0x00007f7031af6f54: test   %r11d,%r11d           ; null check $target <--- NOT IMPLICIT
+                               0x00007f7031af6f57: je     0x00007f7031af6fc7
+          9.91%   10.59%       0x00007f7031af6f59: mov    %r11,%r10
+                               0x00007f7031af6f5c: mov    -0x8(%r10),%rbx       ; <--- read barrier AGAIN
+          0.18%    0.12%       0x00007f7031af6f60: cmpb   $0x0,0x3d8(%r15)      ; "evacuation in progress?"
+          0.02%                0x00007f7031af6f68: mov    -0x8(%r10),%rbx       ; <--- read barrier (in sequence)
+         10.08%    8.85%  ╭    0x00007f7031af6f6c: je     0x00007f7031af6f79    ; no evacuation in progress, jump over <--- SHOULD BE MOVED TO SLOWPATH
+                          │    0x00007f7031af6f6e: xchg   %rax,%rbx
+                          │    0x00007f7031af6f71: callq  Stub::shenandoah_wb
+                          │    0x00007f7031af6f76: xchg   %rax,%rbx
+                          ↘    0x00007f7031af6f79: movsbl 0x378(%r15),%r11d     ; SATB test
+                               0x00007f7031af6f81: test   %r11d,%r11d
+                           ╭   0x00007f7031af6f84: jne    0x00007f7031af6f96
+                           │↗  0x00007f7031af6f86: mov    %r12d,0xc(%rbx)       ; field store
+         25.14%   22.09%   ││  0x00007f7031af6f8a: add    $0x10,%rsp
+                           ││  0x00007f7031af6f8e: pop    %rbp
+                           ││  0x00007f7031af6f8f: test   %eax,0x1286706b(%rip)
+                           ││  0x00007f7031af6f95: retq
+
+
+        G1:
+
+                            [Verified Entry Point]
+          7.45%    6.58%      0x00007f3204840cc0: mov    %eax,-0x14000(%rsp)
+          5.74%    6.06%      0x00007f3204840cc7: push   %rbp
+          7.74%    7.35%      0x00007f3204840cc8: sub    $0x10,%rsp
+          0.18%    0.18%      0x00007f3204840ccc: mov    0xc(%rsi),%ebp         ; get field $target
+          5.84%    6.56%      0x00007f3204840ccf: mov    0xc(%rbp),%r10d        ; <--- read old for SATB, plus NPE check
+         18.27%   20.87%      0x00007f3204840cd3: movsbl 0x378(%r15),%r8d       ; SATB test
+          0.02%    0.02%      0x00007f3204840cdb: test   %r8d,%r8d
+                          ╭   0x00007f3204840cde: jne    0x00007f3204840cf0
+                          │↗  0x00007f3204840ce0: mov    %r12d,0xc(%rbp)        ; field store
+          4.11%    4.73%  ││  0x00007f3204840ce4: add    $0x10,%rsp
+          9.60%   10.33%  ││  0x00007f3204840ce8: pop    %rbp
+          0.30%    0.33%  ││  0x00007f3204840ce9: test   %eax,0x11f29311(%rip)
+                   0.02%  ││  0x00007f3204840cef: retq
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersPrimitive.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,96 @@
+package org.openjdk.gcbench.runtime.writes;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class WriteBarriersPrimitive {
+
+    Target target;
+    int source;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+        source = 42;
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        target.field = source;
+    }
+
+    static class Target {
+        int field;
+    }
+
+    /*
+       i5 4210U, 1.7 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-12)
+
+           Benchmark                    Mode  Cnt  Score   Error  Units
+
+           # Shenandoah
+           WriteBarriersPrimitive.test  avgt   25  3.771 ± 0.010  ns/op
+
+           # G1
+           WriteBarriersPrimitive.test  avgt   25  2.623 ± 0.022  ns/op
+
+           # Parallel
+           WriteBarriersPrimitive.test  avgt   25  2.551 ± 0.008  ns/op
+
+        Shenandoah has to do the write barriers even for primitive stores,
+        in order to preserve "no writes into evacuated regions" invariant.
+        Other collectors do not have to do this. The write barrier code
+        quality may be improved to amortize the costs.
+
+        Shenandoah:
+
+                            [Verified Entry Point]
+          6.81%    8.72%      0x00007f85a58dddc0: mov    %eax,-0x14000(%rsp)
+          3.08%    3.37%      0x00007f85a58dddc7: push   %rbp
+          0.05%    0.11%      0x00007f85a58dddc8: sub    $0x10,%rsp
+          8.72%    7.79%      0x00007f85a58dddcc: mov    -0x8(%rsi),%r10        ; <--- read barrier ($this)
+          0.67%    0.50%      0x00007f85a58dddd0: mov    0x10(%r10),%r11d       ; get field $target
+          0.08%    0.09%      0x00007f85a58dddd4: mov    0xc(%r10),%r10d        ; get field $source
+          6.83%    7.37%      0x00007f85a58dddd8: test   %r11d,%r11d            ; null check $target
+                          ╭   0x00007f85a58ddddb: je     0x00007f85a58dde0e
+          2.30%    0.64%  │   0x00007f85a58ddddd: shl    $0x3,%r11              ; unpack $target
+          0.61%    0.52%  │   0x00007f85a58ddde1: mov    -0x8(%r11),%r11        ; <--- read barrier once; WHY? There is a null check before already
+          0.14%    0.09%  │   0x00007f85a58ddde5: cmpb   $0x0,0x3d8(%r15)       ; evacuation in progress?
+          6.70%    6.03%  │   0x00007f85a58ddded: mov    -0x8(%r11),%r11        ; <--- read barrier twice (in sequence now)
+          3.53%    1.67%  │╭  0x00007f85a58dddf1: je     0x00007f85a58dddfe
+                          ││  0x00007f85a58dddf3: xchg   %rax,%r11              ; <--- barrier slowpath
+                          ││  0x00007f85a58dddf6: callq  Stub::shenandoah_wb
+                          ││  0x00007f85a58dddfb: xchg   %rax,%r11
+          0.53%    0.62%  │↘  0x00007f85a58dddfe: mov    %r10d,0xc(%r11)        ; actual store
+         25.37%   27.26%  │   0x00007f85a58dde02: add    $0x10,%rsp
+          2.28%    2.20%  │   0x00007f85a58dde06: pop    %rbp
+          0.72%    0.03%  │   0x00007f85a58dde07: test   %eax,0x11df41f3(%rip)
+          0.06%    0.09%  │   0x00007f85a58dde0d: retq
+
+
+        Parallel:
+
+                          [Verified Entry Point]
+          4.35%    5.11%    0x00007fc6fdaf3840: mov    %eax,-0x14000(%rsp)
+         10.05%    9.23%    0x00007fc6fdaf3847: push   %rbp
+          0.05%             0x00007fc6fdaf3848: sub    $0x10,%rsp
+          4.65%    4.18%    0x00007fc6fdaf384c: mov    0xc(%rsi),%r11d          ; get field $source
+         10.83%    9.16%    0x00007fc6fdaf3850: mov    0x10(%rsi),%r10d         ; get field $target
+          0.02%    0.02%    0x00007fc6fdaf3854: mov    %r11d,0xc(%r12,%r10,8)   ; actual field store
+         14.61%   15.76%    0x00007fc6fdaf3859: add    $0x10,%rsp
+                            0x00007fc6fdaf385d: pop    %rbp
+          6.13%    6.24%    0x00007fc6fdaf385e: test   %eax,0x13c1a79c(%rip)
+          0.02%             0x00007fc6fdaf3864: retq
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/runtime/writes/WriteBarriersRef.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,114 @@
+package org.openjdk.gcbench.runtime.writes;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Thread)
+public class WriteBarriersRef {
+
+    Target target;
+    Target source;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+        source = new Target();
+    }
+
+    @Benchmark
+
+    public void test() {
+        doStore(target, source);
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void doStore(Target t, Target v) {
+        t.field = v;
+    }
+
+    static class Target {
+        Object field;
+    }
+
+    /*
+       i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+           Benchmark              Mode  Cnt  Score   Error  Units
+
+           # Shenandoah
+           WriteBarriersRef.test  avgt   25  3.222 ± 0.003  ns/op
+
+           # G1
+           WriteBarriersRef.test  avgt   25  2.341 ± 0.005  ns/op
+
+           # Parallel
+           WriteBarriersRef.test  avgt   25  2.176 ± 0.006  ns/op
+
+        The difference is in a more complicated write barrier that may be improved a little with more
+        implicit null checks and code quality tuneups.
+
+        Shenandoah:
+
+                               [Verified Entry Point]
+          5.12%    4.76%         0x00007f51f0845dc0: mov    %eax,-0x14000(%rsp)
+          3.00%    2.29%         0x00007f51f0845dc7: push   %rbp
+          0.08%    0.05%         0x00007f51f0845dc8: sub    $0x10,%rsp
+          4.90%    3.93%         0x00007f51f0845dcc: mov    -0x8(%rsi),%r10     ; <--- read barrier
+          2.34%    0.63%         0x00007f51f0845dd0: mov    0xc(%r10),%r11d     ; get field $target
+          0.05%    0.08%         0x00007f51f0845dd4: mov    0x10(%r10),%r10d    ; <-- get field $source
+          4.68%    5.24%         0x00007f51f0845dd8: test   %r11d,%r11d         ; nullcheck $target <--- NOT IMPLICIT?
+                                 0x00007f51f0845ddb: je     0x00007f51f0845e6c
+          0.05%    0.03%         0x00007f51f0845de1: test   %r10d,%r10d         ; nullcheck $source <--- NOT IMPLICIT?
+                          ╭      0x00007f51f0845de4: je     0x00007f51f0845e33
+          2.56%    0.19%  │      0x00007f51f0845de6: shl    $0x3,%r10           ; unpack $source
+          0.03%           │      0x00007f51f0845dea: mov    -0x8(%r10),%rbx     ; <--- HUH? Is this a null check trap? Unpacking is not needed then.
+          6.58%    5.81%  │  ↗   0x00007f51f0845dee: lea    (%r12,%r11,8),%r10  ; unpack $target
+                          │  │   0x00007f51f0845df2: mov    -0x8(%r10),%rbp     ; <--- read barrier
+          2.00%    0.06%  │  │   0x00007f51f0845df6: cmpb   $0x0,0x3d8(%r15)    ; "evacuation in progress?"
+                          │  │   0x00007f51f0845dfe: mov    -0x8(%r10),%rbp     ; <--- read barrier again (in sequence)
+          6.46%    6.90%  │╭ │   0x00007f51f0845e02: je     0x00007f51f0845e0f  ; no evacuation? jump over the barrier <--- SHOULD MOVE TO SLOWPATH
+                          ││ │   0x00007f51f0845e04: xchg   %rax,%rbp
+                          ││ │   0x00007f51f0845e07: callq  Stub::shenandoah_wb ;   {runtime_call StubRoutines (2)}
+                          ││ │   0x00007f51f0845e0c: xchg   %rax,%rbp
+                          │↘ │   0x00007f51f0845e0f: movsbl 0x378(%r15),%r11d   ; SATB check and jump
+          1.29%    1.33%  │  │   0x00007f51f0845e17: test   %r11d,%r11d
+                          │ ╭│   0x00007f51f0845e1a: jne    0x00007f51f0845e37
+                          │ ││↗  0x00007f51f0845e1c: mov    %rbx,%r10           ; packing $source (note, we unpacked before)
+          6.61%    6.90%  │ │││  0x00007f51f0845e1f: shr    $0x3,%r10
+                          │ │││  0x00007f51f0845e23: mov    %r10d,0xc(%rbp)     ; field store!
+         18.55%   20.33%  │ │││  0x00007f51f0845e27: add    $0x10,%rsp
+                          │ │││  0x00007f51f0845e2b: pop    %rbp
+          2.93%    3.37%  │ │││  0x00007f51f0845e2c: test   %eax,0x11e691ce(%rip)
+                          │ │││  0x00007f51f0845e32: retq
+
+
+        Parallel:
+
+                           [Verified Entry Point]
+          3.65%    4.20%     0x00007f5169af3040: mov    %eax,-0x14000(%rsp)
+          9.31%    9.50%     0x00007f5169af3047: push   %rbp
+                             0x00007f5169af3048: sub    $0x10,%rsp
+          3.17%    4.53%     0x00007f5169af304c: mov    0x10(%rsi),%r11d        ; get field $source
+          8.67%    7.93%     0x00007f5169af3050: mov    0xc(%rsi),%r10d         ; get field $target
+                          ╭  0x00007f5169af3054: mov    %r11d,0xc(%r12,%r10,8)  ; field store, implicit exception: dispatches to 0x00007f5169af307b
+          8.29%    8.77%  │  0x00007f5169af3059: shl    $0x3,%r10               ; card mark update
+                          │  0x00007f5169af305d: shr    $0x9,%r10
+          6.58%    5.85%  │  0x00007f5169af3061: movabs $0x7f517632f000,%r11
+                          │  0x00007f5169af306b: mov    %r12b,(%r11,%r10,1)
+         11.76%   11.87%  │  0x00007f5169af306f: add    $0x10,%rsp              ; epilog
+                          │  0x00007f5169af3073: pop    %rbp
+          5.28%    5.76%  │  0x00007f5169af3074: test   %eax,0x145b3f86(%rip)
+                          │  0x00007f5169af307a: retq
+                          ↘  0x00007f5169af307b: mov    $0xfffffff6,%esi
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/util/Dummy.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,15 @@
+package org.openjdk.gcbench.util;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 1, time = 100, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 1, time = 100, timeUnit = TimeUnit.MILLISECONDS)
+@Fork(1)
+public class Dummy {
+
+    @Benchmark
+    public void test() {}
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/util/TokenBucketBench.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,35 @@
+package org.openjdk.gcbench.util;
+
+import org.openjdk.gcbench.util.ratelimit.MultiTokenBucket;
+import org.openjdk.gcbench.util.ratelimit.RateLimiter;
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Benchmark)
+public class TokenBucketBench {
+
+    private RateLimiter limiter;
+
+    @Param({"10", "100", "1000", "10000", "100000", "1000000", "100000000"})
+    int rate;
+
+    @Setup
+    public void setup() {
+        limiter = new MultiTokenBucket(rate);
+    }
+
+    @Benchmark
+    public void baseline() {
+
+    }
+
+    @Benchmark
+    public void test() {
+        limiter.limit();
+//        return new Object();
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/MultiTokenBucket.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,216 @@
+package org.openjdk.gcbench.util.ratelimit;
+
+import java.util.concurrent.atomic.*;
+
+public class MultiTokenBucket implements RateLimiter {
+
+    static final int QUANTA_PER_SEC = 10;
+    static final int MS_PER_QUANTUM = 1000 / QUANTA_PER_SEC;
+
+    static final AtomicReferenceFieldUpdater<MultiTokenBucket, Counters> STATE =
+            AtomicReferenceFieldUpdater.newUpdater(MultiTokenBucket.class, Counters.class, "counters");
+
+    private final int tokensPerQuantum;
+    private final long timeBase;
+
+    private final int stateCount;
+    private final int stateCountMask;
+
+    private volatile Counters counters;
+    private volatile int currentQuant;
+
+    public MultiTokenBucket(int ratePerSec) {
+        this.tokensPerQuantum = Math.max(1, ratePerSec / QUANTA_PER_SEC);
+        this.stateCount = roundToPow2(Runtime.getRuntime().availableProcessors() * 2);
+        this.stateCountMask = stateCount - 1;
+        this.timeBase = System.currentTimeMillis();
+        STATE.set(this, new Counters(newCounters(), 0));
+
+        new StampUpdater().start();
+    }
+
+    private Counter[] newCounters() {
+        Counter[] counters = new Counter[stateCount];
+        for (int c = 0; c < stateCount; c++) {
+            counters[c] = new Counter();
+        }
+        return counters;
+    }
+
+    private static int roundToPow2(int v) {
+        v--;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        v++;
+        return v;
+    }
+
+    @Override
+    public void limit() {
+        int id = (int)(Thread.currentThread().getId() & stateCountMask);
+
+        while (true) {
+            int quantId = currentQuant;
+
+            Counters st = STATE.get(this);
+            Counter[] states = st.states;
+            int time = st.time;
+
+            if (time == quantId) {
+                // our time, try to figure out the state
+
+                // try to optimistically poll my own ID
+                Counter my = states[id];
+
+                if (my.dec() >= 1) {
+                    return; // success!
+                }
+
+                // try to steal!
+                for (int i = id + 1; i < stateCount; i++) {
+                    if (trySteal(my, states[i]))
+                        return; // success!
+                }
+
+                for (int i = 0; i < id; i++) {
+                    if (trySteal(my, states[i]))
+                        return; // success!
+                }
+            }
+
+            // no rush, this is not our quantum: wait before re-spinning
+            try {
+                Thread.sleep(1);
+            } catch (InterruptedException e) {
+                // ignore
+            }
+        }
+    }
+
+    private boolean trySteal(Counter dst, Counter src) {
+        if (src.val() != 0) {
+            int stolen = src.steal();
+            if (stolen > 0) {
+                dst.add(stolen - 1); // borrow one!
+                return true;
+            }
+        }
+        return false;
+    }
+
+    class StampUpdater extends Thread {
+        public StampUpdater() {
+            setDaemon(true);
+            setPriority(MAX_PRIORITY);
+        }
+
+        @Override
+        public void run() {
+            int lastQuantId = 0;
+            while (!Thread.interrupted()) {
+                int quantId = (int) ((System.currentTimeMillis() - timeBase) / MS_PER_QUANTUM);
+                if (quantId != lastQuantId) {
+                    Counter[] cnts = newCounters();
+                    cnts[0].add(tokensPerQuantum);
+
+                    currentQuant = quantId;
+                    lastQuantId = quantId;
+
+                    STATE.set(MultiTokenBucket.this, new Counters(cnts, quantId));
+                }
+                try {
+                    Thread.sleep(1);
+                } catch (InterruptedException e) {
+                    // do nothing
+                }
+            }
+        }
+    }
+
+    static class Counters {
+        private final Counter[] states;
+        private final int time;
+
+        public Counters(Counter[] states, int time) {
+            this.states = states;
+            this.time = time;
+        }
+    }
+
+    private static class Counter_Payload extends Counter_B1 {
+        static final AtomicIntegerFieldUpdater<Counter_Payload> CURRENT =
+                AtomicIntegerFieldUpdater.newUpdater(Counter_Payload.class, "cnt");
+
+        volatile int cnt;
+
+        int val() {
+            return CURRENT.get(this);
+        }
+
+        int dec() {
+            return CURRENT.getAndDecrement(this);
+        }
+
+        public int steal() {
+            while (true) {
+                int val = CURRENT.get(this);
+                int steal = val / 2;
+                int remain = val - steal;
+                if (remain <= 0)
+                    return 0;
+                if (CURRENT.compareAndSet(this, val, remain))
+                    return steal;
+            }
+        }
+
+        void add(int val) {
+            CURRENT.addAndGet(this, val);
+        }
+    }
+
+    private static class Counter_B1 {
+        boolean p000, p001, p002, p003, p004, p005, p006, p007, p008, p009, p010, p011, p012, p013, p014, p015;
+        boolean p016, p017, p018, p019, p020, p021, p022, p023, p024, p025, p026, p027, p028, p029, p030, p031;
+        boolean p032, p033, p034, p035, p036, p037, p038, p039, p040, p041, p042, p043, p044, p045, p046, p047;
+        boolean p048, p049, p050, p051, p052, p053, p054, p055, p056, p057, p058, p059, p060, p061, p062, p063;
+        boolean p064, p065, p066, p067, p068, p069, p070, p071, p072, p073, p074, p075, p076, p077, p078, p079;
+        boolean p080, p081, p082, p083, p084, p085, p086, p087, p088, p089, p090, p091, p092, p093, p094, p095;
+        boolean p096, p097, p098, p099, p100, p101, p102, p103, p104, p105, p106, p107, p108, p109, p110, p111;
+        boolean p112, p113, p114, p115, p116, p117, p118, p119, p120, p121, p122, p123, p124, p125, p126, p127;
+        boolean p128, p129, p130, p131, p132, p133, p134, p135, p136, p137, p138, p139, p140, p141, p142, p143;
+        boolean p144, p145, p146, p147, p148, p149, p150, p151, p152, p153, p154, p155, p156, p157, p158, p159;
+        boolean p160, p161, p162, p163, p164, p165, p166, p167, p168, p169, p170, p171, p172, p173, p174, p175;
+        boolean p176, p177, p178, p179, p180, p181, p182, p183, p184, p185, p186, p187, p188, p189, p190, p191;
+        boolean p192, p193, p194, p195, p196, p197, p198, p199, p200, p201, p202, p203, p204, p205, p206, p207;
+        boolean p208, p209, p210, p211, p212, p213, p214, p215, p216, p217, p218, p219, p220, p221, p222, p223;
+        boolean p224, p225, p226, p227, p228, p229, p230, p231, p232, p233, p234, p235, p236, p237, p238, p239;
+        boolean p240, p241, p242, p243, p244, p245, p246, p247, p248, p249, p250, p251, p252, p253, p254, p255;
+    }
+
+    private static class Counter_B2 extends Counter_Payload {
+        boolean p000, p001, p002, p003, p004, p005, p006, p007, p008, p009, p010, p011, p012, p013, p014, p015;
+        boolean p016, p017, p018, p019, p020, p021, p022, p023, p024, p025, p026, p027, p028, p029, p030, p031;
+        boolean p032, p033, p034, p035, p036, p037, p038, p039, p040, p041, p042, p043, p044, p045, p046, p047;
+        boolean p048, p049, p050, p051, p052, p053, p054, p055, p056, p057, p058, p059, p060, p061, p062, p063;
+        boolean p064, p065, p066, p067, p068, p069, p070, p071, p072, p073, p074, p075, p076, p077, p078, p079;
+        boolean p080, p081, p082, p083, p084, p085, p086, p087, p088, p089, p090, p091, p092, p093, p094, p095;
+        boolean p096, p097, p098, p099, p100, p101, p102, p103, p104, p105, p106, p107, p108, p109, p110, p111;
+        boolean p112, p113, p114, p115, p116, p117, p118, p119, p120, p121, p122, p123, p124, p125, p126, p127;
+        boolean p128, p129, p130, p131, p132, p133, p134, p135, p136, p137, p138, p139, p140, p141, p142, p143;
+        boolean p144, p145, p146, p147, p148, p149, p150, p151, p152, p153, p154, p155, p156, p157, p158, p159;
+        boolean p160, p161, p162, p163, p164, p165, p166, p167, p168, p169, p170, p171, p172, p173, p174, p175;
+        boolean p176, p177, p178, p179, p180, p181, p182, p183, p184, p185, p186, p187, p188, p189, p190, p191;
+        boolean p192, p193, p194, p195, p196, p197, p198, p199, p200, p201, p202, p203, p204, p205, p206, p207;
+        boolean p208, p209, p210, p211, p212, p213, p214, p215, p216, p217, p218, p219, p220, p221, p222, p223;
+        boolean p224, p225, p226, p227, p228, p229, p230, p231, p232, p233, p234, p235, p236, p237, p238, p239;
+        boolean p240, p241, p242, p243, p244, p245, p246, p247, p248, p249, p250, p251, p252, p253, p254, p255;
+    }
+
+    private static class Counter extends Counter_B2 {
+
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/RateLimiter.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,7 @@
+package org.openjdk.gcbench.util.ratelimit;
+
+public interface RateLimiter {
+
+    void limit();
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/util/ratelimit/TokenBucket.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,64 @@
+package org.openjdk.gcbench.util.ratelimit;
+
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+public class TokenBucket implements RateLimiter {
+
+    static final int QUANTA_PER_SEC = 5;
+    static final int MS_PER_QUANTUM = 1000 / QUANTA_PER_SEC;
+
+    static final AtomicLongFieldUpdater<TokenBucket> STATE =
+            AtomicLongFieldUpdater.newUpdater(TokenBucket.class, "state");
+
+    private final int tokensPerQuantum;
+    private final long timeBase;
+    private volatile long state;
+
+    public TokenBucket(int ratePerSec) {
+        this.tokensPerQuantum = Math.max(1, ratePerSec / QUANTA_PER_SEC);
+        this.timeBase = System.currentTimeMillis();
+    }
+
+    private static int timestamp(long l) {
+        return (int)(l >> 32);
+    }
+
+    private static int tokens(long l) {
+        return (int)(l & 0x7FFFFFFF);
+    }
+
+    private static long pack(int timestamp, int tokens) {
+        return ((long)timestamp << 32) + tokens;
+    }
+
+    @Override
+    public void limit() {
+        while (true) {
+            int quantId = (int) ((System.currentTimeMillis() - timeBase) / MS_PER_QUANTUM);
+
+            long cur = STATE.get(this);
+            int time = timestamp(cur);
+            int tokens = tokens(cur);
+
+            if (time == quantId && tokens != 0) {
+                // current quantum has tokens, try to claim and exit
+                if (STATE.compareAndSet(this, cur, pack(quantId, tokens - 1))) {
+                    return; // success
+                } else {
+                    continue; // immediate respin
+                }
+            } else if (time <= quantId) {
+                // current or past quantum is empty, try to install a new one, and respin
+                STATE.compareAndSet(this, cur, pack(quantId + 1, tokensPerQuantum));
+            }
+
+            // no rush: wait before respinning
+            try {
+                Thread.sleep(1);
+            } catch (InterruptedException e) {
+                // ignore
+            }
+        }
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/LinkedListGC.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,37 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.LinkedList;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"})
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class LinkedListGC {
+
+    LinkedList<Object> list;
+
+    @Param({"1", "10", "100", "1000"})
+    private int size;
+
+
+    @Setup
+    public void setup() {
+        list = new LinkedList<>();
+        for (int c = 0; c < size; c++) {
+            list.add(new Object());
+        }
+    }
+
+    @Benchmark
+    public void test() throws InterruptedException {
+        Thread.sleep(100);
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/LinkedListTraversal.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,192 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.LinkedList;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1, jvmArgsAppend = {"-Xmx2g", "-Xms2g"})
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class LinkedListTraversal {
+
+    LinkedList<Object> list;
+
+    @Param({"1", "10", "100", "1000"})
+    private int size;
+
+    @Param({"0", "1", "10", "100"})
+    private int work;
+
+    @Setup
+    public void setup() {
+        list = new LinkedList<>();
+        for (int c = 0; c < size; c++) {
+            list.add(new Object());
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        for (Object o : list) {
+            process(o);
+        }
+    }
+
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    private void process(Object o) {
+        Blackhole.consumeCPU(work);
+    }
+
+    /*
+        i7 4790K, 4.0 Ghz, Linux x86_64, JDK 9 (Shenandoah, 2016-09-05)
+
+            Benchmark                 (size)  (work)  Mode  Cnt       Score      Error  Units
+
+            # ------------- Shenandoah
+            LinkedListTraversal.test       1       0  avgt    5      17.388 ±    0.424  ns/op
+            LinkedListTraversal.test       1       1  avgt    5      18.755 ±    0.102  ns/op
+            LinkedListTraversal.test       1      10  avgt    5      33.846 ±    0.185  ns/op
+            LinkedListTraversal.test       1     100  avgt    5     230.246 ±    4.150  ns/op
+
+            LinkedListTraversal.test      10       0  avgt    5     126.915 ±    0.619  ns/op
+            LinkedListTraversal.test      10       1  avgt    5     139.774 ±    1.138  ns/op
+            LinkedListTraversal.test      10      10  avgt    5     299.499 ±    0.536  ns/op
+            LinkedListTraversal.test      10     100  avgt    5    2275.352 ±  108.207  ns/op
+
+            LinkedListTraversal.test     100       0  avgt    5    1242.132 ±   12.020  ns/op
+            LinkedListTraversal.test     100       1  avgt    5    1362.940 ±   18.402  ns/op
+            LinkedListTraversal.test     100      10  avgt    5    2937.820 ±   88.949  ns/op
+            LinkedListTraversal.test     100     100  avgt    5   22461.248 ±  215.900  ns/op
+
+            LinkedListTraversal.test    1000       0  avgt    5   12407.080 ±  784.980  ns/op
+            LinkedListTraversal.test    1000       1  avgt    5   13568.390 ±   69.637  ns/op
+            LinkedListTraversal.test    1000      10  avgt    5   28889.154 ±  156.406  ns/op
+            LinkedListTraversal.test    1000     100  avgt    5  224457.455 ± 1615.782  ns/op
+
+            # ------------- G1
+            LinkedListTraversal.test       1       0  avgt    5      11.554 ±    0.044  ns/op
+            LinkedListTraversal.test       1       1  avgt    5      13.590 ±    0.512  ns/op
+            LinkedListTraversal.test       1      10  avgt    5      29.012 ±    0.098  ns/op
+            LinkedListTraversal.test       1     100  avgt    5     224.851 ±    1.311  ns/op
+
+            LinkedListTraversal.test      10       0  avgt    5     101.351 ±    0.610  ns/op
+            LinkedListTraversal.test      10       1  avgt    5     114.732 ±    0.477  ns/op
+            LinkedListTraversal.test      10      10  avgt    5     278.078 ±    2.164  ns/op
+            LinkedListTraversal.test      10     100  avgt    5    2258.527 ±   95.300  ns/op
+
+            LinkedListTraversal.test     100       0  avgt    5     997.334 ±    3.157  ns/op
+            LinkedListTraversal.test     100       1  avgt    5    1129.239 ±    7.287  ns/op
+            LinkedListTraversal.test     100      10  avgt    5    2692.583 ±   46.002  ns/op
+            LinkedListTraversal.test     100     100  avgt    5   22185.850 ±  119.166  ns/op
+
+            LinkedListTraversal.test    1000       0  avgt    5   10459.159 ±  619.495  ns/op
+            LinkedListTraversal.test    1000       1  avgt    5   11306.074 ±   57.860  ns/op
+            LinkedListTraversal.test    1000      10  avgt    5   26786.378 ±   98.852  ns/op
+            LinkedListTraversal.test    1000     100  avgt    5  223273.974 ± 1668.267  ns/op
+
+            # ------------- Parallel
+            LinkedListTraversal.test       1       0  avgt    5      12.045 ±    0.102  ns/op
+            LinkedListTraversal.test       1       1  avgt    5      13.521 ±    0.109  ns/op
+            LinkedListTraversal.test       1      10  avgt    5      28.933 ±    0.170  ns/op
+            LinkedListTraversal.test       1     100  avgt    5     224.028 ±    1.007  ns/op
+
+            LinkedListTraversal.test      10       0  avgt    5      99.943 ±    0.319  ns/op
+            LinkedListTraversal.test      10       1  avgt    5     113.020 ±    0.472  ns/op
+            LinkedListTraversal.test      10      10  avgt    5     278.874 ±    0.981  ns/op
+            LinkedListTraversal.test      10     100  avgt    5    2240.404 ±   68.685  ns/op
+
+            LinkedListTraversal.test     100       0  avgt    5     984.926 ±    9.101  ns/op
+            LinkedListTraversal.test     100       1  avgt    5    1123.761 ±   22.491  ns/op
+            LinkedListTraversal.test     100      10  avgt    5    2686.100 ±   27.615  ns/op
+            LinkedListTraversal.test     100     100  avgt    5   22200.145 ±  137.753  ns/op
+
+            LinkedListTraversal.test    1000       0  avgt    5   10608.240 ± 1392.802  ns/op
+            LinkedListTraversal.test    1000       1  avgt    5   11415.247 ±  448.812  ns/op
+            LinkedListTraversal.test    1000      10  avgt    5   27146.258 ±  786.984  ns/op
+            LinkedListTraversal.test    1000     100  avgt    5  223566.680 ± 4465.310  ns/op
+
+        Bottom-line: Shenandoah experiences slowdowns compared to G1 and Parallel, mostly visible
+        when the work associated with each element is small. For "no operation" mode, the overhead
+        is around 25%, and explained by more instructions emitted by Shenandoah which do more
+        memory accesses, see:
+
+            Benchmark                                       (size)  (work)  Mode  Cnt      Score      Error  Units
+
+            # ------------- Shenandoah
+            LinkedListTraversal.test                          1000       0  avgt   50  12354.399 ±   23.683  ns/op
+            LinkedListTraversal.test:CPI                      1000       0  avgt   10      0.581 ±    0.003   #/op
+            LinkedListTraversal.test:L1-dcache-load-misses    1000       0  avgt   10    664.917 ±    9.217   #/op
+            LinkedListTraversal.test:L1-dcache-loads          1000       0  avgt   10  32601.580 ±  374.103   #/op  <--- !!!
+            LinkedListTraversal.test:L1-dcache-stores         1000       0  avgt   10  15101.405 ±  176.765   #/op
+            LinkedListTraversal.test:branch-misses            1000       0  avgt   10      6.186 ±    0.311   #/op
+            LinkedListTraversal.test:branches                 1000       0  avgt   10  13746.542 ±  154.449   #/op
+            LinkedListTraversal.test:bus-cycles               1000       0  avgt   10   1268.583 ±   13.952   #/op
+            LinkedListTraversal.test:cycles                   1000       0  avgt   10  45603.892 ±  502.818   #/op
+            LinkedListTraversal.test:dTLB-load-misses         1000       0  avgt   10      1.584 ±    0.228   #/op
+            LinkedListTraversal.test:dTLB-loads               1000       0  avgt    9  32488.220 ±  487.012   #/op
+            LinkedListTraversal.test:dTLB-store-misses        1000       0  avgt   10      0.046 ±    0.017   #/op
+            LinkedListTraversal.test:dTLB-stores              1000       0  avgt   10  15058.174 ±  175.262   #/op
+            LinkedListTraversal.test:instructions             1000       0  avgt   10  78513.188 ±  756.405   #/op  <--- !!!
+            LinkedListTraversal.test:ref-cycles               1000       0  avgt   10  50832.373 ±  505.813   #/op
+
+            # ------------- G1
+            LinkedListTraversal.test                          1000       0  avgt   50  10495.699 ±   65.219  ns/op
+            LinkedListTraversal.test:CPI                      1000       0  avgt   10      0.618 ±    0.014   #/op
+            LinkedListTraversal.test:L1-dcache-load-misses    1000       0  avgt   10    377.307 ±   12.210   #/op
+            LinkedListTraversal.test:L1-dcache-loads          1000       0  avgt   10  22606.939 ±  518.441   #/op
+            LinkedListTraversal.test:L1-dcache-stores         1000       0  avgt   10  16238.168 ±  329.001   #/op
+            LinkedListTraversal.test:branch-misses            1000       0  avgt   10      4.739 ±    0.523   #/op
+            LinkedListTraversal.test:branches                 1000       0  avgt   10   9589.771 ±  226.666   #/op
+            LinkedListTraversal.test:bus-cycles               1000       0  avgt   10   1077.738 ±   21.578   #/op
+            LinkedListTraversal.test:cycles                   1000       0  avgt   10  38816.240 ±  706.088   #/op
+            LinkedListTraversal.test:dTLB-load-misses         1000       0  avgt   10      1.343 ±    0.182   #/op
+            LinkedListTraversal.test:dTLB-loads               1000       0  avgt   10  22527.332 ±  500.666   #/op
+            LinkedListTraversal.test:dTLB-store-misses        1000       0  avgt    8      0.047 ±    0.058   #/op
+            LinkedListTraversal.test:dTLB-stores              1000       0  avgt    8  16151.683 ±  322.656   #/op
+            LinkedListTraversal.test:instructions             1000       0  avgt   10  62848.740 ± 1555.431   #/op
+            LinkedListTraversal.test:ref-cycles               1000       0  avgt   10  43275.574 ±  748.133   #/op
+
+            # ------------- Parallel
+            LinkedListTraversal.test                          1000       0  avgt   50  10507.829 ±   65.413  ns/op
+            LinkedListTraversal.test:CPI                      1000       0  avgt   10      0.617 ±    0.008   #/op
+            LinkedListTraversal.test:L1-dcache-load-misses    1000       0  avgt    9    379.135 ±   16.184   #/op
+            LinkedListTraversal.test:L1-dcache-loads          1000       0  avgt    9  22681.509 ±  390.124   #/op
+            LinkedListTraversal.test:L1-dcache-stores         1000       0  avgt    9  16162.990 ±  244.165   #/op
+            LinkedListTraversal.test:branch-misses            1000       0  avgt   10      4.777 ±    0.553   #/op
+            LinkedListTraversal.test:branches                 1000       0  avgt   10   9537.536 ±  122.677   #/op
+            LinkedListTraversal.test:bus-cycles               1000       0  avgt   10   1072.327 ±   19.018   #/op
+            LinkedListTraversal.test:cycles                   1000       0  avgt   10  38632.620 ±  934.336   #/op
+            LinkedListTraversal.test:dTLB-load-misses         1000       0  avgt    8      1.338 ±    0.148   #/op
+            LinkedListTraversal.test:dTLB-loads               1000       0  avgt    9  22627.587 ±  392.352   #/op
+            LinkedListTraversal.test:dTLB-store-misses        1000       0  avgt    8      0.035 ±    0.016   #/op
+            LinkedListTraversal.test:dTLB-stores              1000       0  avgt    9  16104.147 ±  230.044   #/op
+            LinkedListTraversal.test:instructions             1000       0  avgt   10  62623.520 ± 1394.639   #/op
+            LinkedListTraversal.test:ref-cycles               1000       0  avgt   10  42984.093 ± 1044.819   #/op
+
+        These additional memory accesses are read barriers, e.g.:
+
+        Shenandoah:
+
+              0.88%    0.54%  0x00007f2029549703: mov    -0x8(%rcx),%r10    ; implicit exception: dispatches to 0x00007f2029549cb9
+              1.18%    0.95%  0x00007f2029549707: mov    0x10(%r10),%r10d   ;*getfield next {reexecute=0 rethrow=0 return_oop=0}
+                                                                            ; - java.util.LinkedList$ListItr::next@32 (line 897)
+                                                                            ; - org.openjdk.shenandoah.scenarios.LinkedListTraversal::test@18 (line 36)
+                                                                            ; - org.openjdk.shenandoah.scenarios.generated.LinkedListTraversal_test_jmhTest::test_avgt_jmhStub@15 (line 213)
+
+        G1/Parallel:
+
+              0.95%    0.72%  0x00007fe7c554707b: mov    0x10(%r11),%r10d   ;*getfield next {reexecute=0 rethrow=0 return_oop=0}
+                                                                            ; - java.util.LinkedList$ListItr::next@32 (line 897)
+                                                                            ; - org.openjdk.shenandoah.scenarios.LinkedListTraversal::test@18 (line 36)
+                                                                            ; - org.openjdk.shenandoah.scenarios.generated.LinkedListTraversal_test_jmhTest::test_avgt_jmhStub@15 (line 213)
+                                                                            ; implicit exception: dispatches to 0x00007fe7c5547305
+
+     */
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/ReadWriteBarriers.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,38 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class ReadWriteBarriers {
+
+    AtomicInteger ai;
+
+    @Setup
+    public void setup() {
+        ai = new AtomicInteger();
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public Object cas() {
+        AtomicInteger ai = this.ai;
+        int cur = ai.get();
+        return ai.compareAndSet(cur, cur + 1);
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public Object incrAndGet() {
+        return ai.incrementAndGet();
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/Synchronizers.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,78 @@
+package org.openjdk.gcbench.wip;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1, jvmArgsAppend = "-Xss32m")
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Threads(Threads.MAX)
+@State(Scope.Benchmark)
+public class Synchronizers {
+
+    List<Object> list;
+
+    @Param({"40000"})
+    private int size;
+
+    @Setup
+    public void setup() {
+        list = new ArrayList<>();
+        for (int c = 0; c < size; c++) {
+            list.add(new Object());
+        }
+    }
+
+    @Benchmark
+    public void test(Blackhole bh) throws InterruptedException {
+        recursiveLock(bh, list, 0);
+    }
+
+    private void recursiveLock(Blackhole bh, List<Object> list, int i) {
+        if (i < list.size()) {
+            Object o0 = list.get(i + 0);
+            Object o1 = list.get(i + 1);
+            Object o2 = list.get(i + 2);
+            Object o3 = list.get(i + 3);
+            Object o4 = list.get(i + 4);
+            Object o5 = list.get(i + 5);
+            Object o6 = list.get(i + 6);
+            Object o7 = list.get(i + 7);
+            Object o8 = list.get(i + 8);
+            Object o9 = list.get(i + 9);
+            synchronized (o0) {
+                synchronized (o1) {
+                    synchronized (o2) {
+                        synchronized (o3) {
+                            synchronized (o4) {
+                                synchronized (o5) {
+                                    synchronized (o6) {
+                                        synchronized (o7) {
+                                            synchronized (o8) {
+                                                synchronized (o9) {
+                                                    recursiveLock(bh, list, i + 10);
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            for (int c = 0; c < size; c++) {
+                bh.consume(list.get(c).hashCode());
+                list.set(c, new Object());
+            }
+        }
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/WeakCasBarriers.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,36 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WeakCasBarriers {
+
+    AtomicReference ai;
+
+    Object t1, t2;
+
+    @Setup
+    public void setup() {
+        ai = new AtomicReference();
+        t1 = new Object();
+        t2 = new Object();
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        AtomicReference ai = this.ai;
+        ai.weakCompareAndSet(t1, t2);
+        ai.weakCompareAndSet(t2, t1);
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/WeakRefs.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,62 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.lang.ref.ReferenceQueue;
+import java.lang.ref.WeakReference;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WeakRefs {
+
+    @Param({"1", "10", "100", "1000", "10000"})
+    int count;
+
+    ReferenceQueue<Target> refq;
+    Target[] targets;
+    MyRef[] refs;
+
+    @Setup
+    public void setup() {
+        refq = new ReferenceQueue<>();
+        targets = new Target[count];
+        refs = new MyRef[count];
+        for (int c = 0; c < count; c++) {
+            Target o = new Target();
+            targets[c] = o;
+            refs[c] = new MyRef(refq, o, c);
+        }
+    }
+
+    @Benchmark
+    public void test() {
+        targets[ThreadLocalRandom.current().nextInt(count)] = null;
+
+        MyRef r;
+        while ((r = (MyRef)refq.poll()) != null) {
+            Target o = new Target();
+            targets[r.index] = o;
+            refs[r.index] = new MyRef(refq, o, r.index);
+        }
+    }
+
+    static class MyRef extends WeakReference<Target> {
+        int index;
+        public MyRef(ReferenceQueue<Target> refq, Target obj, int idx) {
+            super(obj, refq);
+            this.index = idx;
+        }
+    }
+
+    static class Target {
+
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/WriteBarriersKnownNew.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,35 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WriteBarriersKnownNew {
+
+    Target target;
+    Target targetNull;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+        targetNull = null;
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void test() {
+        target.field = new Object();
+    }
+
+    static class Target {
+        Object field;
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/main/java/org/openjdk/gcbench/wip/WriteBarriersNullCheck.java	Wed Nov 23 16:04:53 2016 +0100
@@ -0,0 +1,45 @@
+package org.openjdk.shenandoah.wip;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WriteBarriersNullCheck {
+
+    Target target;
+    Target targetNull;
+
+    @Setup
+    public void setup() {
+        target = new Target();
+        targetNull = null;
+    }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void storeToNull() {
+        try {
+            targetNull.field = null;
+        } catch (NullPointerException e) {
+            // expected
+        }
+   }
+
+    @Benchmark
+    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+    public void storeNull() {
+        target.field = null;
+    }
+
+    static class Target {
+        Object field;
+    }
+
+}