Mercurial > hg > openjdk > jdk9 > hotspot

--- a/src/cpu/aarch64/vm/aarch64.ad	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/aarch64.ad	Thu Jun 04 18:50:05 2015 -0700
@@ -161,70 +161,165 @@
 // the platform ABI treats v8-v15 as callee save). float registers
 // v16-v31 are SOC as per the platform spec

-  reg_def V0   ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()         );
-  reg_def V0_H ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next() );
-  reg_def V1   ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()         );
-  reg_def V1_H ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next() );
-  reg_def V2   ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()         );
-  reg_def V2_H ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next() );
-  reg_def V3   ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()         );
-  reg_def V3_H ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next() );
-  reg_def V4   ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()         );
-  reg_def V4_H ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next() );
-  reg_def V5   ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()         );
-  reg_def V5_H ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next() );
-  reg_def V6   ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()         );
-  reg_def V6_H ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next() );
-  reg_def V7   ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()         );
-  reg_def V7_H ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next() );
-  reg_def V8   ( SOC, SOE, Op_RegF,  8, v8->as_VMReg()         );
-  reg_def V8_H ( SOC, SOE, Op_RegF,  8, v8->as_VMReg()->next() );
-  reg_def V9   ( SOC, SOE, Op_RegF,  9, v9->as_VMReg()         );
-  reg_def V9_H ( SOC, SOE, Op_RegF,  9, v9->as_VMReg()->next() );
-  reg_def V10  ( SOC, SOE, Op_RegF, 10, v10->as_VMReg()        );
-  reg_def V10_H( SOC, SOE, Op_RegF, 10, v10->as_VMReg()->next());
-  reg_def V11  ( SOC, SOE, Op_RegF, 11, v11->as_VMReg()        );
-  reg_def V11_H( SOC, SOE, Op_RegF, 11, v11->as_VMReg()->next());
-  reg_def V12  ( SOC, SOE, Op_RegF, 12, v12->as_VMReg()        );
-  reg_def V12_H( SOC, SOE, Op_RegF, 12, v12->as_VMReg()->next());
-  reg_def V13  ( SOC, SOE, Op_RegF, 13, v13->as_VMReg()        );
-  reg_def V13_H( SOC, SOE, Op_RegF, 13, v13->as_VMReg()->next());
-  reg_def V14  ( SOC, SOE, Op_RegF, 14, v14->as_VMReg()        );
-  reg_def V14_H( SOC, SOE, Op_RegF, 14, v14->as_VMReg()->next());
-  reg_def V15  ( SOC, SOE, Op_RegF, 15, v15->as_VMReg()        );
-  reg_def V15_H( SOC, SOE, Op_RegF, 15, v15->as_VMReg()->next());
-  reg_def V16  ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()        );
-  reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next());
-  reg_def V17  ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()        );
-  reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next());
-  reg_def V18  ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()        );
-  reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next());
-  reg_def V19  ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()        );
-  reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next());
-  reg_def V20  ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()        );
-  reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next());
-  reg_def V21  ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()        );
-  reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next());
-  reg_def V22  ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()        );
-  reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next());
-  reg_def V23  ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()        );
-  reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next());
-  reg_def V24  ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()        );
-  reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next());
-  reg_def V25  ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()        );
-  reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next());
-  reg_def V26  ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()        );
-  reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next());
-  reg_def V27  ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()        );
-  reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next());
-  reg_def V28  ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()        );
-  reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next());
-  reg_def V29  ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()        );
-  reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next());
-  reg_def V30  ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()        );
-  reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next());
-  reg_def V31  ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()        );
-  reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next());
+  reg_def V0   ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()          );
+  reg_def V0_H ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next()  );
+  reg_def V0_J ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next(2) );
+  reg_def V0_K ( SOC, SOC, Op_RegF,  0, v0->as_VMReg()->next(3) );
+
+  reg_def V1   ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()          );
+  reg_def V1_H ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next()  );
+  reg_def V1_J ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next(2) );
+  reg_def V1_K ( SOC, SOC, Op_RegF,  1, v1->as_VMReg()->next(3) );
+
+  reg_def V2   ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()          );
+  reg_def V2_H ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next()  );
+  reg_def V2_J ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next(2) );
+  reg_def V2_K ( SOC, SOC, Op_RegF,  2, v2->as_VMReg()->next(3) );
+
+  reg_def V3   ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()          );
+  reg_def V3_H ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next()  );
+  reg_def V3_J ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next(2) );
+  reg_def V3_K ( SOC, SOC, Op_RegF,  3, v3->as_VMReg()->next(3) );
+
+  reg_def V4   ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()          );
+  reg_def V4_H ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next()  );
+  reg_def V4_J ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next(2) );
+  reg_def V4_K ( SOC, SOC, Op_RegF,  4, v4->as_VMReg()->next(3) );
+
+  reg_def V5   ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()          );
+  reg_def V5_H ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next()  );
+  reg_def V5_J ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next(2) );
+  reg_def V5_K ( SOC, SOC, Op_RegF,  5, v5->as_VMReg()->next(3) );
+
+  reg_def V6   ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()          );
+  reg_def V6_H ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next()  );
+  reg_def V6_J ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next(2) );
+  reg_def V6_K ( SOC, SOC, Op_RegF,  6, v6->as_VMReg()->next(3) );
+
+  reg_def V7   ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()          );
+  reg_def V7_H ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next()  );
+  reg_def V7_J ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next(2) );
+  reg_def V7_K ( SOC, SOC, Op_RegF,  7, v7->as_VMReg()->next(3) );
+
+  reg_def V8   ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()          );
+  reg_def V8_H ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next()  );
+  reg_def V8_J ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next(2) );
+  reg_def V8_K ( SOC, SOC, Op_RegF,  8, v8->as_VMReg()->next(3) );
+
+  reg_def V9   ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()          );
+  reg_def V9_H ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next()  );
+  reg_def V9_J ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next(2) );
+  reg_def V9_K ( SOC, SOC, Op_RegF,  9, v9->as_VMReg()->next(3) );
+
+  reg_def V10  ( SOC, SOC, Op_RegF, 10, v10->as_VMReg()         );
+  reg_def V10_H( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next() );
+  reg_def V10_J( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(2));
+  reg_def V10_K( SOC, SOC, Op_RegF, 10, v10->as_VMReg()->next(3));
+
+  reg_def V11  ( SOC, SOC, Op_RegF, 11, v11->as_VMReg()         );
+  reg_def V11_H( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next() );
+  reg_def V11_J( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(2));
+  reg_def V11_K( SOC, SOC, Op_RegF, 11, v11->as_VMReg()->next(3));
+
+  reg_def V12  ( SOC, SOC, Op_RegF, 12, v12->as_VMReg()         );
+  reg_def V12_H( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next() );
+  reg_def V12_J( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(2));
+  reg_def V12_K( SOC, SOC, Op_RegF, 12, v12->as_VMReg()->next(3));
+
+  reg_def V13  ( SOC, SOC, Op_RegF, 13, v13->as_VMReg()         );
+  reg_def V13_H( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next() );
+  reg_def V13_J( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(2));
+  reg_def V13_K( SOC, SOC, Op_RegF, 13, v13->as_VMReg()->next(3));
+
+  reg_def V14  ( SOC, SOC, Op_RegF, 14, v14->as_VMReg()         );
+  reg_def V14_H( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next() );
+  reg_def V14_J( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(2));
+  reg_def V14_K( SOC, SOC, Op_RegF, 14, v14->as_VMReg()->next(3));
+
+  reg_def V15  ( SOC, SOC, Op_RegF, 15, v15->as_VMReg()         );
+  reg_def V15_H( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next() );
+  reg_def V15_J( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(2));
+  reg_def V15_K( SOC, SOC, Op_RegF, 15, v15->as_VMReg()->next(3));
+
+  reg_def V16  ( SOC, SOC, Op_RegF, 16, v16->as_VMReg()         );
+  reg_def V16_H( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next() );
+  reg_def V16_J( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(2));
+  reg_def V16_K( SOC, SOC, Op_RegF, 16, v16->as_VMReg()->next(3));
+
+  reg_def V17  ( SOC, SOC, Op_RegF, 17, v17->as_VMReg()         );
+  reg_def V17_H( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next() );
+  reg_def V17_J( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(2));
+  reg_def V17_K( SOC, SOC, Op_RegF, 17, v17->as_VMReg()->next(3));
+
+  reg_def V18  ( SOC, SOC, Op_RegF, 18, v18->as_VMReg()         );
+  reg_def V18_H( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next() );
+  reg_def V18_J( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(2));
+  reg_def V18_K( SOC, SOC, Op_RegF, 18, v18->as_VMReg()->next(3));
+
+  reg_def V19  ( SOC, SOC, Op_RegF, 19, v19->as_VMReg()         );
+  reg_def V19_H( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next() );
+  reg_def V19_J( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(2));
+  reg_def V19_K( SOC, SOC, Op_RegF, 19, v19->as_VMReg()->next(3));
+
+  reg_def V20  ( SOC, SOC, Op_RegF, 20, v20->as_VMReg()         );
+  reg_def V20_H( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next() );
+  reg_def V20_J( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(2));
+  reg_def V20_K( SOC, SOC, Op_RegF, 20, v20->as_VMReg()->next(3));
+
+  reg_def V21  ( SOC, SOC, Op_RegF, 21, v21->as_VMReg()         );
+  reg_def V21_H( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next() );
+  reg_def V21_J( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(2));
+  reg_def V21_K( SOC, SOC, Op_RegF, 21, v21->as_VMReg()->next(3));
+
+  reg_def V22  ( SOC, SOC, Op_RegF, 22, v22->as_VMReg()         );
+  reg_def V22_H( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next() );
+  reg_def V22_J( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(2));
+  reg_def V22_K( SOC, SOC, Op_RegF, 22, v22->as_VMReg()->next(3));
+
+  reg_def V23  ( SOC, SOC, Op_RegF, 23, v23->as_VMReg()         );
+  reg_def V23_H( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next() );
+  reg_def V23_J( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(2));
+  reg_def V23_K( SOC, SOC, Op_RegF, 23, v23->as_VMReg()->next(3));
+
+  reg_def V24  ( SOC, SOC, Op_RegF, 24, v24->as_VMReg()         );
+  reg_def V24_H( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next() );
+  reg_def V24_J( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(2));
+  reg_def V24_K( SOC, SOC, Op_RegF, 24, v24->as_VMReg()->next(3));
+
+  reg_def V25  ( SOC, SOC, Op_RegF, 25, v25->as_VMReg()         );
+  reg_def V25_H( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next() );
+  reg_def V25_J( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(2));
+  reg_def V25_K( SOC, SOC, Op_RegF, 25, v25->as_VMReg()->next(3));
+
+  reg_def V26  ( SOC, SOC, Op_RegF, 26, v26->as_VMReg()         );
+  reg_def V26_H( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next() );
+  reg_def V26_J( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(2));
+  reg_def V26_K( SOC, SOC, Op_RegF, 26, v26->as_VMReg()->next(3));
+
+  reg_def V27  ( SOC, SOC, Op_RegF, 27, v27->as_VMReg()         );
+  reg_def V27_H( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next() );
+  reg_def V27_J( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(2));
+  reg_def V27_K( SOC, SOC, Op_RegF, 27, v27->as_VMReg()->next(3));
+
+  reg_def V28  ( SOC, SOC, Op_RegF, 28, v28->as_VMReg()         );
+  reg_def V28_H( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next() );
+  reg_def V28_J( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(2));
+  reg_def V28_K( SOC, SOC, Op_RegF, 28, v28->as_VMReg()->next(3));
+
+  reg_def V29  ( SOC, SOC, Op_RegF, 29, v29->as_VMReg()         );
+  reg_def V29_H( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next() );
+  reg_def V29_J( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(2));
+  reg_def V29_K( SOC, SOC, Op_RegF, 29, v29->as_VMReg()->next(3));
+
+  reg_def V30  ( SOC, SOC, Op_RegF, 30, v30->as_VMReg()         );
+  reg_def V30_H( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next() );
+  reg_def V30_J( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(2));
+  reg_def V30_K( SOC, SOC, Op_RegF, 30, v30->as_VMReg()->next(3));
+
+  reg_def V31  ( SOC, SOC, Op_RegF, 31, v31->as_VMReg()         );
+  reg_def V31_H( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next() );
+  reg_def V31_J( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(2));
+  reg_def V31_K( SOC, SOC, Op_RegF, 31, v31->as_VMReg()->next(3));

 // ----------------------------
 // Special Registers
@@ -291,42 +386,42 @@
 alloc_class chunk1(

     // no save
-    V16, V16_H,
-    V17, V17_H,
-    V18, V18_H,
-    V19, V19_H,
-    V20, V20_H,
-    V21, V21_H,
-    V22, V22_H,
-    V23, V23_H,
-    V24, V24_H,
-    V25, V25_H,
-    V26, V26_H,
-    V27, V27_H,
-    V28, V28_H,
-    V29, V29_H,
-    V30, V30_H,
-    V31, V31_H,
+    V16, V16_H, V16_J, V16_K,
+    V17, V17_H, V17_J, V17_K,
+    V18, V18_H, V18_J, V18_K,
+    V19, V19_H, V19_J, V19_K,
+    V20, V20_H, V20_J, V20_K,
+    V21, V21_H, V21_J, V21_K,
+    V22, V22_H, V22_J, V22_K,
+    V23, V23_H, V23_J, V23_K,
+    V24, V24_H, V24_J, V24_K,
+    V25, V25_H, V25_J, V25_K,
+    V26, V26_H, V26_J, V26_K,
+    V27, V27_H, V27_J, V27_K,
+    V28, V28_H, V28_J, V28_K,
+    V29, V29_H, V29_J, V29_K,
+    V30, V30_H, V30_J, V30_K,
+    V31, V31_H, V31_J, V31_K,

     // arg registers
-    V0, V0_H,
-    V1, V1_H,
-    V2, V2_H,
-    V3, V3_H,
-    V4, V4_H,
-    V5, V5_H,
-    V6, V6_H,
-    V7, V7_H,
+    V0, V0_H, V0_J, V0_K,
+    V1, V1_H, V1_J, V1_K,
+    V2, V2_H, V2_J, V2_K,
+    V3, V3_H, V3_J, V3_K,
+    V4, V4_H, V4_J, V4_K,
+    V5, V5_H, V5_J, V5_K,
+    V6, V6_H, V6_J, V6_K,
+    V7, V7_H, V7_J, V7_K,

     // non-volatiles
-    V8, V8_H,
-    V9, V9_H,
-    V10, V10_H,
-    V11, V11_H,
-    V12, V12_H,
-    V13, V13_H,
-    V14, V14_H,
-    V15, V15_H,
+    V8, V8_H, V8_J, V8_K,
+    V9, V9_H, V9_J, V9_K,
+    V10, V10_H, V10_J, V10_K,
+    V11, V11_H, V11_J, V11_K,
+    V12, V12_H, V12_J, V12_K,
+    V13, V13_H, V13_J, V13_K,
+    V14, V14_H, V14_J, V14_K,
+    V15, V15_H, V15_J, V15_K,
 );

 alloc_class chunk2(RFLAGS);
@@ -770,6 +865,42 @@
     V31, V31_H
 );

+// Class for all 128bit vector registers
+reg_class vectorx_reg(
+    V0, V0_H, V0_J, V0_K,
+    V1, V1_H, V1_J, V1_K,
+    V2, V2_H, V2_J, V2_K,
+    V3, V3_H, V3_J, V3_K,
+    V4, V4_H, V4_J, V4_K,
+    V5, V5_H, V5_J, V5_K,
+    V6, V6_H, V6_J, V6_K,
+    V7, V7_H, V7_J, V7_K,
+    V8, V8_H, V8_J, V8_K,
+    V9, V9_H, V9_J, V9_K,
+    V10, V10_H, V10_J, V10_K,
+    V11, V11_H, V11_J, V11_K,
+    V12, V12_H, V12_J, V12_K,
+    V13, V13_H, V13_J, V13_K,
+    V14, V14_H, V14_J, V14_K,
+    V15, V15_H, V15_J, V15_K,
+    V16, V16_H, V16_J, V16_K,
+    V17, V17_H, V17_J, V17_K,
+    V18, V18_H, V18_J, V18_K,
+    V19, V19_H, V19_J, V19_K,
+    V20, V20_H, V20_J, V20_K,
+    V21, V21_H, V21_J, V21_K,
+    V22, V22_H, V22_J, V22_K,
+    V23, V23_H, V23_J, V23_K,
+    V24, V24_H, V24_J, V24_K,
+    V25, V25_H, V25_J, V25_K,
+    V26, V26_H, V26_J, V26_K,
+    V27, V27_H, V27_J, V27_K,
+    V28, V28_H, V28_J, V28_K,
+    V29, V29_H, V29_J, V29_K,
+    V30, V30_H, V30_J, V30_K,
+    V31, V31_H, V31_J, V31_K
+);
+
 // Class for 128 bit register v0
 reg_class v0_reg(
     V0, V0_H
@@ -1964,7 +2095,7 @@
   }

   // we have 32 float register * 2 halves
-  if (reg < 60 + 64) {
+  if (reg < 60 + 128) {
     return rc_float;
   }

@@ -2000,6 +2131,78 @@
     return 0;            // Self copy, no move.
   }

+  if (bottom_type()->isa_vect() != NULL) {
+    uint len = 4;
+    if (cbuf) {
+      MacroAssembler _masm(cbuf);
+      uint ireg = ideal_reg();
+      assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity");
+      assert(ireg == Op_VecX, "sanity");
+      if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
+        // stack->stack
+        int src_offset = ra_->reg2offset(src_lo);
+        int dst_offset = ra_->reg2offset(dst_lo);
+        assert((src_offset & 7) && (dst_offset & 7), "unaligned stack offset");
+        len = 8;
+        if (src_offset < 512) {
+          __ ldp(rscratch1, rscratch2, Address(sp, src_offset));
+        } else {
+          __ ldr(rscratch1, Address(sp, src_offset));
+          __ ldr(rscratch2, Address(sp, src_offset+4));
+          len += 4;
+        }
+        if (dst_offset < 512) {
+          __ stp(rscratch1, rscratch2, Address(sp, dst_offset));
+        } else {
+          __ str(rscratch1, Address(sp, dst_offset));
+          __ str(rscratch2, Address(sp, dst_offset+4));
+          len += 4;
+        }
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
+        __ orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), __ T16B,
+               as_FloatRegister(Matcher::_regEncode[src_lo]),
+               as_FloatRegister(Matcher::_regEncode[src_lo]));
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
+        __ str(as_FloatRegister(Matcher::_regEncode[src_lo]), __ Q,
+               Address(sp, ra_->reg2offset(dst_lo)));
+      } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
+        __ ldr(as_FloatRegister(Matcher::_regEncode[dst_lo]), __ Q,
+               Address(sp, ra_->reg2offset(src_lo)));
+      } else {
+        ShouldNotReachHere();
+      }
+    } else if (st) {
+      if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
+        // stack->stack
+        int src_offset = ra_->reg2offset(src_lo);
+        int dst_offset = ra_->reg2offset(dst_lo);
+        if (src_offset < 512) {
+          st->print("ldp  rscratch1, rscratch2, [sp, #%d]", src_offset);
+        } else {
+          st->print("ldr  rscratch1, [sp, #%d]", src_offset);
+          st->print("\nldr  rscratch2, [sp, #%d]", src_offset+4);
+        }
+        if (dst_offset < 512) {
+          st->print("\nstp  rscratch1, rscratch2, [sp, #%d]", dst_offset);
+        } else {
+          st->print("\nstr  rscratch1, [sp, #%d]", dst_offset);
+          st->print("\nstr  rscratch2, [sp, #%d]", dst_offset+4);
+        }
+        st->print("\t# vector spill, stack to stack");
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
+        st->print("mov  %s, %s\t# vector spill, reg to reg",
+                   Matcher::regName[dst_lo], Matcher::regName[src_lo]);
+      } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
+        st->print("str  %s, [sp, #%d]\t# vector spill, reg to stack",
+                   Matcher::regName[src_lo], ra_->reg2offset(dst_lo));
+      } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
+        st->print("ldr  %s, [sp, #%d]\t# vector spill, stack to reg",
+                   Matcher::regName[dst_lo], ra_->reg2offset(src_lo));
+      }
+    }
+    return len;
+  }
+
   switch (src_lo_rc) {
   case rc_int:
     if (dst_lo_rc == rc_int) {  // gpr --> gpr copy
@@ -2422,8 +2625,12 @@

 // Vector width in bytes.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
-  // TODO fixme
-  return 0;
+  int size = MIN2(16,(int)MaxVectorSize);
+  // Minimum 2 values in vector
+  if (size < 2*type2aelembytes(bt)) size = 0;
+  // But never < 4
+  if (size < 4) size = 0;
+  return size;
 }

 // Limits on vector size (number of elements) loaded into vector.
@@ -2431,22 +2638,19 @@
   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 }
 const int Matcher::min_vector_size(const BasicType bt) {
-  int max_size = max_vector_size(bt);
-  // Min size which can be loaded into vector is 4 bytes.
-  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
-  return MIN2(size,max_size);
+  //return (type2aelembytes(bt) == 1) ? 4 : 2;
+  // For the moment, only support 1 vector size, 128 bits
+  return max_vector_size(bt);
 }

 // Vector ideal reg.
 const int Matcher::vector_ideal_reg(int len) {
-  // TODO fixme
-  return Op_RegD;
+  return Op_VecX;
 }

 // Only lowest bits of xmm reg are used for vector shift count.
 const int Matcher::vector_shift_count_ideal_reg(int size) {
-  // TODO fixme
-  return Op_RegL;
+  return Op_VecX;
 }

 // AES support not yet implemented
@@ -2657,6 +2861,8 @@

 typedef void (MacroAssembler::* mem_insn)(Register Rt, const Address &adr);
 typedef void (MacroAssembler::* mem_float_insn)(FloatRegister Rt, const Address &adr);
+typedef void (MacroAssembler::* mem_vector_insn)(FloatRegister Rt,
+                                  MacroAssembler::SIMD_RegVariant T, const Address &adr);

   // Used for all non-volatile memory accesses.  The use of
   // $mem->opcode() to discover whether this pattern uses sign-extended
@@ -2724,6 +2930,18 @@
     }
   }

+  static void loadStore(MacroAssembler masm, mem_vector_insn insn,
+                         FloatRegister reg, MacroAssembler::SIMD_RegVariant T,
+                         int opcode, Register base, int index, int size, int disp)
+  {
+    if (index == -1) {
+      (masm.*insn)(reg, T, Address(base, disp));
+    } else {
+      assert(disp == 0, "unsupported address mode");
+      (masm.*insn)(reg, T, Address(base, as_Register(index), Address::lsl(size)));
+    }
+  }
+
 %}


@@ -2855,6 +3073,24 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}

+  enc_class aarch64_enc_ldrvS(vecX dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::S,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_ldrvD(vecX dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::D,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_ldrvQ(vecX dst, memory mem) %{
+    FloatRegister dst_reg = as_FloatRegister($dst$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::ldr, dst_reg, MacroAssembler::Q,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strb(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strb, src_reg, $mem->opcode(),
@@ -2923,6 +3159,24 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}

+  enc_class aarch64_enc_strvS(vecX src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::S,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_strvD(vecX src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::D,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
+  enc_class aarch64_enc_strvQ(vecX src, memory mem) %{
+    FloatRegister src_reg = as_FloatRegister($src$$reg);
+    loadStore(MacroAssembler(&cbuf), &MacroAssembler::str, src_reg, MacroAssembler::Q,
+       $mem->opcode(), as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   // END Non-volatile memory access

   // volatile loads and stores
@@ -4933,6 +5187,16 @@
   interface(REG_INTER);
 %}

+operand vecX()
+%{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  op_cost(0);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
 operand vRegD_V0()
 %{
   constraint(ALLOC_IN_RC(v0_reg));
@@ -5505,6 +5769,7 @@
   interface(REG_INTER)
 %}

+opclass vmem(indirect, indIndex, indOffI, indOffL);

 //----------OPERAND CLASSES----------------------------------------------------
 // Operand Classes are groups of operands that are used as to simplify
@@ -12926,7 +13191,919 @@
   ins_pipe(pipe_class_empty);
 %}

-
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load vector (32 bits)
+instruct loadV4(vecX dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 4);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrs   $dst,$mem\t# vector (32 bits)" %}
+  ins_encode( aarch64_enc_ldrvS(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Load vector (64 bits)
+instruct loadV8(vecX dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrd   $dst,$mem\t# vector (64 bits)" %}
+  ins_encode( aarch64_enc_ldrvD(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Load Vector (128 bits)
+instruct loadV16(vecX dst, vmem mem)
+%{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrq   $dst,$mem\t# vector (128 bits)" %}
+  ins_encode( aarch64_enc_ldrvQ(dst, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (32 bits)
+instruct storeV4(vecX src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 4);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strs   $mem,$src\t# vector (32 bits)" %}
+  ins_encode( aarch64_enc_strvS(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (64 bits)
+instruct storeV8(vecX src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strd   $mem,$src\t# vector (64 bits)" %}
+  ins_encode( aarch64_enc_strvD(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+// Store Vector (128 bits)
+instruct storeV16(vecX src, vmem mem)
+%{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(4 * INSN_COST);
+  format %{ "strq   $mem,$src\t# vector (128 bits)" %}
+  ins_encode( aarch64_enc_strvQ(src, mem) );
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct replicate16B(vecX dst, iRegIorL2I src)
+%{
+  match(Set dst (ReplicateB src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (16B)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate16B_imm(vecX dst, immI con)
+%{
+  match(Set dst (ReplicateB con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(16B)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T16B, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate8S(vecX dst, iRegIorL2I src)
+%{
+  match(Set dst (ReplicateS src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (8S)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T8H, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate8S_imm(vecX dst, immI con)
+%{
+  match(Set dst (ReplicateS con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(8H)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T8H, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4I(vecX dst, iRegIorL2I src)
+%{
+  match(Set dst (ReplicateI src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4I)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4S, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4I_imm(vecX dst, immI con)
+%{
+  match(Set dst (ReplicateI con));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $con\t# vector(4I)" %}
+  ins_encode %{
+    __ mov(as_FloatRegister($dst$$reg), __ T4S, $con$$constant);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2L(vecX dst, iRegL src)
+%{
+  match(Set dst (ReplicateL src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2L)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2D, as_Register($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2L_zero(vecX dst, immI0 zero)
+%{
+  match(Set dst (ReplicateI zero));
+  ins_cost(INSN_COST);
+  format %{ "movi  $dst, $zero\t# vector(4I)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($dst$$reg),
+           as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate4F(vecX dst, vRegF src)
+%{
+  match(Set dst (ReplicateF src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (4F)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T4S,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct replicate2D(vecX dst, vRegD src)
+%{
+  match(Set dst (ReplicateD src));
+  ins_cost(INSN_COST);
+  format %{ "dup  $dst, $src\t# vector (2D)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T2D,
+           as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// ====================REDUCTION ARITHMETIC====================================
+
+instruct reduce_add4I(iRegINoSp dst, iRegIorL2I src1, vecX src2, vecX tmp, iRegI tmp2)
+%{
+  match(Set dst (AddReductionVI src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "addv  $tmp, T4S, $src2\n\t"
+            "umov  $tmp2, $tmp, S, 0\n\t"
+            "addw  $dst, $tmp2, $src1\t add reduction4i"
+  %}
+  ins_encode %{
+    __ addv(as_FloatRegister($tmp$$reg), __ T4S,
+            as_FloatRegister($src2$$reg));
+    __ umov($tmp2$$Register, as_FloatRegister($tmp$$reg), __ S, 0);
+    __ addw($dst$$Register, $tmp2$$Register, $src1$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct reduce_mul4I(iRegINoSp dst, iRegIorL2I src1, vecX src2, vecX tmp, iRegI tmp2)
+%{
+  match(Set dst (MulReductionVI src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP tmp2, TEMP dst);
+  format %{ "ins   $tmp, $src2, 0, 1\n\t"
+            "mul   $tmp, $tmp, $src2\n\t"
+            "umov  $tmp2, $tmp, S, 0\n\t"
+            "mul   $dst, $tmp2, $src1\n\t"
+            "umov  $tmp2, $tmp, S, 1\n\t"
+            "mul   $dst, $tmp2, $dst\t mul reduction4i\n\t"
+  %}
+  ins_encode %{
+    __ ins(as_FloatRegister($tmp$$reg), __ D,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ mulv(as_FloatRegister($tmp$$reg), __ T2S,
+           as_FloatRegister($tmp$$reg), as_FloatRegister($src2$$reg));
+    __ umov($tmp2$$Register, as_FloatRegister($tmp$$reg), __ S, 0);
+    __ mul($dst$$Register, $tmp2$$Register, $src1$$Register);
+    __ umov($tmp2$$Register, as_FloatRegister($tmp$$reg), __ S, 1);
+    __ mul($dst$$Register, $tmp2$$Register, $dst$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct reduce_add4F(vRegF dst, vRegF src1, vecX src2, vecX tmp)
+%{
+  match(Set dst (AddReductionVF src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "fadds $dst, $src1, $src2\n\t"
+            "ins   $tmp, S, $src2, 0, 1\n\t"
+            "fadds $dst, $dst, $tmp\n\t"
+            "ins   $tmp, S, $src2, 0, 2\n\t"
+            "fadds $dst, $dst, $tmp\n\t"
+            "ins   $tmp, S, $src2, 0, 3\n\t"
+            "fadds $dst, $dst, $tmp\t add reduction4f"
+  %}
+  ins_encode %{
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 2);
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 3);
+    __ fadds(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct reduce_mul4F(vRegF dst, vRegF src1, vecX src2, vecX tmp)
+%{
+  match(Set dst (MulReductionVF src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "fmuls $dst, $src1, $src2\n\t"
+            "ins   $tmp, S, $src2, 0, 1\n\t"
+            "fmuls $dst, $dst, $tmp\n\t"
+            "ins   $tmp, S, $src2, 0, 2\n\t"
+            "fmuls $dst, $dst, $tmp\n\t"
+            "ins   $tmp, S, $src2, 0, 3\n\t"
+            "fmuls $dst, $dst, $tmp\t add reduction4f"
+  %}
+  ins_encode %{
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 2);
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ S,
+           as_FloatRegister($src2$$reg), 0, 3);
+    __ fmuls(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct reduce_add2D(vRegD dst, vRegD src1, vecX src2, vecX tmp)
+%{
+  match(Set dst (AddReductionVD src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "faddd $dst, $src1, $src2\n\t"
+            "ins   $tmp, D, $src2, 0, 1\n\t"
+            "faddd $dst, $dst, $tmp\t add reduction2d"
+  %}
+  ins_encode %{
+    __ faddd(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ D,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ faddd(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct reduce_mul2D(vRegD dst, vRegD src1, vecX src2, vecX tmp)
+%{
+  match(Set dst (MulReductionVD src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp, TEMP dst);
+  format %{ "fmuld $dst, $src1, $src2\n\t"
+            "ins   $tmp, D, $src2, 0, 1\n\t"
+            "fmuld $dst, $dst, $tmp\t add reduction2d"
+  %}
+  ins_encode %{
+    __ fmuld(as_FloatRegister($dst$$reg),
+             as_FloatRegister($src1$$reg), as_FloatRegister($src2$$reg));
+    __ ins(as_FloatRegister($tmp$$reg), __ D,
+           as_FloatRegister($src2$$reg), 0, 1);
+    __ fmuld(as_FloatRegister($dst$$reg),
+             as_FloatRegister($dst$$reg), as_FloatRegister($tmp$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+instruct vadd16B(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd8S(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd4I(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2L(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVL src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "addv  $dst,$src1,$src2\t# vector (2L)" %}
+  ins_encode %{
+    __ addv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd4F(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vadd2D(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AddVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fadd  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fadd(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- SUB --------------------------------------
+
+instruct vsub16B(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVB src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub8S(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub4I(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2L(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVL src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "subv  $dst,$src1,$src2\t# vector (2L)" %}
+  ins_encode %{
+    __ subv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub4F(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsub2D(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (SubVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fsub  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fsub(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- MUL --------------------------------------
+
+instruct vmul8S(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (MulVS src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (8H)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul4I(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (MulVI src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "mulv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ mulv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul4F(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (MulVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vmul2D(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (MulVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fmul  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fmul(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- DIV --------------------------------------
+
+instruct vdiv4F(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (DivVF src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (4S)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vdiv2D(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (DivVD src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "fdiv  $dst,$src1,$src2\t# vector (2D)" %}
+  ins_encode %{
+    __ fdiv(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- AND --------------------------------------
+
+instruct vand16B(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (AndV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "and  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ andr(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vor16B(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (OrV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "orr  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ orr(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxor16B(vecX dst, vecX src1, vecX src2)
+%{
+  match(Set dst (XorV src1 src2));
+  ins_cost(INSN_COST);
+  format %{ "xor  $dst,$src1,$src2\t# vector (16B)" %}
+  ins_encode %{
+    __ eor(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src1$$reg),
+            as_FloatRegister($src2$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// ------------------------------ Shift ---------------------------------------
+
+instruct vshiftcntL(vecX dst, iRegIorL2I cnt) %{
+  match(Set dst (LShiftCntV cnt));
+  format %{ "dup  $dst, $cnt\t# shift count (vecX)" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($cnt$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Right shifts on aarch64 SIMD are implemented as left shift by -ve amount
+instruct vshiftcntR(vecX dst, iRegIorL2I cnt) %{
+  match(Set dst (RShiftCntV cnt));
+  format %{ "dup  $dst, $cnt\t# shift count (vecX)\n\tneg  $dst, $dst\t T16B" %}
+  ins_encode %{
+    __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($cnt$$reg));
+    __ negr(as_FloatRegister($dst$$reg), __ T16B, as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll16B(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (LShiftVB src shift));
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (16B)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl16B(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (16B)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T16B,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (LShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra16B_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (RShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) sh = 7;
+    sh = -sh & 7;
+    __ sshr(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl16B_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (URShiftVB src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg), -sh & 7);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8S(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (8H)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8S(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (8H)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T8H,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll8S_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (LShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ shl(as_FloatRegister($dst$$reg), __ T8H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra8S_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (RShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) sh = 15;
+    sh = -sh & 15;
+    __ sshr(as_FloatRegister($dst$$reg), __ T8H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl8S_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (URShiftVS src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant & 31;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T8H,
+             as_FloatRegister($src$$reg), -sh & 15);
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4I(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (4S)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4I(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (4S)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll4I_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (LShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T4S,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra4I_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (RShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl4I_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (URShiftVI src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 31);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2L(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (LShiftVL src shift));
+  match(Set dst (RShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshl  $dst,$src,$shift\t# vector (2D)" %}
+  ins_encode %{
+    __ sshl(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2L(vecX dst, vecX src, vecX shift) %{
+  match(Set dst (URShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushl  $dst,$src,$shift\t# vector (2D)" %}
+  ins_encode %{
+    __ ushl(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            as_FloatRegister($shift$$reg));
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsll2L_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (LShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "shl    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ shl(as_FloatRegister($dst$$reg), __ T2D,
+           as_FloatRegister($src$$reg),
+           (int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsra2L_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (RShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "sshr    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ sshr(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
+  match(Set dst (URShiftVL src shift));
+  ins_cost(INSN_COST);
+  format %{ "ushr    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ ushr(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            -(int)$shift$$constant & 63);
+  %}
+  ins_pipe(pipe_class_default);
+%}

 //----------PEEPHOLE RULES-----------------------------------------------------
 // These must follow all instruction definitions as they use the names
--- a/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -466,6 +466,11 @@
     case base_plus_offset:
       {
         unsigned size = i->get(31, 30);
+        if (i->get(26, 26) && i->get(23, 23)) {
+          // SIMD Q Type - Size = 128 bits
+          assert(size == 0, "bad size");
+          size = 0b100;
+        }
         unsigned mask = (1 << size) - 1;
         if (_offset < 0 || _offset & mask)
           {
@@ -1888,9 +1893,18 @@
   };

   enum SIMD_RegVariant {
-       S32, D64, Q128
+       B, H, S, D, Q
   };

+#define INSN(NAME, op)                                            \
+  void NAME(FloatRegister Rt, SIMD_RegVariant T, const Address &adr) {   \
+    ld_st2((Register)Rt, adr, (int)T & 3, op + ((T==Q) ? 0b10:0b00), 1); \
+  }                                                                      \
+
+  INSN(ldr, 1);
+  INSN(str, 0);
+
+#undef INSN

  private:

@@ -1997,27 +2011,87 @@
     rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);                              \
   }

-  INSN(eor, 0b101110001);
-  INSN(orr, 0b001110101);
+  INSN(eor,  0b101110001);
+  INSN(orr,  0b001110101);
   INSN(andr, 0b001110001);
-  INSN(bic, 0b001110011);
-  INSN(bif, 0b101110111);
-  INSN(bit, 0b101110101);
-  INSN(bsl, 0b101110011);
-  INSN(orn, 0b001110111);
+  INSN(bic,  0b001110011);
+  INSN(bif,  0b101110111);
+  INSN(bit,  0b101110101);
+  INSN(bsl,  0b101110011);
+  INSN(orn,  0b001110111);
+
+#undef INSN
+
+#define INSN(NAME, opc, opc2)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);                        \
+    f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(opc2, 15, 10);                      \
+    rf(Vn, 5), rf(Vd, 0);                                                               \
+  }
+
+  INSN(addv, 0, 0b100001);
+  INSN(subv, 1, 0b100001);
+  INSN(mulv, 0, 0b100111);
+  INSN(sshl, 0, 0b010001);
+  INSN(ushl, 1, 0b010001);

 #undef INSN

-#define INSN(NAME, opc)                                                                 \
-  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+#define INSN(NAME, opc, opc2) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {                   \
     starti;                                                                             \
     f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);                        \
-    f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b100001, 15, 10);                  \
+    f((int)T >> 1, 23, 22), f(opc2, 21, 10);                                            \
     rf(Vn, 5), rf(Vd, 0);                                                               \
   }

-  INSN(addv, 0);
-  INSN(subv, 1);
+  INSN(absr,  0, 0b100000101110);
+  INSN(negr,  1, 0b100000101110);
+  INSN(notr,  1, 0b100000010110);
+  INSN(addv,  0, 0b110001101110);
+
+#undef INSN
+
+#define INSN(NAME, op0, cmode0) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, unsigned imm8, unsigned lsl = 0) {   \
+    unsigned cmode = cmode0;                                                           \
+    unsigned op = op0;                                                                 \
+    starti;                                                                            \
+    assert(lsl == 0 ||                                                                 \
+           ((T == T4H || T == T8H) && lsl == 8) ||                                     \
+           ((T == T2S || T == T4S) && ((lsl >> 3) < 4)), "invalid shift");             \
+    cmode |= lsl >> 2;                                                                 \
+    if (T == T4H || T == T8H) cmode |= 0b1000;                                         \
+    if (!(T == T4H || T == T8H || T == T2S || T == T4S)) {                             \
+      assert(op == 0 && cmode0 == 0, "must be MOVI");                                  \
+      cmode = 0b1110;                                                                  \
+      if (T == T1D || T == T2D) op = 1;                                                \
+    }                                                                                  \
+    f(0, 31), f((int)T & 1, 30), f(op, 29), f(0b0111100000, 28, 19);                   \
+    f(imm8 >> 5, 18, 16), f(cmode, 15, 12), f(0x01, 11, 10), f(imm8 & 0b11111, 9, 5);  \
+    rf(Vd, 0);                                                                         \
+  }
+
+  INSN(movi, 0, 0);
+  INSN(orri, 0, 1);
+  INSN(mvni, 1, 0);
+  INSN(bici, 1, 1);
+
+#undef INSN
+
+#define INSN(NAME, op1, op2, op3) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");                    \
+    f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01110, 28, 24), f(op2, 23);            \
+    f(T==T2D ? 1:0, 22); f(1, 21), rf(Vm, 16), f(op3, 15, 10), rf(Vn, 5), rf(Vd, 0);    \
+  }
+
+  INSN(fadd, 0, 0, 0b110101);
+  INSN(fdiv, 1, 0, 0b111111);
+  INSN(fmul, 1, 0, 0b110111);
+  INSN(fsub, 0, 1, 0b110101);

 #undef INSN

@@ -2064,19 +2138,40 @@

 #undef INSN

-  void shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+  void ins(FloatRegister Vd, SIMD_RegVariant T, FloatRegister Vn, int didx, int sidx) {
+    starti;
+    assert(T != Q, "invalid register variant");
+    f(0b01101110000, 31, 21), f(((didx<<1)|1)<<(int)T, 20, 16), f(0, 15);
+    f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) {
     starti;
-    /* The encodings for the immh:immb fields (bits 22:16) are
-     *   0001 xxx       8B/16B, shift = xxx
-     *   001x xxx       4H/8H,  shift = xxxx
-     *   01xx xxx       2S/4S,  shift = xxxxx
-     *   1xxx xxx       1D/2D,  shift = xxxxxx (1D is RESERVED)
-     */
-    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
-    f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
-    f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+    f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21);
+    f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10);
+    rf(Vn, 5), rf(Rd, 0);
   }

+#define INSN(NAME, opc, opc2) \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){         \
+    starti;                                                                             \
+    /* The encodings for the immh:immb fields (bits 22:16) are                          \
+     *   0001 xxx       8B/16B, shift = xxx                                             \
+     *   001x xxx       4H/8H,  shift = xxxx                                            \
+     *   01xx xxx       2S/4S,  shift = xxxxx                                           \
+     *   1xxx xxx       1D/2D,  shift = xxxxxx (1D is RESERVED)                         \
+     */                                                                                 \
+    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");                           \
+    f(0, 31), f(T & 1, 30), f(opc, 29), f(0b011110, 28, 23),                            \
+    f((1 << ((T>>1)+3))|shift, 22, 16); f(opc2, 15, 10), rf(Vn, 5), rf(Vd, 0);          \
+  }
+
+  INSN(shl,  0, 0b010101);
+  INSN(sshr, 0, 0b000001);
+  INSN(ushr, 1, 0b000001);
+
+#undef INSN
+
   void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
     starti;
     /* The encodings for the immh:immb fields (bits 22:16) are
@@ -2149,6 +2244,23 @@
     rf(Vn, 5), rf(Vd, 0);
   }

+  void dup(FloatRegister Vd, SIMD_Arrangement T, Register Xs)
+  {
+    starti;
+    assert(T != T1D, "reserved encoding");
+    f(0,31), f((int)T & 1, 30), f(0b001110000, 29, 21);
+    f((1 << (T >> 1)), 20, 16), f(0b000011, 15, 10), rf(Xs, 5), rf(Vd, 0);
+  }
+
+  void dup(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int index = 0)
+  {
+    starti;
+    assert(T != T1D, "reserved encoding");
+    f(0, 31), f((int)T & 1, 30), f(0b001110000, 29, 21);
+    f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b000001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
   // CRC32 instructions
 #define INSN(NAME, sf, sz)                                                \
   void NAME(Register Rd, Register Rn, Register Rm) {                      \
--- a/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -64,7 +64,7 @@
 define_pd_global(intx, PreInflateSpin,           10);

 define_pd_global(bool, RewriteBytecodes,     true);
-define_pd_global(bool, RewriteFrequentPairs, false);
+define_pd_global(bool, RewriteFrequentPairs, true);

 define_pd_global(bool, UseMembar,            true);
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -2802,8 +2802,8 @@
       uzp2(v21, v20, v16, T2D);
       eor(v20, T16B, v17, v21);

-      shl(v16, v28, T2D, 1);
-      shl(v17, v20, T2D, 1);
+      shl(v16, T2D, v28, 1);
+      shl(v17, T2D, v20, 1);

       eor(v0, T16B, v0, v16);
       eor(v1, T16B, v1, v17);
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -37,6 +37,7 @@
   friend class LIR_Assembler;

   using Assembler::mov;
+  using Assembler::movi;

  protected:

@@ -464,6 +465,45 @@

   void movptr(Register r, uintptr_t imm64);

+  // Macro to mov replicated immediate to vector register.
+  // Where imm32 == hex abcdefgh, Vd will get the following values
+  // for different arrangements in T
+  //   T8B:  Vd = ghghghghghghghgh
+  //   T16B: Vd = ghghghghghghghghghghghghghghghgh
+  //   T4H:  Vd = efghefghefghefgh
+  //   T8H:  Vd = efghefghefghefghefghefghefghefgh
+  //   T2S:  Vd = abcdefghabcdefgh
+  //   T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
+  //   T1D/T2D: invalid
+  void mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
+    assert(T != T1D && T != T2D, "invalid arrangement");
+    u_int32_t nimm32 = ~imm32;
+    if (T == T8B || T == T16B) { imm32 &= 0xff; nimm32 &= 0xff; }
+    if (T == T4H || T == T8H) { imm32 &= 0xffff; nimm32 &= 0xffff; }
+    u_int32_t x = imm32;
+    int movi_cnt = 0;
+    int movn_cnt = 0;
+    while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
+    x = nimm32;
+    while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
+    if (movn_cnt < movi_cnt) imm32 = nimm32;
+    unsigned lsl = 0;
+    while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+    if (movn_cnt < movi_cnt)
+      mvni(Vd, T, imm32 & 0xff, lsl);
+    else
+      movi(Vd, T, imm32 & 0xff, lsl);
+    imm32 >>= 8; lsl += 8;
+    while (imm32) {
+      while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
+      if (movn_cnt < movi_cnt)
+        bici(Vd, T, imm32 & 0xff, lsl);
+      else
+        orri(Vd, T, imm32 & 0xff, lsl);
+      lsl += 8; imm32 >>= 8;
+    }
+  }
+
   // macro instructions for accessing and updating floating point
   // status register
   //
--- a/src/cpu/aarch64/vm/register_aarch64.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/register_aarch64.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -186,7 +186,7 @@
   // it's optoregs.

     number_of_registers = (2 * RegisterImpl::number_of_registers +
-                           2 * FloatRegisterImpl::number_of_registers +
+                           4 * FloatRegisterImpl::number_of_registers +
                            1) // flags
   };
--- a/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -513,23 +513,61 @@
 void TemplateTable::iload_internal(RewriteControl rc) {
   transition(vtos, itos);
   if (RewriteFrequentPairs && rc == may_rewrite) {
-    // TODO : check x86 code for what to do here
-    __ call_Unimplemented();
-  } else {
-    locals_index(r1);
-    __ ldr(r0, iaddress(r1));
+    Label rewrite, done;
+    Register bc = r4;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_iload)));
+
+    // if _iload, wait to rewrite to iload2.  We only want to rewrite the
+    // last two iloads in a pair.  Comparing against fast_iload means that
+    // the next bytecode is neither an iload or a caload, and therefore
+    // an iload pair.
+    __ cmpw(r1, Bytecodes::_iload);
+    __ br(Assembler::EQ, done);
+
+    // if _fast_iload rewrite to _fast_iload2
+    __ cmpw(r1, Bytecodes::_fast_iload);
+    __ movw(bc, Bytecodes::_fast_iload2);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _caload rewrite to _fast_icaload
+    __ cmpw(r1, Bytecodes::_caload);
+    __ movw(bc, Bytecodes::_fast_icaload);
+    __ br(Assembler::EQ, rewrite);
+
+    // else rewrite to _fast_iload
+    __ movw(bc, Bytecodes::_fast_iload);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_iload, bc, r1, false);
+    __ bind(done);
+
   }

+  // do iload, get the local value into tos
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+
 }

 void TemplateTable::fast_iload2()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+  __ push(itos);
+  locals_index(r1, 3);
+  __ ldr(r0, iaddress(r1));
 }

 void TemplateTable::fast_iload()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
 }

 void TemplateTable::lload()
@@ -721,7 +759,18 @@
 // iload followed by caload frequent pair
 void TemplateTable::fast_icaload()
 {
-  __ call_Unimplemented();
+  transition(vtos, itos);
+  // load index out of locals
+  locals_index(r2);
+  __ ldr(r1, iaddress(r2));
+
+  __ pop_ptr(r0);
+
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(1)));
+  __ load_unsigned_short(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_CHAR)));
 }

 void TemplateTable::saload()
@@ -797,7 +846,47 @@
   // These bytecodes with a small amount of code are most profitable
   // to rewrite
   if (RewriteFrequentPairs && rc == may_rewrite) {
-    __ call_Unimplemented();
+    Label rewrite, done;
+    const Register bc = r4;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_aload_0)));
+
+    // do actual aload_0
+    aload(0);
+
+    // if _getfield then wait with rewrite
+    __ cmpw(r1, Bytecodes::Bytecodes::_getfield);
+    __ br(Assembler::EQ, done);
+
+    // if _igetfield then reqrite to _fast_iaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_igetfield);
+    __ movw(bc, Bytecodes::_fast_iaccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _agetfield then reqrite to _fast_aaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_agetfield);
+    __ movw(bc, Bytecodes::_fast_aaccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // if _fgetfield then reqrite to _fast_faccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_faccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmpw(r1, Bytecodes::_fast_fgetfield);
+    __ movw(bc, Bytecodes::_fast_faccess_0);
+    __ br(Assembler::EQ, rewrite);
+
+    // else rewrite to _fast_aload0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aload_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ movw(bc, Bytecodes::Bytecodes::_fast_aload_0);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_aload_0, bc, r1, false);
+
+    __ bind(done);
   } else {
     aload(0);
   }
--- a/src/os/windows/vm/os_windows.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/os/windows/vm/os_windows.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -3768,7 +3768,7 @@
   return NULL;
 }

-#define EXIT_TIMEOUT     PRODUCT_ONLY(1000) NOT_PRODUCT(4000) /* 1 sec in product, 4 sec in debug */
+#define EXIT_TIMEOUT 300000 /* 5 minutes */

 static BOOL CALLBACK init_crit_sect_call(PINIT_ONCE, PVOID pcrit_sect, PVOID*) {
   InitializeCriticalSection((CRITICAL_SECTION*)pcrit_sect);
--- a/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -1469,7 +1469,9 @@
   } else {
     guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1,
               "Assumption");
-    flag_type = T_BYTE;
+    // Use unsigned type T_BOOLEAN here rather than signed T_BYTE since some platforms, eg. ARM,
+    // need to use unsigned instructions to use the large offset to load the satb_mark_queue.
+    flag_type = T_BOOLEAN;
   }
   LIR_Opr thrd = getThreadPointer();
   LIR_Address* mark_active_flag_addr =
--- a/src/share/vm/gc/cms/cmsOopClosures.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/cmsOopClosures.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -26,6 +26,7 @@
 #define SHARE_VM_GC_CMS_CMSOOPCLOSURES_HPP

 #include "gc/shared/genOopClosures.hpp"
+#include "gc/shared/taskqueue.hpp"
 #include "memory/iterator.hpp"

 /////////////////////////////////////////////////////////////////
--- a/src/share/vm/gc/cms/compactibleFreeListSpace.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/compactibleFreeListSpace.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -641,6 +641,7 @@
 class FreeListSpace_DCTOC : public Filtering_DCTOC {
   CompactibleFreeListSpace* _cfls;
   CMSCollector* _collector;
+  bool _parallel;
 protected:
   // Override.
 #define walk_mem_region_with_cl_DECL(ClosureType)                       \
@@ -661,9 +662,10 @@
                       CMSCollector* collector,
                       ExtendedOopClosure* cl,
                       CardTableModRefBS::PrecisionStyle precision,
-                      HeapWord* boundary) :
+                      HeapWord* boundary,
+                      bool parallel) :
     Filtering_DCTOC(sp, cl, precision, boundary),
-    _cfls(sp), _collector(collector) {}
+    _cfls(sp), _collector(collector), _parallel(parallel) {}
 };

 // We de-virtualize the block-related calls below, since we know that our
@@ -674,10 +676,7 @@
                                                  HeapWord* bottom,              \
                                                  HeapWord* top,                 \
                                                  ClosureType* cl) {             \
-   bool is_par = GenCollectedHeap::heap()->n_par_threads() > 0;                 \
-   if (is_par) {                                                                \
-     assert(GenCollectedHeap::heap()->n_par_threads() ==                        \
-            GenCollectedHeap::heap()->workers()->active_workers(), "Mismatch"); \
+   if (_parallel) {                                                             \
      walk_mem_region_with_cl_par(mr, bottom, top, cl);                          \
    } else {                                                                     \
      walk_mem_region_with_cl_nopar(mr, bottom, top, cl);                        \
@@ -747,8 +746,9 @@
 DirtyCardToOopClosure*
 CompactibleFreeListSpace::new_dcto_cl(ExtendedOopClosure* cl,
                                       CardTableModRefBS::PrecisionStyle precision,
-                                      HeapWord* boundary) {
-  return new FreeListSpace_DCTOC(this, _collector, cl, precision, boundary);
+                                      HeapWord* boundary,
+                                      bool parallel) {
+  return new FreeListSpace_DCTOC(this, _collector, cl, precision, boundary, parallel);
 }


@@ -1897,11 +1897,9 @@
   assert(chunk->is_free() && ffc->is_free(), "Error");
   _bt.split_block((HeapWord*)chunk, chunk->size(), new_size);
   if (rem_sz < SmallForDictionary) {
-    bool is_par = (GenCollectedHeap::heap()->n_par_threads() > 0);
+    // The freeList lock is held, but multiple GC task threads might be executing in parallel.
+    bool is_par = Thread::current()->is_GC_task_thread();
     if (is_par) _indexedFreeListParLocks[rem_sz]->lock();
-    assert(!is_par ||
-           (GenCollectedHeap::heap()->n_par_threads() ==
-            GenCollectedHeap::heap()->workers()->active_workers()), "Mismatch");
     returnChunkToFreeList(ffc);
     split(size, rem_sz);
     if (is_par) _indexedFreeListParLocks[rem_sz]->unlock();
@@ -1972,8 +1970,6 @@

 bool CompactibleFreeListSpace::no_allocs_since_save_marks() {
   assert(_promoInfo.tracking(), "No preceding save_marks?");
-  assert(GenCollectedHeap::heap()->n_par_threads() == 0,
-         "Shouldn't be called if using parallel gc.");
   return _promoInfo.noPromotions();
 }

@@ -1981,8 +1977,6 @@
                                                                             \
 void CompactibleFreeListSpace::                                             \
 oop_since_save_marks_iterate##nv_suffix(OopClosureType* blk) {              \
-  assert(GenCollectedHeap::heap()->n_par_threads() == 0,                    \
-         "Shouldn't be called (yet) during parallel part of gc.");          \
   _promoInfo.promoted_oops_iterate##nv_suffix(blk);                         \
   /*                                                                        \
    * This also restores any displaced headers and removes the elements from \
--- a/src/share/vm/gc/cms/compactibleFreeListSpace.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/compactibleFreeListSpace.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -438,7 +438,8 @@
   // Override: provides a DCTO_CL specific to this kind of space.
   DirtyCardToOopClosure* new_dcto_cl(ExtendedOopClosure* cl,
                                      CardTableModRefBS::PrecisionStyle precision,
-                                     HeapWord* boundary);
+                                     HeapWord* boundary,
+                                     bool parallel);

   void blk_iterate(BlkClosure* cl);
   void blk_iterate_careful(BlkClosureCareful* cl);
--- a/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -2428,14 +2428,18 @@
   MarkRefsIntoClosure notOlder(_span, verification_mark_bm());
   gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.

-  gch->gen_process_roots(_cmsGen->level(),
-                         true,   // younger gens are roots
-                         true,   // activate StrongRootsScope
-                         GenCollectedHeap::ScanningOption(roots_scanning_options()),
-                         should_unload_classes(),
-                         &notOlder,
-                         NULL,
-                         NULL);  // SSS: Provide correct closure
+  {
+    StrongRootsScope srs(1);
+
+    gch->gen_process_roots(&srs,
+                           _cmsGen->level(),
+                           true,   // younger gens are roots
+                           GenCollectedHeap::ScanningOption(roots_scanning_options()),
+                           should_unload_classes(),
+                           &notOlder,
+                           NULL,
+                           NULL);
+  }

   // Now mark from the roots
   MarkFromRootsClosure markFromRootsClosure(this, _span,
@@ -2496,14 +2500,18 @@

   gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.

-  gch->gen_process_roots(_cmsGen->level(),
-                         true,   // younger gens are roots
-                         true,   // activate StrongRootsScope
-                         GenCollectedHeap::ScanningOption(roots_scanning_options()),
-                         should_unload_classes(),
-                         &notOlder,
-                         NULL,
-                         &cld_closure);
+  {
+    StrongRootsScope srs(1);
+
+    gch->gen_process_roots(&srs,
+                           _cmsGen->level(),
+                           true,   // younger gens are roots
+                           GenCollectedHeap::ScanningOption(roots_scanning_options()),
+                           should_unload_classes(),
+                           &notOlder,
+                           NULL,
+                           &cld_closure);
+  }

   // Now mark from the roots
   MarkFromRootsVerifyClosure markFromRootsClosure(this, _span,
@@ -2913,10 +2921,11 @@

 // Parallel initial mark task
 class CMSParInitialMarkTask: public CMSParMarkTask {
+  StrongRootsScope* _strong_roots_scope;
  public:
-  CMSParInitialMarkTask(CMSCollector* collector, uint n_workers) :
-      CMSParMarkTask("Scan roots and young gen for initial mark in parallel",
-                     collector, n_workers) {}
+  CMSParInitialMarkTask(CMSCollector* collector, StrongRootsScope* strong_roots_scope, uint n_workers) :
+      CMSParMarkTask("Scan roots and young gen for initial mark in parallel", collector, n_workers),
+      _strong_roots_scope(strong_roots_scope) {}
   void work(uint worker_id);
 };

@@ -3004,24 +3013,26 @@
       FlexibleWorkGang* workers = gch->workers();
       assert(workers != NULL, "Need parallel worker threads.");
       uint n_workers = workers->active_workers();
-      CMSParInitialMarkTask tsk(this, n_workers);
-      gch->set_par_threads(n_workers);
+
+      StrongRootsScope srs(n_workers);
+
+      CMSParInitialMarkTask tsk(this, &srs, n_workers);
       initialize_sequential_subtasks_for_young_gen_rescan(n_workers);
       if (n_workers > 1) {
-        StrongRootsScope srs;
         workers->run_task(&tsk);
       } else {
-        StrongRootsScope srs;
         tsk.work(0);
       }
-      gch->set_par_threads(0);
     } else {
       // The serial version.
       CLDToOopClosure cld_closure(&notOlder, true);
       gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.
-      gch->gen_process_roots(_cmsGen->level(),
+
+      StrongRootsScope srs(1);
+
+      gch->gen_process_roots(&srs,
+                             _cmsGen->level(),
                              true,   // younger gens are roots
-                             true,   // activate StrongRootsScope
                              GenCollectedHeap::ScanningOption(roots_scanning_options()),
                              should_unload_classes(),
                              &notOlder,
@@ -4452,9 +4463,9 @@

   CLDToOopClosure cld_closure(&par_mri_cl, true);

-  gch->gen_process_roots(_collector->_cmsGen->level(),
+  gch->gen_process_roots(_strong_roots_scope,
+                         _collector->_cmsGen->level(),
                          false,     // yg was scanned above
-                         false,     // this is parallel code
                          GenCollectedHeap::ScanningOption(_collector->CMSCollector::roots_scanning_options()),
                          _collector->should_unload_classes(),
                          &par_mri_cl,
@@ -4478,6 +4489,7 @@
   // The per-thread work queues, available here for stealing.
   OopTaskQueueSet*       _task_queues;
   ParallelTaskTerminator _term;
+  StrongRootsScope*      _strong_roots_scope;

  public:
   // A value of 0 passed to n_workers will cause the number of
@@ -4485,12 +4497,14 @@
   CMSParRemarkTask(CMSCollector* collector,
                    CompactibleFreeListSpace* cms_space,
                    uint n_workers, FlexibleWorkGang* workers,
-                   OopTaskQueueSet* task_queues):
+                   OopTaskQueueSet* task_queues,
+                   StrongRootsScope* strong_roots_scope):
     CMSParMarkTask("Rescan roots and grey objects in parallel",
                    collector, n_workers),
     _cms_space(cms_space),
     _task_queues(task_queues),
-    _term(n_workers, task_queues) { }
+    _term(n_workers, task_queues),
+    _strong_roots_scope(strong_roots_scope) { }

   OopTaskQueueSet* task_queues() { return _task_queues; }

@@ -4588,9 +4602,9 @@
   // ---------- remaining roots --------------
   _timer.reset();
   _timer.start();
-  gch->gen_process_roots(_collector->_cmsGen->level(),
+  gch->gen_process_roots(_strong_roots_scope,
+                         _collector->_cmsGen->level(),
                          false,     // yg was scanned above
-                         false,     // this is parallel code
                          GenCollectedHeap::ScanningOption(_collector->CMSCollector::roots_scanning_options()),
                          _collector->should_unload_classes(),
                          &par_mrias_cl,
@@ -5058,22 +5072,15 @@
   FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   // Choose to use the number of GC workers most recently set
-  // into "active_workers".  If active_workers is not set, set it
-  // to ParallelGCThreads.
+  // into "active_workers".
   uint n_workers = workers->active_workers();
-  if (n_workers == 0) {
-    assert(n_workers > 0, "Should have been set during scavenge");
-    n_workers = ParallelGCThreads;
-    workers->set_active_workers(n_workers);
-  }
+
   CompactibleFreeListSpace* cms_space  = _cmsGen->cmsSpace();

-  CMSParRemarkTask tsk(this,
-    cms_space,
-    n_workers, workers, task_queues());
-
-  // Set up for parallel process_roots work.
-  gch->set_par_threads(n_workers);
+  StrongRootsScope srs(n_workers);
+
+  CMSParRemarkTask tsk(this, cms_space, n_workers, workers, task_queues(), &srs);
+
   // We won't be iterating over the cards in the card table updating
   // the younger_gen cards, so we shouldn't call the following else
   // the verification code as well as subsequent younger_refs_iterate
@@ -5105,15 +5112,12 @@
     // necessarily be so, since it's possible that we are doing
     // ST marking.
     ReferenceProcessorMTDiscoveryMutator mt(ref_processor(), true);
-    StrongRootsScope srs;
     workers->run_task(&tsk);
   } else {
     ReferenceProcessorMTDiscoveryMutator mt(ref_processor(), false);
-    StrongRootsScope srs;
     tsk.work(0);
   }

-  gch->set_par_threads(0);  // 0 ==> non-parallel.
   // restore, single-threaded for now, any preserved marks
   // as a result of work_q overflow
   restore_preserved_marks_if_any();
@@ -5177,11 +5181,11 @@
     verify_work_stacks_empty();

     gch->rem_set()->prepare_for_younger_refs_iterate(false); // Not parallel.
-    StrongRootsScope srs;
-
-    gch->gen_process_roots(_cmsGen->level(),
+    StrongRootsScope srs(1);
+
+    gch->gen_process_roots(&srs,
+                           _cmsGen->level(),
                            true,  // younger gens as roots
-                           false, // use the local StrongRootsScope
                            GenCollectedHeap::ScanningOption(roots_scanning_options()),
                            should_unload_classes(),
                            &mrias_cl,
@@ -5254,18 +5258,14 @@
                       CMSBitMap*       mark_bit_map,
                       AbstractWorkGang* workers,
                       OopTaskQueueSet* task_queues):
-    // XXX Should superclass AGTWOQ also know about AWG since it knows
-    // about the task_queues used by the AWG? Then it could initialize
-    // the terminator() object. See 6984287. The set_for_termination()
-    // below is a temporary band-aid for the regression in 6984287.
     AbstractGangTaskWOopQueues("Process referents by policy in parallel",
-      task_queues),
+      task_queues,
+      workers->active_workers()),
     _task(task),
     _collector(collector), _span(span), _mark_bit_map(mark_bit_map)
   {
     assert(_collector->_span.equals(_span) && !_span.is_empty(),
            "Inconsistency in _span");
-    set_for_termination(workers->active_workers());
   }

   OopTaskQueueSet* task_queues() { return queues(); }
--- a/src/share/vm/gc/cms/parCardTableModRefBS.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/parCardTableModRefBS.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -39,16 +39,11 @@
 void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
                                                              OopsInGenClosure* cl,
                                                              CardTableRS* ct,
-                                                             int n_threads) {
-  assert(n_threads > 0, "Error: expected n_threads > 0");
-  assert((n_threads == 1 && ParallelGCThreads == 0) ||
-         n_threads <= (int)ParallelGCThreads,
-         "# worker threads != # requested!");
-  assert(!Thread::current()->is_VM_thread() || (n_threads == 1), "There is only 1 VM thread");
-  assert(UseDynamicNumberOfGCThreads ||
-         !FLAG_IS_DEFAULT(ParallelGCThreads) ||
-         n_threads == (int)ParallelGCThreads,
-         "# worker threads != # requested!");
+                                                             uint n_threads) {
+  assert(n_threads > 0, "expected n_threads > 0");
+  assert(n_threads <= ParallelGCThreads,
+         err_msg("n_threads: %u > ParallelGCThreads: " UINTX_FORMAT, n_threads, ParallelGCThreads));
+
   // Make sure the LNC array is valid for the space.
   jbyte**   lowest_non_clean;
   uintptr_t lowest_non_clean_base_chunk_index;
@@ -66,7 +61,8 @@

   uint stride = 0;
   while (!pst->is_task_claimed(/* reference */ stride)) {
-    process_stride(sp, mr, stride, n_strides, cl, ct,
+    process_stride(sp, mr, stride, n_strides,
+                   cl, ct,
                    lowest_non_clean,
                    lowest_non_clean_base_chunk_index,
                    lowest_non_clean_chunk_size);
@@ -132,9 +128,13 @@
     assert(chunk_mr.word_size() > 0, "[chunk_card_start > used_end)");
     assert(used.contains(chunk_mr), "chunk_mr should be subset of used");

+    // This function is used by the parallel card table iteration.
+    const bool parallel = true;
+
     DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(),
-                                                     cl->gen_boundary());
-    ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
+                                                     cl->gen_boundary(),
+                                                     parallel);
+    ClearNoncleanCardWrapper clear_cl(dcto_cl, ct, parallel);


     // Process the chunk.
--- a/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -567,23 +567,15 @@
 }

 ParNewGenTask::ParNewGenTask(ParNewGeneration* gen, Generation* old_gen,
-                             HeapWord* young_old_boundary, ParScanThreadStateSet* state_set) :
+                             HeapWord* young_old_boundary, ParScanThreadStateSet* state_set,
+                             StrongRootsScope* strong_roots_scope) :
     AbstractGangTask("ParNewGeneration collection"),
     _gen(gen), _old_gen(old_gen),
     _young_old_boundary(young_old_boundary),
-    _state_set(state_set)
+    _state_set(state_set),
+    _strong_roots_scope(strong_roots_scope)
   {}

-// Reset the terminator for the given number of
-// active threads.
-void ParNewGenTask::set_for_termination(uint active_workers) {
-  _state_set->reset(active_workers, _gen->promotion_failed());
-  // Should the heap be passed in?  There's only 1 for now so
-  // grab it instead.
-  GenCollectedHeap* gch = GenCollectedHeap::heap();
-  gch->set_n_termination(active_workers);
-}
-
 void ParNewGenTask::work(uint worker_id) {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
   // Since this is being done in a separate thread, need new resource
@@ -603,10 +595,10 @@
                                            false);

   par_scan_state.start_strong_roots();
-  gch->gen_process_roots(_gen->level(),
+  gch->gen_process_roots(_strong_roots_scope,
+                         _gen->level(),
                          true,  // Process younger gens, if any,
                                 // as strong roots.
-                         false, // no scope; this is parallel code
                          GenCollectedHeap::SO_ScavengeCodeCache,
                          GenCollectedHeap::StrongAndWeakRoots,
                          &par_scan_state.to_space_root_closure(),
@@ -759,9 +751,6 @@

 private:
   virtual void work(uint worker_id);
-  virtual void set_for_termination(uint active_workers) {
-    _state_set.terminator()->reset_for_reuse(active_workers);
-  }
 private:
   ParNewGeneration&      _gen;
   ProcessTask&           _task;
@@ -838,7 +827,6 @@
 {
   _state_set.flush();
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  gch->set_par_threads(0);  // 0 ==> non-parallel.
   gch->save_marks();
 }

@@ -939,33 +927,35 @@
   to()->clear(SpaceDecorator::Mangle);

   gch->save_marks();
-  assert(workers != NULL, "Need parallel worker threads.");
-  uint n_workers = active_workers;

   // Set the correct parallelism (number of queues) in the reference processor
-  ref_processor()->set_active_mt_degree(n_workers);
+  ref_processor()->set_active_mt_degree(active_workers);

   // Always set the terminator for the active number of workers
   // because only those workers go through the termination protocol.
-  ParallelTaskTerminator _term(n_workers, task_queues());
-  ParScanThreadStateSet thread_state_set(workers->active_workers(),
+  ParallelTaskTerminator _term(active_workers, task_queues());
+  ParScanThreadStateSet thread_state_set(active_workers,
                                          *to(), *this, *_old_gen, *task_queues(),
                                          _overflow_stacks, desired_plab_sz(), _term);

-  ParNewGenTask tsk(this, _old_gen, reserved().end(), &thread_state_set);
-  gch->set_par_threads(n_workers);
-  gch->rem_set()->prepare_for_younger_refs_iterate(true);
-  // It turns out that even when we're using 1 thread, doing the work in a
-  // separate thread causes wide variance in run times.  We can't help this
-  // in the multi-threaded case, but we special-case n=1 here to get
-  // repeatable measurements of the 1-thread overhead of the parallel code.
-  if (n_workers > 1) {
-    StrongRootsScope srs;
-    workers->run_task(&tsk);
-  } else {
-    StrongRootsScope srs;
-    tsk.work(0);
+  thread_state_set.reset(active_workers, promotion_failed());
+
+  {
+    StrongRootsScope srs(active_workers);
+
+    ParNewGenTask tsk(this, _old_gen, reserved().end(), &thread_state_set, &srs);
+    gch->rem_set()->prepare_for_younger_refs_iterate(true);
+    // It turns out that even when we're using 1 thread, doing the work in a
+    // separate thread causes wide variance in run times.  We can't help this
+    // in the multi-threaded case, but we special-case n=1 here to get
+    // repeatable measurements of the 1-thread overhead of the parallel code.
+    if (active_workers > 1) {
+      workers->run_task(&tsk);
+    } else {
+      tsk.work(0);
+    }
   }
+
   thread_state_set.reset(0 /* Bad value in debug if not reset */,
                          promotion_failed());

@@ -995,7 +985,6 @@
                                               _gc_timer, _gc_tracer.gc_id());
   } else {
     thread_state_set.flush();
-    gch->set_par_threads(0);  // 0 ==> non-parallel.
     gch->save_marks();
     stats = rp->process_discovered_references(&is_alive, &keep_alive,
                                               &evacuate_followers, NULL,
@@ -1033,7 +1022,7 @@
   to()->set_concurrent_iteration_safe_limit(to()->top());

   if (ResizePLAB) {
-    plab_stats()->adjust_desired_plab_sz(n_workers);
+    plab_stats()->adjust_desired_plab_sz(active_workers);
   }

   if (PrintGC && !PrintGCDetails) {
@@ -1477,9 +1466,9 @@
     _ref_processor =
       new ReferenceProcessor(_reserved,                  // span
                              ParallelRefProcEnabled && (ParallelGCThreads > 1), // mt processing
-                             (int) ParallelGCThreads,    // mt processing degree
+                             (uint) ParallelGCThreads,   // mt processing degree
                              refs_discovery_is_mt(),     // mt discovery
-                             (int) ParallelGCThreads,    // mt discovery degree
+                             (uint) ParallelGCThreads,   // mt discovery degree
                              refs_discovery_is_atomic(), // atomic_discovery
                              NULL);                      // is_alive_non_header
   }
--- a/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -39,6 +39,7 @@
 class ParRootScanWithoutBarrierClosure;
 class ParRootScanWithBarrierTwoGensClosure;
 class ParEvacuateFollowersClosure;
+class StrongRootsScope;

 // It would be better if these types could be kept local to the .cpp file,
 // but they must be here to allow ParScanClosure::do_oop_work to be defined
@@ -237,20 +238,18 @@
   Generation*                  _old_gen;
   HeapWord*                    _young_old_boundary;
   class ParScanThreadStateSet* _state_set;
+  StrongRootsScope*            _strong_roots_scope;

 public:
   ParNewGenTask(ParNewGeneration*      gen,
                 Generation*            old_gen,
                 HeapWord*              young_old_boundary,
-                ParScanThreadStateSet* state_set);
+                ParScanThreadStateSet* state_set,
+                StrongRootsScope*      strong_roots_scope);

   HeapWord* young_old_boundary() { return _young_old_boundary; }

   void work(uint worker_id);
-
-  // Reset the terminator in ParScanThreadStateSet for
-  // "active_workers" threads.
-  virtual void set_for_termination(uint active_workers);
 };

 class KeepAliveClosure: public DefNewGeneration::KeepAliveClosure {
--- a/src/share/vm/gc/cms/parOopClosures.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/parOopClosures.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -26,6 +26,7 @@
 #define SHARE_VM_GC_CMS_PAROOPCLOSURES_HPP

 #include "gc/shared/genOopClosures.hpp"
+#include "gc/shared/taskqueue.hpp"
 #include "memory/padded.hpp"

 // Closures for ParNewGeneration
--- a/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -147,6 +147,13 @@
   bool completed() const { return _status == COMPLETED; }
   bool aborted()   const { return _status == ABORTED; }
   bool active()    const { return _status == ACTIVE; }
+
+  // This method configures the task for proper termination.
+  // Some tasks do not have any requirements on termination
+  // and may inherit this method that does nothing.  Some
+  // tasks do some coordination on termination and override
+  // this method to implement that coordination.
+  virtual void set_for_termination(uint active_workers) {}
 };
 // Class YieldingWorkGang: A subclass of WorkGang.
 // In particular, a YieldingWorkGang is made up of
--- a/src/share/vm/gc/g1/collectionSetChooser.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/collectionSetChooser.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -158,20 +158,10 @@
   hr->calc_gc_efficiency();
 }

-void CollectionSetChooser::prepare_for_par_region_addition(uint n_regions,
+void CollectionSetChooser::prepare_for_par_region_addition(uint n_threads,
+                                                           uint n_regions,
                                                            uint chunk_size) {
   _first_par_unreserved_idx = 0;
-  uint n_threads = (uint) ParallelGCThreads;
-  if (UseDynamicNumberOfGCThreads) {
-    assert(G1CollectedHeap::heap()->workers()->active_workers() > 0,
-      "Should have been set earlier");
-    // This is defensive code. As the assertion above says, the number
-    // of active threads should be > 0, but in case there is some path
-    // or some improperly initialized variable with leads to no
-    // active threads, protect against that in a product build.
-    n_threads = MAX2(G1CollectedHeap::heap()->workers()->active_workers(),
-                     1U);
-  }
   uint max_waste = n_threads * chunk_size;
   // it should be aligned with respect to chunk_size
   uint aligned_n_regions = (n_regions + chunk_size - 1) / chunk_size * chunk_size;
--- a/src/share/vm/gc/g1/collectionSetChooser.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/collectionSetChooser.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -121,7 +121,7 @@

   // Must be called before calls to claim_array_chunk().
   // n_regions is the number of regions, chunk_size the chunk size.
-  void prepare_for_par_region_addition(uint n_regions, uint chunk_size);
+  void prepare_for_par_region_addition(uint n_threads, uint n_regions, uint chunk_size);
   // Returns the first index in a contiguous chunk of chunk_size indexes
   // that the calling thread has reserved.  These must be set by the
   // calling thread using set_region() (to NULL if necessary).
--- a/src/share/vm/gc/g1/concurrentG1Refine.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/concurrentG1Refine.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -35,7 +35,7 @@
 {
   // Ergonomically select initial concurrent refinement parameters
   if (FLAG_IS_DEFAULT(G1ConcRefinementGreenZone)) {
-    FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, MAX2<int>(ParallelGCThreads, 1));
+    FLAG_SET_DEFAULT(G1ConcRefinementGreenZone, (intx)ParallelGCThreads);
   }
   set_green_zone(G1ConcRefinementGreenZone);
--- a/src/share/vm/gc/g1/concurrentMark.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/concurrentMark.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -518,7 +518,7 @@
   _markStack(this),
   // _finger set in set_non_marking_state

-  _max_worker_id(MAX2((uint)ParallelGCThreads, 1U)),
+  _max_worker_id((uint)ParallelGCThreads),
   // _active_tasks set in set_non_marking_state
   // _tasks set inside the constructor
   _task_queues(new CMTaskQueueSet((int) _max_worker_id)),
@@ -1218,15 +1218,13 @@
     "Maximum number of marking threads exceeded");

   uint active_workers = MAX2(1U, parallel_marking_threads());
+  assert(active_workers > 0, "Should have been set");

   // Parallel task terminator is set in "set_concurrency_and_phase()"
   set_concurrency_and_phase(active_workers, true /* concurrent */);

   CMConcurrentMarkingTask markingTask(this, cmThread());
   _parallel_workers->set_active_workers(active_workers);
-  // Don't set _n_par_threads because it affects MT in process_roots()
-  // and the decisions on that MT processing is made elsewhere.
-  assert(_parallel_workers->active_workers() > 0, "Should have been set");
   _parallel_workers->run_task(&markingTask);
   print_stats();
 }
@@ -1761,28 +1759,20 @@
   }
 };

-class G1ParNoteEndTask;
-
 class G1NoteEndOfConcMarkClosure : public HeapRegionClosure {
   G1CollectedHeap* _g1;
-  size_t _max_live_bytes;
-  uint _regions_claimed;
   size_t _freed_bytes;
   FreeRegionList* _local_cleanup_list;
   HeapRegionSetCount _old_regions_removed;
   HeapRegionSetCount _humongous_regions_removed;
   HRRSCleanupTask* _hrrs_cleanup_task;
-  double _claimed_region_time;
-  double _max_region_time;

 public:
   G1NoteEndOfConcMarkClosure(G1CollectedHeap* g1,
                              FreeRegionList* local_cleanup_list,
                              HRRSCleanupTask* hrrs_cleanup_task) :
     _g1(g1),
-    _max_live_bytes(0), _regions_claimed(0),
     _freed_bytes(0),
-    _claimed_region_time(0.0), _max_region_time(0.0),
     _local_cleanup_list(local_cleanup_list),
     _old_regions_removed(),
     _humongous_regions_removed(),
@@ -1799,10 +1789,7 @@
     // We use a claim value of zero here because all regions
     // were claimed with value 1 in the FinalCount task.
     _g1->reset_gc_time_stamps(hr);
-    double start = os::elapsedTime();
-    _regions_claimed++;
     hr->note_end_of_marking();
-    _max_live_bytes += hr->max_live_bytes();

     if (hr->used() > 0 && hr->max_live_bytes() == 0 && !hr->is_young()) {
       _freed_bytes += hr->used();
@@ -1819,18 +1806,8 @@
       hr->rem_set()->do_cleanup_work(_hrrs_cleanup_task);
     }

-    double region_time = (os::elapsedTime() - start);
-    _claimed_region_time += region_time;
-    if (region_time > _max_region_time) {
-      _max_region_time = region_time;
-    }
     return false;
   }
-
-  size_t max_live_bytes() { return _max_live_bytes; }
-  uint regions_claimed() { return _regions_claimed; }
-  double claimed_region_time_sec() { return _claimed_region_time; }
-  double max_region_time_sec() { return _max_region_time; }
 };

 class G1ParNoteEndTask: public AbstractGangTask {
@@ -1838,14 +1815,12 @@

 protected:
   G1CollectedHeap* _g1h;
-  size_t _max_live_bytes;
-  size_t _freed_bytes;
   FreeRegionList* _cleanup_list;
   HeapRegionClaimer _hrclaimer;

 public:
   G1ParNoteEndTask(G1CollectedHeap* g1h, FreeRegionList* cleanup_list, uint n_workers) :
-      AbstractGangTask("G1 note end"), _g1h(g1h), _max_live_bytes(0), _freed_bytes(0), _cleanup_list(cleanup_list), _hrclaimer(n_workers) {
+      AbstractGangTask("G1 note end"), _g1h(g1h), _cleanup_list(cleanup_list), _hrclaimer(n_workers) {
   }

   void work(uint worker_id) {
@@ -1861,8 +1836,6 @@
     {
       MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
       _g1h->decrement_summary_bytes(g1_note_end.freed_bytes());
-      _max_live_bytes += g1_note_end.max_live_bytes();
-      _freed_bytes += g1_note_end.freed_bytes();

       // If we iterate over the global cleanup list at the end of
       // cleanup to do this printing we will not guarantee to only
@@ -1887,8 +1860,6 @@
       HeapRegionRemSet::finish_cleanup_task(&hrrs_cleanup_task);
     }
   }
-  size_t max_live_bytes() { return _max_live_bytes; }
-  size_t freed_bytes() { return _freed_bytes; }
 };

 class G1ParScrubRemSetTask: public AbstractGangTask {
@@ -1938,18 +1909,10 @@

   HeapRegionRemSet::reset_for_cleanup_tasks();

-  uint n_workers;
-
   // Do counting once more with the world stopped for good measure.
   G1ParFinalCountTask g1_par_count_task(g1h, &_region_bm, &_card_bm);

-  g1h->set_par_threads();
-  n_workers = g1h->n_par_threads();
-  assert(g1h->n_par_threads() == n_workers,
-         "Should not have been reset");
   g1h->workers()->run_task(&g1_par_count_task);
-  // Done with the parallel phase so reset to 0.
-  g1h->set_par_threads(0);

   if (VerifyDuringGC) {
     // Verify that the counting data accumulated during marking matches
@@ -1965,10 +1928,7 @@
                                                  &expected_region_bm,
                                                  &expected_card_bm);

-    g1h->set_par_threads((int)n_workers);
     g1h->workers()->run_task(&g1_par_verify_task);
-    // Done with the parallel phase so reset to 0.
-    g1h->set_par_threads(0);

     guarantee(g1_par_verify_task.failures() == 0, "Unexpected accounting failures");
   }
@@ -1990,11 +1950,11 @@

   g1h->reset_gc_time_stamp();

+  uint n_workers = _g1h->workers()->active_workers();
+
   // Note end of marking in all heap regions.
   G1ParNoteEndTask g1_par_note_end_task(g1h, &_cleanup_list, n_workers);
-  g1h->set_par_threads((int)n_workers);
   g1h->workers()->run_task(&g1_par_note_end_task);
-  g1h->set_par_threads(0);
   g1h->check_gc_time_stamps();

   if (!cleanup_list_is_empty()) {
@@ -2009,9 +1969,7 @@
   if (G1ScrubRemSets) {
     double rs_scrub_start = os::elapsedTime();
     G1ParScrubRemSetTask g1_par_scrub_rs_task(g1h, &_region_bm, &_card_bm, n_workers);
-    g1h->set_par_threads((int)n_workers);
     g1h->workers()->run_task(&g1_par_scrub_rs_task);
-    g1h->set_par_threads(0);

     double rs_scrub_end = os::elapsedTime();
     double this_rs_scrub_time = (rs_scrub_end - rs_scrub_start);
@@ -2020,7 +1978,7 @@

   // this will also free any regions totally full of garbage objects,
   // and sort the regions.
-  g1h->g1_policy()->record_concurrent_mark_cleanup_end((int)n_workers);
+  g1h->g1_policy()->record_concurrent_mark_cleanup_end();

   // Statistics.
   double end = os::elapsedTime();
@@ -2312,9 +2270,7 @@
   // and overflow handling in CMTask::do_marking_step() knows
   // how many workers to wait for.
   _cm->set_concurrency(_active_workers);
-  _g1h->set_par_threads(_active_workers);
   _workers->run_task(&proc_task_proxy);
-  _g1h->set_par_threads(0);
 }

 class G1CMRefEnqueueTaskProxy: public AbstractGangTask {
@@ -2344,9 +2300,7 @@
   // and overflow handling in CMTask::do_marking_step() knows
   // how many workers to wait for.
   _cm->set_concurrency(_active_workers);
-  _g1h->set_par_threads(_active_workers);
   _workers->run_task(&enq_task_proxy);
-  _g1h->set_par_threads(0);
 }

 void ConcurrentMark::weakRefsWorkParallelPart(BoolObjectClosure* is_alive, bool purged_classes) {
@@ -2608,27 +2562,23 @@

   g1h->ensure_parsability(false);

-  StrongRootsScope srs;
   // this is remark, so we'll use up all active threads
   uint active_workers = g1h->workers()->active_workers();
-  if (active_workers == 0) {
-    assert(active_workers > 0, "Should have been set earlier");
-    active_workers = (uint) ParallelGCThreads;
-    g1h->workers()->set_active_workers(active_workers);
-  }
   set_concurrency_and_phase(active_workers, false /* concurrent */);
   // Leave _parallel_marking_threads at it's
   // value originally calculated in the ConcurrentMark
   // constructor and pass values of the active workers
   // through the gang in the task.

-  CMRemarkTask remarkTask(this, active_workers);
-  // We will start all available threads, even if we decide that the
-  // active_workers will be fewer. The extra ones will just bail out
-  // immediately.
-  g1h->set_par_threads(active_workers);
-  g1h->workers()->run_task(&remarkTask);
-  g1h->set_par_threads(0);
+  {
+    StrongRootsScope srs(active_workers);
+
+    CMRemarkTask remarkTask(this, active_workers);
+    // We will start all available threads, even if we decide that the
+    // active_workers will be fewer. The extra ones will just bail out
+    // immediately.
+    g1h->workers()->run_task(&remarkTask);
+  }

   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
   guarantee(has_overflown() ||
@@ -3001,9 +2951,7 @@
   G1AggregateCountDataTask g1_par_agg_task(_g1h, this, &_card_bm,
                                            _max_worker_id, n_workers);

-  _g1h->set_par_threads(n_workers);
   _g1h->workers()->run_task(&g1_par_agg_task);
-  _g1h->set_par_threads(0);
 }

 // Clear the per-worker arrays used to store the per-region counting data
--- a/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -1326,27 +1326,10 @@
         AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
                                                 workers()->active_workers(),
                                                 Threads::number_of_non_daemon_threads());
-      assert(UseDynamicNumberOfGCThreads ||
-             n_workers == workers()->total_workers(),
-             "If not dynamic should be using all the  workers");
       workers()->set_active_workers(n_workers);
-      // Set parallel threads in the heap (_n_par_threads) only
-      // before a parallel phase and always reset it to 0 after
-      // the phase so that the number of parallel threads does
-      // no get carried forward to a serial phase where there
-      // may be code that is "possibly_parallel".
-      set_par_threads(n_workers);

       ParRebuildRSTask rebuild_rs_task(this);
-      assert(UseDynamicNumberOfGCThreads ||
-             workers()->active_workers() == workers()->total_workers(),
-             "Unless dynamic should use total workers");
-      // Use the most recent number of  active workers
-      assert(workers()->active_workers() > 0,
-             "Active workers not properly set");
-      set_par_threads(workers()->active_workers());
       workers()->run_task(&rebuild_rs_task);
-      set_par_threads(0);

       // Rebuild the strong code root lists for each region
       rebuild_strong_code_roots();
@@ -1769,7 +1752,7 @@
   _allocator = G1Allocator::create_allocator(this);
   _humongous_object_threshold_in_words = HeapRegion::GrainWords / 2;

-  int n_queues = MAX2((int)ParallelGCThreads, 1);
+  int n_queues = (int)ParallelGCThreads;
   _task_queues = new RefToScanQueueSet(n_queues);

   uint n_rem_sets = HeapRegionRemSet::num_par_rem_sets();
@@ -2081,11 +2064,11 @@
     new ReferenceProcessor(mr,    // span
                            ParallelRefProcEnabled && (ParallelGCThreads > 1),
                                 // mt processing
-                           (int) ParallelGCThreads,
+                           (uint) ParallelGCThreads,
                                 // degree of mt processing
                            (ParallelGCThreads > 1) || (ConcGCThreads > 1),
                                 // mt discovery
-                           (int) MAX2(ParallelGCThreads, ConcGCThreads),
+                           (uint) MAX2(ParallelGCThreads, ConcGCThreads),
                                 // degree of mt discovery
                            false,
                                 // Reference discovery is not atomic
@@ -2098,11 +2081,11 @@
     new ReferenceProcessor(mr,    // span
                            ParallelRefProcEnabled && (ParallelGCThreads > 1),
                                 // mt processing
-                           MAX2((int)ParallelGCThreads, 1),
+                           (uint) ParallelGCThreads,
                                 // degree of mt processing
                            (ParallelGCThreads > 1),
                                 // mt discovery
-                           MAX2((int)ParallelGCThreads, 1),
+                           (uint) ParallelGCThreads,
                                 // degree of mt discovery
                            true,
                                 // Reference discovery is atomic
@@ -2502,8 +2485,7 @@
   assert(_worker_cset_start_region != NULL, "sanity");
   assert(_worker_cset_start_region_time_stamp != NULL, "sanity");

-  int n_queues = MAX2((int)ParallelGCThreads, 1);
-  for (int i = 0; i < n_queues; i++) {
+  for (uint i = 0; i < ParallelGCThreads; i++) {
     _worker_cset_start_region[i] = NULL;
     _worker_cset_start_region_time_stamp[i] = 0;
   }
@@ -2541,9 +2523,6 @@
   result = g1_policy()->collection_set();
   uint cs_size = g1_policy()->cset_region_length();
   uint active_workers = workers()->active_workers();
-  assert(UseDynamicNumberOfGCThreads ||
-           active_workers == workers()->total_workers(),
-           "Unless dynamic should use total workers");

   uint end_ind   = (cs_size * worker_i) / active_workers;
   uint start_ind = 0;
@@ -3021,7 +3000,7 @@
     G1VerifyCodeRootBlobClosure blobsCl(&codeRootsCl);

     {
-      G1RootProcessor root_processor(this);
+      G1RootProcessor root_processor(this, 1);
       root_processor.process_all_roots(&rootsCl,
                                        &cldCl,
                                        &blobsCl);
@@ -3042,13 +3021,7 @@
     if (GCParallelVerificationEnabled && ParallelGCThreads > 1) {

       G1ParVerifyTask task(this, vo);
-      assert(UseDynamicNumberOfGCThreads ||
-        workers()->active_workers() == workers()->total_workers(),
-        "If not dynamic should be using all the workers");
-      uint n_workers = workers()->active_workers();
-      set_par_threads(n_workers);
       workers()->run_task(&task);
-      set_par_threads(0);
       if (task.failures()) {
         failures = true;
       }
@@ -3572,6 +3545,10 @@
 };
 #endif // ASSERT

+uint G1CollectedHeap::num_task_queues() const {
+  return _task_queues->size();
+}
+
 #if TASKQUEUE_STATS
 void G1CollectedHeap::print_taskqueue_stats_hdr(outputStream* const st) {
   st->print_raw_cr("GC Task Stats");
@@ -3583,7 +3560,7 @@
   print_taskqueue_stats_hdr(st);

   TaskQueueStats totals;
-  const uint n = workers()->total_workers();
+  const uint n = num_task_queues();
   for (uint i = 0; i < n; ++i) {
     st->print("%3u ", i); task_queue(i)->stats.print(st); st->cr();
     totals += task_queue(i)->stats;
@@ -3594,7 +3571,7 @@
 }

 void G1CollectedHeap::reset_taskqueue_stats() {
-  const uint n = workers()->total_workers();
+  const uint n = num_task_queues();
   for (uint i = 0; i < n; ++i) {
     task_queue(i)->stats.reset();
   }
@@ -3696,9 +3673,6 @@
     uint active_workers = AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
                                                                   workers()->active_workers(),
                                                                   Threads::number_of_non_daemon_threads());
-    assert(UseDynamicNumberOfGCThreads ||
-           active_workers == workers()->total_workers(),
-           "If not dynamic should be using all the  workers");
     workers()->set_active_workers(active_workers);

     double pause_start_sec = os::elapsedTime();
@@ -3873,8 +3847,7 @@

         if (evacuation_failed()) {
           _allocator->set_used(recalculate_used());
-          uint n_queues = MAX2((int)ParallelGCThreads, 1);
-          for (uint i = 0; i < n_queues; i++) {
+          for (uint i = 0; i < ParallelGCThreads; i++) {
             if (_evacuation_failed_info_array[i].has_failed()) {
               _gc_tracer_stw->report_evacuation_failed(_evacuation_failed_info_array[i]);
             }
@@ -4041,10 +4014,8 @@
 void G1CollectedHeap::remove_self_forwarding_pointers() {
   double remove_self_forwards_start = os::elapsedTime();

-  set_par_threads();
   G1ParRemoveSelfForwardPtrsTask rsfp_task(this);
   workers()->run_task(&rsfp_task);
-  set_par_threads(0);

   // Now restore saved marks, if any.
   assert(_objs_with_preserved_marks.size() ==
@@ -4308,12 +4279,13 @@
   Mutex* stats_lock() { return &_stats_lock; }

 public:
-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor)
+  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
       _queues(task_queues),
       _root_processor(root_processor),
-      _terminator(0, _queues),
+      _terminator(n_workers, _queues),
+      _n_workers(n_workers),
       _stats_lock(Mutex::leaf, "parallel G1 stats lock", true)
   {}

@@ -4325,12 +4297,6 @@

   ParallelTaskTerminator* terminator() { return &_terminator; }

-  virtual void set_for_termination(uint active_workers) {
-    _root_processor->set_num_workers(active_workers);
-    terminator()->reset_for_reuse(active_workers);
-    _n_workers = active_workers;
-  }
-
   // Helps out with CLD processing.
   //
   // During InitialMark we need to:
@@ -4811,19 +4777,14 @@

   G1ParallelCleaningTask g1_unlink_task(is_alive, process_strings, process_symbols,
                                         n_workers, class_unloading_occurred);
-  set_par_threads(n_workers);
   workers()->run_task(&g1_unlink_task);
-  set_par_threads(0);
 }

 void G1CollectedHeap::unlink_string_and_symbol_table(BoolObjectClosure* is_alive,
                                                      bool process_strings, bool process_symbols) {
   {
-    uint n_workers = workers()->active_workers();
     G1StringSymbolTableUnlinkTask g1_unlink_task(is_alive, process_strings, process_symbols);
-    set_par_threads(n_workers);
     workers()->run_task(&g1_unlink_task);
-    set_par_threads(0);
   }

   if (G1StringDedup::is_enabled()) {
@@ -4851,13 +4812,9 @@
 void G1CollectedHeap::redirty_logged_cards() {
   double redirty_logged_cards_start = os::elapsedTime();

-  uint n_workers = workers()->active_workers();
-
   G1RedirtyLoggedCardsTask redirty_task(&dirty_card_queue_set());
   dirty_card_queue_set().reset_for_par_iteration();
-  set_par_threads(n_workers);
   workers()->run_task(&redirty_task);
-  set_par_threads(0);

   DirtyCardQueueSet& dcq = JavaThread::dirty_card_queue_set();
   dcq.merge_bufferlists(&dirty_card_queue_set());
@@ -5093,9 +5050,7 @@
   ParallelTaskTerminator terminator(_active_workers, _queues);
   G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);

-  _g1h->set_par_threads(_active_workers);
   _workers->run_task(&proc_task_proxy);
-  _g1h->set_par_threads(0);
 }

 // Gang task for parallel reference enqueueing.
@@ -5124,9 +5079,7 @@

   G1STWRefEnqueueTaskProxy enq_task_proxy(enq_task);

-  _g1h->set_par_threads(_active_workers);
   _workers->run_task(&enq_task_proxy);
-  _g1h->set_par_threads(0);
 }

 // End of weak reference support closures
@@ -5219,7 +5172,7 @@
 };

 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references(uint no_of_gc_workers) {
+void G1CollectedHeap::process_discovered_references() {
   double ref_proc_start = os::elapsedTime();

   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5246,17 +5199,14 @@
   // referents points to another object which is also referenced by an
   // object discovered by the STW ref processor.

-  assert(no_of_gc_workers == workers()->active_workers(), "Need to reset active GC workers");
-
-  set_par_threads(no_of_gc_workers);
+  uint no_of_gc_workers = workers()->active_workers();
+
   G1ParPreserveCMReferentsTask keep_cm_referents(this,
                                                  no_of_gc_workers,
                                                  _task_queues);

   workers()->run_task(&keep_cm_referents);

-  set_par_threads(0);
-
   // Closure to test whether a referent is alive.
   G1STWIsAliveClosure is_alive(this);

@@ -5330,7 +5280,7 @@
 }

 // Weak Reference processing during an evacuation pause (part 2).
-void G1CollectedHeap::enqueue_discovered_references(uint no_of_gc_workers) {
+void G1CollectedHeap::enqueue_discovered_references() {
   double ref_enq_start = os::elapsedTime();

   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5344,12 +5294,12 @@
   } else {
     // Parallel reference enqueueing

-    assert(no_of_gc_workers == workers()->active_workers(),
-           "Need to reset active workers");
-    assert(rp->num_q() == no_of_gc_workers, "sanity");
-    assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
-
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    uint n_workers = workers()->active_workers();
+
+    assert(rp->num_q() == n_workers, "sanity");
+    assert(n_workers <= rp->max_num_q(), "sanity");
+
+    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, n_workers);
     rp->enqueue_discovered_references(&par_task_executor);
   }

@@ -5380,11 +5330,6 @@
   hot_card_cache->set_use_cache(false);

   const uint n_workers = workers()->active_workers();
-  assert(UseDynamicNumberOfGCThreads ||
-         n_workers == workers()->total_workers(),
-         "If not dynamic should be using all the  workers");
-  set_par_threads(n_workers);
-

   init_for_evac_failure(NULL);

@@ -5393,19 +5338,16 @@
   double end_par_time_sec;

   {
-    G1RootProcessor root_processor(this);
-    G1ParTask g1_par_task(this, _task_queues, &root_processor);
+    G1RootProcessor root_processor(this, n_workers);
+    G1ParTask g1_par_task(this, _task_queues, &root_processor, n_workers);
     // InitialMark needs claim bits to keep track of the marked-through CLDs.
     if (g1_policy()->during_initial_mark_pause()) {
       ClassLoaderDataGraph::clear_claimed_marks();
     }

-     // The individual threads will set their evac-failure closures.
-     if (PrintTerminationStats) G1ParScanThreadState::print_termination_stats_hdr();
-     // These tasks use ShareHeap::_process_strong_tasks
-     assert(UseDynamicNumberOfGCThreads ||
-            workers()->active_workers() == workers()->total_workers(),
-            "If not dynamic should be using all the  workers");
+    // The individual threads will set their evac-failure closures.
+    if (PrintTerminationStats) G1ParScanThreadState::print_termination_stats_hdr();
+
     workers()->run_task(&g1_par_task);
     end_par_time_sec = os::elapsedTime();

@@ -5425,14 +5367,12 @@
         (os::elapsedTime() - end_par_time_sec) * 1000.0;
   phase_times->record_code_root_fixup_time(code_root_fixup_time_ms);

-  set_par_threads(0);
-
   // Process any discovered reference objects - we have
   // to do this _before_ we retire the GC alloc regions
   // as we may have to copy some 'reachable' referent
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
-  process_discovered_references(n_workers);
+  process_discovered_references();

   if (G1StringDedup::is_enabled()) {
     double fixup_start = os::elapsedTime();
@@ -5474,7 +5414,7 @@
   // will log these updates (and dirty their associated
   // cards). We need these updates logged to update any
   // RSets.
-  enqueue_discovered_references(n_workers);
+  enqueue_discovered_references();

   redirty_logged_cards();
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
@@ -5779,9 +5719,7 @@
     // Iterate over the dirty cards region list.
     G1ParCleanupCTTask cleanup_task(ct_bs, this);

-    set_par_threads();
     workers()->run_task(&cleanup_task);
-    set_par_threads(0);
 #ifndef PRODUCT
     if (G1VerifyCTCleanup || VerifyAfterGC) {
       G1VerifyCardTableCleanup cleanup_verifier(this, ct_bs);
@@ -6314,21 +6252,6 @@
   g1mm()->update_eden_size();
 }

-void G1CollectedHeap::set_par_threads() {
-  // Don't change the number of workers.  Use the value previously set
-  // in the workgroup.
-  uint n_workers = workers()->active_workers();
-  assert(UseDynamicNumberOfGCThreads ||
-           n_workers == workers()->total_workers(),
-      "Otherwise should be using the total number of workers");
-  if (n_workers == 0) {
-    assert(false, "Should have been set in prior evacuation pause.");
-    n_workers = ParallelGCThreads;
-    workers()->set_active_workers(n_workers);
-  }
-  set_par_threads(n_workers);
-}
-
 // Methods for the GC alloc regions

 HeapRegion* G1CollectedHeap::new_gc_alloc_region(size_t word_size,
--- a/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -606,11 +606,11 @@

   // Process any reference objects discovered during
   // an incremental evacuation pause.
-  void process_discovered_references(uint no_of_gc_workers);
+  void process_discovered_references();

   // Enqueue any remaining discovered references
   // after processing.
-  void enqueue_discovered_references(uint no_of_gc_workers);
+  void enqueue_discovered_references();

 public:
   FlexibleWorkGang* workers() const { return _workers; }
@@ -981,6 +981,8 @@

   RefToScanQueue *task_queue(uint i) const;

+  uint num_task_queues() const;
+
   // A set of cards where updates happened during the GC
   DirtyCardQueueSet& dirty_card_queue_set() { return _dirty_card_queue_set; }

@@ -1012,11 +1014,6 @@
   // Initialize weak reference processing.
   void ref_processing_init();

-  // Explicitly import set_par_threads into this scope
-  using CollectedHeap::set_par_threads;
-  // Set _n_par_threads according to a policy TBD.
-  void set_par_threads();
-
   virtual Name kind() const {
     return CollectedHeap::G1CollectedHeap;
   }
--- a/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -1587,14 +1587,17 @@
 }

 void
-G1CollectorPolicy::record_concurrent_mark_cleanup_end(uint n_workers) {
+G1CollectorPolicy::record_concurrent_mark_cleanup_end() {
   _collectionSetChooser->clear();

+  FlexibleWorkGang* workers = _g1->workers();
+  uint n_workers = workers->active_workers();
+
   uint n_regions = _g1->num_regions();
   uint chunk_size = calculate_parallel_work_chunk_size(n_workers, n_regions);
-  _collectionSetChooser->prepare_for_par_region_addition(n_regions, chunk_size);
+  _collectionSetChooser->prepare_for_par_region_addition(n_workers, n_regions, chunk_size);
   ParKnownGarbageTask par_known_garbage_task(_collectionSetChooser, chunk_size, n_workers);
-  _g1->workers()->run_task(&par_known_garbage_task);
+  workers->run_task(&par_known_garbage_task);

   _collectionSetChooser->sort_regions();
--- a/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -692,7 +692,7 @@

   // Record start, end, and completion of cleanup.
   void record_concurrent_mark_cleanup_start();
-  void record_concurrent_mark_cleanup_end(uint n_workers);
+  void record_concurrent_mark_cleanup_end();
   void record_concurrent_mark_cleanup_completed();

   // Records the information about the heap size for reporting in
--- a/src/share/vm/gc/g1/g1MarkSweep.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1MarkSweep.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -127,7 +127,7 @@

   MarkingCodeBlobClosure follow_code_closure(&GenMarkSweep::follow_root_closure, !CodeBlobToOopClosure::FixRelocations);
   {
-    G1RootProcessor root_processor(g1h);
+    G1RootProcessor root_processor(g1h, 1);
     root_processor.process_strong_roots(&GenMarkSweep::follow_root_closure,
                                         &GenMarkSweep::follow_cld_closure,
                                         &follow_code_closure);
@@ -237,7 +237,7 @@

   CodeBlobToOopClosure adjust_code_closure(&GenMarkSweep::adjust_pointer_closure, CodeBlobToOopClosure::FixRelocations);
   {
-    G1RootProcessor root_processor(g1h);
+    G1RootProcessor root_processor(g1h, 1);
     root_processor.process_all_roots(&GenMarkSweep::adjust_pointer_closure,
                                      &GenMarkSweep::adjust_cld_closure,
                                      &adjust_code_closure);
--- a/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -50,8 +50,8 @@
   _par_scan_state = par_scan_state;
   _worker_id = par_scan_state->queue_num();

-  assert(_worker_id < MAX2((uint)ParallelGCThreads, 1u),
-         err_msg("The given worker id %u must be less than the number of threads %u", _worker_id, MAX2((uint)ParallelGCThreads, 1u)));
+  assert(_worker_id < ParallelGCThreads,
+         err_msg("The given worker id %u must be less than the number of threads " UINTX_FORMAT, _worker_id, ParallelGCThreads));
 }

 // Generate G1 specialized oop_oop_iterate functions.
--- a/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -90,11 +90,10 @@


 void G1RootProcessor::worker_has_discovered_all_strong_classes() {
-  uint n_workers = _g1h->n_par_threads();
   assert(ClassUnloadingWithConcurrentMark, "Currently only needed when doing G1 Class Unloading");

   uint new_value = (uint)Atomic::add(1, &_n_workers_discovered_strong_classes);
-  if (new_value == n_workers) {
+  if (new_value == n_workers()) {
     // This thread is last. Notify the others.
     MonitorLockerEx ml(&_lock, Mutex::_no_safepoint_check_flag);
     _lock.notify_all();
@@ -102,21 +101,20 @@
 }

 void G1RootProcessor::wait_until_all_strong_classes_discovered() {
-  uint n_workers = _g1h->n_par_threads();
   assert(ClassUnloadingWithConcurrentMark, "Currently only needed when doing G1 Class Unloading");

-  if ((uint)_n_workers_discovered_strong_classes != n_workers) {
+  if ((uint)_n_workers_discovered_strong_classes != n_workers()) {
     MonitorLockerEx ml(&_lock, Mutex::_no_safepoint_check_flag);
-    while ((uint)_n_workers_discovered_strong_classes != n_workers) {
+    while ((uint)_n_workers_discovered_strong_classes != n_workers()) {
       _lock.wait(Mutex::_no_safepoint_check_flag, 0, false);
     }
   }
 }

-G1RootProcessor::G1RootProcessor(G1CollectedHeap* g1h) :
+G1RootProcessor::G1RootProcessor(G1CollectedHeap* g1h, uint n_workers) :
     _g1h(g1h),
     _process_strong_tasks(new SubTasksDone(G1RP_PS_NumElements)),
-    _srs(),
+    _srs(n_workers),
     _lock(Mutex::leaf, "G1 Root Scanning barrier lock", false, Monitor::_safepoint_check_never),
     _n_workers_discovered_strong_classes(0) {}

@@ -206,7 +204,7 @@
     }
   }

-  _process_strong_tasks->all_tasks_completed();
+  _process_strong_tasks->all_tasks_completed(n_workers());
 }

 void G1RootProcessor::process_strong_roots(OopClosure* oops,
@@ -216,7 +214,7 @@
   process_java_roots(oops, clds, clds, NULL, blobs, NULL, 0);
   process_vm_roots(oops, NULL, NULL, 0);

-  _process_strong_tasks->all_tasks_completed();
+  _process_strong_tasks->all_tasks_completed(n_workers());
 }

 void G1RootProcessor::process_all_roots(OopClosure* oops,
@@ -230,7 +228,7 @@
     CodeCache::blobs_do(blobs);
   }

-  _process_strong_tasks->all_tasks_completed();
+  _process_strong_tasks->all_tasks_completed(n_workers());
 }

 void G1RootProcessor::process_java_roots(OopClosure* strong_roots,
@@ -253,7 +251,7 @@

   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::ThreadRoots, worker_i);
-    bool is_par = _g1h->n_par_threads() > 0;
+    bool is_par = n_workers() > 1;
     Threads::possibly_parallel_oops_do(is_par, strong_roots, thread_stack_clds, strong_code);
   }
 }
@@ -329,6 +327,6 @@
   _g1h->g1_rem_set()->oops_into_collection_set_do(scan_rs, &scavenge_cs_nmethods, worker_i);
 }

-void G1RootProcessor::set_num_workers(uint active_workers) {
-  _process_strong_tasks->set_n_threads(active_workers);
+uint G1RootProcessor::n_workers() const {
+  return _srs.n_threads();
 }
--- a/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -85,7 +85,7 @@
                         uint worker_i);

 public:
-  G1RootProcessor(G1CollectedHeap* g1h);
+  G1RootProcessor(G1CollectedHeap* g1h, uint n_workers);

   // Apply closures to the strongly and weakly reachable roots in the system
   // in a single pass.
@@ -114,8 +114,8 @@
                             OopClosure* scan_non_heap_weak_roots,
                             uint worker_i);

-  // Inform the root processor about the number of worker threads
-  void set_num_workers(uint active_workers);
+  // Number of worker threads used by the root processor.
+  uint n_workers() const;
 };

 #endif // SHARE_VM_GC_G1_G1ROOTPROCESSOR_HPP
--- a/src/share/vm/gc/g1/g1StringDedup.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1StringDedup.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -153,9 +153,7 @@

   G1StringDedupUnlinkOrOopsDoTask task(is_alive, keep_alive, allow_resize_and_rehash, phase_times);
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  g1h->set_par_threads();
   g1h->workers()->run_task(&task);
-  g1h->set_par_threads(0);
 }

 void G1StringDedup::threads_do(ThreadClosure* tc) {
--- a/src/share/vm/gc/g1/g1StringDedupQueue.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1StringDedupQueue.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -42,7 +42,7 @@
   _cancel(false),
   _empty(true),
   _dropped(0) {
-  _nqueues = MAX2(ParallelGCThreads, (size_t)1);
+  _nqueues = ParallelGCThreads;
   _queues = NEW_C_HEAP_ARRAY(G1StringDedupWorkerQueue, _nqueues, mtGC);
   for (size_t i = 0; i < _nqueues; i++) {
     new (_queues + i) G1StringDedupWorkerQueue(G1StringDedupWorkerQueue::default_segment_size(), _max_cache_size, _max_size);
--- a/src/share/vm/gc/g1/g1StringDedupTable.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/g1/g1StringDedupTable.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -112,7 +112,7 @@
 };

 G1StringDedupEntryCache::G1StringDedupEntryCache() {
-  _nlists = MAX2(ParallelGCThreads, (size_t)1);
+  _nlists = ParallelGCThreads;
   _lists = PaddedArray<G1StringDedupEntryFreeList, mtGC>::create_unfreeable((uint)_nlists);
 }
--- a/src/share/vm/gc/parallel/psParallelCompact.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/parallel/psParallelCompact.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -832,9 +832,9 @@
   _ref_processor =
     new ReferenceProcessor(mr,            // span
                            ParallelRefProcEnabled && (ParallelGCThreads > 1), // mt processing
-                           (int) ParallelGCThreads, // mt processing degree
+                           (uint) ParallelGCThreads, // mt processing degree
                            true,          // mt discovery
-                           (int) ParallelGCThreads, // mt discovery degree
+                           (uint) ParallelGCThreads, // mt discovery degree
                            true,          // atomic_discovery
                            &_is_alive_closure); // non-header is alive closure
   _counters = new CollectorCounters("PSParallelCompact", 1);
@@ -2029,7 +2029,6 @@
     // Set the number of GC threads to be used in this collection
     gc_task_manager()->set_active_gang();
     gc_task_manager()->task_idle_workers();
-    heap->set_par_threads(gc_task_manager()->active_workers());

     TraceCPUTime tcpu(PrintGCDetails, true, gclog_or_tty);
     GCTraceTime t1(GCCauseString("Full GC", gc_cause), PrintGC, !PrintGCDetails, NULL, _gc_tracer.gc_id());
--- a/src/share/vm/gc/parallel/psScavenge.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/parallel/psScavenge.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -382,7 +382,6 @@
     // Get the active number of workers here and use that value
     // throughout the methods.
     uint active_workers = gc_task_manager()->active_workers();
-    heap->set_par_threads(active_workers);

     PSPromotionManager::pre_scavenge();

@@ -846,9 +845,9 @@
   _ref_processor =
     new ReferenceProcessor(mr,                         // span
                            ParallelRefProcEnabled && (ParallelGCThreads > 1), // mt processing
-                           (int) ParallelGCThreads,    // mt processing degree
+                           (uint) ParallelGCThreads,   // mt processing degree
                            true,                       // mt discovery
-                           (int) ParallelGCThreads,    // mt discovery degree
+                           (uint) ParallelGCThreads,   // mt discovery degree
                            true,                       // atomic_discovery
                            NULL);                      // header provides liveness info
--- a/src/share/vm/gc/serial/defNewGeneration.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/serial/defNewGeneration.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -38,6 +38,7 @@
 #include "gc/shared/referencePolicy.hpp"
 #include "gc/shared/space.inline.hpp"
 #include "gc/shared/spaceDecorator.hpp"
+#include "gc/shared/strongRootsScope.hpp"
 #include "memory/iterator.hpp"
 #include "oops/instanceRefKlass.hpp"
 #include "oops/oop.inline.hpp"
@@ -454,7 +455,7 @@
   }
 }

-void DefNewGeneration::younger_refs_iterate(OopsInGenClosure* cl) {
+void DefNewGeneration::younger_refs_iterate(OopsInGenClosure* cl, uint n_threads) {
   assert(false, "NYI -- are you sure you want to call this?");
 }

@@ -625,15 +626,22 @@
   assert(gch->no_allocs_since_save_marks(0),
          "save marks have not been newly set.");

-  gch->gen_process_roots(_level,
-                         true,  // Process younger gens, if any,
-                                // as strong roots.
-                         true,  // activate StrongRootsScope
-                         GenCollectedHeap::SO_ScavengeCodeCache,
-                         GenCollectedHeap::StrongAndWeakRoots,
-                         &fsc_with_no_gc_barrier,
-                         &fsc_with_gc_barrier,
-                         &cld_scan_closure);
+  {
+    // DefNew needs to run with n_threads == 0, to make sure the serial
+    // version of the card table scanning code is used.
+    // See: CardTableModRefBS::non_clean_card_iterate_possibly_parallel.
+    StrongRootsScope srs(0);
+
+    gch->gen_process_roots(&srs,
+                           _level,
+                           true,  // Process younger gens, if any,
+                                  // as strong roots.
+                           GenCollectedHeap::SO_ScavengeCodeCache,
+                           GenCollectedHeap::StrongAndWeakRoots,
+                           &fsc_with_no_gc_barrier,
+                           &fsc_with_gc_barrier,
+                           &cld_scan_closure);
+  }

   // "evacuate followers".
   evacuate_followers.do_void();
--- a/src/share/vm/gc/serial/defNewGeneration.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/serial/defNewGeneration.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -255,7 +255,7 @@
   // Iteration
   void object_iterate(ObjectClosure* blk);

-  void younger_refs_iterate(OopsInGenClosure* cl);
+  void younger_refs_iterate(OopsInGenClosure* cl, uint n_threads);

   void space_iterate(SpaceClosure* blk, bool usedOnly = false);
--- a/src/share/vm/gc/serial/genMarkSweep.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/serial/genMarkSweep.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -40,6 +40,7 @@
 #include "gc/shared/modRefBarrierSet.hpp"
 #include "gc/shared/referencePolicy.hpp"
 #include "gc/shared/space.hpp"
+#include "gc/shared/strongRootsScope.hpp"
 #include "oops/instanceRefKlass.hpp"
 #include "oops/oop.inline.hpp"
 #include "prims/jvmtiExport.hpp"
@@ -200,14 +201,18 @@
   // Need new claim bits before marking starts.
   ClassLoaderDataGraph::clear_claimed_marks();

-  gch->gen_process_roots(level,
-                         false, // Younger gens are not roots.
-                         true,  // activate StrongRootsScope
-                         GenCollectedHeap::SO_None,
-                         GenCollectedHeap::StrongRootsOnly,
-                         &follow_root_closure,
-                         &follow_root_closure,
-                         &follow_cld_closure);
+  {
+    StrongRootsScope srs(1);
+
+    gch->gen_process_roots(&srs,
+                           level,
+                           false, // Younger gens are not roots.
+                           GenCollectedHeap::SO_None,
+                           GenCollectedHeap::StrongRootsOnly,
+                           &follow_root_closure,
+                           &follow_root_closure,
+                           &follow_cld_closure);
+  }

   // Process reference objects found during marking
   {
@@ -284,14 +289,18 @@
   assert(level == 1, "We don't use mark-sweep on young generations.");
   adjust_pointer_closure.set_orig_generation(gch->old_gen());

-  gch->gen_process_roots(level,
-                         false, // Younger gens are not roots.
-                         true,  // activate StrongRootsScope
-                         GenCollectedHeap::SO_AllCodeCache,
-                         GenCollectedHeap::StrongAndWeakRoots,
-                         &adjust_pointer_closure,
-                         &adjust_pointer_closure,
-                         &adjust_cld_closure);
+  {
+    StrongRootsScope srs(1);
+
+    gch->gen_process_roots(&srs,
+                           level,
+                           false, // Younger gens are not roots.
+                           GenCollectedHeap::SO_AllCodeCache,
+                           GenCollectedHeap::StrongAndWeakRoots,
+                           &adjust_pointer_closure,
+                           &adjust_pointer_closure,
+                           &adjust_cld_closure);
+  }

   gch->gen_process_weak_roots(&adjust_pointer_closure);
--- a/src/share/vm/gc/shared/adaptiveSizePolicy.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/adaptiveSizePolicy.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -161,7 +161,7 @@
       }
       _debug_perturbation = !_debug_perturbation;
     }
-    assert((new_active_workers <= (uintx) ParallelGCThreads) &&
+    assert((new_active_workers <= ParallelGCThreads) &&
            (new_active_workers >= min_workers),
       "Jiggled active workers too much");
   }
--- a/src/share/vm/gc/shared/cardGeneration.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardGeneration.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -353,8 +353,8 @@
   blk->do_space(space());
 }

-void CardGeneration::younger_refs_iterate(OopsInGenClosure* blk) {
+void CardGeneration::younger_refs_iterate(OopsInGenClosure* blk, uint n_threads) {
   blk->set_generation(this);
-  younger_refs_in_space_iterate(space(), blk);
+  younger_refs_in_space_iterate(space(), blk, n_threads);
   blk->reset_generation();
 }
--- a/src/share/vm/gc/shared/cardGeneration.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardGeneration.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -89,7 +89,7 @@

   void space_iterate(SpaceClosure* blk, bool usedOnly = false);

-  void younger_refs_iterate(OopsInGenClosure* blk);
+  void younger_refs_iterate(OopsInGenClosure* blk, uint n_threads);

   bool is_in(const void* p) const;
--- a/src/share/vm/gc/shared/cardTableModRefBS.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardTableModRefBS.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -440,31 +440,11 @@
 void CardTableModRefBS::non_clean_card_iterate_possibly_parallel(Space* sp,
                                                                  MemRegion mr,
                                                                  OopsInGenClosure* cl,
-                                                                 CardTableRS* ct) {
+                                                                 CardTableRS* ct,
+                                                                 uint n_threads) {
   if (!mr.is_empty()) {
-    // Caller (process_roots()) claims that all GC threads
-    // execute this call.  With UseDynamicNumberOfGCThreads now all
-    // active GC threads execute this call.  The number of active GC
-    // threads needs to be passed to par_non_clean_card_iterate_work()
-    // to get proper partitioning and termination.
-    //
-    // This is an example of where n_par_threads() is used instead
-    // of workers()->active_workers().  n_par_threads can be set to 0 to
-    // turn off parallelism.  For example when this code is called as
-    // part of verification during root processing then n_par_threads()
-    // may have been set to 0. active_workers is not overloaded with
-    // the meaning that it is a switch to disable parallelism and so keeps
-    // the meaning of the number of active gc workers. If parallelism has
-    // not been shut off by setting n_par_threads to 0, then n_par_threads
-    // should be equal to active_workers.  When a different mechanism for
-    // shutting off parallelism is used, then active_workers can be used in
-    // place of n_par_threads.
-    int n_threads =  GenCollectedHeap::heap()->n_par_threads();
-    bool is_par = n_threads > 0;
-    if (is_par) {
+    if (n_threads > 0) {
 #if INCLUDE_ALL_GCS
-      assert(GenCollectedHeap::heap()->n_par_threads() ==
-             GenCollectedHeap::heap()->workers()->active_workers(), "Mismatch");
       non_clean_card_iterate_parallel_work(sp, mr, cl, ct, n_threads);
 #else  // INCLUDE_ALL_GCS
       fatal("Parallel gc not supported here.");
@@ -472,8 +452,11 @@
     } else {
       // clear_cl finds contiguous dirty ranges of cards to process and clear.

-      DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), cl->gen_boundary());
-      ClearNoncleanCardWrapper clear_cl(dcto_cl, ct);
+      // This is the single-threaded version used by DefNew.
+      const bool parallel = false;
+
+      DirtyCardToOopClosure* dcto_cl = sp->new_dcto_cl(cl, precision(), cl->gen_boundary(), parallel);
+      ClearNoncleanCardWrapper clear_cl(dcto_cl, ct, parallel);

       clear_cl.do_MemRegion(mr);
     }
--- a/src/share/vm/gc/shared/cardTableModRefBS.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardTableModRefBS.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -178,14 +178,15 @@
   // region mr in the given space and apply cl to any dirty sub-regions
   // of mr. Clears the dirty cards as they are processed.
   void non_clean_card_iterate_possibly_parallel(Space* sp, MemRegion mr,
-                                                OopsInGenClosure* cl, CardTableRS* ct);
+                                                OopsInGenClosure* cl, CardTableRS* ct,
+                                                uint n_threads);

  private:
   // Work method used to implement non_clean_card_iterate_possibly_parallel()
   // above in the parallel case.
   void non_clean_card_iterate_parallel_work(Space* sp, MemRegion mr,
                                             OopsInGenClosure* cl, CardTableRS* ct,
-                                            int n_threads);
+                                            uint n_threads);

  protected:
   // Dirty the bytes corresponding to "mr" (not all of which must be
--- a/src/share/vm/gc/shared/cardTableRS.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardTableRS.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -102,9 +102,10 @@
 }

 void CardTableRS::younger_refs_iterate(Generation* g,
-                                       OopsInGenClosure* blk) {
+                                       OopsInGenClosure* blk,
+                                       uint n_threads) {
   _last_cur_val_in_gen[g->level()+1] = cur_youngergen_card_val();
-  g->younger_refs_iterate(blk);
+  g->younger_refs_iterate(blk, n_threads);
 }

 inline bool ClearNoncleanCardWrapper::clear_card(jbyte* entry) {
@@ -164,15 +165,8 @@
 }

 ClearNoncleanCardWrapper::ClearNoncleanCardWrapper(
-  DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct) :
-    _dirty_card_closure(dirty_card_closure), _ct(ct) {
-    // Cannot yet substitute active_workers for n_par_threads
-    // in the case where parallelism is being turned off by
-    // setting n_par_threads to 0.
-    _is_par = (GenCollectedHeap::heap()->n_par_threads() > 0);
-    assert(!_is_par ||
-           (GenCollectedHeap::heap()->n_par_threads() ==
-            GenCollectedHeap::heap()->workers()->active_workers()), "Mismatch");
+  DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct, bool is_par) :
+    _dirty_card_closure(dirty_card_closure), _ct(ct), _is_par(is_par) {
 }

 bool ClearNoncleanCardWrapper::is_word_aligned(jbyte* entry) {
@@ -272,7 +266,8 @@
 }

 void CardTableRS::younger_refs_in_space_iterate(Space* sp,
-                                                OopsInGenClosure* cl) {
+                                                OopsInGenClosure* cl,
+                                                uint n_threads) {
   const MemRegion urasm = sp->used_region_at_save_marks();
 #ifdef ASSERT
   // Convert the assertion check to a warning if we are running
@@ -301,7 +296,7 @@
     ShouldNotReachHere();
   }
 #endif
-  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm, cl, this);
+  _ct_bs->non_clean_card_iterate_possibly_parallel(sp, urasm, cl, this, n_threads);
 }

 void CardTableRS::clear_into_younger(Generation* old_gen) {
--- a/src/share/vm/gc/shared/cardTableRS.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/cardTableRS.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -56,7 +56,7 @@

   CardTableModRefBSForCTRS* _ct_bs;

-  virtual void younger_refs_in_space_iterate(Space* sp, OopsInGenClosure* cl);
+  virtual void younger_refs_in_space_iterate(Space* sp, OopsInGenClosure* cl, uint n_threads);

   void verify_space(Space* s, HeapWord* gen_start);

@@ -116,7 +116,7 @@
   // Card table entries are cleared before application; "blk" is
   // responsible for dirtying if the oop is still older-to-younger after
   // closure application.
-  void younger_refs_iterate(Generation* g, OopsInGenClosure* blk);
+  void younger_refs_iterate(Generation* g, OopsInGenClosure* blk, uint n_threads);

   void inline_write_ref_field_gc(void* field, oop new_val) {
     jbyte* byte = _ct_bs->byte_for(field);
@@ -183,7 +183,7 @@
   bool is_word_aligned(jbyte* entry);

 public:
-  ClearNoncleanCardWrapper(DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct);
+  ClearNoncleanCardWrapper(DirtyCardToOopClosure* dirty_card_closure, CardTableRS* ct, bool is_par);
   void do_MemRegion(MemRegion mr);
 };
--- a/src/share/vm/gc/shared/collectedHeap.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/collectedHeap.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -160,8 +160,7 @@
 // Memory state functions.


-CollectedHeap::CollectedHeap() : _n_par_threads(0)
-{
+CollectedHeap::CollectedHeap() {
   const size_t max_len = size_t(arrayOopDesc::max_array_length(T_INT));
   const size_t elements_per_word = HeapWordSize / sizeof(jint);
   _filler_array_max_size = align_object_size(filler_array_hdr_size() +
--- a/src/share/vm/gc/shared/collectedHeap.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/collectedHeap.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -101,7 +101,6 @@
  protected:
   BarrierSet* _barrier_set;
   bool _is_gc_active;
-  uint _n_par_threads;

   unsigned int _total_collections;          // ... started
   unsigned int _total_full_collections;     // ... started
@@ -291,12 +290,6 @@
   }
   GCCause::Cause gc_cause() { return _gc_cause; }

-  // Number of threads currently working on GC tasks.
-  uint n_par_threads() { return _n_par_threads; }
-
-  // May be overridden to set additional parallelism.
-  virtual void set_par_threads(uint t) { _n_par_threads = t; };
-
   // General obj/array allocation facilities.
   inline static oop obj_allocate(KlassHandle klass, int size, TRAPS);
   inline static oop array_allocate(KlassHandle klass, int size, int length, TRAPS);
--- a/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -561,16 +561,6 @@
   return collector_policy()->satisfy_failed_allocation(size, is_tlab);
 }

-void GenCollectedHeap::set_par_threads(uint t) {
-  assert(t == 0 || !UseSerialGC, "Cannot have parallel threads");
-  CollectedHeap::set_par_threads(t);
-  set_n_termination(t);
-}
-
-void GenCollectedHeap::set_n_termination(uint t) {
-  _process_strong_tasks->set_n_threads(t);
-}
-
 #ifdef ASSERT
 class AssertNonScavengableClosure: public OopClosure {
 public:
@@ -582,15 +572,13 @@
 static AssertNonScavengableClosure assert_is_non_scavengable_closure;
 #endif

-void GenCollectedHeap::process_roots(bool activate_scope,
+void GenCollectedHeap::process_roots(StrongRootsScope* scope,
                                      ScanningOption so,
                                      OopClosure* strong_roots,
                                      OopClosure* weak_roots,
                                      CLDClosure* strong_cld_closure,
                                      CLDClosure* weak_cld_closure,
                                      CodeBlobClosure* code_roots) {
-  StrongRootsScope srs(activate_scope);
-
   // General roots.
   assert(Threads::thread_claim_parity() != 0, "must have called prologue code");
   assert(code_roots != NULL, "code root closure should always be set");
@@ -609,7 +597,7 @@
   // Only process code roots from thread stacks if we aren't visiting the entire CodeCache anyway
   CodeBlobClosure* roots_from_code_p = (so & SO_AllCodeCache) ? NULL : code_roots;

-  bool is_par = n_par_threads() > 0;
+  bool is_par = scope->n_threads() > 1;
   Threads::possibly_parallel_oops_do(is_par, strong_roots, roots_from_clds_p, roots_from_code_p);

   if (!_process_strong_tasks->is_task_claimed(GCH_PS_Universe_oops_do)) {
@@ -669,9 +657,9 @@

 }

-void GenCollectedHeap::gen_process_roots(int level,
+void GenCollectedHeap::gen_process_roots(StrongRootsScope* scope,
+                                         int level,
                                          bool younger_gens_as_roots,
-                                         bool activate_scope,
                                          ScanningOption so,
                                          bool only_strong_roots,
                                          OopsInGenClosure* not_older_gens,
@@ -689,7 +677,7 @@
   OopsInGenClosure* weak_roots = only_strong_roots ? NULL : not_older_gens;
   CLDClosure* weak_cld_closure = only_strong_roots ? NULL : cld_closure;

-  process_roots(activate_scope, so,
+  process_roots(scope, so,
                 not_older_gens, weak_roots,
                 cld_closure, weak_cld_closure,
                 &mark_code_closure);
@@ -707,11 +695,11 @@
   // older-gen scanning.
   if (level == 0) {
     older_gens->set_generation(_old_gen);
-    rem_set()->younger_refs_iterate(_old_gen, older_gens);
+    rem_set()->younger_refs_iterate(_old_gen, older_gens, scope->n_threads());
     older_gens->reset_generation();
   }

-  _process_strong_tasks->all_tasks_completed();
+  _process_strong_tasks->all_tasks_completed(scope->n_threads());
 }
--- a/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -30,8 +30,9 @@
 #include "gc/shared/collectorPolicy.hpp"
 #include "gc/shared/generation.hpp"

+class FlexibleWorkGang;
+class StrongRootsScope;
 class SubTasksDone;
-class FlexibleWorkGang;

 // A "GenCollectedHeap" is a CollectedHeap that uses generational
 // collection.  It has two generations, young and old.
@@ -363,9 +364,6 @@
   // asserted to be this type.
   static GenCollectedHeap* heap();

-  void set_par_threads(uint t);
-  void set_n_termination(uint t);
-
   // Invoke the "do_oop" method of one of the closures "not_older_gens"
   // or "older_gens" on root locations for the generation at
   // "level".  (The "older_gens" closure is used for scanning references
@@ -385,7 +383,7 @@
   };

  private:
-  void process_roots(bool activate_scope,
+  void process_roots(StrongRootsScope* scope,
                      ScanningOption so,
                      OopClosure* strong_roots,
                      OopClosure* weak_roots,
@@ -393,24 +391,13 @@
                      CLDClosure* weak_cld_closure,
                      CodeBlobClosure* code_roots);

-  void gen_process_roots(int level,
-                         bool younger_gens_as_roots,
-                         bool activate_scope,
-                         ScanningOption so,
-                         OopsInGenClosure* not_older_gens,
-                         OopsInGenClosure* weak_roots,
-                         OopsInGenClosure* older_gens,
-                         CLDClosure* cld_closure,
-                         CLDClosure* weak_cld_closure,
-                         CodeBlobClosure* code_closure);
-
  public:
   static const bool StrongAndWeakRoots = false;
   static const bool StrongRootsOnly    = true;

-  void gen_process_roots(int level,
+  void gen_process_roots(StrongRootsScope* scope,
+                         int level,
                          bool younger_gens_as_roots,
-                         bool activate_scope,
                          ScanningOption so,
                          bool only_strong_roots,
                          OopsInGenClosure* not_older_gens,
--- a/src/share/vm/gc/shared/genOopClosures.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/genOopClosures.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -35,11 +35,6 @@
 class DefNewGeneration;
 class KlassRemSet;

-template<class E, MEMFLAGS F, unsigned int N> class GenericTaskQueue;
-typedef GenericTaskQueue<oop, mtGC, TASKQUEUE_SIZE> OopTaskQueue;
-template<class T, MEMFLAGS F> class GenericTaskQueueSet;
-typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
-
 // Closure for iterating roots from a particular generation
 // Note: all classes deriving from this MUST call this do_barrier
 // method at the end of their own do_oop method!
--- a/src/share/vm/gc/shared/genRemSet.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/genRemSet.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -77,10 +77,11 @@
   //  1) that are in objects allocated in "g" at the time of the last call
   //     to "save_Marks", and
   //  2) that point to objects in younger generations.
-  virtual void younger_refs_iterate(Generation* g, OopsInGenClosure* blk) = 0;
+  virtual void younger_refs_iterate(Generation* g, OopsInGenClosure* blk, uint n_threads) = 0;

   virtual void younger_refs_in_space_iterate(Space* sp,
-                                             OopsInGenClosure* cl) = 0;
+                                             OopsInGenClosure* cl,
+                                             uint n_threads) = 0;

   // This method is used to notify the remembered set that "new_val" has
   // been written into "field" by the garbage collector.
--- a/src/share/vm/gc/shared/generation.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/generation.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -293,9 +293,10 @@
 }

 void Generation::younger_refs_in_space_iterate(Space* sp,
-                                               OopsInGenClosure* cl) {
+                                               OopsInGenClosure* cl,
+                                               uint n_threads) {
   GenRemSet* rs = GenCollectedHeap::heap()->rem_set();
-  rs->younger_refs_in_space_iterate(sp, cl);
+  rs->younger_refs_in_space_iterate(sp, cl, n_threads);
 }

 class GenerationObjIterateClosure : public SpaceClosure {
--- a/src/share/vm/gc/shared/generation.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/generation.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -122,7 +122,7 @@
   // The iteration is only over objects allocated at the start of the
   // iterations; objects allocated as a result of applying the closure are
   // not included.
-  void younger_refs_in_space_iterate(Space* sp, OopsInGenClosure* cl);
+  void younger_refs_in_space_iterate(Space* sp, OopsInGenClosure* cl, uint n_threads);

  public:
   // The set of possible generation kinds.
@@ -526,7 +526,7 @@
   // in the current generation that contain pointers to objects in younger
   // generations. Objects allocated since the last "save_marks" call are
   // excluded.
-  virtual void younger_refs_iterate(OopsInGenClosure* cl) = 0;
+  virtual void younger_refs_iterate(OopsInGenClosure* cl, uint n_threads) = 0;

   // Inform a generation that it longer contains references to objects
   // in any younger generation.    [e.g. Because younger gens are empty,
--- a/src/share/vm/gc/shared/space.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/space.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -181,7 +181,8 @@

 DirtyCardToOopClosure* Space::new_dcto_cl(ExtendedOopClosure* cl,
                                           CardTableModRefBS::PrecisionStyle precision,
-                                          HeapWord* boundary) {
+                                          HeapWord* boundary,
+                                          bool parallel) {
   return new DirtyCardToOopClosure(this, cl, precision, boundary);
 }

@@ -260,7 +261,8 @@
 DirtyCardToOopClosure*
 ContiguousSpace::new_dcto_cl(ExtendedOopClosure* cl,
                              CardTableModRefBS::PrecisionStyle precision,
-                             HeapWord* boundary) {
+                             HeapWord* boundary,
+                             bool parallel) {
   return new ContiguousSpaceDCTOC(this, cl, precision, boundary);
 }
--- a/src/share/vm/gc/shared/space.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/space.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -183,7 +183,8 @@
   // operate. ResourceArea allocated.
   virtual DirtyCardToOopClosure* new_dcto_cl(ExtendedOopClosure* cl,
                                              CardTableModRefBS::PrecisionStyle precision,
-                                             HeapWord* boundary = NULL);
+                                             HeapWord* boundary,
+                                             bool parallel);

   // If "p" is in the space, returns the address of the start of the
   // "block" that contains "p".  We say "block" instead of "object" since
@@ -629,7 +630,8 @@
   // Override.
   DirtyCardToOopClosure* new_dcto_cl(ExtendedOopClosure* cl,
                                      CardTableModRefBS::PrecisionStyle precision,
-                                     HeapWord* boundary = NULL);
+                                     HeapWord* boundary,
+                                     bool parallel);

   // Apply "blk->do_oop" to the addresses of all reference fields in objects
   // starting with the _saved_mark_word, which was noted during a generation's
--- a/src/share/vm/gc/shared/strongRootsScope.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/strongRootsScope.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -28,24 +28,18 @@
 #include "gc/shared/strongRootsScope.hpp"
 #include "runtime/thread.hpp"

-MarkScope::MarkScope(bool activate) : _active(activate) {
-  if (_active) {
-    nmethod::oops_do_marking_prologue();
-  }
+MarkScope::MarkScope() {
+  nmethod::oops_do_marking_prologue();
 }

 MarkScope::~MarkScope() {
-  if (_active) {
-    nmethod::oops_do_marking_epilogue();
-  }
+  nmethod::oops_do_marking_epilogue();
 }

-StrongRootsScope::StrongRootsScope(bool activate) : MarkScope(activate) {
-  if (_active) {
-    Threads::change_thread_claim_parity();
-    // Zero the claimed high water mark in the StringTable
-    StringTable::clear_parallel_claimed_index();
-  }
+StrongRootsScope::StrongRootsScope(uint n_threads) : _n_threads(n_threads) {
+  Threads::change_thread_claim_parity();
+  // Zero the claimed high water mark in the StringTable
+  StringTable::clear_parallel_claimed_index();
 }

 StrongRootsScope::~StrongRootsScope() {
--- a/src/share/vm/gc/shared/strongRootsScope.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/strongRootsScope.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -29,18 +29,21 @@

 class MarkScope : public StackObj {
  protected:
-  bool _active;
- public:
-  MarkScope(bool activate = true);
+  MarkScope();
   ~MarkScope();
 };

 // Sets up and tears down the required state for parallel root processing.

 class StrongRootsScope : public MarkScope {
+  // Number of threads participating in the roots processing.
+  const uint _n_threads;
+
  public:
-  StrongRootsScope(bool activate = true);
+  StrongRootsScope(uint n_threads);
   ~StrongRootsScope();
+
+  uint n_threads() const { return _n_threads; }
 };

 #endif // SHARE_VM_GC_SHARED_STRONGROOTSSCOPE_HPP
--- a/src/share/vm/gc/shared/taskqueue.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/taskqueue.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -382,6 +382,8 @@
   bool steal(uint queue_num, int* seed, E& t);

   bool peek();
+
+  uint size() const { return _n; }
 };

 template<class T, MEMFLAGS F> void
--- a/src/share/vm/gc/shared/workgroup.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/workgroup.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -133,8 +133,6 @@
 }

 void WorkGang::run_task(AbstractGangTask* task, uint no_of_parallel_workers) {
-  task->set_for_termination(no_of_parallel_workers);
-
   // This thread is executed by the VM thread which does not block
   // on ordinary MutexLocker's.
   MutexLockerEx ml(monitor(), Mutex::_no_safepoint_check_flag);
@@ -434,7 +432,7 @@
 // SubTasksDone functions.

 SubTasksDone::SubTasksDone(uint n) :
-  _n_tasks(n), _n_threads(1), _tasks(NULL) {
+  _n_tasks(n), _tasks(NULL) {
   _tasks = NEW_C_HEAP_ARRAY(uint, n, mtInternal);
   guarantee(_tasks != NULL, "alloc failure");
   clear();
@@ -444,12 +442,6 @@
   return _tasks != NULL;
 }

-void SubTasksDone::set_n_threads(uint t) {
-  assert(_claimed == 0 || _threads_completed == _n_threads,
-         "should not be called while tasks are being processed!");
-  _n_threads = (t == 0 ? 1 : t);
-}
-
 void SubTasksDone::clear() {
   for (uint i = 0; i < _n_tasks; i++) {
     _tasks[i] = 0;
@@ -477,7 +469,7 @@
   return res;
 }

-void SubTasksDone::all_tasks_completed() {
+void SubTasksDone::all_tasks_completed(uint n_threads) {
   jint observed = _threads_completed;
   jint old;
   do {
@@ -485,7 +477,10 @@
     observed = Atomic::cmpxchg(old+1, &_threads_completed, old);
   } while (observed != old);
   // If this was the last thread checking in, clear the tasks.
-  if (observed+1 == (jint)_n_threads) clear();
+  uint adjusted_thread_count = (n_threads == 0 ? 1 : n_threads);
+  if (observed + 1 == (jint)adjusted_thread_count) {
+    clear();
+  }
 }
--- a/src/share/vm/gc/shared/workgroup.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/gc/shared/workgroup.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -59,13 +59,6 @@
   // The argument tells you which member of the gang you are.
   virtual void work(uint worker_id) = 0;

-  // This method configures the task for proper termination.
-  // Some tasks do not have any requirements on termination
-  // and may inherit this method that does nothing.  Some
-  // tasks do some coordination on termination and override
-  // this method to implement that coordination.
-  virtual void set_for_termination(uint active_workers) {};
-
   // Debugging accessor for the name.
   const char* name() const PRODUCT_RETURN_(return NULL;);
   int counter() { return _counter; }
@@ -99,12 +92,9 @@
   OopTaskQueueSet*       _queues;
   ParallelTaskTerminator _terminator;
  public:
-  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues) :
-    AbstractGangTask(name), _queues(queues), _terminator(0, _queues) {}
+  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues, uint n_threads) :
+    AbstractGangTask(name), _queues(queues), _terminator(n_threads, _queues) {}
   ParallelTaskTerminator* terminator() { return &_terminator; }
-  virtual void set_for_termination(uint active_workers) {
-    terminator()->reset_for_reuse(active_workers);
-  }
   OopTaskQueueSet* queues() { return _queues; }
 };

@@ -315,16 +305,20 @@
   uint _active_workers;
  public:
   // Constructor and destructor.
-  // Initialize active_workers to a minimum value.  Setting it to
-  // the parameter "workers" will initialize it to a maximum
-  // value which is not desirable.
   FlexibleWorkGang(const char* name, uint workers,
                    bool are_GC_task_threads,
                    bool  are_ConcurrentGC_threads) :
     WorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
-    _active_workers(UseDynamicNumberOfGCThreads ? 1U : ParallelGCThreads) {}
-  // Accessors for fields
-  virtual uint active_workers() const { return _active_workers; }
+    _active_workers(UseDynamicNumberOfGCThreads ? 1U : workers) {}
+
+  // Accessors for fields.
+  virtual uint active_workers() const {
+    assert(_active_workers <= _total_workers,
+           err_msg("_active_workers: %u > _total_workers: %u", _active_workers, _total_workers));
+    assert(UseDynamicNumberOfGCThreads || _active_workers == _total_workers,
+           "Unless dynamic should use total workers");
+    return _active_workers;
+  }
   void set_active_workers(uint v) {
     assert(v <= _total_workers,
            "Trying to set more workers active than there are");
@@ -390,12 +384,6 @@
 class SubTasksDone: public CHeapObj<mtInternal> {
   uint* _tasks;
   uint _n_tasks;
-  // _n_threads is used to determine when a sub task is done.
-  // It does not control how many threads will execute the subtask
-  // but must be initialized to the number that do execute the task
-  // in order to correctly decide when the subtask is done (all the
-  // threads working on the task have finished).
-  uint _n_threads;
   uint _threads_completed;
 #ifdef ASSERT
   volatile uint _claimed;
@@ -413,11 +401,6 @@
   // True iff the object is in a valid state.
   bool valid();

-  // Get/set the number of parallel threads doing the tasks to "t".  Can only
-  // be called before tasks start or after they are complete.
-  uint n_threads() { return _n_threads; }
-  void set_n_threads(uint t);
-
   // Returns "false" if the task "t" is unclaimed, and ensures that task is
   // claimed.  The task "t" is required to be within the range of "this".
   bool is_task_claimed(uint t);
@@ -426,7 +409,9 @@
   // tasks that it will try to claim.  Every thread in the parallel task
   // must execute this.  (When the last thread does so, the task array is
   // cleared.)
-  void all_tasks_completed();
+  //
+  // n_threads - Number of threads executing the sub-tasks.
+  void all_tasks_completed(uint n_threads);

   // Destructor.
   ~SubTasksDone();
--- a/src/share/vm/memory/iterator.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/memory/iterator.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -381,9 +381,4 @@
   template <class OopClosureType>             static bool do_metadata(OopClosureType* closure);
 };

-// Helper to convert the oop iterate macro suffixes into bool values that can be used by template functions.
-#define nvs_nv_to_bool true
-#define nvs_v_to_bool  false
-#define nvs_to_bool(nv_suffix) nvs##nv_suffix##_to_bool
-
 #endif // SHARE_VM_MEMORY_ITERATOR_HPP
--- a/src/share/vm/oops/arrayKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/arrayKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -144,4 +144,36 @@
   void oop_verify_on(oop obj, outputStream* st);
 };

+// Array oop iteration macros for declarations.
+// Used to generate the declarations in the *ArrayKlass header files.
+
+#define OOP_OOP_ITERATE_DECL_RANGE(OopClosureType, nv_suffix)                                  \
+  int oop_oop_iterate_range##nv_suffix(oop obj, OopClosureType* closure, int start, int end);
+
+#if INCLUDE_ALL_GCS
+// Named NO_BACKWARDS because the definition used by *ArrayKlass isn't reversed, see below.
+#define OOP_OOP_ITERATE_DECL_NO_BACKWARDS(OopClosureType, nv_suffix)           \
+  int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure);
+#endif // INCLUDE_ALL_GCS
+
+
+// Array oop iteration macros for definitions.
+// Used to generate the definitions in the *ArrayKlass.inline.hpp files.
+
+#define OOP_OOP_ITERATE_DEFN_RANGE(KlassType, OopClosureType, nv_suffix)                                 \
+                                                                                                         \
+int KlassType::oop_oop_iterate_range##nv_suffix(oop obj, OopClosureType* closure, int start, int end) {  \
+  return oop_oop_iterate_range<nvs_to_bool(nv_suffix)>(obj, closure, start, end);                        \
+}
+
+#if INCLUDE_ALL_GCS
+#define OOP_OOP_ITERATE_DEFN_NO_BACKWARDS(KlassType, OopClosureType, nv_suffix)          \
+int KlassType::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
+  /* No reverse implementation ATM. */                                                   \
+  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                          \
+}
+#else
+#define OOP_OOP_ITERATE_DEFN_NO_BACKWARDS(KlassType, OopClosureType, nv_suffix)
+#endif
+
 #endif // SHARE_VM_OOPS_ARRAYKLASS_HPP
--- a/src/share/vm/oops/instanceClassLoaderKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceClassLoaderKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -87,19 +87,12 @@

  public:

-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)   \
-  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* blk);                    \
-  int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* blk, MemRegion mr);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceClassLoaderKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceClassLoaderKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)

 #if INCLUDE_ALL_GCS
-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix)  \
-  int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* blk);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_BACKWARDS)
 #endif // INCLUDE_ALL_GCS

 };
--- a/src/share/vm/oops/instanceClassLoaderKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceClassLoaderKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -78,33 +78,9 @@
   return size;
 }

-
-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)              \
-                                                                                              \
-int InstanceClassLoaderKlass::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                               \
-}
-
-#if INCLUDE_ALL_GCS
-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)              \
-                                                                                                        \
-int InstanceClassLoaderKlass::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate_reverse<nvs_to_bool(nv_suffix)>(obj, closure);                                 \
-}
-#else
-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-
-#define InstanceClassLoaderKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)                              \
-                                                                                                                \
-int InstanceClassLoaderKlass::oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                                     \
-}
-
 #define ALL_INSTANCE_CLASS_LOADER_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  InstanceClassLoaderKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)     \
-  InstanceClassLoaderKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)     \
-  InstanceClassLoaderKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
+  OOP_OOP_ITERATE_DEFN(          InstanceClassLoaderKlass, OopClosureType, nv_suffix)    \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(  InstanceClassLoaderKlass, OopClosureType, nv_suffix)    \
+  OOP_OOP_ITERATE_DEFN_BACKWARDS(InstanceClassLoaderKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_INSTANCECLASSLOADERKLASS_INLINE_HPP
--- a/src/share/vm/oops/instanceKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -1084,19 +1084,12 @@

  public:

-#define InstanceKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)                   \
-  int  oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure);                    \
-  int  oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)

 #if INCLUDE_ALL_GCS
-#define InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix)  \
-  int  oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_BACKWARDS)
 #endif // INCLUDE_ALL_GCS

   u2 idnum_allocated_count() const      { return _idnum_allocated_count; }
--- a/src/share/vm/oops/instanceKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -27,6 +27,7 @@

 #include "memory/iterator.hpp"
 #include "oops/instanceKlass.hpp"
+#include "oops/klass.hpp"
 #include "oops/oop.inline.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
@@ -187,29 +188,9 @@

 #undef INLINE

-
-#define InstanceKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)              \
-int InstanceKlass::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                    \
-}
-
-#if INCLUDE_ALL_GCS
-#define InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)              \
-int InstanceKlass::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate_reverse<nvs_to_bool(nv_suffix)>(obj, closure);                      \
-}
-#else
-#define InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-#define InstanceKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)                              \
-int InstanceKlass::oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                          \
-}
-
 #define ALL_INSTANCE_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  InstanceKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)   \
-  InstanceKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)   \
-  InstanceKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
+  OOP_OOP_ITERATE_DEFN(          InstanceKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(  InstanceKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_BACKWARDS(InstanceKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_INSTANCEKLASS_INLINE_HPP
--- a/src/share/vm/oops/instanceMirrorKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceMirrorKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -149,19 +149,12 @@

  public:

-#define InstanceMirrorKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)           \
-  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* blk);                       \
-  int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* blk, MemRegion mr);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceMirrorKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceMirrorKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)

 #if INCLUDE_ALL_GCS
-#define InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix) \
-  int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* blk);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_BACKWARDS)
 #endif // INCLUDE_ALL_GCS
 };
--- a/src/share/vm/oops/instanceMirrorKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceMirrorKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -27,6 +27,7 @@
 #include "classfile/javaClasses.hpp"
 #include "oops/instanceKlass.inline.hpp"
 #include "oops/instanceMirrorKlass.hpp"
+#include "oops/klass.hpp"
 #include "oops/oop.inline.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
@@ -132,33 +133,9 @@
   return oop_size(obj);
 }

-
-#define InstanceMirrorKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)              \
-                                                                                         \
-int InstanceMirrorKlass::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                          \
-}
-
-#if INCLUDE_ALL_GCS
-#define InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)              \
-                                                                                                   \
-int InstanceMirrorKlass::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate_reverse<nvs_to_bool(nv_suffix)>(obj, closure);                            \
-}
-#else
-#define InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-
-#define InstanceMirrorKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)                              \
-                                                                                                           \
-int InstanceMirrorKlass::oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                                \
-}
-
 #define ALL_INSTANCE_MIRROR_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  InstanceMirrorKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)    \
-  InstanceMirrorKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)    \
-  InstanceMirrorKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
+  OOP_OOP_ITERATE_DEFN(          InstanceMirrorKlass, OopClosureType, nv_suffix)   \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(  InstanceMirrorKlass, OopClosureType, nv_suffix)   \
+  OOP_OOP_ITERATE_DEFN_BACKWARDS(InstanceMirrorKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_INSTANCEMIRRORKLASS_INLINE_HPP
--- a/src/share/vm/oops/instanceRefKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceRefKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -119,19 +119,12 @@

  public:

-#define InstanceRefKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)               \
-  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure);                    \
-  int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceRefKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceRefKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)

 #if INCLUDE_ALL_GCS
-#define InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix)     \
-  int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_BACKWARDS)
 #endif // INCLUDE_ALL_GCS

   static void release_and_notify_pending_list_lock(BasicLock *pending_list_basic_lock);
--- a/src/share/vm/oops/instanceRefKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/instanceRefKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -141,34 +141,9 @@

 // Macro to define InstanceRefKlass::oop_oop_iterate for virtual/nonvirtual for
 // all closures.  Macros calling macros above for each oop size.
-
-#define InstanceRefKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)              \
-                                                                                      \
-int InstanceRefKlass::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                       \
-}
-
-#if INCLUDE_ALL_GCS
-#define InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)              \
-                                                                                                \
-int InstanceRefKlass::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate_reverse<nvs_to_bool(nv_suffix)>(obj, closure);                         \
-}
-#else
-#define InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-
-#define InstanceRefKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)                              \
-                                                                                                        \
-int InstanceRefKlass::oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                             \
-}
-
 #define ALL_INSTANCE_REF_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  InstanceRefKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)    \
-  InstanceRefKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)    \
-  InstanceRefKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-
+  OOP_OOP_ITERATE_DEFN(          InstanceRefKlass, OopClosureType, nv_suffix)   \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(  InstanceRefKlass, OopClosureType, nv_suffix)   \
+  OOP_OOP_ITERATE_DEFN_BACKWARDS(InstanceRefKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_INSTANCEREFKLASS_INLINE_HPP
--- a/src/share/vm/oops/klass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/klass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -583,20 +583,20 @@

   // Iterators specialized to particular subtypes
   // of ExtendedOopClosure, to avoid closure virtual calls.
-#define Klass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)                                      \
-  virtual int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) = 0;                    \
-  /* Iterates "closure" over all the oops in "obj" (of type "this") within "mr". */                \
-  virtual int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) = 0;
+#define Klass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)                                          \
+  virtual int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) = 0;                        \
+  /* Iterates "closure" over all the oops in "obj" (of type "this") within "mr". */                    \
+  virtual int oop_oop_iterate_bounded##nv_suffix(oop obj, OopClosureType* closure, MemRegion mr) = 0;

   ALL_OOP_OOP_ITERATE_CLOSURES_1(Klass_OOP_OOP_ITERATE_DECL)
   ALL_OOP_OOP_ITERATE_CLOSURES_2(Klass_OOP_OOP_ITERATE_DECL)

 #if INCLUDE_ALL_GCS
-#define Klass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix)                    \
+#define Klass_OOP_OOP_ITERATE_DECL_BACKWARDS(OopClosureType, nv_suffix)                    \
   virtual int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) = 0;

-  ALL_OOP_OOP_ITERATE_CLOSURES_1(Klass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(Klass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(Klass_OOP_OOP_ITERATE_DECL_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(Klass_OOP_OOP_ITERATE_DECL_BACKWARDS)
 #endif // INCLUDE_ALL_GCS

   virtual void array_klasses_do(void f(Klass* k)) {}
@@ -651,4 +651,44 @@
   void klass_update_barrier_set_pre(oop* p, oop v);
 };

+// Helper to convert the oop iterate macro suffixes into bool values that can be used by template functions.
+#define nvs_nv_to_bool true
+#define nvs_v_to_bool  false
+#define nvs_to_bool(nv_suffix) nvs##nv_suffix##_to_bool
+
+// Oop iteration macros for declarations.
+// Used to generate declarations in the *Klass header files.
+
+#define OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)                                    \
+  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure);                        \
+  int oop_oop_iterate_bounded##nv_suffix(oop obj, OopClosureType* closure, MemRegion mr);
+
+#if INCLUDE_ALL_GCS
+#define OOP_OOP_ITERATE_DECL_BACKWARDS(OopClosureType, nv_suffix)              \
+  int oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure);
+#endif // INCLUDE_ALL_GCS
+
+
+// Oop iteration macros for definitions.
+// Used to generate definitions in the *Klass.inline.hpp files.
+
+#define OOP_OOP_ITERATE_DEFN(KlassType, OopClosureType, nv_suffix)             \
+int KlassType::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
+  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                \
+}
+
+#if INCLUDE_ALL_GCS
+#define OOP_OOP_ITERATE_DEFN_BACKWARDS(KlassType, OopClosureType, nv_suffix)             \
+int KlassType::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
+  return oop_oop_iterate_reverse<nvs_to_bool(nv_suffix)>(obj, closure);                  \
+}
+#else
+#define OOP_OOP_ITERATE_DEFN_BACKWARDS(KlassType, OopClosureType, nv_suffix)
+#endif
+
+#define OOP_OOP_ITERATE_DEFN_BOUNDED(KlassType, OopClosureType, nv_suffix)                           \
+int KlassType::oop_oop_iterate_bounded##nv_suffix(oop obj, OopClosureType* closure, MemRegion mr) {  \
+  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                          \
+}
+
 #endif // SHARE_VM_OOPS_KLASS_HPP
--- a/src/share/vm/oops/objArrayKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/objArrayKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -163,22 +163,14 @@

  public:

-#define ObjArrayKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)   \
-  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* blk);         \
-  int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* blk,      \
-                                     MemRegion mr);                     \
-  int oop_oop_iterate_range##nv_suffix(oop obj, OopClosureType* blk,    \
-                                     int start, int end);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(ObjArrayKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(ObjArrayKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_RANGE)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_RANGE)

 #if INCLUDE_ALL_GCS
-#define ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix) \
-  int  oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* blk);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_NO_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_NO_BACKWARDS)
 #endif // INCLUDE_ALL_GCS

   // JVM support
--- a/src/share/vm/oops/objArrayKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/objArrayKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -27,6 +27,8 @@

 #include "memory/memRegion.hpp"
 #include "memory/iterator.inline.hpp"
+#include "oops/arrayKlass.hpp"
+#include "oops/klass.hpp"
 #include "oops/objArrayKlass.hpp"
 #include "oops/objArrayOop.inline.hpp"
 #include "oops/oop.inline.hpp"
@@ -149,41 +151,10 @@
   return size;
 }

-
-#define ObjArrayKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)              \
-                                                                                   \
-int ObjArrayKlass::oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {  \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                    \
-}
-
-#if INCLUDE_ALL_GCS
-#define ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)              \
-int ObjArrayKlass::oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {  \
-  /* No reverse implementation ATM. */                                                       \
-  return oop_oop_iterate<nvs_to_bool(nv_suffix)>(obj, closure);                              \
-}
-#else
-#define ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-#define ObjArrayKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)                              \
-                                                                                                     \
-int ObjArrayKlass::oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_bounded<nvs_to_bool(nv_suffix)>(obj, closure, mr);                          \
-}
-
-#define ObjArrayKlass_OOP_OOP_ITERATE_DEFN_r(OopClosureType, nv_suffix)                                      \
-                                                                                                             \
-int ObjArrayKlass::oop_oop_iterate_range##nv_suffix(oop obj, OopClosureType* closure, int start, int end) {  \
-  return oop_oop_iterate_range<nvs_to_bool(nv_suffix)>(obj, closure, start, end);                            \
-}
-
-
-#define ALL_OBJ_ARRAY_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  ObjArrayKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)    \
-  ObjArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)    \
-  ObjArrayKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)    \
-  ObjArrayKlass_OOP_OOP_ITERATE_DEFN_r(        OopClosureType, nv_suffix)
-
+#define ALL_OBJ_ARRAY_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)    \
+  OOP_OOP_ITERATE_DEFN(             ObjArrayKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(     ObjArrayKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_RANGE(       ObjArrayKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_NO_BACKWARDS(ObjArrayKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_OBJARRAYKLASS_INLINE_HPP
--- a/src/share/vm/oops/oop.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/oop.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -741,7 +741,7 @@
 }                                                                     \
                                                                       \
 inline int oopDesc::oop_iterate(OopClosureType* blk, MemRegion mr) {  \
-  return klass()->oop_oop_iterate##nv_suffix##_m(this, blk, mr);      \
+  return klass()->oop_oop_iterate_bounded##nv_suffix(this, blk, mr);  \
 }
--- a/src/share/vm/oops/typeArrayKlass.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/typeArrayKlass.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -92,24 +92,24 @@
   // The implementation used by all oop_oop_iterate functions in TypeArrayKlasses.
   inline int oop_oop_iterate_impl(oop obj, ExtendedOopClosure* closure);

+  // Wraps oop_oop_iterate_impl to conform to macros.
+  template <bool nv, typename OopClosureType>
+  inline int oop_oop_iterate(oop obj, OopClosureType* closure);
+
+  // Wraps oop_oop_iterate_impl to conform to macros.
+  template <bool nv, typename OopClosureType>
+  inline int oop_oop_iterate_bounded(oop obj, OopClosureType* closure, MemRegion mr);
+
  public:

-#define TypeArrayKlass_OOP_OOP_ITERATE_DECL(OopClosureType, nv_suffix)    \
-  int oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure);       \
-  int oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure,    \
-                                     MemRegion mr);                       \
-  int oop_oop_iterate_range##nv_suffix(oop obj, OopClosureType* closure,  \
-                                     int start, int end);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(TypeArrayKlass_OOP_OOP_ITERATE_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(TypeArrayKlass_OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_RANGE)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_RANGE)

 #if INCLUDE_ALL_GCS
-#define TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL(OopClosureType, nv_suffix)  \
-  int  oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure);
-
-  ALL_OOP_OOP_ITERATE_CLOSURES_1(TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
-  ALL_OOP_OOP_ITERATE_CLOSURES_2(TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DECL)
+  ALL_OOP_OOP_ITERATE_CLOSURES_1(OOP_OOP_ITERATE_DECL_NO_BACKWARDS)
+  ALL_OOP_OOP_ITERATE_CLOSURES_2(OOP_OOP_ITERATE_DECL_NO_BACKWARDS)
 #endif // INCLUDE_ALL_GCS
--- a/src/share/vm/oops/typeArrayKlass.inline.hpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/oops/typeArrayKlass.inline.hpp	Thu Jun 04 18:50:05 2015 -0700
@@ -25,6 +25,8 @@
 #ifndef SHARE_VM_OOPS_TYPEARRAYKLASS_INLINE_HPP
 #define SHARE_VM_OOPS_TYPEARRAYKLASS_INLINE_HPP

+#include "oops/arrayKlass.hpp"
+#include "oops/klass.hpp"
 #include "oops/oop.inline.hpp"
 #include "oops/typeArrayKlass.hpp"
 #include "oops/typeArrayOop.hpp"
@@ -39,35 +41,19 @@
   return t->object_size();
 }

-#define TypeArrayKlass_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-                                                                        \
-int TypeArrayKlass::                                                    \
-oop_oop_iterate##nv_suffix(oop obj, OopClosureType* closure) {          \
-  return oop_oop_iterate_impl(obj, closure);                            \
+template <bool nv, typename OopClosureType>
+int TypeArrayKlass::oop_oop_iterate(oop obj, OopClosureType* closure) {
+  return oop_oop_iterate_impl(obj, closure);
 }

-#if INCLUDE_ALL_GCS
-#define TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)  \
-                                                                                  \
-int TypeArrayKlass::                                                              \
-oop_oop_iterate_backwards##nv_suffix(oop obj, OopClosureType* closure) {          \
-  return oop_oop_iterate_impl(obj, closure);                                      \
-}
-#else
-#define TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
-#endif
-
-
-#define TypeArrayKlass_OOP_OOP_ITERATE_DEFN_m(OopClosureType, nv_suffix)          \
-                                                                                  \
-int TypeArrayKlass::                                                              \
-oop_oop_iterate##nv_suffix##_m(oop obj, OopClosureType* closure, MemRegion mr) {  \
-  return oop_oop_iterate_impl(obj, closure);                                      \
+template <bool nv, typename OopClosureType>
+int TypeArrayKlass::oop_oop_iterate_bounded(oop obj, OopClosureType* closure, MemRegion mr) {
+  return oop_oop_iterate_impl(obj, closure);
 }

-#define ALL_TYPE_ARRAY_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)  \
-  TypeArrayKlass_OOP_OOP_ITERATE_DEFN(          OopClosureType, nv_suffix)    \
-  TypeArrayKlass_OOP_OOP_ITERATE_DEFN_m(        OopClosureType, nv_suffix)    \
-  TypeArrayKlass_OOP_OOP_ITERATE_BACKWARDS_DEFN(OopClosureType, nv_suffix)
+#define ALL_TYPE_ARRAY_KLASS_OOP_OOP_ITERATE_DEFN(OopClosureType, nv_suffix)    \
+  OOP_OOP_ITERATE_DEFN(             TypeArrayKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_BOUNDED(     TypeArrayKlass, OopClosureType, nv_suffix)  \
+  OOP_OOP_ITERATE_DEFN_NO_BACKWARDS(TypeArrayKlass, OopClosureType, nv_suffix)

 #endif // SHARE_VM_OOPS_TYPEARRAYKLASS_INLINE_HPP
--- a/src/share/vm/runtime/arguments.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/runtime/arguments.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -1278,10 +1278,8 @@

   // Preferred young gen size for "short" pauses:
   // upper bound depends on # of threads and NewRatio.
-  const uintx parallel_gc_threads =
-    (ParallelGCThreads == 0 ? 1 : ParallelGCThreads);
   const size_t preferred_max_new_size_unaligned =
-    MIN2(max_heap/(NewRatio+1), ScaleForWordSize(young_gen_per_worker * parallel_gc_threads));
+    MIN2(max_heap/(NewRatio+1), ScaleForWordSize(young_gen_per_worker * ParallelGCThreads));
   size_t preferred_max_new_size =
     align_size_up(preferred_max_new_size_unaligned, os::vm_page_size());
--- a/src/share/vm/utilities/elfFile.cpp	Thu Jun 04 09:31:46 2015 -0700
+++ b/src/share/vm/utilities/elfFile.cpp	Thu Jun 04 18:50:05 2015 -0700
@@ -261,7 +261,12 @@
       }
     }
   }
+// AARCH64 defaults to noexecstack. All others default to execstack.
+#ifdef AARCH64
+  return true;
+#else
   return false;
+#endif
 }
 #endif
--- a/test/compiler/stable/StableConfiguration.java	Thu Jun 04 09:31:46 2015 -0700
+++ b/test/compiler/stable/StableConfiguration.java	Thu Jun 04 18:50:05 2015 -0700
@@ -41,10 +41,32 @@
         System.out.println("Server Compiler: " + get());
     }

+    // The method 'get' below returns true if the method is server compiled
+    // and is used by the Stable tests to determine whether methods in
+    // general are being server compiled or not as the -XX:+FoldStableValues
+    // option is only applicable to -server.
+    //
+    // On aarch64 we DeOptimize when patching. This means that when the
+    // method is compiled as a result of -Xcomp it DeOptimizes immediately.
+    // The result is that getMethodCompilationLevel returns 0. This means
+    // the method returns true based on java.vm.name.
+    //
+    // However when the tests are run with -XX:+TieredCompilation and
+    // -XX:TieredStopAtLevel=1 this fails because methods will always
+    // be client compiled.
+    //
+    // Solution is to add a simple method 'get1' which should never be
+    // DeOpted and use that to determine the compilation level instead.
+    static void get1() {
+    }
+
+
+
     // ::get() is among immediately compiled methods.
     static boolean get() {
         try {
-            Method m = StableConfiguration.class.getDeclaredMethod("get");
+            get1();
+            Method m = StableConfiguration.class.getDeclaredMethod("get1");
             int level = WB.getMethodCompilationLevel(m);
             if (level > 0) {
               return (level == 4);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/serviceability/sa/TestClassLoaderStats.java	Thu Jun 04 18:50:05 2015 -0700
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import jdk.test.lib.Platform;
+import jdk.test.lib.ProcessTools;
+import jdk.test.lib.OutputAnalyzer;
+
+/*
+ * @test
+ * @library /testlibrary
+ * @build jdk.test.lib.*
+ * @run main TestClassLoaderStats
+ */
+public class TestClassLoaderStats {
+
+    public static void main(String[] args) throws Exception {
+        if (!Platform.shouldSAAttach()) {
+            System.out.println("SA attach not expected to work - test skipped.");
+            return;
+        }
+
+        ProcessBuilder processBuilder = ProcessTools.createJavaProcessBuilder(
+                "-XX:+UsePerfData",
+                "sun.jvm.hotspot.tools.ClassLoaderStats",
+                Integer.toString(ProcessTools.getProcessId()));
+        OutputAnalyzer output = ProcessTools.executeProcess(processBuilder);
+        System.out.println(output.getOutput());
+
+        output.shouldHaveExitValue(0);
+        output.shouldContain("Debugger attached successfully.");
+        // The class loader stats header needs to be presented in the output:
+        output.shouldMatch("class_loader\\W+classes\\W+bytes\\W+parent_loader\\W+alive?\\W+type");
+        output.stderrShouldNotMatch("[E|e]xception");
+        output.stderrShouldNotMatch("[E|e]rror");
+    }
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/serviceability/sa/TestStackTrace.java	Thu Jun 04 18:50:05 2015 -0700
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import jdk.test.lib.OutputAnalyzer;
+import jdk.test.lib.Platform;
+import jdk.test.lib.ProcessTools;
+
+/*
+ * @test
+ * @library /testlibrary
+ * @build jdk.test.lib.*
+ * @run main TestStackTrace
+ */
+public class TestStackTrace {
+
+    public static void main(String[] args) throws Exception {
+        if (!Platform.shouldSAAttach()) {
+            System.out.println("SA attach not expected to work - test skipped.");
+            return;
+        }
+
+        ProcessBuilder processBuilder = ProcessTools.createJavaProcessBuilder(
+                "-XX:+UsePerfData",
+                "sun.jvm.hotspot.tools.StackTrace",
+                Integer.toString(ProcessTools.getProcessId()));
+        OutputAnalyzer output = ProcessTools.executeProcess(processBuilder);
+        System.out.println(output.getOutput());
+
+        output.shouldHaveExitValue(0);
+        output.shouldContain("Debugger attached successfully.");
+        output.stderrShouldNotMatch("[E|e]xception");
+        output.stderrShouldNotMatch("[E|e]rror");
+     }
+
+}