view src/cpu/aarch64/vm/assembler_aarch64.cpp @ 6034:5ad4c0916974 icedtea-2.6pre15

Add support for A53 multiply accumulate
author adinn
date Thu, 11 Dec 2014 16:42:03 +0000
parents 023d218976e3
children
line wrap: on
line source

/*
 * Copyright (c) 2013, Red Hat Inc.
 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights
 * reserved.  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE
 * HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include <stdio.h>
#include <sys/types.h>

#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "interpreter/interpreter.hpp"

#ifndef PRODUCT
const unsigned long Assembler::asm_bp = 0x00007fffee09ac88;
#endif

#include "compiler/disassembler.hpp"
#include "memory/resourceArea.hpp"
#include "runtime/biasedLocking.hpp"
#include "runtime/interfaceSupport.hpp"
#include "runtime/sharedRuntime.hpp"

// for the moment we reuse the logical/floating point immediate encode
// and decode functiosn provided by the simulator. when we move to
// real hardware we will need to pull taht code into here

#include "immediate_aarch64.hpp"

// #include "gc_interface/collectedHeap.inline.hpp"
// #include "interpreter/interpreter.hpp"
// #include "memory/cardTableModRefBS.hpp"
// #include "prims/methodHandles.hpp"
// #include "runtime/biasedLocking.hpp"
// #include "runtime/interfaceSupport.hpp"
// #include "runtime/objectMonitor.hpp"
// #include "runtime/os.hpp"
// #include "runtime/sharedRuntime.hpp"
// #include "runtime/stubRoutines.hpp"
#ifndef SERIALGC
#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
#include "gc_implementation/g1/heapRegion.hpp"
#endif


extern "C" void entry(CodeBuffer *cb);

#define __ _masm.
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#define STOP(error) stop(error)
#else
#define BLOCK_COMMENT(str) block_comment(str)
#define STOP(error) block_comment(error); stop(error)
#endif

#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

static float unpack(unsigned value);

void entry(CodeBuffer *cb) {

  // {
  //   for (int i = 0; i < 256; i+=16)
  //     {
  // 	printf("\"%20.20g\", ", unpack(i));
  // 	printf("\"%20.20g\", ", unpack(i+1));
  //     }
  //   printf("\n");
  // }

  Assembler _masm(cb);
  address entry = __ pc();

  // Smoke test for assembler

#ifdef ASSERT
// BEGIN  Generated code -- do not edit
// Generated by aarch64-asmtest.py
    Label back, forth;
    __ bind(back);

// ArithOp
    __ add(r19, r22, r7, Assembler::LSL, 28);          //	add	x19, x22, x7, LSL #28
    __ sub(r16, r11, r10, Assembler::LSR, 13);         //	sub	x16, x11, x10, LSR #13
    __ adds(r27, r13, r28, Assembler::ASR, 2);         //	adds	x27, x13, x28, ASR #2
    __ subs(r20, r28, r26, Assembler::ASR, 41);        //	subs	x20, x28, x26, ASR #41
    __ addw(r8, r19, r19, Assembler::ASR, 19);         //	add	w8, w19, w19, ASR #19
    __ subw(r4, r9, r10, Assembler::LSL, 14);          //	sub	w4, w9, w10, LSL #14
    __ addsw(r8, r11, r30, Assembler::LSL, 13);        //	adds	w8, w11, w30, LSL #13
    __ subsw(r0, r25, r19, Assembler::LSL, 9);         //	subs	w0, w25, w19, LSL #9
    __ andr(r20, r0, r21, Assembler::LSL, 19);         //	and	x20, x0, x21, LSL #19
    __ orr(r21, r14, r20, Assembler::LSL, 17);         //	orr	x21, x14, x20, LSL #17
    __ eor(r25, r28, r1, Assembler::LSL, 51);          //	eor	x25, x28, x1, LSL #51
    __ ands(r10, r27, r11, Assembler::ASR, 15);        //	ands	x10, x27, x11, ASR #15
    __ andw(r25, r5, r12, Assembler::ASR, 23);         //	and	w25, w5, w12, ASR #23
    __ orrw(r18, r14, r10, Assembler::LSR, 4);         //	orr	w18, w14, w10, LSR #4
    __ eorw(r4, r21, r5, Assembler::ASR, 22);          //	eor	w4, w21, w5, ASR #22
    __ andsw(r21, r0, r5, Assembler::ASR, 29);         //	ands	w21, w0, w5, ASR #29
    __ bic(r26, r30, r6, Assembler::ASR, 37);          //	bic	x26, x30, x6, ASR #37
    __ orn(r3, r1, r13, Assembler::LSR, 29);           //	orn	x3, x1, x13, LSR #29
    __ eon(r0, r28, r9, Assembler::LSL, 47);           //	eon	x0, x28, x9, LSL #47
    __ bics(r29, r5, r28, Assembler::LSL, 46);         //	bics	x29, x5, x28, LSL #46
    __ bicw(r9, r18, r7, Assembler::LSR, 20);          //	bic	w9, w18, w7, LSR #20
    __ ornw(r26, r13, r25, Assembler::ASR, 24);        //	orn	w26, w13, w25, ASR #24
    __ eonw(r25, r4, r19, Assembler::LSL, 6);          //	eon	w25, w4, w19, LSL #6
    __ bicsw(r5, r26, r4, Assembler::LSR, 24);         //	bics	w5, w26, w4, LSR #24

// AddSubImmOp
    __ addw(r7, r19, 340u);                            //	add	w7, w19, #340
    __ addsw(r8, r0, 401u);                            //	adds	w8, w0, #401
    __ subw(r29, r20, 163u);                           //	sub	w29, w20, #163
    __ subsw(r8, r23, 759u);                           //	subs	w8, w23, #759
    __ add(r1, r12, 523u);                             //	add	x1, x12, #523
    __ adds(r2, r11, 426u);                            //	adds	x2, x11, #426
    __ sub(r14, r29, 716u);                            //	sub	x14, x29, #716
    __ subs(r11, r5, 582u);                            //	subs	x11, x5, #582

// LogicalImmOp
    __ andw(r23, r22, 32768ul);                        //	and	w23, w22, #0x8000
    __ orrw(r4, r10, 4042322160ul);                    //	orr	w4, w10, #0xf0f0f0f0
    __ eorw(r0, r24, 4042322160ul);                    //	eor	w0, w24, #0xf0f0f0f0
    __ andsw(r19, r29, 2139127680ul);                  //	ands	w19, w29, #0x7f807f80
    __ andr(r5, r10, 4503599627354112ul);              //	and	x5, x10, #0xfffffffffc000
    __ orr(r12, r30, 18445618178097414144ul);          //	orr	x12, x30, #0xfffc0000fffc0000
    __ eor(r30, r5, 262128ul);                         //	eor	x30, x5, #0x3fff0
    __ ands(r26, r23, 4194300ul);                      //	ands	x26, x23, #0x3ffffc

// AbsOp
    __ b(__ pc());                                     //	b	.
    __ b(back);                                        //	b	back
    __ b(forth);                                       //	b	forth
    __ bl(__ pc());                                    //	bl	.
    __ bl(back);                                       //	bl	back
    __ bl(forth);                                      //	bl	forth

// RegAndAbsOp
    __ cbzw(r12, __ pc());                             //	cbz	w12, .
    __ cbzw(r12, back);                                //	cbz	w12, back
    __ cbzw(r12, forth);                               //	cbz	w12, forth
    __ cbnzw(r20, __ pc());                            //	cbnz	w20, .
    __ cbnzw(r20, back);                               //	cbnz	w20, back
    __ cbnzw(r20, forth);                              //	cbnz	w20, forth
    __ cbz(r12, __ pc());                              //	cbz	x12, .
    __ cbz(r12, back);                                 //	cbz	x12, back
    __ cbz(r12, forth);                                //	cbz	x12, forth
    __ cbnz(r24, __ pc());                             //	cbnz	x24, .
    __ cbnz(r24, back);                                //	cbnz	x24, back
    __ cbnz(r24, forth);                               //	cbnz	x24, forth
    __ adr(r6, __ pc());                               //	adr	x6, .
    __ adr(r6, back);                                  //	adr	x6, back
    __ adr(r6, forth);                                 //	adr	x6, forth
    __ _adrp(r21, __ pc());                             //	adrp	x21, .

// RegImmAbsOp
    __ tbz(r1, 1, __ pc());                            //	tbz	x1, #1, .
    __ tbz(r1, 1, back);                               //	tbz	x1, #1, back
    __ tbz(r1, 1, forth);                              //	tbz	x1, #1, forth
    __ tbnz(r8, 9, __ pc());                           //	tbnz	x8, #9, .
    __ tbnz(r8, 9, back);                              //	tbnz	x8, #9, back
    __ tbnz(r8, 9, forth);                             //	tbnz	x8, #9, forth

// MoveWideImmOp
    __ movnw(r12, 23175, 0);                           //	movn	w12, #23175, lsl 0
    __ movzw(r11, 20476, 16);                          //	movz	w11, #20476, lsl 16
    __ movkw(r21, 3716, 0);                            //	movk	w21, #3716, lsl 0
    __ movn(r29, 28661, 48);                           //	movn	x29, #28661, lsl 48
    __ movz(r3, 6927, 0);                              //	movz	x3, #6927, lsl 0
    __ movk(r22, 9828, 16);                            //	movk	x22, #9828, lsl 16

// BitfieldOp
    __ sbfm(r12, r8, 6, 22);                           //	sbfm	x12, x8, #6, #22
    __ bfmw(r19, r25, 25, 19);                         //	bfm	w19, w25, #25, #19
    __ ubfmw(r9, r12, 29, 15);                         //	ubfm	w9, w12, #29, #15
    __ sbfm(r28, r25, 16, 16);                         //	sbfm	x28, x25, #16, #16
    __ bfm(r12, r5, 4, 25);                            //	bfm	x12, x5, #4, #25
    __ ubfm(r0, r10, 6, 8);                            //	ubfm	x0, x10, #6, #8

// ExtractOp
    __ extrw(r4, r13, r26, 24);                        //	extr	w4, w13, w26, #24
    __ extr(r23, r30, r24, 31);                        //	extr	x23, x30, x24, #31

// CondBranchOp
    __ br(Assembler::EQ, __ pc());                     //	b.EQ	.
    __ br(Assembler::EQ, back);                        //	b.EQ	back
    __ br(Assembler::EQ, forth);                       //	b.EQ	forth
    __ br(Assembler::NE, __ pc());                     //	b.NE	.
    __ br(Assembler::NE, back);                        //	b.NE	back
    __ br(Assembler::NE, forth);                       //	b.NE	forth
    __ br(Assembler::HS, __ pc());                     //	b.HS	.
    __ br(Assembler::HS, back);                        //	b.HS	back
    __ br(Assembler::HS, forth);                       //	b.HS	forth
    __ br(Assembler::CS, __ pc());                     //	b.CS	.
    __ br(Assembler::CS, back);                        //	b.CS	back
    __ br(Assembler::CS, forth);                       //	b.CS	forth
    __ br(Assembler::LO, __ pc());                     //	b.LO	.
    __ br(Assembler::LO, back);                        //	b.LO	back
    __ br(Assembler::LO, forth);                       //	b.LO	forth
    __ br(Assembler::CC, __ pc());                     //	b.CC	.
    __ br(Assembler::CC, back);                        //	b.CC	back
    __ br(Assembler::CC, forth);                       //	b.CC	forth
    __ br(Assembler::MI, __ pc());                     //	b.MI	.
    __ br(Assembler::MI, back);                        //	b.MI	back
    __ br(Assembler::MI, forth);                       //	b.MI	forth
    __ br(Assembler::PL, __ pc());                     //	b.PL	.
    __ br(Assembler::PL, back);                        //	b.PL	back
    __ br(Assembler::PL, forth);                       //	b.PL	forth
    __ br(Assembler::VS, __ pc());                     //	b.VS	.
    __ br(Assembler::VS, back);                        //	b.VS	back
    __ br(Assembler::VS, forth);                       //	b.VS	forth
    __ br(Assembler::VC, __ pc());                     //	b.VC	.
    __ br(Assembler::VC, back);                        //	b.VC	back
    __ br(Assembler::VC, forth);                       //	b.VC	forth
    __ br(Assembler::HI, __ pc());                     //	b.HI	.
    __ br(Assembler::HI, back);                        //	b.HI	back
    __ br(Assembler::HI, forth);                       //	b.HI	forth
    __ br(Assembler::LS, __ pc());                     //	b.LS	.
    __ br(Assembler::LS, back);                        //	b.LS	back
    __ br(Assembler::LS, forth);                       //	b.LS	forth
    __ br(Assembler::GE, __ pc());                     //	b.GE	.
    __ br(Assembler::GE, back);                        //	b.GE	back
    __ br(Assembler::GE, forth);                       //	b.GE	forth
    __ br(Assembler::LT, __ pc());                     //	b.LT	.
    __ br(Assembler::LT, back);                        //	b.LT	back
    __ br(Assembler::LT, forth);                       //	b.LT	forth
    __ br(Assembler::GT, __ pc());                     //	b.GT	.
    __ br(Assembler::GT, back);                        //	b.GT	back
    __ br(Assembler::GT, forth);                       //	b.GT	forth
    __ br(Assembler::LE, __ pc());                     //	b.LE	.
    __ br(Assembler::LE, back);                        //	b.LE	back
    __ br(Assembler::LE, forth);                       //	b.LE	forth
    __ br(Assembler::AL, __ pc());                     //	b.AL	.
    __ br(Assembler::AL, back);                        //	b.AL	back
    __ br(Assembler::AL, forth);                       //	b.AL	forth
    __ br(Assembler::NV, __ pc());                     //	b.NV	.
    __ br(Assembler::NV, back);                        //	b.NV	back
    __ br(Assembler::NV, forth);                       //	b.NV	forth

// ImmOp
    __ svc(12729);                                     //	svc	#12729
    __ hvc(6788);                                      //	hvc	#6788
    __ smc(1535);                                      //	smc	#1535
    __ brk(16766);                                     //	brk	#16766
    __ hlt(9753);                                      //	hlt	#9753

// Op
    __ nop();                                          //	nop	
    __ eret();                                         //	eret	
    __ drps();                                         //	drps	
    __ isb();                                          //	isb	

// SystemOp
    __ dsb(Assembler::SY);                             //	dsb	SY
    __ dmb(Assembler::ISHST);                          //	dmb	ISHST

// OneRegOp
    __ br(r2);                                         //	br	x2
    __ blr(r5);                                        //	blr	x5

// LoadStoreExclusiveOp
    __ stxr(r20, r21, r2);                             //	stxr	w20, x21, [x2]
    __ stlxr(r7, r29, r7);                             //	stlxr	w7, x29, [x7]
    __ ldxr(r5, r16);                                  //	ldxr	x5, [x16]
    __ ldaxr(r27, r29);                                //	ldaxr	x27, [x29]
    __ stlr(r0, r29);                                  //	stlr	x0, [x29]
    __ ldar(r21, r28);                                 //	ldar	x21, [x28]

// LoadStoreExclusiveOp
    __ stxrw(r24, r24, r7);                            //	stxr	w24, w24, [x7]
    __ stlxrw(r21, r26, r28);                          //	stlxr	w21, w26, [x28]
    __ ldxrw(r21, r6);                                 //	ldxr	w21, [x6]
    __ ldaxrw(r15, r30);                               //	ldaxr	w15, [x30]
    __ stlrw(r19, r3);                                 //	stlr	w19, [x3]
    __ ldarw(r22, r2);                                 //	ldar	w22, [x2]

// LoadStoreExclusiveOp
    __ stxrh(r18, r15, r0);                            //	stxrh	w18, w15, [x0]
    __ stlxrh(r11, r5, r28);                           //	stlxrh	w11, w5, [x28]
    __ ldxrh(r29, r6);                                 //	ldxrh	w29, [x6]
    __ ldaxrh(r18, r7);                                //	ldaxrh	w18, [x7]
    __ stlrh(r25, r28);                                //	stlrh	w25, [x28]
    __ ldarh(r2, r19);                                 //	ldarh	w2, [x19]

// LoadStoreExclusiveOp
    __ stxrb(r10, r30, r1);                            //	stxrb	w10, w30, [x1]
    __ stlxrb(r20, r21, r22);                          //	stlxrb	w20, w21, [x22]
    __ ldxrb(r25, r2);                                 //	ldxrb	w25, [x2]
    __ ldaxrb(r24, r5);                                //	ldaxrb	w24, [x5]
    __ stlrb(r16, r3);                                 //	stlrb	w16, [x3]
    __ ldarb(r22, r29);                                //	ldarb	w22, [x29]

// LoadStoreExclusiveOp
    __ ldxp(r8, r2, r19);                              //	ldxp	x8, x2, [x19]
    __ ldaxp(r7, r19, r14);                            //	ldaxp	x7, x19, [x14]
    __ stxp(r8, r27, r28, r5);                         //	stxp	w8, x27, x28, [x5]
    __ stlxp(r6, r8, r14, r6);                         //	stlxp	w6, x8, x14, [x6]

// LoadStoreExclusiveOp
    __ ldxpw(r25, r4, r22);                            //	ldxp	w25, w4, [x22]
    __ ldaxpw(r14, r14, r15);                          //	ldaxp	w14, w14, [x15]
    __ stxpw(r20, r26, r8, r10);                       //	stxp	w20, w26, w8, [x10]
    __ stlxpw(r23, r18, r18, r18);                     //	stlxp	w23, w18, w18, [x18]

// base_plus_unscaled_offset 
// LoadStoreOp
    __ str(r30, Address(r11, 99));                     //	str	x30, [x11, 99]
    __ strw(r23, Address(r25, -77));                   //	str	w23, [x25, -77]
    __ strb(r2, Address(r14, 3));                      //	strb	w2, [x14, 3]
    __ strh(r9, Address(r10, 5));                      //	strh	w9, [x10, 5]
    __ ldr(r20, Address(r15, 57));                     //	ldr	x20, [x15, 57]
    __ ldrw(r12, Address(r16, -78));                   //	ldr	w12, [x16, -78]
    __ ldrb(r22, Address(r26, -3));                    //	ldrb	w22, [x26, -3]
    __ ldrh(r30, Address(r19, -47));                   //	ldrh	w30, [x19, -47]
    __ ldrsb(r9, Address(r10, -12));                   //	ldrsb	x9, [x10, -12]
    __ ldrsh(r28, Address(r17, 14));                   //	ldrsh	x28, [x17, 14]
    __ ldrshw(r3, Address(r5, 10));                    //	ldrsh	w3, [x5, 10]
    __ ldrsw(r17, Address(r17, -91));                  //	ldrsw	x17, [x17, -91]
    __ ldrd(v2, Address(r20, -17));                    //	ldr	d2, [x20, -17]
    __ ldrs(v22, Address(r7, -10));                    //	ldr	s22, [x7, -10]
    __ strd(v30, Address(r18, -223));                  //	str	d30, [x18, -223]
    __ strs(v13, Address(r22, 21));                    //	str	s13, [x22, 21]

// pre 
// LoadStoreOp
    __ str(r9, Address(__ pre(r18, -112)));            //	str	x9, [x18, -112]!
    __ strw(r29, Address(__ pre(r23, 11)));            //	str	w29, [x23, 11]!
    __ strb(r18, Address(__ pre(r12, -1)));            //	strb	w18, [x12, -1]!
    __ strh(r16, Address(__ pre(r20, -23)));           //	strh	w16, [x20, -23]!
    __ ldr(r3, Address(__ pre(r29, 9)));               //	ldr	x3, [x29, 9]!
    __ ldrw(r25, Address(__ pre(r3, 19)));             //	ldr	w25, [x3, 19]!
    __ ldrb(r1, Address(__ pre(r29, -1)));             //	ldrb	w1, [x29, -1]!
    __ ldrh(r8, Address(__ pre(r29, -57)));            //	ldrh	w8, [x29, -57]!
    __ ldrsb(r5, Address(__ pre(r14, -13)));           //	ldrsb	x5, [x14, -13]!
    __ ldrsh(r10, Address(__ pre(r27, 1)));            //	ldrsh	x10, [x27, 1]!
    __ ldrshw(r11, Address(__ pre(r10, 25)));          //	ldrsh	w11, [x10, 25]!
    __ ldrsw(r4, Address(__ pre(r22, -92)));           //	ldrsw	x4, [x22, -92]!
    __ ldrd(v11, Address(__ pre(r23, 8)));             //	ldr	d11, [x23, 8]!
    __ ldrs(v25, Address(__ pre(r19, 54)));            //	ldr	s25, [x19, 54]!
    __ strd(v1, Address(__ pre(r7, -174)));            //	str	d1, [x7, -174]!
    __ strs(v8, Address(__ pre(r25, 54)));             //	str	s8, [x25, 54]!

// post 
// LoadStoreOp
    __ str(r5, Address(__ post(r11, 37)));             //	str	x5, [x11], 37
    __ strw(r24, Address(__ post(r15, 19)));           //	str	w24, [x15], 19
    __ strb(r15, Address(__ post(r26, -1)));           //	strb	w15, [x26], -1
    __ strh(r18, Address(__ post(r18, -6)));           //	strh	w18, [x18], -6
    __ ldr(r7, Address(__ post(r2, -230)));            //	ldr	x7, [x2], -230
    __ ldrw(r27, Address(__ post(r11, -27)));          //	ldr	w27, [x11], -27
    __ ldrb(r18, Address(__ post(r3, -25)));           //	ldrb	w18, [x3], -25
    __ ldrh(r10, Address(__ post(r24, -32)));          //	ldrh	w10, [x24], -32
    __ ldrsb(r22, Address(__ post(r10, 4)));           //	ldrsb	x22, [x10], 4
    __ ldrsh(r17, Address(__ post(r12, 25)));          //	ldrsh	x17, [x12], 25
    __ ldrshw(r8, Address(__ post(r7, -62)));          //	ldrsh	w8, [x7], -62
    __ ldrsw(r23, Address(__ post(r22, -51)));         //	ldrsw	x23, [x22], -51
    __ ldrd(v24, Address(__ post(r25, 48)));           //	ldr	d24, [x25], 48
    __ ldrs(v21, Address(__ post(r12, -10)));          //	ldr	s21, [x12], -10
    __ strd(v18, Address(__ post(r13, -222)));         //	str	d18, [x13], -222
    __ strs(v16, Address(__ post(r1, -41)));           //	str	s16, [x1], -41

// base_plus_reg 
// LoadStoreOp
    __ str(r2, Address(r22, r15, Address::sxtw(0)));   //	str	x2, [x22, w15, sxtw #0]
    __ strw(r2, Address(r16, r29, Address::lsl(0)));   //	str	w2, [x16, x29, lsl #0]
    __ strb(r20, Address(r18, r14, Address::uxtw(0))); //	strb	w20, [x18, w14, uxtw #0]
    __ strh(r6, Address(r19, r20, Address::sxtx(1)));  //	strh	w6, [x19, x20, sxtx #1]
    __ ldr(r14, Address(r29, r14, Address::sxtw(0)));  //	ldr	x14, [x29, w14, sxtw #0]
    __ ldrw(r16, Address(r20, r12, Address::sxtw(2))); //	ldr	w16, [x20, w12, sxtw #2]
    __ ldrb(r9, Address(r12, r0, Address::sxtw(0)));   //	ldrb	w9, [x12, w0, sxtw #0]
    __ ldrh(r12, Address(r17, r3, Address::lsl(1)));   //	ldrh	w12, [x17, x3, lsl #1]
    __ ldrsb(r2, Address(r17, r3, Address::sxtx(0)));  //	ldrsb	x2, [x17, x3, sxtx #0]
    __ ldrsh(r7, Address(r1, r17, Address::uxtw(1)));  //	ldrsh	x7, [x1, w17, uxtw #1]
    __ ldrshw(r25, Address(r15, r18, Address::sxtw(1))); //	ldrsh	w25, [x15, w18, sxtw #1]
    __ ldrsw(r23, Address(r21, r12, Address::lsl(0))); //	ldrsw	x23, [x21, x12, lsl #0]
    __ ldrd(v5, Address(r13, r8, Address::lsl(3)));    //	ldr	d5, [x13, x8, lsl #3]
    __ ldrs(v3, Address(r10, r22, Address::lsl(2)));   //	ldr	s3, [x10, x22, lsl #2]
    __ strd(v14, Address(r2, r27, Address::sxtw(0)));  //	str	d14, [x2, w27, sxtw #0]
    __ strs(v20, Address(r6, r25, Address::lsl(0)));   //	str	s20, [x6, x25, lsl #0]

// base_plus_scaled_offset 
// LoadStoreOp
    __ str(r30, Address(r7, 16256));                   //	str	x30, [x7, 16256]
    __ strw(r15, Address(r8, 7588));                   //	str	w15, [x8, 7588]
    __ strb(r11, Address(r0, 1866));                   //	strb	w11, [x0, 1866]
    __ strh(r3, Address(r17, 3734));                   //	strh	w3, [x17, 3734]
    __ ldr(r2, Address(r7, 14224));                    //	ldr	x2, [x7, 14224]
    __ ldrw(r5, Address(r9, 7396));                    //	ldr	w5, [x9, 7396]
    __ ldrb(r28, Address(r9, 1721));                   //	ldrb	w28, [x9, 1721]
    __ ldrh(r2, Address(r20, 3656));                   //	ldrh	w2, [x20, 3656]
    __ ldrsb(r22, Address(r14, 1887));                 //	ldrsb	x22, [x14, 1887]
    __ ldrsh(r8, Address(r0, 4080));                   //	ldrsh	x8, [x0, 4080]
    __ ldrshw(r0, Address(r30, 3916));                 //	ldrsh	w0, [x30, 3916]
    __ ldrsw(r24, Address(r19, 6828));                 //	ldrsw	x24, [x19, 6828]
    __ ldrd(v24, Address(r12, 13032));                 //	ldr	d24, [x12, 13032]
    __ ldrs(v8, Address(r8, 7452));                    //	ldr	s8, [x8, 7452]
    __ strd(v10, Address(r15, 15992));                 //	str	d10, [x15, 15992]
    __ strs(v26, Address(r19, 6688));                  //	str	s26, [x19, 6688]

// pcrel 
// LoadStoreOp
    __ ldr(r10, forth);                                //	ldr	x10, forth
    __ ldrw(r3, __ pc());                              //	ldr	w3, .

// LoadStoreOp
    __ prfm(Address(r23, 9));                          //	prfm	PLDL1KEEP, [x23, 9]

// LoadStoreOp
    __ prfm(back);                                     //	prfm	PLDL1KEEP, back

// LoadStoreOp
    __ prfm(Address(r3, r8, Address::uxtw(0)));        //	prfm	PLDL1KEEP, [x3, w8, uxtw #0]

// LoadStoreOp
    __ prfm(Address(r11, 15080));                      //	prfm	PLDL1KEEP, [x11, 15080]

// AddSubCarryOp
    __ adcw(r13, r9, r28);                             //	adc	w13, w9, w28
    __ adcsw(r27, r19, r28);                           //	adcs	w27, w19, w28
    __ sbcw(r19, r18, r6);                             //	sbc	w19, w18, w6
    __ sbcsw(r14, r20, r3);                            //	sbcs	w14, w20, w3
    __ adc(r16, r14, r8);                              //	adc	x16, x14, x8
    __ adcs(r0, r29, r8);                              //	adcs	x0, x29, x8
    __ sbc(r8, r24, r20);                              //	sbc	x8, x24, x20
    __ sbcs(r12, r28, r0);                             //	sbcs	x12, x28, x0

// AddSubExtendedOp
    __ addw(r23, r6, r16, ext::uxtb, 4);               //	add	w23, w6, w16, uxtb #4
    __ addsw(r25, r25, r23, ext::sxth, 2);             //	adds	w25, w25, w23, sxth #2
    __ sub(r26, r22, r4, ext::uxtx, 1);                //	sub	x26, x22, x4, uxtx #1
    __ subsw(r17, r29, r19, ext::sxtx, 3);             //	subs	w17, w29, w19, sxtx #3
    __ add(r11, r30, r21, ext::uxtb, 3);               //	add	x11, x30, x21, uxtb #3
    __ adds(r16, r19, r0, ext::sxtb, 2);               //	adds	x16, x19, x0, sxtb #2
    __ sub(r11, r9, r25, ext::sxtx, 1);                //	sub	x11, x9, x25, sxtx #1
    __ subs(r17, r20, r12, ext::sxtb, 4);              //	subs	x17, x20, x12, sxtb #4

// ConditionalCompareOp
    __ ccmnw(r13, r11, 3u, Assembler::LE);             //	ccmn	w13, w11, #3, LE
    __ ccmpw(r13, r12, 2u, Assembler::HI);             //	ccmp	w13, w12, #2, HI
    __ ccmn(r3, r2, 12u, Assembler::NE);               //	ccmn	x3, x2, #12, NE
    __ ccmp(r7, r21, 3u, Assembler::VS);               //	ccmp	x7, x21, #3, VS

// ConditionalCompareImmedOp
    __ ccmnw(r2, 14, 4, Assembler::CC);                //	ccmn	w2, #14, #4, CC
    __ ccmpw(r17, 17, 6, Assembler::PL);               //	ccmp	w17, #17, #6, PL
    __ ccmn(r10, 12, 0, Assembler::CS);                //	ccmn	x10, #12, #0, CS
    __ ccmp(r21, 18, 14, Assembler::GE);               //	ccmp	x21, #18, #14, GE

// ConditionalSelectOp
    __ cselw(r21, r13, r12, Assembler::GT);            //	csel	w21, w13, w12, GT
    __ csincw(r10, r27, r15, Assembler::LS);           //	csinc	w10, w27, w15, LS
    __ csinvw(r0, r13, r9, Assembler::HI);             //	csinv	w0, w13, w9, HI
    __ csnegw(r18, r4, r26, Assembler::VS);            //	csneg	w18, w4, w26, VS
    __ csel(r12, r29, r7, Assembler::LS);              //	csel	x12, x29, x7, LS
    __ csinc(r6, r7, r20, Assembler::VC);              //	csinc	x6, x7, x20, VC
    __ csinv(r22, r21, r3, Assembler::LE);             //	csinv	x22, x21, x3, LE
    __ csneg(r19, r12, r27, Assembler::LS);            //	csneg	x19, x12, x27, LS

// TwoRegOp
    __ rbitw(r0, r16);                                 //	rbit	w0, w16
    __ rev16w(r17, r23);                               //	rev16	w17, w23
    __ revw(r17, r14);                                 //	rev	w17, w14
    __ clzw(r24, r30);                                 //	clz	w24, w30
    __ clsw(r24, r22);                                 //	cls	w24, w22
    __ rbit(r3, r17);                                  //	rbit	x3, x17
    __ rev16(r12, r13);                                //	rev16	x12, x13
    __ rev32(r9, r22);                                 //	rev32	x9, x22
    __ rev(r0, r0);                                    //	rev	x0, x0
    __ clz(r5, r16);                                   //	clz	x5, x16
    __ cls(r25, r22);                                  //	cls	x25, x22

// ThreeRegOp
    __ udivw(r29, r4, r0);                             //	udiv	w29, w4, w0
    __ sdivw(r0, r29, r29);                            //	sdiv	w0, w29, w29
    __ lslvw(r5, r17, r21);                            //	lslv	w5, w17, w21
    __ lsrvw(r9, r9, r18);                             //	lsrv	w9, w9, w18
    __ asrvw(r1, r27, r8);                             //	asrv	w1, w27, w8
    __ rorvw(r18, r20, r13);                           //	rorv	w18, w20, w13
    __ udiv(r8, r25, r12);                             //	udiv	x8, x25, x12
    __ sdiv(r7, r5, r28);                              //	sdiv	x7, x5, x28
    __ lslv(r5, r17, r27);                             //	lslv	x5, x17, x27
    __ lsrv(r23, r26, r20);                            //	lsrv	x23, x26, x20
    __ asrv(r28, r8, r28);                             //	asrv	x28, x8, x28
    __ rorv(r3, r29, r4);                              //	rorv	x3, x29, x4

// FourRegMulOp
    __ maddw(r17, r14, r26, r21);                      //	madd	w17, w14, w26, w21
    __ msubw(r1, r30, r11, r11);                       //	msub	w1, w30, w11, w11
    __ madd(r1, r17, r6, r28);                         //	madd	x1, x17, x6, x28
    __ msub(r30, r6, r30, r8);                         //	msub	x30, x6, x30, x8
    __ smaddl(r21, r6, r14, r8);                       //	smaddl	x21, w6, w14, x8
    __ smsubl(r10, r10, r24, r19);                     //	smsubl	x10, w10, w24, x19
    __ umaddl(r20, r18, r14, r24);                     //	umaddl	x20, w18, w14, x24
    __ umsubl(r18, r2, r5, r5);                        //	umsubl	x18, w2, w5, x5

// ThreeRegFloatOp
    __ fmuls(v8, v18, v13);                            //	fmul	s8, s18, s13
    __ fdivs(v2, v14, v28);                            //	fdiv	s2, s14, s28
    __ fadds(v15, v12, v28);                           //	fadd	s15, s12, s28
    __ fsubs(v0, v12, v1);                             //	fsub	s0, s12, s1
    __ fmuls(v15, v29, v4);                            //	fmul	s15, s29, s4
    __ fmuld(v12, v1, v23);                            //	fmul	d12, d1, d23
    __ fdivd(v27, v8, v18);                            //	fdiv	d27, d8, d18
    __ faddd(v23, v20, v11);                           //	fadd	d23, d20, d11
    __ fsubd(v8, v12, v18);                            //	fsub	d8, d12, d18
    __ fmuld(v26, v24, v23);                           //	fmul	d26, d24, d23

// FourRegFloatOp
    __ fmadds(v21, v23, v13, v25);                     //	fmadd	s21, s23, s13, s25
    __ fmsubs(v22, v10, v1, v14);                      //	fmsub	s22, s10, s1, s14
    __ fnmadds(v14, v20, v2, v30);                     //	fnmadd	s14, s20, s2, s30
    __ fnmadds(v7, v29, v22, v22);                     //	fnmadd	s7, s29, s22, s22
    __ fmaddd(v13, v5, v15, v5);                       //	fmadd	d13, d5, d15, d5
    __ fmsubd(v14, v12, v5, v10);                      //	fmsub	d14, d12, d5, d10
    __ fnmaddd(v10, v19, v0, v1);                      //	fnmadd	d10, d19, d0, d1
    __ fnmaddd(v20, v2, v2, v0);                       //	fnmadd	d20, d2, d2, d0

// TwoRegFloatOp
    __ fmovs(v25, v9);                                 //	fmov	s25, s9
    __ fabss(v20, v4);                                 //	fabs	s20, s4
    __ fnegs(v3, v27);                                 //	fneg	s3, s27
    __ fsqrts(v1, v2);                                 //	fsqrt	s1, s2
    __ fcvts(v30, v0);                                 //	fcvt	d30, s0
    __ fmovd(v12, v4);                                 //	fmov	d12, d4
    __ fabsd(v1, v27);                                 //	fabs	d1, d27
    __ fnegd(v8, v22);                                 //	fneg	d8, d22
    __ fsqrtd(v11, v11);                               //	fsqrt	d11, d11
    __ fcvtd(v22, v28);                                //	fcvt	s22, d28

// FloatConvertOp
    __ fcvtzsw(r28, v22);                              //	fcvtzs	w28, s22
    __ fcvtzs(r20, v27);                               //	fcvtzs	x20, s27
    __ fcvtzdw(r14, v0);                               //	fcvtzs	w14, d0
    __ fcvtzd(r26, v11);                               //	fcvtzs	x26, d11
    __ scvtfws(v28, r22);                              //	scvtf	s28, w22
    __ scvtfs(v16, r10);                               //	scvtf	s16, x10
    __ scvtfwd(v8, r21);                               //	scvtf	d8, w21
    __ scvtfd(v21, r28);                               //	scvtf	d21, x28
    __ fmovs(r24, v24);                                //	fmov	w24, s24
    __ fmovd(r8, v19);                                 //	fmov	x8, d19
    __ fmovs(v8, r12);                                 //	fmov	s8, w12
    __ fmovd(v6, r7);                                  //	fmov	d6, x7

// TwoRegFloatOp
    __ fcmps(v30, v16);                                //	fcmp	s30, s16
    __ fcmpd(v25, v11);                                //	fcmp	d25, d11
    __ fcmps(v11, 0.0);                                //	fcmp	s11, #0.0
    __ fcmpd(v11, 0.0);                                //	fcmp	d11, #0.0

// LoadStorePairOp
    __ stpw(r29, r12, Address(r17, 128));              //	stp	w29, w12, [x17, #128]
    __ ldpw(r22, r18, Address(r14, -96));              //	ldp	w22, w18, [x14, #-96]
    __ ldpsw(r11, r16, Address(r1, 64));               //	ldpsw	x11, x16, [x1, #64]
    __ stp(r0, r11, Address(r26, 112));                //	stp	x0, x11, [x26, #112]
    __ ldp(r7, r1, Address(r26, 16));                  //	ldp	x7, x1, [x26, #16]

// LoadStorePairOp
    __ stpw(r10, r7, Address(__ pre(r24, 0)));         //	stp	w10, w7, [x24, #0]!
    __ ldpw(r7, r28, Address(__ pre(r24, -256)));      //	ldp	w7, w28, [x24, #-256]!
    __ ldpsw(r25, r28, Address(__ pre(r21, -240)));    //	ldpsw	x25, x28, [x21, #-240]!
    __ stp(r20, r18, Address(__ pre(r14, -16)));       //	stp	x20, x18, [x14, #-16]!
    __ ldp(r8, r10, Address(__ pre(r13, 80)));         //	ldp	x8, x10, [x13, #80]!

// LoadStorePairOp
    __ stpw(r26, r24, Address(__ post(r2, -128)));     //	stp	w26, w24, [x2], #-128
    __ ldpw(r2, r25, Address(__ post(r21, -192)));     //	ldp	w2, w25, [x21], #-192
    __ ldpsw(r17, r2, Address(__ post(r21, -144)));    //	ldpsw	x17, x2, [x21], #-144
    __ stp(r12, r10, Address(__ post(r11, 96)));       //	stp	x12, x10, [x11], #96
    __ ldp(r24, r6, Address(__ post(r17, -32)));       //	ldp	x24, x6, [x17], #-32

// LoadStorePairOp
    __ stnpw(r3, r30, Address(r14, -224));             //	stnp	w3, w30, [x14, #-224]
    __ ldnpw(r15, r20, Address(r26, -144));            //	ldnp	w15, w20, [x26, #-144]
    __ stnp(r22, r25, Address(r12, -128));             //	stnp	x22, x25, [x12, #-128]
    __ ldnp(r27, r22, Address(r17, -176));             //	ldnp	x27, x22, [x17, #-176]

// FloatImmediateOp
    __ fmovd(v0, 2.0);                                 //	fmov d0, #2.0
    __ fmovd(v0, 2.125);                               //	fmov d0, #2.125
    __ fmovd(v0, 4.0);                                 //	fmov d0, #4.0
    __ fmovd(v0, 4.25);                                //	fmov d0, #4.25
    __ fmovd(v0, 8.0);                                 //	fmov d0, #8.0
    __ fmovd(v0, 8.5);                                 //	fmov d0, #8.5
    __ fmovd(v0, 16.0);                                //	fmov d0, #16.0
    __ fmovd(v0, 17.0);                                //	fmov d0, #17.0
    __ fmovd(v0, 0.125);                               //	fmov d0, #0.125
    __ fmovd(v0, 0.1328125);                           //	fmov d0, #0.1328125
    __ fmovd(v0, 0.25);                                //	fmov d0, #0.25
    __ fmovd(v0, 0.265625);                            //	fmov d0, #0.265625
    __ fmovd(v0, 0.5);                                 //	fmov d0, #0.5
    __ fmovd(v0, 0.53125);                             //	fmov d0, #0.53125
    __ fmovd(v0, 1.0);                                 //	fmov d0, #1.0
    __ fmovd(v0, 1.0625);                              //	fmov d0, #1.0625
    __ fmovd(v0, -2.0);                                //	fmov d0, #-2.0
    __ fmovd(v0, -2.125);                              //	fmov d0, #-2.125
    __ fmovd(v0, -4.0);                                //	fmov d0, #-4.0
    __ fmovd(v0, -4.25);                               //	fmov d0, #-4.25
    __ fmovd(v0, -8.0);                                //	fmov d0, #-8.0
    __ fmovd(v0, -8.5);                                //	fmov d0, #-8.5
    __ fmovd(v0, -16.0);                               //	fmov d0, #-16.0
    __ fmovd(v0, -17.0);                               //	fmov d0, #-17.0
    __ fmovd(v0, -0.125);                              //	fmov d0, #-0.125
    __ fmovd(v0, -0.1328125);                          //	fmov d0, #-0.1328125
    __ fmovd(v0, -0.25);                               //	fmov d0, #-0.25
    __ fmovd(v0, -0.265625);                           //	fmov d0, #-0.265625
    __ fmovd(v0, -0.5);                                //	fmov d0, #-0.5
    __ fmovd(v0, -0.53125);                            //	fmov d0, #-0.53125
    __ fmovd(v0, -1.0);                                //	fmov d0, #-1.0
    __ fmovd(v0, -1.0625);                             //	fmov d0, #-1.0625

    __ bind(forth);

/*
aarch64ops.o:     file format elf64-littleaarch64


Disassembly of section .text:

0000000000000000 <back>:
   0:	8b0772d3 	add	x19, x22, x7, lsl #28
   4:	cb4a3570 	sub	x16, x11, x10, lsr #13
   8:	ab9c09bb 	adds	x27, x13, x28, asr #2
   c:	eb9aa794 	subs	x20, x28, x26, asr #41
  10:	0b934e68 	add	w8, w19, w19, asr #19
  14:	4b0a3924 	sub	w4, w9, w10, lsl #14
  18:	2b1e3568 	adds	w8, w11, w30, lsl #13
  1c:	6b132720 	subs	w0, w25, w19, lsl #9
  20:	8a154c14 	and	x20, x0, x21, lsl #19
  24:	aa1445d5 	orr	x21, x14, x20, lsl #17
  28:	ca01cf99 	eor	x25, x28, x1, lsl #51
  2c:	ea8b3f6a 	ands	x10, x27, x11, asr #15
  30:	0a8c5cb9 	and	w25, w5, w12, asr #23
  34:	2a4a11d2 	orr	w18, w14, w10, lsr #4
  38:	4a855aa4 	eor	w4, w21, w5, asr #22
  3c:	6a857415 	ands	w21, w0, w5, asr #29
  40:	8aa697da 	bic	x26, x30, x6, asr #37
  44:	aa6d7423 	orn	x3, x1, x13, lsr #29
  48:	ca29bf80 	eon	x0, x28, x9, lsl #47
  4c:	ea3cb8bd 	bics	x29, x5, x28, lsl #46
  50:	0a675249 	bic	w9, w18, w7, lsr #20
  54:	2ab961ba 	orn	w26, w13, w25, asr #24
  58:	4a331899 	eon	w25, w4, w19, lsl #6
  5c:	6a646345 	bics	w5, w26, w4, lsr #24
  60:	11055267 	add	w7, w19, #0x154
  64:	31064408 	adds	w8, w0, #0x191
  68:	51028e9d 	sub	w29, w20, #0xa3
  6c:	710bdee8 	subs	w8, w23, #0x2f7
  70:	91082d81 	add	x1, x12, #0x20b
  74:	b106a962 	adds	x2, x11, #0x1aa
  78:	d10b33ae 	sub	x14, x29, #0x2cc
  7c:	f10918ab 	subs	x11, x5, #0x246
  80:	121102d7 	and	w23, w22, #0x8000
  84:	3204cd44 	orr	w4, w10, #0xf0f0f0f0
  88:	5204cf00 	eor	w0, w24, #0xf0f0f0f0
  8c:	72099fb3 	ands	w19, w29, #0x7f807f80
  90:	92729545 	and	x5, x10, #0xfffffffffc000
  94:	b20e37cc 	orr	x12, x30, #0xfffc0000fffc0000
  98:	d27c34be 	eor	x30, x5, #0x3fff0
  9c:	f27e4efa 	ands	x26, x23, #0x3ffffc
  a0:	14000000 	b	a0 <back+0xa0>
  a4:	17ffffd7 	b	0 <back>
  a8:	1400017f 	b	6a4 <forth>
  ac:	94000000 	bl	ac <back+0xac>
  b0:	97ffffd4 	bl	0 <back>
  b4:	9400017c 	bl	6a4 <forth>
  b8:	3400000c 	cbz	w12, b8 <back+0xb8>
  bc:	34fffa2c 	cbz	w12, 0 <back>
  c0:	34002f2c 	cbz	w12, 6a4 <forth>
  c4:	35000014 	cbnz	w20, c4 <back+0xc4>
  c8:	35fff9d4 	cbnz	w20, 0 <back>
  cc:	35002ed4 	cbnz	w20, 6a4 <forth>
  d0:	b400000c 	cbz	x12, d0 <back+0xd0>
  d4:	b4fff96c 	cbz	x12, 0 <back>
  d8:	b4002e6c 	cbz	x12, 6a4 <forth>
  dc:	b5000018 	cbnz	x24, dc <back+0xdc>
  e0:	b5fff918 	cbnz	x24, 0 <back>
  e4:	b5002e18 	cbnz	x24, 6a4 <forth>
  e8:	10000006 	adr	x6, e8 <back+0xe8>
  ec:	10fff8a6 	adr	x6, 0 <back>
  f0:	10002da6 	adr	x6, 6a4 <forth>
  f4:	90000015 	adrp	x21, 0 <back>
  f8:	36080001 	tbz	w1, #1, f8 <back+0xf8>
  fc:	360ff821 	tbz	w1, #1, 0 <back>
 100:	36082d21 	tbz	w1, #1, 6a4 <forth>
 104:	37480008 	tbnz	w8, #9, 104 <back+0x104>
 108:	374ff7c8 	tbnz	w8, #9, 0 <back>
 10c:	37482cc8 	tbnz	w8, #9, 6a4 <forth>
 110:	128b50ec 	movn	w12, #0x5a87
 114:	52a9ff8b 	movz	w11, #0x4ffc, lsl #16
 118:	7281d095 	movk	w21, #0xe84
 11c:	92edfebd 	movn	x29, #0x6ff5, lsl #48
 120:	d28361e3 	movz	x3, #0x1b0f
 124:	f2a4cc96 	movk	x22, #0x2664, lsl #16
 128:	9346590c 	sbfx	x12, x8, #6, #17
 12c:	33194f33 	bfi	w19, w25, #7, #20
 130:	531d3d89 	ubfiz	w9, w12, #3, #16
 134:	9350433c 	sbfx	x28, x25, #16, #1
 138:	b34464ac 	bfxil	x12, x5, #4, #22
 13c:	d3462140 	ubfx	x0, x10, #6, #3
 140:	139a61a4 	extr	w4, w13, w26, #24
 144:	93d87fd7 	extr	x23, x30, x24, #31
 148:	54000000 	b.eq	148 <back+0x148>
 14c:	54fff5a0 	b.eq	0 <back>
 150:	54002aa0 	b.eq	6a4 <forth>
 154:	54000001 	b.ne	154 <back+0x154>
 158:	54fff541 	b.ne	0 <back>
 15c:	54002a41 	b.ne	6a4 <forth>
 160:	54000002 	b.cs	160 <back+0x160>
 164:	54fff4e2 	b.cs	0 <back>
 168:	540029e2 	b.cs	6a4 <forth>
 16c:	54000002 	b.cs	16c <back+0x16c>
 170:	54fff482 	b.cs	0 <back>
 174:	54002982 	b.cs	6a4 <forth>
 178:	54000003 	b.cc	178 <back+0x178>
 17c:	54fff423 	b.cc	0 <back>
 180:	54002923 	b.cc	6a4 <forth>
 184:	54000003 	b.cc	184 <back+0x184>
 188:	54fff3c3 	b.cc	0 <back>
 18c:	540028c3 	b.cc	6a4 <forth>
 190:	54000004 	b.mi	190 <back+0x190>
 194:	54fff364 	b.mi	0 <back>
 198:	54002864 	b.mi	6a4 <forth>
 19c:	54000005 	b.pl	19c <back+0x19c>
 1a0:	54fff305 	b.pl	0 <back>
 1a4:	54002805 	b.pl	6a4 <forth>
 1a8:	54000006 	b.vs	1a8 <back+0x1a8>
 1ac:	54fff2a6 	b.vs	0 <back>
 1b0:	540027a6 	b.vs	6a4 <forth>
 1b4:	54000007 	b.vc	1b4 <back+0x1b4>
 1b8:	54fff247 	b.vc	0 <back>
 1bc:	54002747 	b.vc	6a4 <forth>
 1c0:	54000008 	b.hi	1c0 <back+0x1c0>
 1c4:	54fff1e8 	b.hi	0 <back>
 1c8:	540026e8 	b.hi	6a4 <forth>
 1cc:	54000009 	b.ls	1cc <back+0x1cc>
 1d0:	54fff189 	b.ls	0 <back>
 1d4:	54002689 	b.ls	6a4 <forth>
 1d8:	5400000a 	b.ge	1d8 <back+0x1d8>
 1dc:	54fff12a 	b.ge	0 <back>
 1e0:	5400262a 	b.ge	6a4 <forth>
 1e4:	5400000b 	b.lt	1e4 <back+0x1e4>
 1e8:	54fff0cb 	b.lt	0 <back>
 1ec:	540025cb 	b.lt	6a4 <forth>
 1f0:	5400000c 	b.gt	1f0 <back+0x1f0>
 1f4:	54fff06c 	b.gt	0 <back>
 1f8:	5400256c 	b.gt	6a4 <forth>
 1fc:	5400000d 	b.le	1fc <back+0x1fc>
 200:	54fff00d 	b.le	0 <back>
 204:	5400250d 	b.le	6a4 <forth>
 208:	5400000e 	b.al	208 <back+0x208>
 20c:	54ffefae 	b.al	0 <back>
 210:	540024ae 	b.al	6a4 <forth>
 214:	5400000f 	b.nv	214 <back+0x214>
 218:	54ffef4f 	b.nv	0 <back>
 21c:	5400244f 	b.nv	6a4 <forth>
 220:	d4063721 	svc	#0x31b9
 224:	d4035082 	hvc	#0x1a84
 228:	d400bfe3 	smc	#0x5ff
 22c:	d4282fc0 	brk	#0x417e
 230:	d444c320 	hlt	#0x2619
 234:	d503201f 	nop
 238:	d69f03e0 	eret
 23c:	d6bf03e0 	drps
 240:	d5033fdf 	isb
 244:	d5033f9f 	dsb	sy
 248:	d5033abf 	dmb	ishst
 24c:	d61f0040 	br	x2
 250:	d63f00a0 	blr	x5
 254:	c8147c55 	stxr	w20, x21, [x2]
 258:	c807fcfd 	stlxr	w7, x29, [x7]
 25c:	c85f7e05 	ldxr	x5, [x16]
 260:	c85fffbb 	ldaxr	x27, [x29]
 264:	c89fffa0 	stlr	x0, [x29]
 268:	c8dfff95 	ldar	x21, [x28]
 26c:	88187cf8 	stxr	w24, w24, [x7]
 270:	8815ff9a 	stlxr	w21, w26, [x28]
 274:	885f7cd5 	ldxr	w21, [x6]
 278:	885fffcf 	ldaxr	w15, [x30]
 27c:	889ffc73 	stlr	w19, [x3]
 280:	88dffc56 	ldar	w22, [x2]
 284:	48127c0f 	stxrh	w18, w15, [x0]
 288:	480bff85 	stlxrh	w11, w5, [x28]
 28c:	485f7cdd 	ldxrh	w29, [x6]
 290:	485ffcf2 	ldaxrh	w18, [x7]
 294:	489fff99 	stlrh	w25, [x28]
 298:	48dffe62 	ldarh	w2, [x19]
 29c:	080a7c3e 	stxrb	w10, w30, [x1]
 2a0:	0814fed5 	stlxrb	w20, w21, [x22]
 2a4:	085f7c59 	ldxrb	w25, [x2]
 2a8:	085ffcb8 	ldaxrb	w24, [x5]
 2ac:	089ffc70 	stlrb	w16, [x3]
 2b0:	08dfffb6 	ldarb	w22, [x29]
 2b4:	c87f0a68 	ldxp	x8, x2, [x19]
 2b8:	c87fcdc7 	ldaxp	x7, x19, [x14]
 2bc:	c82870bb 	stxp	w8, x27, x28, [x5]
 2c0:	c826b8c8 	stlxp	w6, x8, x14, [x6]
 2c4:	887f12d9 	ldxp	w25, w4, [x22]
 2c8:	887fb9ee 	ldaxp	w14, w14, [x15]
 2cc:	8834215a 	stxp	w20, w26, w8, [x10]
 2d0:	8837ca52 	stlxp	w23, w18, w18, [x18]
 2d4:	f806317e 	str	x30, [x11,#99]
 2d8:	b81b3337 	str	w23, [x25,#-77]
 2dc:	39000dc2 	strb	w2, [x14,#3]
 2e0:	78005149 	strh	w9, [x10,#5]
 2e4:	f84391f4 	ldr	x20, [x15,#57]
 2e8:	b85b220c 	ldr	w12, [x16,#-78]
 2ec:	385fd356 	ldrb	w22, [x26,#-3]
 2f0:	785d127e 	ldrh	w30, [x19,#-47]
 2f4:	389f4149 	ldrsb	x9, [x10,#-12]
 2f8:	79801e3c 	ldrsh	x28, [x17,#14]
 2fc:	79c014a3 	ldrsh	w3, [x5,#10]
 300:	b89a5231 	ldrsw	x17, [x17,#-91]
 304:	fc5ef282 	ldr	d2, [x20,#-17]
 308:	bc5f60f6 	ldr	s22, [x7,#-10]
 30c:	fc12125e 	str	d30, [x18,#-223]
 310:	bc0152cd 	str	s13, [x22,#21]
 314:	f8190e49 	str	x9, [x18,#-112]!
 318:	b800befd 	str	w29, [x23,#11]!
 31c:	381ffd92 	strb	w18, [x12,#-1]!
 320:	781e9e90 	strh	w16, [x20,#-23]!
 324:	f8409fa3 	ldr	x3, [x29,#9]!
 328:	b8413c79 	ldr	w25, [x3,#19]!
 32c:	385fffa1 	ldrb	w1, [x29,#-1]!
 330:	785c7fa8 	ldrh	w8, [x29,#-57]!
 334:	389f3dc5 	ldrsb	x5, [x14,#-13]!
 338:	78801f6a 	ldrsh	x10, [x27,#1]!
 33c:	78c19d4b 	ldrsh	w11, [x10,#25]!
 340:	b89a4ec4 	ldrsw	x4, [x22,#-92]!
 344:	fc408eeb 	ldr	d11, [x23,#8]!
 348:	bc436e79 	ldr	s25, [x19,#54]!
 34c:	fc152ce1 	str	d1, [x7,#-174]!
 350:	bc036f28 	str	s8, [x25,#54]!
 354:	f8025565 	str	x5, [x11],#37
 358:	b80135f8 	str	w24, [x15],#19
 35c:	381ff74f 	strb	w15, [x26],#-1
 360:	781fa652 	strh	w18, [x18],#-6
 364:	f851a447 	ldr	x7, [x2],#-230
 368:	b85e557b 	ldr	w27, [x11],#-27
 36c:	385e7472 	ldrb	w18, [x3],#-25
 370:	785e070a 	ldrh	w10, [x24],#-32
 374:	38804556 	ldrsb	x22, [x10],#4
 378:	78819591 	ldrsh	x17, [x12],#25
 37c:	78dc24e8 	ldrsh	w8, [x7],#-62
 380:	b89cd6d7 	ldrsw	x23, [x22],#-51
 384:	fc430738 	ldr	d24, [x25],#48
 388:	bc5f6595 	ldr	s21, [x12],#-10
 38c:	fc1225b2 	str	d18, [x13],#-222
 390:	bc1d7430 	str	s16, [x1],#-41
 394:	f82fcac2 	str	x2, [x22,w15,sxtw]
 398:	b83d6a02 	str	w2, [x16,x29]
 39c:	382e5a54 	strb	w20, [x18,w14,uxtw #0]
 3a0:	7834fa66 	strh	w6, [x19,x20,sxtx #1]
 3a4:	f86ecbae 	ldr	x14, [x29,w14,sxtw]
 3a8:	b86cda90 	ldr	w16, [x20,w12,sxtw #2]
 3ac:	3860d989 	ldrb	w9, [x12,w0,sxtw #0]
 3b0:	78637a2c 	ldrh	w12, [x17,x3,lsl #1]
 3b4:	38a3fa22 	ldrsb	x2, [x17,x3,sxtx #0]
 3b8:	78b15827 	ldrsh	x7, [x1,w17,uxtw #1]
 3bc:	78f2d9f9 	ldrsh	w25, [x15,w18,sxtw #1]
 3c0:	b8ac6ab7 	ldrsw	x23, [x21,x12]
 3c4:	fc6879a5 	ldr	d5, [x13,x8,lsl #3]
 3c8:	bc767943 	ldr	s3, [x10,x22,lsl #2]
 3cc:	fc3bc84e 	str	d14, [x2,w27,sxtw]
 3d0:	bc3968d4 	str	s20, [x6,x25]
 3d4:	f91fc0fe 	str	x30, [x7,#16256]
 3d8:	b91da50f 	str	w15, [x8,#7588]
 3dc:	391d280b 	strb	w11, [x0,#1866]
 3e0:	791d2e23 	strh	w3, [x17,#3734]
 3e4:	f95bc8e2 	ldr	x2, [x7,#14224]
 3e8:	b95ce525 	ldr	w5, [x9,#7396]
 3ec:	395ae53c 	ldrb	w28, [x9,#1721]
 3f0:	795c9282 	ldrh	w2, [x20,#3656]
 3f4:	399d7dd6 	ldrsb	x22, [x14,#1887]
 3f8:	799fe008 	ldrsh	x8, [x0,#4080]
 3fc:	79de9bc0 	ldrsh	w0, [x30,#3916]
 400:	b99aae78 	ldrsw	x24, [x19,#6828]
 404:	fd597598 	ldr	d24, [x12,#13032]
 408:	bd5d1d08 	ldr	s8, [x8,#7452]
 40c:	fd1f3dea 	str	d10, [x15,#15992]
 410:	bd1a227a 	str	s26, [x19,#6688]
 414:	5800148a 	ldr	x10, 6a4 <forth>
 418:	18000003 	ldr	w3, 418 <back+0x418>
 41c:	f88092e0 	prfm	pldl1keep, [x23,#9]
 420:	d8ffdf00 	prfm	pldl1keep, 0 <back>
 424:	f8a84860 	prfm	pldl1keep, [x3,w8,uxtw]
 428:	f99d7560 	prfm	pldl1keep, [x11,#15080]
 42c:	1a1c012d 	adc	w13, w9, w28
 430:	3a1c027b 	adcs	w27, w19, w28
 434:	5a060253 	sbc	w19, w18, w6
 438:	7a03028e 	sbcs	w14, w20, w3
 43c:	9a0801d0 	adc	x16, x14, x8
 440:	ba0803a0 	adcs	x0, x29, x8
 444:	da140308 	sbc	x8, x24, x20
 448:	fa00038c 	sbcs	x12, x28, x0
 44c:	0b3010d7 	add	w23, w6, w16, uxtb #4
 450:	2b37ab39 	adds	w25, w25, w23, sxth #2
 454:	cb2466da 	sub	x26, x22, x4, uxtx #1
 458:	6b33efb1 	subs	w17, w29, w19, sxtx #3
 45c:	8b350fcb 	add	x11, x30, w21, uxtb #3
 460:	ab208a70 	adds	x16, x19, w0, sxtb #2
 464:	cb39e52b 	sub	x11, x9, x25, sxtx #1
 468:	eb2c9291 	subs	x17, x20, w12, sxtb #4
 46c:	3a4bd1a3 	ccmn	w13, w11, #0x3, le
 470:	7a4c81a2 	ccmp	w13, w12, #0x2, hi
 474:	ba42106c 	ccmn	x3, x2, #0xc, ne
 478:	fa5560e3 	ccmp	x7, x21, #0x3, vs
 47c:	3a4e3844 	ccmn	w2, #0xe, #0x4, cc
 480:	7a515a26 	ccmp	w17, #0x11, #0x6, pl
 484:	ba4c2940 	ccmn	x10, #0xc, #0x0, cs
 488:	fa52aaae 	ccmp	x21, #0x12, #0xe, ge
 48c:	1a8cc1b5 	csel	w21, w13, w12, gt
 490:	1a8f976a 	csinc	w10, w27, w15, ls
 494:	5a8981a0 	csinv	w0, w13, w9, hi
 498:	5a9a6492 	csneg	w18, w4, w26, vs
 49c:	9a8793ac 	csel	x12, x29, x7, ls
 4a0:	9a9474e6 	csinc	x6, x7, x20, vc
 4a4:	da83d2b6 	csinv	x22, x21, x3, le
 4a8:	da9b9593 	csneg	x19, x12, x27, ls
 4ac:	5ac00200 	rbit	w0, w16
 4b0:	5ac006f1 	rev16	w17, w23
 4b4:	5ac009d1 	rev	w17, w14
 4b8:	5ac013d8 	clz	w24, w30
 4bc:	5ac016d8 	cls	w24, w22
 4c0:	dac00223 	rbit	x3, x17
 4c4:	dac005ac 	rev16	x12, x13
 4c8:	dac00ac9 	rev32	x9, x22
 4cc:	dac00c00 	rev	x0, x0
 4d0:	dac01205 	clz	x5, x16
 4d4:	dac016d9 	cls	x25, x22
 4d8:	1ac0089d 	udiv	w29, w4, w0
 4dc:	1add0fa0 	sdiv	w0, w29, w29
 4e0:	1ad52225 	lsl	w5, w17, w21
 4e4:	1ad22529 	lsr	w9, w9, w18
 4e8:	1ac82b61 	asr	w1, w27, w8
 4ec:	1acd2e92 	ror	w18, w20, w13
 4f0:	9acc0b28 	udiv	x8, x25, x12
 4f4:	9adc0ca7 	sdiv	x7, x5, x28
 4f8:	9adb2225 	lsl	x5, x17, x27
 4fc:	9ad42757 	lsr	x23, x26, x20
 500:	9adc291c 	asr	x28, x8, x28
 504:	9ac42fa3 	ror	x3, x29, x4
 508:	1b1a55d1 	madd	w17, w14, w26, w21
 50c:	1b0bafc1 	msub	w1, w30, w11, w11
 510:	9b067221 	madd	x1, x17, x6, x28
 514:	9b1ea0de 	msub	x30, x6, x30, x8
 518:	9b2e20d5 	smaddl	x21, w6, w14, x8
 51c:	9b38cd4a 	smsubl	x10, w10, w24, x19
 520:	9bae6254 	umaddl	x20, w18, w14, x24
 524:	9ba59452 	umsubl	x18, w2, w5, x5
 528:	1e2d0a48 	fmul	s8, s18, s13
 52c:	1e3c19c2 	fdiv	s2, s14, s28
 530:	1e3c298f 	fadd	s15, s12, s28
 534:	1e213980 	fsub	s0, s12, s1
 538:	1e240baf 	fmul	s15, s29, s4
 53c:	1e77082c 	fmul	d12, d1, d23
 540:	1e72191b 	fdiv	d27, d8, d18
 544:	1e6b2a97 	fadd	d23, d20, d11
 548:	1e723988 	fsub	d8, d12, d18
 54c:	1e770b1a 	fmul	d26, d24, d23
 550:	1f0d66f5 	fmadd	s21, s23, s13, s25
 554:	1f01b956 	fmsub	s22, s10, s1, s14
 558:	1f227a8e 	fnmadd	s14, s20, s2, s30
 55c:	1f365ba7 	fnmadd	s7, s29, s22, s22
 560:	1f4f14ad 	fmadd	d13, d5, d15, d5
 564:	1f45a98e 	fmsub	d14, d12, d5, d10
 568:	1f60066a 	fnmadd	d10, d19, d0, d1
 56c:	1f620054 	fnmadd	d20, d2, d2, d0
 570:	1e204139 	fmov	s25, s9
 574:	1e20c094 	fabs	s20, s4
 578:	1e214363 	fneg	s3, s27
 57c:	1e21c041 	fsqrt	s1, s2
 580:	1e22c01e 	fcvt	d30, s0
 584:	1e60408c 	fmov	d12, d4
 588:	1e60c361 	fabs	d1, d27
 58c:	1e6142c8 	fneg	d8, d22
 590:	1e61c16b 	fsqrt	d11, d11
 594:	1e624396 	fcvt	s22, d28
 598:	1e3802dc 	fcvtzs	w28, s22
 59c:	9e380374 	fcvtzs	x20, s27
 5a0:	1e78000e 	fcvtzs	w14, d0
 5a4:	9e78017a 	fcvtzs	x26, d11
 5a8:	1e2202dc 	scvtf	s28, w22
 5ac:	9e220150 	scvtf	s16, x10
 5b0:	1e6202a8 	scvtf	d8, w21
 5b4:	9e620395 	scvtf	d21, x28
 5b8:	1e260318 	fmov	w24, s24
 5bc:	9e660268 	fmov	x8, d19
 5c0:	1e270188 	fmov	s8, w12
 5c4:	9e6700e6 	fmov	d6, x7
 5c8:	1e3023c0 	fcmp	s30, s16
 5cc:	1e6b2320 	fcmp	d25, d11
 5d0:	1e202168 	fcmp	s11, #0.0
 5d4:	1e602168 	fcmp	d11, #0.0
 5d8:	2910323d 	stp	w29, w12, [x17,#128]
 5dc:	297449d6 	ldp	w22, w18, [x14,#-96]
 5e0:	6948402b 	ldpsw	x11, x16, [x1,#64]
 5e4:	a9072f40 	stp	x0, x11, [x26,#112]
 5e8:	a9410747 	ldp	x7, x1, [x26,#16]
 5ec:	29801f0a 	stp	w10, w7, [x24,#0]!
 5f0:	29e07307 	ldp	w7, w28, [x24,#-256]!
 5f4:	69e272b9 	ldpsw	x25, x28, [x21,#-240]!
 5f8:	a9bf49d4 	stp	x20, x18, [x14,#-16]!
 5fc:	a9c529a8 	ldp	x8, x10, [x13,#80]!
 600:	28b0605a 	stp	w26, w24, [x2],#-128
 604:	28e866a2 	ldp	w2, w25, [x21],#-192
 608:	68ee0ab1 	ldpsw	x17, x2, [x21],#-144
 60c:	a886296c 	stp	x12, x10, [x11],#96
 610:	a8fe1a38 	ldp	x24, x6, [x17],#-32
 614:	282479c3 	stnp	w3, w30, [x14,#-224]
 618:	286e534f 	ldnp	w15, w20, [x26,#-144]
 61c:	a8386596 	stnp	x22, x25, [x12,#-128]
 620:	a8755a3b 	ldnp	x27, x22, [x17,#-176]
 624:	1e601000 	fmov	d0, #2.000000000000000000e+00
 628:	1e603000 	fmov	d0, #2.125000000000000000e+00
 62c:	1e621000 	fmov	d0, #4.000000000000000000e+00
 630:	1e623000 	fmov	d0, #4.250000000000000000e+00
 634:	1e641000 	fmov	d0, #8.000000000000000000e+00
 638:	1e643000 	fmov	d0, #8.500000000000000000e+00
 63c:	1e661000 	fmov	d0, #1.600000000000000000e+01
 640:	1e663000 	fmov	d0, #1.700000000000000000e+01
 644:	1e681000 	fmov	d0, #1.250000000000000000e-01
 648:	1e683000 	fmov	d0, #1.328125000000000000e-01
 64c:	1e6a1000 	fmov	d0, #2.500000000000000000e-01
 650:	1e6a3000 	fmov	d0, #2.656250000000000000e-01
 654:	1e6c1000 	fmov	d0, #5.000000000000000000e-01
 658:	1e6c3000 	fmov	d0, #5.312500000000000000e-01
 65c:	1e6e1000 	fmov	d0, #1.000000000000000000e+00
 660:	1e6e3000 	fmov	d0, #1.062500000000000000e+00
 664:	1e701000 	fmov	d0, #-2.000000000000000000e+00
 668:	1e703000 	fmov	d0, #-2.125000000000000000e+00
 66c:	1e721000 	fmov	d0, #-4.000000000000000000e+00
 670:	1e723000 	fmov	d0, #-4.250000000000000000e+00
 674:	1e741000 	fmov	d0, #-8.000000000000000000e+00
 678:	1e743000 	fmov	d0, #-8.500000000000000000e+00
 67c:	1e761000 	fmov	d0, #-1.600000000000000000e+01
 680:	1e763000 	fmov	d0, #-1.700000000000000000e+01
 684:	1e781000 	fmov	d0, #-1.250000000000000000e-01
 688:	1e783000 	fmov	d0, #-1.328125000000000000e-01
 68c:	1e7a1000 	fmov	d0, #-2.500000000000000000e-01
 690:	1e7a3000 	fmov	d0, #-2.656250000000000000e-01
 694:	1e7c1000 	fmov	d0, #-5.000000000000000000e-01
 698:	1e7c3000 	fmov	d0, #-5.312500000000000000e-01
 69c:	1e7e1000 	fmov	d0, #-1.000000000000000000e+00
 6a0:	1e7e3000 	fmov	d0, #-1.062500000000000000e+00
 */

  static const unsigned int insns[] =
  {
    0x8b0772d3,     0xcb4a3570,     0xab9c09bb,     0xeb9aa794,
    0x0b934e68,     0x4b0a3924,     0x2b1e3568,     0x6b132720,
    0x8a154c14,     0xaa1445d5,     0xca01cf99,     0xea8b3f6a,
    0x0a8c5cb9,     0x2a4a11d2,     0x4a855aa4,     0x6a857415,
    0x8aa697da,     0xaa6d7423,     0xca29bf80,     0xea3cb8bd,
    0x0a675249,     0x2ab961ba,     0x4a331899,     0x6a646345,
    0x11055267,     0x31064408,     0x51028e9d,     0x710bdee8,
    0x91082d81,     0xb106a962,     0xd10b33ae,     0xf10918ab,
    0x121102d7,     0x3204cd44,     0x5204cf00,     0x72099fb3,
    0x92729545,     0xb20e37cc,     0xd27c34be,     0xf27e4efa,
    0x14000000,     0x17ffffd7,     0x1400017f,     0x94000000,
    0x97ffffd4,     0x9400017c,     0x3400000c,     0x34fffa2c,
    0x34002f2c,     0x35000014,     0x35fff9d4,     0x35002ed4,
    0xb400000c,     0xb4fff96c,     0xb4002e6c,     0xb5000018,
    0xb5fff918,     0xb5002e18,     0x10000006,     0x10fff8a6,
    0x10002da6,     0x90000015,     0x36080001,     0x360ff821,
    0x36082d21,     0x37480008,     0x374ff7c8,     0x37482cc8,
    0x128b50ec,     0x52a9ff8b,     0x7281d095,     0x92edfebd,
    0xd28361e3,     0xf2a4cc96,     0x9346590c,     0x33194f33,
    0x531d3d89,     0x9350433c,     0xb34464ac,     0xd3462140,
    0x139a61a4,     0x93d87fd7,     0x54000000,     0x54fff5a0,
    0x54002aa0,     0x54000001,     0x54fff541,     0x54002a41,
    0x54000002,     0x54fff4e2,     0x540029e2,     0x54000002,
    0x54fff482,     0x54002982,     0x54000003,     0x54fff423,
    0x54002923,     0x54000003,     0x54fff3c3,     0x540028c3,
    0x54000004,     0x54fff364,     0x54002864,     0x54000005,
    0x54fff305,     0x54002805,     0x54000006,     0x54fff2a6,
    0x540027a6,     0x54000007,     0x54fff247,     0x54002747,
    0x54000008,     0x54fff1e8,     0x540026e8,     0x54000009,
    0x54fff189,     0x54002689,     0x5400000a,     0x54fff12a,
    0x5400262a,     0x5400000b,     0x54fff0cb,     0x540025cb,
    0x5400000c,     0x54fff06c,     0x5400256c,     0x5400000d,
    0x54fff00d,     0x5400250d,     0x5400000e,     0x54ffefae,
    0x540024ae,     0x5400000f,     0x54ffef4f,     0x5400244f,
    0xd4063721,     0xd4035082,     0xd400bfe3,     0xd4282fc0,
    0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
    0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
    0xd63f00a0,     0xc8147c55,     0xc807fcfd,     0xc85f7e05,
    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
    0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
    0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
    0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
    0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
    0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
    0xc826b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
    0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
    0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
    0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
    0xb89a5231,     0xfc5ef282,     0xbc5f60f6,     0xfc12125e,
    0xbc0152cd,     0xf8190e49,     0xb800befd,     0x381ffd92,
    0x781e9e90,     0xf8409fa3,     0xb8413c79,     0x385fffa1,
    0x785c7fa8,     0x389f3dc5,     0x78801f6a,     0x78c19d4b,
    0xb89a4ec4,     0xfc408eeb,     0xbc436e79,     0xfc152ce1,
    0xbc036f28,     0xf8025565,     0xb80135f8,     0x381ff74f,
    0x781fa652,     0xf851a447,     0xb85e557b,     0x385e7472,
    0x785e070a,     0x38804556,     0x78819591,     0x78dc24e8,
    0xb89cd6d7,     0xfc430738,     0xbc5f6595,     0xfc1225b2,
    0xbc1d7430,     0xf82fcac2,     0xb83d6a02,     0x382e5a54,
    0x7834fa66,     0xf86ecbae,     0xb86cda90,     0x3860d989,
    0x78637a2c,     0x38a3fa22,     0x78b15827,     0x78f2d9f9,
    0xb8ac6ab7,     0xfc6879a5,     0xbc767943,     0xfc3bc84e,
    0xbc3968d4,     0xf91fc0fe,     0xb91da50f,     0x391d280b,
    0x791d2e23,     0xf95bc8e2,     0xb95ce525,     0x395ae53c,
    0x795c9282,     0x399d7dd6,     0x799fe008,     0x79de9bc0,
    0xb99aae78,     0xfd597598,     0xbd5d1d08,     0xfd1f3dea,
    0xbd1a227a,     0x5800148a,     0x18000003,     0xf88092e0,
    0xd8ffdf00,     0xf8a84860,     0xf99d7560,     0x1a1c012d,
    0x3a1c027b,     0x5a060253,     0x7a03028e,     0x9a0801d0,
    0xba0803a0,     0xda140308,     0xfa00038c,     0x0b3010d7,
    0x2b37ab39,     0xcb2466da,     0x6b33efb1,     0x8b350fcb,
    0xab208a70,     0xcb39e52b,     0xeb2c9291,     0x3a4bd1a3,
    0x7a4c81a2,     0xba42106c,     0xfa5560e3,     0x3a4e3844,
    0x7a515a26,     0xba4c2940,     0xfa52aaae,     0x1a8cc1b5,
    0x1a8f976a,     0x5a8981a0,     0x5a9a6492,     0x9a8793ac,
    0x9a9474e6,     0xda83d2b6,     0xda9b9593,     0x5ac00200,
    0x5ac006f1,     0x5ac009d1,     0x5ac013d8,     0x5ac016d8,
    0xdac00223,     0xdac005ac,     0xdac00ac9,     0xdac00c00,
    0xdac01205,     0xdac016d9,     0x1ac0089d,     0x1add0fa0,
    0x1ad52225,     0x1ad22529,     0x1ac82b61,     0x1acd2e92,
    0x9acc0b28,     0x9adc0ca7,     0x9adb2225,     0x9ad42757,
    0x9adc291c,     0x9ac42fa3,     0x1b1a55d1,     0x1b0bafc1,
    0x9b067221,     0x9b1ea0de,     0x9b2e20d5,     0x9b38cd4a,
    0x9bae6254,     0x9ba59452,     0x1e2d0a48,     0x1e3c19c2,
    0x1e3c298f,     0x1e213980,     0x1e240baf,     0x1e77082c,
    0x1e72191b,     0x1e6b2a97,     0x1e723988,     0x1e770b1a,
    0x1f0d66f5,     0x1f01b956,     0x1f227a8e,     0x1f365ba7,
    0x1f4f14ad,     0x1f45a98e,     0x1f60066a,     0x1f620054,
    0x1e204139,     0x1e20c094,     0x1e214363,     0x1e21c041,
    0x1e22c01e,     0x1e60408c,     0x1e60c361,     0x1e6142c8,
    0x1e61c16b,     0x1e624396,     0x1e3802dc,     0x9e380374,
    0x1e78000e,     0x9e78017a,     0x1e2202dc,     0x9e220150,
    0x1e6202a8,     0x9e620395,     0x1e260318,     0x9e660268,
    0x1e270188,     0x9e6700e6,     0x1e3023c0,     0x1e6b2320,
    0x1e202168,     0x1e602168,     0x2910323d,     0x297449d6,
    0x6948402b,     0xa9072f40,     0xa9410747,     0x29801f0a,
    0x29e07307,     0x69e272b9,     0xa9bf49d4,     0xa9c529a8,
    0x28b0605a,     0x28e866a2,     0x68ee0ab1,     0xa886296c,
    0xa8fe1a38,     0x282479c3,     0x286e534f,     0xa8386596,
    0xa8755a3b,     0x1e601000,     0x1e603000,     0x1e621000,
    0x1e623000,     0x1e641000,     0x1e643000,     0x1e661000,
    0x1e663000,     0x1e681000,     0x1e683000,     0x1e6a1000,
    0x1e6a3000,     0x1e6c1000,     0x1e6c3000,     0x1e6e1000,
    0x1e6e3000,     0x1e701000,     0x1e703000,     0x1e721000,
    0x1e723000,     0x1e741000,     0x1e743000,     0x1e761000,
    0x1e763000,     0x1e781000,     0x1e783000,     0x1e7a1000,
    0x1e7a3000,     0x1e7c1000,     0x1e7c3000,     0x1e7e1000,
    0x1e7e3000, 
  };
// END  Generated code -- do not edit

  {
    bool ok = true;
    unsigned int *insns1 = (unsigned int *)entry;
    for (unsigned int i = 0; i < sizeof insns / sizeof insns[0]; i++) {
      if (insns[i] != insns1[i]) {
	ok = false;
	printf("Ours:\n");
	Disassembler::decode((address)&insns1[i], (address)&insns1[i+1]);
	printf("Theirs:\n");
	Disassembler::decode((address)&insns[i], (address)&insns[i+1]);
	printf("\n");
      }
    }
    assert(ok, "Assembler smoke test failed");
  }

#ifndef PRODUCT

  address PC = __ pc();
  __ ld1(v0, __ T16B, Address(r16)); // No offset
  __ ld1(v0, __ T16B, __ post(r16, 0)); // Post-index
  __ ld1(v0, __ T16B, Address(r16, r17)); // 


#endif // PRODUCT
#endif // ASSERT
}

#undef __

// Implementation of Assembler

void Assembler::emit_data64(jlong data,
                            relocInfo::relocType rtype,
                            int format) {
  if (rtype == relocInfo::none) {
    emit_long64(data);
  } else {
    emit_data64(data, Relocation::spec_simple(rtype), format);
  }
}

void Assembler::emit_data64(jlong data,
                            RelocationHolder const& rspec,
                            int format) {

  assert(inst_mark() != NULL, "must be inside InstructionMark");
  // Do not use AbstractAssembler::relocate, which is not intended for
  // embedded words.  Instead, relocate to the enclosing instruction.
  code_section()->relocate(inst_mark(), rspec, format);
  emit_long64(data);
}

extern "C" {
  void das(uint64_t start, int len) {
    ResourceMark rm;
    len <<= 2;
    if (len < 0)
      Disassembler::decode((address)start + len, (address)start);
    else
      Disassembler::decode((address)start, (address)start + len);
  }

  JNIEXPORT void das1(unsigned long insn) {
    das(insn, 1);
  }
}

#define gas_assert(ARG1) assert(ARG1, #ARG1)

#define __ as->

void Address::lea(MacroAssembler *as, Register r) const {
  Relocation* reloc = _rspec.reloc();
  relocInfo::relocType rtype = (relocInfo::relocType) reloc->type();

  switch(_mode) {
  case base_plus_offset: {
    if (_offset == 0 && _base == r) // it's a nop
      break;
    if (_offset > 0)
      __ add(r, _base, _offset);
    else
      __ sub(r, _base, -_offset);
      break;
  }
  case base_plus_offset_reg: {
    __ add(r, _base, _index, _ext.op(), MAX(_ext.shift(), 0));
    break;
  }
  case literal: {
    if (rtype == relocInfo::none)
      __ mov(r, target());
    else
      __ movptr(r, (uint64_t)target());
    break;
  }
  default:
    ShouldNotReachHere();
  }
}

void Assembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
  ShouldNotReachHere();
}

#undef __

#define starti Instruction_aarch64 do_not_use(this); set_current(&do_not_use)

  void Assembler::adr(Register Rd, address adr) {
    long offset = adr - pc();
    int offset_lo = offset & 3;
    offset >>= 2;
    starti;
    f(0, 31), f(offset_lo, 30, 29), f(0b10000, 28, 24), sf(offset, 23, 5);
    rf(Rd, 0);
  }

  void Assembler::_adrp(Register Rd, address adr) {
    uint64_t pc_page = (uint64_t)pc() >> 12;
    uint64_t adr_page = (uint64_t)adr >> 12;
    long offset = adr_page - pc_page;
    int offset_lo = offset & 3;
    offset >>= 2;
    starti;
    f(1, 31), f(offset_lo, 30, 29), f(0b10000, 28, 24), sf(offset, 23, 5);
    rf(Rd, 0);
  }

#undef starti

Address::Address(address target, relocInfo::relocType rtype) : _mode(literal){
  _is_lval = false;
  _target = target;
  switch (rtype) {
  case relocInfo::oop_type:
    // Oops are a special case. Normally they would be their own section
    // but in cases like icBuffer they are literals in the code stream that
    // we don't have a section for. We use none so that we get a literal address
    // which is always patchable.
    break;
  case relocInfo::external_word_type:
    _rspec = external_word_Relocation::spec(target);
    break;
  case relocInfo::internal_word_type:
    _rspec = internal_word_Relocation::spec(target);
    break;
  case relocInfo::opt_virtual_call_type:
    _rspec = opt_virtual_call_Relocation::spec();
    break;
  case relocInfo::static_call_type:
    _rspec = static_call_Relocation::spec();
    break;
  case relocInfo::runtime_call_type:
    _rspec = runtime_call_Relocation::spec();
    break;
  case relocInfo::poll_type:
  case relocInfo::poll_return_type:
    _rspec = Relocation::spec_simple(rtype);
    break;
  case relocInfo::none:
    _rspec = RelocationHolder::none;
    break;
  default:
    ShouldNotReachHere();
    break;
  }
}

void Assembler::b(const Address &dest) {
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), dest.rspec());
  b(dest.target());
}

void Assembler::bl(const Address &dest) {
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), dest.rspec());
  bl(dest.target());
}

void Assembler::adr(Register r, const Address &dest) {
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), dest.rspec());
  adr(r, dest.target());
}

void Assembler::br(Condition cc, Label &L) {
  if (L.is_bound()) {
    br(cc, target(L));
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    br(cc, pc());
  }
}

void Assembler::wrap_label(Label &L,
				 Assembler::uncond_branch_insn insn) {
  if (L.is_bound()) {
    (this->*insn)(target(L));
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    (this->*insn)(pc());
  }
}

void Assembler::wrap_label(Register r, Label &L,
				 compare_and_branch_insn insn) {
  if (L.is_bound()) {
    (this->*insn)(r, target(L));
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    (this->*insn)(r, pc());
  }
}

void Assembler::wrap_label(Register r, int bitpos, Label &L,
				 test_and_branch_insn insn) {
  if (L.is_bound()) {
    (this->*insn)(r, bitpos, target(L));
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    (this->*insn)(r, bitpos, pc());
  }
}

void Assembler::wrap_label(Label &L, prfop op, prefetch_insn insn) {
  if (L.is_bound()) {
    (this->*insn)(target(L), op);
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    (this->*insn)(pc(), op);
  }
}

  // An "all-purpose" add/subtract immediate, per ARM documentation:
  // A "programmer-friendly" assembler may accept a negative immediate
  // between -(2^24 -1) and -1 inclusive, causing it to convert a
  // requested ADD operation to a SUB, or vice versa, and then encode
  // the absolute value of the immediate as for uimm24.
void Assembler::add_sub_immediate(Register Rd, Register Rn, unsigned uimm, int op,
				  int negated_op) {
  bool sets_flags = op & 1;   // this op sets flags
  union {
    unsigned u;
    int imm;
  };
  u = uimm;
  bool shift = false;
  bool neg = imm < 0;
  if (neg) {
    imm = -imm;
    op = negated_op;
  }
  assert(Rd != sp || imm % 16 == 0, "misaligned stack");
  if (imm >= (1 << 11)
      && ((imm >> 12) << 12 == imm)) {
    imm >>= 12;
    shift = true;
  }
  f(op, 31, 29), f(0b10001, 28, 24), f(shift, 23, 22), f(imm, 21, 10);

  // add/subtract immediate ops with the S bit set treat r31 as zr;
  // with S unset they use sp.
  if (sets_flags)
    zrf(Rd, 0);
  else
    srf(Rd, 0);

  srf(Rn, 5);
}

bool Assembler::operand_valid_for_add_sub_immediate(long imm) {
  bool shift = false;
  unsigned long uimm = uabs(imm);
  if (uimm < (1 << 12))
    return true;
  if (uimm < (1 << 24)
      && ((uimm >> 12) << 12 == uimm)) {
    return true;
  }
  return false;
}

bool Assembler::operand_valid_for_logical_immediate(bool is32, uint64_t imm) {
  return encode_logical_immediate(is32, imm) != 0xffffffff;
}

static uint64_t doubleTo64Bits(jdouble d) {
  union {
    jdouble double_value;
    uint64_t double_bits;
  };

  double_value = d;
  return double_bits;
}

bool Assembler::operand_valid_for_float_immediate(double imm) {
  // If imm is all zero bits we can use ZR as the source of a
  // floating-point value.
  if (doubleTo64Bits(imm) == 0)
    return true;

  // Otherwise try to encode imm then convert the encoded value back
  // and make sure it's the exact same bit pattern.
  unsigned result = encoding_for_fp_immediate(imm);
  return doubleTo64Bits(imm) == fp_immediate_for_encoding(result, true);
}

void Assembler::relocate(address at, const RelocationHolder& rspec)
{
  code_section()->relocate(at, rspec);
}

void Assembler::relocate(const RelocationHolder& rspec)
{
  AbstractAssembler::relocate(rspec);
}

int AbstractAssembler::code_fill_byte() {
  return 0;
}

// n.b. this is implemented in subclass MacroAssembler
void Assembler::bang_stack_with_offset(int offset) { Unimplemented(); }


// these are the functions provided by the simulator which are used to
// encode and decode logical immediates and floating point immediates
//
//   u_int64_t logical_immediate_for_encoding(u_int32_t encoding);
//
//   u_int32_t encoding_for_logical_immediate(u_int64_t immediate);
//
//   u_int64_t fp_immediate_for_encoding(u_int32_t imm8, int is_dp);
//
//   u_int32_t encoding_for_fp_immediate(float immediate);
//
// we currently import these from the simulator librray but the
// definitions will need to be moved to here when we switch to real
// hardware.

// and now the routines called by the assembler which encapsulate the
// above encode and decode functions

uint32_t
asm_util::encode_logical_immediate(bool is32, uint64_t imm)
{
  if (is32) {
    /* Allow all zeros or all ones in top 32-bits, so that
       constant expressions like ~1 are permitted. */
    if (imm >> 32 != 0 && imm >> 32 != 0xffffffff)
      return 0xffffffff;
    /* Replicate the 32 lower bits to the 32 upper bits.  */
    imm &= 0xffffffff;
    imm |= imm << 32;
  }

  return encoding_for_logical_immediate(imm);
}

unsigned Assembler::pack(double value) {
  float val = (float)value;
  unsigned result = encoding_for_fp_immediate(val);
  guarantee(unpack(result) == value,
	    "Invalid floating-point immediate operand");
  return result;
}

// Packed operands for  Floating-point Move (immediate)

static float unpack(unsigned value) {
  union {
    unsigned ival;
    float val;
  };
  ival = fp_immediate_for_encoding(value, 0);
  return val;
}

// Implementation of MacroAssembler

int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  int instructions = 1;
  assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  long offset = (target - branch) >> 2;
  unsigned insn = *(unsigned*)branch;
  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
    // Load register (literal)
    Instruction_aarch64::spatch(branch, 23, 5, offset);
  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
    // Unconditional branch (immediate)
    Instruction_aarch64::spatch(branch, 25, 0, offset);
  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
    // Conditional branch (immediate)
    Instruction_aarch64::spatch(branch, 23, 5, offset);
  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
    // Compare & branch (immediate)
    Instruction_aarch64::spatch(branch, 23, 5, offset);
  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
    // Test & branch (immediate)
    Instruction_aarch64::spatch(branch, 18, 5, offset);
  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
    // PC-rel. addressing
    offset = target-branch;
    int shift = Instruction_aarch64::extract(insn, 31, 31);
    if (shift) {
      u_int64_t dest = (u_int64_t)target;
      uint64_t pc_page = (uint64_t)branch >> 12;
      uint64_t adr_page = (uint64_t)target >> 12;
      unsigned offset_lo = dest & 0xfff;
      offset = adr_page - pc_page;

      // We handle 3 types of PC relative addressing
      //   1 - adrp    Rx, target_page
      //       ldr/str Ry, [Rx, #offset_in_page]
      //   2 - adrp    Rx, target_page
      //       add     Ry, Rx, #offset_in_page
      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
      //
      // In the first 2 cases we must check that Rx is the same in the
      // adrp and the subsequent ldr/str or add instruction. Otherwise
      // we could accidentally end up treating a type 3 relocation as
      // a type 1 or 2 just because it happened to be followed by a
      // random unrelated ldr/str or add instruction.
      //
      // In the case of a type 3 relocation, we know that these are
      // only generated for the safepoint polling page, the crc table
      // base or the card type byte map base so we assert as much
      // and of course that the offset is 0.
      // 
      // In jdk7 the card type byte map base is aligned on a 1K
      // boundary which may fail to be 4K aligned. In that case the
      // card table load will fall into category 2.

      unsigned insn2 = ((unsigned*)branch)[1];
      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
		Instruction_aarch64::extract(insn, 4, 0) ==
			Instruction_aarch64::extract(insn2, 9, 5)) {
	// Load/store register (unsigned immediate)
	unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
	Instruction_aarch64::patch(branch + sizeof (unsigned),
				    21, 10, offset_lo >> size);
	guarantee(((dest >> size) << size) == dest, "misaligned target");
        instructions = 2;
      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
		Instruction_aarch64::extract(insn, 4, 0) ==
			Instruction_aarch64::extract(insn2, 4, 0)) {
	// add (immediate)
	assert (((jbyte *)target !=
		 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base) ||
		(offset_lo & 0x3FFl) == 0, "offset must be 0x400 aligned for crc_table");
	Instruction_aarch64::patch(branch + sizeof (unsigned),
				   21, 10, offset_lo);
        instructions = 2;
      } else {
	assert((jbyte *)target ==
		((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
               target == StubRoutines::crc_table_addr() ||
               (address)target == os::get_polling_page(),
	       "adrp must be polling page, crc_table or byte map base");
	assert(offset_lo == 0, "offset must be 0 for polling page, crc_table or byte map base");
      }
    }
    int offset_lo = offset & 3;
    offset >>= 2;
    Instruction_aarch64::spatch(branch, 23, 5, offset);
    Instruction_aarch64::patch(branch, 30, 29, offset_lo);
  } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
    u_int64_t dest = (u_int64_t)target;
    // Move wide constant
    assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
    assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
    Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
    Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
    Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
    assert(pd_call_destination(branch) == target, "should be");
    instructions = 2;
  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
    // nothing to do
    assert(target == 0, "did not expect to relocate target for polling page load");
  } else {
    ShouldNotReachHere();
  }
  return instructions * NativeInstruction::instruction_size;
}

int MacroAssembler::patch_oop(address insn_addr, address o) {
  int instructions;
  unsigned insn = *(unsigned*)insn_addr;
  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");

  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
  // narrow OOPs by setting the upper 16 bits in the first
  // instruction.
  if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
    // Move narrow OOP
    assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
    narrowOop n = oopDesc::encode_heap_oop((oop)o);
    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
    instructions = 2;
  } else {
    // Move wide OOP
    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
    uintptr_t dest = (uintptr_t)o;
    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
    instructions = 3;
  }
  return instructions * NativeInstruction::instruction_size;
}

address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
  long offset = 0;
  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
    // Load register (literal)
    offset = Instruction_aarch64::sextract(insn, 23, 5);
    return address(((uint64_t)insn_addr + (offset << 2)));
  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
    // Unconditional branch (immediate)
    offset = Instruction_aarch64::sextract(insn, 25, 0);
  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
    // Conditional branch (immediate)
    offset = Instruction_aarch64::sextract(insn, 23, 5);
  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
    // Compare & branch (immediate)
    offset = Instruction_aarch64::sextract(insn, 23, 5);
   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
    // Test & branch (immediate)
    offset = Instruction_aarch64::sextract(insn, 18, 5);
  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
    // PC-rel. addressing
    offset = Instruction_aarch64::extract(insn, 30, 29);
    offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
    int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
    if (shift) {
      offset <<= shift;
      uint64_t target_page = ((uint64_t)insn_addr) + offset;
      target_page &= ((uint64_t)-1) << shift;
      // Return the target address for the following sequences
      //   1 - adrp    Rx, target_page
      //       ldr/str Ry, [Rx, #offset_in_page]
      //   [ 2 - adrp    Rx, target_page         ] Not handled
      //   [    add     Ry, Rx, #offset_in_page  ]
      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
      //
      // In the case of type 1 we check that the register is the same and
      // return the target_page + the offset within the page.
      //
      // Otherwise we assume it is a page aligned relocation and return
      // the target page only. The only cases this is generated is for
      // the safepoint polling page or for the card table byte map base so
      // we assert as much.
      //
      // Note: Strangely, we do not handle 'type 2' relocation (adrp followed
      // by add) which is handled in pd_patch_instruction above.
      //
      unsigned insn2 = ((unsigned*)insn_addr)[1];
      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
		Instruction_aarch64::extract(insn, 4, 0) ==
			Instruction_aarch64::extract(insn2, 9, 5)) {
	// Load/store register (unsigned immediate)
	unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
	unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
	return address(target_page + (byte_offset << size));
      } else {
	assert((jbyte *)target_page ==
		((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
               (address)target_page == os::get_polling_page(),
	       "adrp must be polling page or byte map base");
	return (address)target_page;
      }
    } else {
      ShouldNotReachHere();
    }
  } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
    u_int32_t *insns = (u_int32_t *)insn_addr;
    // Move wide constant: movz, movk, movk.  See movptr().
    assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
    assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
    return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
		   + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
		   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
    return 0;
  } else {
    ShouldNotReachHere();
  }
  return address(((uint64_t)insn_addr + (offset << 2)));
}

void MacroAssembler::serialize_memory(Register thread, Register tmp) {
  dsb(Assembler::SY);
}


void MacroAssembler::reset_last_Java_frame(bool clear_fp,
                                           bool clear_pc) {
  // we must set sp to zero to clear frame
  str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
  // must clear fp, so that compiled frames are not confused; it is
  // possible that we need it only for debugging
  if (clear_fp) {
    str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
  }

  if (clear_pc) {
    str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
  }
}

// Calls to C land
//
// When entering C land, the rfp, & resp of the last Java frame have to be recorded
// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
// has to be reset to 0. This is required to allow proper stack traversal.
void MacroAssembler::set_last_Java_frame(Register last_java_sp,
                                         Register last_java_fp,
                                         Register last_java_pc,
					 Register scratch) {

  if (last_java_pc->is_valid()) {
      str(last_java_pc, Address(rthread,
				JavaThread::frame_anchor_offset()
				+ JavaFrameAnchor::last_Java_pc_offset()));
    }

  // determine last_java_sp register
  if (last_java_sp == sp) {
    mov(scratch, sp);
    last_java_sp = scratch;
  } else if (!last_java_sp->is_valid()) {
    last_java_sp = esp;
  }

  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));

  // last_java_fp is optional
  if (last_java_fp->is_valid()) {
    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
  }
}

void MacroAssembler::set_last_Java_frame(Register last_java_sp,
                                         Register last_java_fp,
                                         address  last_java_pc,
					 Register scratch) {
  if (last_java_pc != NULL) {
    adr(scratch, last_java_pc);
  } else {
    // FIXME: This is almost never correct.  We should delete all
    // cases of set_last_Java_frame with last_java_pc=NULL and use the
    // correct return address instead.
    adr(scratch, pc());
  }

  str(scratch, Address(rthread,
		       JavaThread::frame_anchor_offset()
		       + JavaFrameAnchor::last_Java_pc_offset()));

  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
}

void MacroAssembler::set_last_Java_frame(Register last_java_sp,
                                         Register last_java_fp,
                                         Label &L,
					 Register scratch) {
  if (L.is_bound()) {
    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
  } else {
    InstructionMark im(this);
    L.add_patch_at(code(), locator());
    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
  }
}

int MacroAssembler::biased_locking_enter(Register lock_reg,
                                         Register obj_reg,
                                         Register swap_reg,
                                         Register tmp_reg,
                                         bool swap_reg_contains_mark,
                                         Label& done,
                                         Label* slow_case,
                                         BiasedLockingCounters* counters) {
  assert(UseBiasedLocking, "why call this otherwise?");
  assert_different_registers(lock_reg, obj_reg, swap_reg);

  if (PrintBiasedLockingStatistics && counters == NULL)
    counters = BiasedLocking::counters();

  bool need_tmp_reg = false;
  if (tmp_reg == noreg) {
    tmp_reg = rscratch2;
  }
  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
  Address saved_mark_addr(lock_reg, 0);

  // Biased locking
  // See whether the lock is currently biased toward our thread and
  // whether the epoch is still valid
  // Note that the runtime guarantees sufficient alignment of JavaThread
  // pointers to allow age to be placed into low bits
  // First check to see whether biasing is even enabled for this object
  Label cas_label;
  int null_check_offset = -1;
  if (!swap_reg_contains_mark) {
    null_check_offset = offset();
    ldr(swap_reg, mark_addr);
  }
  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
  br(Assembler::NE, cas_label);
  // The bias pattern is present in the object's header. Need to check
  // whether the bias owner and the epoch are both still current.
  load_prototype_header(tmp_reg, obj_reg);
  orr(tmp_reg, tmp_reg, rthread);
  eor(tmp_reg, swap_reg, tmp_reg);
  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
  if (counters != NULL) {
    Label around;
    cbnz(tmp_reg, around);
    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
    b(done);
    bind(around);
  } else {
    cbz(tmp_reg, done);
  }

  Label try_revoke_bias;
  Label try_rebias;

  // At this point we know that the header has the bias pattern and
  // that we are not the bias owner in the current epoch. We need to
  // figure out more details about the state of the header in order to
  // know what operations can be legally performed on the object's
  // header.

  // If the low three bits in the xor result aren't clear, that means
  // the prototype header is no longer biased and we have to revoke
  // the bias on this object.
  andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
  cbnz(rscratch1, try_revoke_bias);

  // Biasing is still enabled for this data type. See whether the
  // epoch of the current bias is still valid, meaning that the epoch
  // bits of the mark word are equal to the epoch bits of the
  // prototype header. (Note that the prototype header's epoch bits
  // only change at a safepoint.) If not, attempt to rebias the object
  // toward the current thread. Note that we must be absolutely sure
  // that the current epoch is invalid in order to do this because
  // otherwise the manipulations it performs on the mark word are
  // illegal.
  andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
  cbnz(rscratch1, try_rebias);

  // The epoch of the current bias is still valid but we know nothing
  // about the owner; it might be set or it might be clear. Try to
  // acquire the bias of the object using an atomic operation. If this
  // fails we will go in to the runtime to revoke the object's bias.
  // Note that we first construct the presumed unbiased header so we
  // don't accidentally blow away another thread's valid bias.
  {
    Label here;
    mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
    andr(swap_reg, swap_reg, rscratch1);
    orr(tmp_reg, swap_reg, rthread);
    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
    // If the biasing toward our thread failed, this means that
    // another thread succeeded in biasing it toward itself and we
    // need to revoke that bias. The revocation will occur in the
    // interpreter runtime in the slow case.
    bind(here);
    if (counters != NULL) {
      atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
		  tmp_reg, rscratch1);
    }
  }
  b(done);

  bind(try_rebias);
  // At this point we know the epoch has expired, meaning that the
  // current "bias owner", if any, is actually invalid. Under these
  // circumstances _only_, we are allowed to use the current header's
  // value as the comparison value when doing the cas to acquire the
  // bias in the current epoch. In other words, we allow transfer of
  // the bias from one thread to another directly in this situation.
  //
  // FIXME: due to a lack of registers we currently blow away the age
  // bits in this situation. Should attempt to preserve them.
  {
    Label here;
    load_prototype_header(tmp_reg, obj_reg);
    orr(tmp_reg, rthread, tmp_reg);
    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
    // If the biasing toward our thread failed, then another thread
    // succeeded in biasing it toward itself and we need to revoke that
    // bias. The revocation will occur in the runtime in the slow case.
    bind(here);
    if (counters != NULL) {
      atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
		  tmp_reg, rscratch1);
    }
  }
  b(done);

  bind(try_revoke_bias);
  // The prototype mark in the klass doesn't have the bias bit set any
  // more, indicating that objects of this data type are not supposed
  // to be biased any more. We are going to try to reset the mark of
  // this object to the prototype value and fall through to the
  // CAS-based locking scheme. Note that if our CAS fails, it means
  // that another thread raced us for the privilege of revoking the
  // bias of this particular object, so it's okay to continue in the
  // normal locking code.
  //
  // FIXME: due to a lack of registers we currently blow away the age
  // bits in this situation. Should attempt to preserve them.
  {
    Label here, nope;
    load_prototype_header(tmp_reg, obj_reg);
    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
    bind(here);

    // Fall through to the normal CAS-based lock, because no matter what
    // the result of the above CAS, some thread must have succeeded in
    // removing the bias bit from the object's header.
    if (counters != NULL) {
      atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
		  rscratch1);
    }
    bind(nope);
  }

  bind(cas_label);

  return null_check_offset;
}

void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
  assert(UseBiasedLocking, "why call this otherwise?");

  // Check for biased locking unlock case, which is a no-op
  // Note: we do not have to check the thread ID for two reasons.
  // First, the interpreter checks for IllegalMonitorStateException at
  // a higher level. Second, if the bias was revoked while we held the
  // lock, the object could not be rebiased toward another thread, so
  // the bias bit would be clear.
  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
  cmp(temp_reg, markOopDesc::biased_lock_pattern);
  br(Assembler::EQ, done);
}


// added to make this compile

REGISTER_DEFINITION(Register, noreg);

static void pass_arg0(MacroAssembler* masm, Register arg) {
  if (c_rarg0 != arg ) {
    masm->mov(c_rarg0, arg);
  }
}

static void pass_arg1(MacroAssembler* masm, Register arg) {
  if (c_rarg1 != arg ) {
    masm->mov(c_rarg1, arg);
  }
}

static void pass_arg2(MacroAssembler* masm, Register arg) {
  if (c_rarg2 != arg ) {
    masm->mov(c_rarg2, arg);
  }
}

static void pass_arg3(MacroAssembler* masm, Register arg) {
  if (c_rarg3 != arg ) {
    masm->mov(c_rarg3, arg);
  }
}

void MacroAssembler::call_VM_base(Register oop_result,
				  Register java_thread,
				  Register last_java_sp,
				  address  entry_point,
				  int      number_of_arguments,
				  bool     check_exceptions) {
   // determine java_thread register
  if (!java_thread->is_valid()) {
    java_thread = rthread;
  }

  // determine last_java_sp register
  if (!last_java_sp->is_valid()) {
    last_java_sp = esp;
  }

  // debugging support
  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
  assert(java_thread == rthread, "unexpected register");
#ifdef ASSERT
  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
  // if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
#endif // ASSERT

  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");

  // push java thread (becomes first argument of C function)

  mov(c_rarg0, java_thread);

  // set last Java frame before call
  assert(last_java_sp != rfp, "can't use rfp");

  Label l;
  set_last_Java_frame(last_java_sp, rfp, l, rscratch1);

  // do the call, remove parameters
  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);

  // reset last Java frame
  // Only interpreter should have to clear fp
  reset_last_Java_frame(true, true);

   // C++ interp handles this in the interpreter
  check_and_handle_popframe(java_thread);
  check_and_handle_earlyret(java_thread);

  if (check_exceptions) {
    // check for pending exceptions (java_thread is set upon return)
    ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
    Label ok;
    cbz(rscratch1, ok);
    lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
    br(rscratch1);
    bind(ok);
  }

  // get oop result if there is one and reset the value in the thread
  if (oop_result->is_valid()) {
    // !!! FIXME AARCH64 -- retained this, it is in sparc but not in x86 !!!
    get_vm_result(oop_result, java_thread);
  }
}

void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
}

void MacroAssembler::call(Address entry) {
  if (true // reachable(entry)
      ) {
    bl(entry);
  } else {
    lea(rscratch1, entry);
    blr(rscratch1);
  }
}

// Implementation of call_VM versions

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             bool check_exceptions) {
  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             bool check_exceptions) {
  pass_arg1(this, arg_1);
  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             bool check_exceptions) {
  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);
  pass_arg1(this, arg_1);
  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             Register arg_3,
                             bool check_exceptions) {
  assert(arg_1 != c_rarg3, "smashed arg");
  assert(arg_2 != c_rarg3, "smashed arg");
  pass_arg3(this, arg_3);

  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);

  pass_arg1(this, arg_1);
  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             int number_of_arguments,
                             bool check_exceptions) {
  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             bool check_exceptions) {
  pass_arg1(this, arg_1);
  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             bool check_exceptions) {

  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);
  pass_arg1(this, arg_1);
  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             Register arg_3,
                             bool check_exceptions) {
  assert(arg_1 != c_rarg3, "smashed arg");
  assert(arg_2 != c_rarg3, "smashed arg");
  pass_arg3(this, arg_3);
  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);
  pass_arg1(this, arg_1);
  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
}


void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
  str(zr, Address(java_thread, JavaThread::vm_result_offset()));
  verify_oop(oop_result, "broken oop in call_VM_base");
}

void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
  str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
}

void MacroAssembler::align(int modulus) {
  while (offset() % modulus != 0) nop();
}

// these are no-ops overridden by InterpreterMacroAssembler

void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }

void MacroAssembler::check_and_handle_popframe(Register java_thread) { }

RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
                                                      Register tmp,
                                                      int offset) {
  intptr_t value = *delayed_value_addr;
  if (value != 0)
    return RegisterOrConstant(value + offset);

  // load indirectly to solve generation ordering problem
  ldr(tmp, ExternalAddress((address) delayed_value_addr));

  if (offset != 0)
    add(tmp, tmp, offset);

  return RegisterOrConstant(tmp);
}

void MacroAssembler:: notify(int type) {
  if (type == bytecode_start) {
    // set_last_Java_frame(esp, rfp, (address)NULL);
    Assembler:: notify(type);
    // reset_last_Java_frame(true, false);
  }
  else
    Assembler:: notify(type);
}

// Look up the method for a megamorphic invokeinterface call.
// The target method is determined by <intf_klass, itable_index>.
// The receiver klass is in recv_klass.
// On success, the result will be in method_result, and execution falls through.
// On failure, execution transfers to the given label.
void MacroAssembler::lookup_interface_method(Register recv_klass,
                                             Register intf_klass,
                                             RegisterOrConstant itable_index,
                                             Register method_result,
                                             Register scan_temp,
                                             Label& L_no_such_interface) {
  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
         "caller must use same register for non-constant itable index as for method");

  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
  int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
  int itentry_off = itableMethodEntry::method_offset_in_bytes();
  int scan_step   = itableOffsetEntry::size() * wordSize;
  int vte_size    = vtableEntry::size() * wordSize;
  assert(vte_size == wordSize, "else adjust times_vte_scale");

  ldrw(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));

  // %%% Could store the aligned, prescaled offset in the klassoop.
  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
  lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
  add(scan_temp, scan_temp, vtable_base);
  if (HeapWordsPerLong > 1) {
    // Round up to align_object_offset boundary
    // see code for instanceKlass::start_of_itable!
    round_to(scan_temp, BytesPerLong);
  }

  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
  // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
  lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
  if (itentry_off)
    add(recv_klass, recv_klass, itentry_off);

  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
  //   if (scan->interface() == intf) {
  //     result = (klass + scan->offset() + itable_index);
  //   }
  // }
  Label search, found_method;

  for (int peel = 1; peel >= 0; peel--) {
    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
    cmp(intf_klass, method_result);

    if (peel) {
      br(Assembler::EQ, found_method);
    } else {
      br(Assembler::NE, search);
      // (invert the test to fall through to found_method...)
    }

    if (!peel)  break;

    bind(search);

    // Check that the previous entry is non-null.  A null entry means that
    // the receiver class doesn't implement the interface, and wasn't the
    // same as when the caller was compiled.
    cbz(method_result, L_no_such_interface);
    add(scan_temp, scan_temp, scan_step);
  }

  bind(found_method);

  // Got a hit.
  ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
  ldr(method_result, Address(recv_klass, scan_temp));
}

// virtual method calling
void MacroAssembler::lookup_virtual_method(Register recv_klass,
                                           RegisterOrConstant vtable_index,
                                           Register method_result) {
  const int base = instanceKlass::vtable_start_offset() * wordSize;
  assert(vtableEntry::size() * wordSize == 8,
         "adjust the scaling in the code below");
  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();

  if (vtable_index.is_register()) {
    lea(method_result, Address(recv_klass,
			       vtable_index.as_register(),
			       Address::lsl(LogBytesPerWord)));
    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
  } else {
    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
    ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
  }
}

void MacroAssembler::check_klass_subtype(Register sub_klass,
                           Register super_klass,
                           Register temp_reg,
                           Label& L_success) {
  Label L_failure;
  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
  bind(L_failure);
}


void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
                                                   Register super_klass,
                                                   Register temp_reg,
                                                   Label* L_success,
                                                   Label* L_failure,
                                                   Label* L_slow_path,
                                        RegisterOrConstant super_check_offset) {
  assert_different_registers(sub_klass, super_klass, temp_reg);
  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
  if (super_check_offset.is_register()) {
    assert_different_registers(sub_klass, super_klass,
                               super_check_offset.as_register());
  } else if (must_load_sco) {
    assert(temp_reg != noreg, "supply either a temp or a register offset");
  }

  Label L_fallthrough;
  int label_nulls = 0;
  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
  assert(label_nulls <= 1, "at most one NULL in the batch");

  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  int sco_offset = in_bytes(Klass::super_check_offset_offset());
  Address super_check_offset_addr(super_klass, sco_offset);

  // Hacked jmp, which may only be used just before L_fallthrough.
#define final_jmp(label)                                                \
  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
  else                            b(label)                /*omit semi*/

  // If the pointers are equal, we are done (e.g., String[] elements).
  // This self-check enables sharing of secondary supertype arrays among
  // non-primary types such as array-of-interface.  Otherwise, each such
  // type would need its own customized SSA.
  // We move this check to the front of the fast path because many
  // type checks are in fact trivially successful in this manner,
  // so we get a nicely predicted branch right at the start of the check.
  cmp(sub_klass, super_klass);
  br(Assembler::EQ, *L_success);

  // Check the supertype display:
  if (must_load_sco) {
    // Positive movl does right thing on LP64.
    ldrw(temp_reg, super_check_offset_addr);
    super_check_offset = RegisterOrConstant(temp_reg);
  }
  Address super_check_addr(sub_klass, super_check_offset);
  ldr(rscratch1, super_check_addr);
  cmp(super_klass, rscratch1); // load displayed supertype

  // This check has worked decisively for primary supers.
  // Secondary supers are sought in the super_cache ('super_cache_addr').
  // (Secondary supers are interfaces and very deeply nested subtypes.)
  // This works in the same check above because of a tricky aliasing
  // between the super_cache and the primary super display elements.
  // (The 'super_check_addr' can address either, as the case requires.)
  // Note that the cache is updated below if it does not help us find
  // what we need immediately.
  // So if it was a primary super, we can just fail immediately.
  // Otherwise, it's the slow path for us (no success at this point).

  if (super_check_offset.is_register()) {
    br(Assembler::EQ, *L_success);
    cmp(super_check_offset.as_register(), sc_offset);
    if (L_failure == &L_fallthrough) {
      br(Assembler::EQ, *L_slow_path);
    } else {
      br(Assembler::NE, *L_failure);
      final_jmp(*L_slow_path);
    }
  } else if (super_check_offset.as_constant() == sc_offset) {
    // Need a slow path; fast failure is impossible.
    if (L_slow_path == &L_fallthrough) {
      br(Assembler::EQ, *L_success);
    } else {
      br(Assembler::NE, *L_slow_path);
      final_jmp(*L_success);
    }
  } else {
    // No slow path; it's a fast decision.
    if (L_failure == &L_fallthrough) {
      br(Assembler::EQ, *L_success);
    } else {
      br(Assembler::NE, *L_failure);
      final_jmp(*L_success);
    }
  }

  bind(L_fallthrough);

#undef final_jmp
}

// These two are taken from x86, but they look generally useful

// scans count pointer sized words at [addr] for occurence of value,
// generic
void MacroAssembler::repne_scan(Register addr, Register value, Register count,
				Register scratch) {
  Label Lloop, Lexit;
  cbz(count, Lexit);
  bind(Lloop);
  ldr(scratch, post(addr, wordSize));
  cmp(value, scratch);
  br(EQ, Lexit);
  sub(count, count, 1);
  cbnz(count, Lloop);
  bind(Lexit);
}

// scans count 4 byte words at [addr] for occurence of value,
// generic
void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
				Register scratch) {
  Label Lloop, Lexit;
  cbz(count, Lexit);
  bind(Lloop);
  // !!! FIXME AARCH64 -- if this only gets called when CompressedOops
  // is true and repne_scan only gets called when CompressedOops is
  // false then the size passed in the post call should be heapOopSize
  // both here and in repne_scan above. if it is used more generally
  // for 32 bit searches and repne_scan is used for 64 bit searches
  // then size needs to be wordSize/2 here and wordSize above.
  ldrw(scratch, post(addr, wordSize/2));
  cmpw(value, scratch);
  br(EQ, Lexit);
  sub(count, count, 1);
  cbnz(count, Lloop);
  bind(Lexit);
}

void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
                                                   Register super_klass,
                                                   Register temp_reg,
                                                   Register temp2_reg,
                                                   Label* L_success,
                                                   Label* L_failure,
                                                   bool set_cond_codes) {
  assert_different_registers(sub_klass, super_klass, temp_reg);
  if (temp2_reg != noreg)
    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)

  Label L_fallthrough;
  int label_nulls = 0;
  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  assert(label_nulls <= 1, "at most one NULL in the batch");

  // a couple of useful fields in sub_klass:
  int ss_offset = in_bytes(Klass::secondary_supers_offset());
  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
  Address secondary_supers_addr(sub_klass, ss_offset);
  Address super_cache_addr(     sub_klass, sc_offset);

  BLOCK_COMMENT("check_klass_subtype_slow_path");

  // Do a linear scan of the secondary super-klass chain.
  // This code is rarely used, so simplicity is a virtue here.
  // The repne_scan instruction uses fixed registers, which we must spill.
  // Don't worry too much about pre-existing connections with the input regs.

  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)

  // Get super_klass value into r0 (even if it was in r5 or r2).
  RegSet pushed_registers;
  if (!IS_A_TEMP(r2))    pushed_registers += r2;
  if (!IS_A_TEMP(r5))    pushed_registers += r5;

  if (super_klass != r0 || UseCompressedOops) {
    if (!IS_A_TEMP(r0))   pushed_registers += r0;
  }

  push(pushed_registers, sp);

#ifndef PRODUCT
  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
  Address pst_counter_addr(rscratch2);
  ldr(rscratch1, pst_counter_addr);
  add(rscratch1, rscratch1, 1);
  str(rscratch1, pst_counter_addr);
#endif //PRODUCT

  // We will consult the secondary-super array.
  ldr(r5, secondary_supers_addr);
  // Load the 32 bit array length.
  ldrw(r2, Address(r5, arrayOopDesc::length_offset_in_bytes()));
  // Skip to start of data.
  add(r5, r5, arrayOopDesc::base_offset_in_bytes(T_OBJECT));

  // This part is tricky, as values in supers array could be 32 or 64 bit wide
  // and we store values in objArrays always encoded, thus we need to encode
  // the value of r0 before repne.  Note that r0 is dead after the repne.
  if (UseCompressedOops) {
    encode_heap_oop_not_null(r0); // Changes flags.
    cmp(sp, zr); // Clear Z flag; SP is never zero
    repne_scanw(r5, r0, r2, rscratch1);
  } else {
    cmp(sp, zr); // Clear Z flag; SP is never zero
    // Scan R2 words at [R5] for an occurrence of R0.
    // Set NZ/Z based on last compare.
    repne_scan(r5, r0, r2, rscratch1);
  }

  // Unspill the temp. registers:
  pop(pushed_registers, sp);

  br(Assembler::NE, *L_failure);

  // Success.  Cache the super we found and proceed in triumph.
  str(super_klass, super_cache_addr);

  if (L_success != &L_fallthrough) {
    b(*L_success);
  }

#undef IS_A_TEMP

  bind(L_fallthrough);
}


void MacroAssembler::verify_oop(Register reg, const char* s) {
  if (!VerifyOops) return;

  // Pass register number to verify_oop_subroutine
  const char* b = NULL;
  {
    ResourceMark rm;
    stringStream ss;
    ss.print("verify_oop: %s: %s", reg->name(), s);
    b = code_string(ss.as_string());
  }
  BLOCK_COMMENT("verify_oop {");

  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));

  mov(r0, reg);
  mov(rscratch1, (address)b);

  // call indirectly to solve generation ordering problem
  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
  ldr(rscratch2, Address(rscratch2));
  blr(rscratch2);

  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));

  BLOCK_COMMENT("} verify_oop");
}

void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
  if (!VerifyOops) return;

  const char* b = NULL;
  {
    ResourceMark rm;
    stringStream ss;
    ss.print("verify_oop_addr: %s", s);
    b = code_string(ss.as_string());
  }
  BLOCK_COMMENT("verify_oop_addr {");

  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));

  // addr may contain sp so we will have to adjust it based on the
  // pushes that we just did.
  if (addr.uses(sp)) {
    lea(r0, addr);
    ldr(r0, Address(r0, 4 * wordSize));
  } else {
    ldr(r0, addr);
  }
  mov(rscratch1, (address)b);

  // call indirectly to solve generation ordering problem
  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
  ldr(rscratch2, Address(rscratch2));
  blr(rscratch2);

  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));

  BLOCK_COMMENT("} verify_oop_addr");
}

Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
                                         int extra_slot_offset) {
  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
  int stackElementSize = Interpreter::stackElementSize;
  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
#ifdef ASSERT
  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
  assert(offset1 - offset == stackElementSize, "correct arithmetic");
#endif
  if (arg_slot.is_constant()) {
    return Address(esp, arg_slot.as_constant() * stackElementSize
		   + offset);
  } else {
    add(rscratch1, esp, arg_slot.as_register(),
	ext::uxtx, exact_log2(stackElementSize));
    return Address(rscratch1, offset);
  }
}

void MacroAssembler::call_VM_leaf_base(address entry_point,
                                       int number_of_arguments,
				       Label *retaddr) {
  call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
}

void MacroAssembler::call_VM_leaf_base1(address entry_point,
					int number_of_gp_arguments,
					int number_of_fp_arguments,
					ret_type type,
					Label *retaddr) {
  Label E, L;

  // !!! FIXME AARCH64 we normally need to save rmethod as it is
  // volatile.  however we don't need to when calling from the
  // interpreter.
  stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));

  // We add 1 to number_of_arguments because the thread in arg0 is
  // not counted
  mov(rscratch1, entry_point);
  blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
  if (retaddr)
    bind(*retaddr);

  ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
  maybe_isb();
}

void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
  call_VM_leaf_base(entry_point, number_of_arguments);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
  pass_arg0(this, arg_0);
  call_VM_leaf_base(entry_point, 1);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
  pass_arg0(this, arg_0);
  pass_arg1(this, arg_1);
  call_VM_leaf_base(entry_point, 2);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
				  Register arg_1, Register arg_2) {
  pass_arg0(this, arg_0);
  pass_arg1(this, arg_1);
  pass_arg2(this, arg_2);
  call_VM_leaf_base(entry_point, 3);
}

void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
  pass_arg0(this, arg_0);
  MacroAssembler::call_VM_leaf_base(entry_point, 1);
}

void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {

  assert(arg_0 != c_rarg1, "smashed arg");
  pass_arg1(this, arg_1);
  pass_arg0(this, arg_0);
  MacroAssembler::call_VM_leaf_base(entry_point, 2);
}

void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
  assert(arg_0 != c_rarg2, "smashed arg");
  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);
  assert(arg_0 != c_rarg1, "smashed arg");
  pass_arg1(this, arg_1);
  pass_arg0(this, arg_0);
  MacroAssembler::call_VM_leaf_base(entry_point, 3);
}

void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
  assert(arg_0 != c_rarg3, "smashed arg");
  assert(arg_1 != c_rarg3, "smashed arg");
  assert(arg_2 != c_rarg3, "smashed arg");
  pass_arg3(this, arg_3);
  assert(arg_0 != c_rarg2, "smashed arg");
  assert(arg_1 != c_rarg2, "smashed arg");
  pass_arg2(this, arg_2);
  assert(arg_0 != c_rarg1, "smashed arg");
  pass_arg1(this, arg_1);
  pass_arg0(this, arg_0);
  MacroAssembler::call_VM_leaf_base(entry_point, 4);
}

void MacroAssembler::null_check(Register reg, int offset) {
  if (needs_explicit_null_check(offset)) {
    // provoke OS NULL exception if reg = NULL by
    // accessing M[reg] w/o changing any registers
    // NOTE: this is plenty to provoke a segv
    ldr(zr, Address(reg));
  } else {
    // nothing to do, (later) access of M[reg + offset]
    // will provoke OS NULL exception if reg = NULL
  }
}

// MacroAssembler protected routines needed to implement
// public methods

void MacroAssembler::mov(Register r, Address dest) {
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), dest.rspec());
  u_int64_t imm64 = (u_int64_t)dest.target();
  movptr(r, imm64);
}

// Move a constant pointer into r.  In AArch64 mode the virtual
// address space is 48 bits in size, so we only need three
// instructions to create a patchable instruction sequence that can
// reach anywhere.
void MacroAssembler::movptr(Register r, uintptr_t imm64) {
#ifndef PRODUCT
  {
    char buffer[64];
    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
    block_comment(buffer);
  }
#endif
  assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
  movz(r, imm64 & 0xffff);
  imm64 >>= 16;
  movk(r, imm64 & 0xffff, 16);
  imm64 >>= 16;
  movk(r, imm64 & 0xffff, 32);
}

void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
{
#ifndef PRODUCT
  {
    char buffer[64];
    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
    block_comment(buffer);
  }
#endif
  if (operand_valid_for_logical_immediate(false, imm64)) {
    orr(dst, zr, imm64);
  } else {
    // we can use a combination of MOVZ or MOVN with
    // MOVK to build up the constant
    u_int64_t imm_h[4];
    int zero_count = 0;
    int neg_count = 0;
    int i;
    for (i = 0; i < 4; i++) {
      imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
      if (imm_h[i] == 0) {
	zero_count++;
      } else if (imm_h[i] == 0xffffL) {
	neg_count++;
      }
    }
    if (zero_count == 4) {
      // one MOVZ will do
      movz(dst, 0);
    } else if (neg_count == 4) {
      // one MOVN will do
      movn(dst, 0);
    } else if (zero_count == 3) {
      for (i = 0; i < 4; i++) {
	if (imm_h[i] != 0L) {
	  movz(dst, (u_int32_t)imm_h[i], (i << 4));
	  break;
	}
      }
    } else if (neg_count == 3) {
      // one MOVN will do
      for (int i = 0; i < 4; i++) {
	if (imm_h[i] != 0xffffL) {
	  movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
	  break;
	}
      }
    } else if (zero_count == 2) {
      // one MOVZ and one MOVK will do
      for (i = 0; i < 3; i++) {
	if (imm_h[i] != 0L) {
	  movz(dst, (u_int32_t)imm_h[i], (i << 4));
	  i++;
	  break;
	}
      }
      for (;i < 4; i++) {
	if (imm_h[i] != 0L) {
	  movk(dst, (u_int32_t)imm_h[i], (i << 4));
	}
      }
    } else if (neg_count == 2) {
      // one MOVN and one MOVK will do
      for (i = 0; i < 4; i++) {
	if (imm_h[i] != 0xffffL) {
	  movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
	  i++;
	  break;
	}
      }
      for (;i < 4; i++) {
	if (imm_h[i] != 0xffffL) {
	  movk(dst, (u_int32_t)imm_h[i], (i << 4));
	}
      }
    } else if (zero_count == 1) {
      // one MOVZ and two MOVKs will do
      for (i = 0; i < 4; i++) {
	if (imm_h[i] != 0L) {
	  movz(dst, (u_int32_t)imm_h[i], (i << 4));
	  i++;
	  break;
	}
      }
      for (;i < 4; i++) {
	if (imm_h[i] != 0x0L) {
	  movk(dst, (u_int32_t)imm_h[i], (i << 4));
	}
      }
    } else if (neg_count == 1) {
      // one MOVN and two MOVKs will do
      for (i = 0; i < 4; i++) {
	if (imm_h[i] != 0xffffL) {
	  movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
	  i++;
	  break;
	}
      }
      for (;i < 4; i++) {
	if (imm_h[i] != 0xffffL) {
	  movk(dst, (u_int32_t)imm_h[i], (i << 4));
	}
      }
    } else {
      // use a MOVZ and 3 MOVKs (makes it easier to debug)
      movz(dst, (u_int32_t)imm_h[0], 0);
      for (i = 1; i < 4; i++) {
	movk(dst, (u_int32_t)imm_h[i], (i << 4));
      }
    }
  }
}

void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
{
#ifndef PRODUCT
    {
      char buffer[64];
      snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
      block_comment(buffer);
    }
#endif
  if (operand_valid_for_logical_immediate(true, imm32)) {
    orrw(dst, zr, imm32);
  } else {
    // we can use MOVZ, MOVN or two calls to MOVK to build up the
    // constant
    u_int32_t imm_h[2];
    imm_h[0] = imm32 & 0xffff;
    imm_h[1] = ((imm32 >> 16) & 0xffff);
    if (imm_h[0] == 0) {
      movzw(dst, imm_h[1], 16);
    } else if (imm_h[0] == 0xffff) {
      movnw(dst, imm_h[1] ^ 0xffff, 16);
    } else if (imm_h[1] == 0) {
      movzw(dst, imm_h[0], 0);
    } else if (imm_h[1] == 0xffff) {
      movnw(dst, imm_h[0] ^ 0xffff, 0);
    } else {
      // use a MOVZ and MOVK (makes it easier to debug)
      movzw(dst, imm_h[0], 0);
      movkw(dst, imm_h[1], 16);
    }
  }
}

// Form an address from base + offset in Rd.  Rd may or may
// not actually be used: you must use the Address that is returned.
// It is up to you to ensure that the shift provided matches the size
// of your data.
Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
  if (Address::offset_ok_for_immed(byte_offset, shift))
    // It fits; no need for any heroics
    return Address(base, byte_offset);

  // Don't do anything clever with negative or misaligned offsets
  unsigned mask = (1 << shift) - 1;
  if (byte_offset < 0 || byte_offset & mask) {
    mov(Rd, byte_offset);
    add(Rd, base, Rd);
    return Address(Rd);
  }

  // See if we can do this with two 12-bit offsets
  {
    unsigned long word_offset = byte_offset >> shift;
    unsigned long masked_offset = word_offset & 0xfff000;
    if (Address::offset_ok_for_immed(word_offset - masked_offset)
	&& Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
      add(Rd, base, masked_offset << shift);
      word_offset -= masked_offset;
      return Address(Rd, word_offset << shift);
    }
  }

  // Do it the hard way
  mov(Rd, byte_offset);
  add(Rd, base, Rd);
  return Address(Rd);
}

void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
  Label retry_load;
  bind(retry_load);
  // flush and load exclusive from the memory location
  ldxrw(tmp, counter_addr);
  addw(tmp, tmp, 1);
  // if we store+flush with no intervening write tmp wil be zero
  stxrw(tmp, tmp, counter_addr);
  cbnzw(tmp, retry_load);
}


int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
				    bool want_remainder, Register scratch)
{
  // Full implementation of Java idiv and irem.  The function
  // returns the (pc) offset of the div instruction - may be needed
  // for implicit exceptions.
  //
  // constraint : ra/rb =/= scratch
  //         normal case
  //
  // input : ra: dividend
  //         rb: divisor
  //
  // result: either
  //         quotient  (= ra idiv rb)
  //         remainder (= ra irem rb)

  assert(ra != scratch && rb != scratch, "reg cannot be scratch");

  int idivl_offset = offset();
  if (! want_remainder) {
    sdivw(result, ra, rb);
  } else {
    sdivw(scratch, ra, rb);
    Assembler::msubw(result, scratch, rb, ra);
  }

  return idivl_offset;
}

int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
				    bool want_remainder, Register scratch)
{
  // Full implementation of Java ldiv and lrem.  The function
  // returns the (pc) offset of the div instruction - may be needed
  // for implicit exceptions.
  //
  // constraint : ra/rb =/= scratch
  //         normal case
  //
  // input : ra: dividend
  //         rb: divisor
  //
  // result: either
  //         quotient  (= ra idiv rb)
  //         remainder (= ra irem rb)

  assert(ra != scratch && rb != scratch, "reg cannot be scratch");

  int idivq_offset = offset();
  if (! want_remainder) {
    sdiv(result, ra, rb);
  } else {
    sdiv(scratch, ra, rb);
    msub(result, scratch, rb, ra);
  }

  return idivq_offset;
}

// MacroAssembler routines found actually to be needed

void MacroAssembler::push(Register src)
{
  str(src, Address(pre(esp, -1 * wordSize)));
}

void MacroAssembler::pop(Register dst)
{
  ldr(dst, Address(post(esp, 1 * wordSize)));
}

// Note: load_unsigned_short used to be called load_unsigned_word.
int MacroAssembler::load_unsigned_short(Register dst, Address src) {
  int off = offset();
  ldrh(dst, src);
  return off;
}

int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
  int off = offset();
  ldrb(dst, src);
  return off;
}

int MacroAssembler::load_signed_short(Register dst, Address src) {
  int off = offset();
  ldrsh(dst, src);
  return off;
}

int MacroAssembler::load_signed_byte(Register dst, Address src) {
  int off = offset();
  ldrsb(dst, src);
  return off;
}

int MacroAssembler::load_signed_short32(Register dst, Address src) {
  int off = offset();
  ldrshw(dst, src);
  return off;
}

int MacroAssembler::load_signed_byte32(Register dst, Address src) {
  int off = offset();
  ldrsbw(dst, src);
  return off;
}

void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
  switch (size_in_bytes) {
  case  8:  ldr(dst, src); break;
  case  4:  ldrw(dst, src); break;
  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
  default:  ShouldNotReachHere();
  }
}

void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
  switch (size_in_bytes) {
  case  8:  str(src, dst); break;
  case  4:  strw(src, dst); break;
  case  2:  strh(src, dst); break;
  case  1:  strb(src, dst); break;
  default:  ShouldNotReachHere();
  }
}

void MacroAssembler::decrementw(Register reg, int value)
{
  if (value < 0)  { incrementw(reg, -value);      return; }
  if (value == 0) {                               return; }
  if (value < (1 << 12)) { subw(reg, reg, value); return; }
  /* else */ {
    guarantee(reg != rscratch2, "invalid dst for register decrement");
    movw(rscratch2, (unsigned)value);
    subw(reg, reg, rscratch2);
  }
}

void MacroAssembler::decrement(Register reg, int value)
{
  if (value < 0)  { increment(reg, -value);      return; }
  if (value == 0) {                              return; }
  if (value < (1 << 12)) { sub(reg, reg, value); return; }
  /* else */ {
    assert(reg != rscratch2, "invalid dst for register decrement");
    mov(rscratch2, (unsigned long)value);
    sub(reg, reg, rscratch2);
  }
}

void MacroAssembler::decrementw(Address dst, int value)
{
  assert(!dst.uses(rscratch1), "invalid dst for address decrement");
  ldrw(rscratch1, dst);
  decrementw(rscratch1, value);
  strw(rscratch1, dst);
}

void MacroAssembler::decrement(Address dst, int value)
{
  assert(!dst.uses(rscratch1), "invalid address for decrement");
  ldr(rscratch1, dst);
  decrement(rscratch1, value);
  str(rscratch1, dst);
}

void MacroAssembler::incrementw(Register reg, int value)
{
  if (value < 0)  { decrementw(reg, -value);      return; }
  if (value == 0) {                               return; }
  if (value < (1 << 12)) { addw(reg, reg, value); return; }
  /* else */ {
    assert(reg != rscratch2, "invalid dst for register increment");
    movw(rscratch2, (unsigned)value);
    addw(reg, reg, rscratch2);
  }
}

void MacroAssembler::increment(Register reg, int value)
{
  if (value < 0)  { decrement(reg, -value);      return; }
  if (value == 0) {                              return; }
  if (value < (1 << 12)) { add(reg, reg, value); return; }
  /* else */ {
    assert(reg != rscratch2, "invalid dst for register increment");
    movw(rscratch2, (unsigned)value);
    add(reg, reg, rscratch2);
  }
}

void MacroAssembler::incrementw(Address dst, int value)
{
  assert(!dst.uses(rscratch1), "invalid dst for address increment");
  ldrw(rscratch1, dst);
  incrementw(rscratch1, value);
  strw(rscratch1, dst);
}

void MacroAssembler::increment(Address dst, int value)
{
  assert(!dst.uses(rscratch1), "invalid dst for address increment");
  ldr(rscratch1, dst);
  increment(rscratch1, value);
  str(rscratch1, dst);
}


void MacroAssembler::pusha() {
  push(0x7fffffff, sp);
}

void MacroAssembler::popa() {
  pop(0x7fffffff, sp);
}

// Push lots of registers in the bit set supplied.  Don't push sp.
// Return the number of words pushed
int MacroAssembler::push(unsigned int bitset, Register stack) {
  int words_pushed = 0;

  // Scan bitset to accumulate register pairs
  unsigned char regs[32];
  int count = 0;
  for (int reg = 0; reg <= 30; reg++) {
    if (1 & bitset)
      regs[count++] = reg;
    bitset >>= 1;
  }
  regs[count++] = zr->encoding_nocheck();
  count &= ~1;  // Only push an even nuber of regs

  if (count) {
    stp(as_Register(regs[0]), as_Register(regs[1]),
       Address(pre(stack, -count * wordSize)));
    words_pushed += 2;
  }
  for (int i = 2; i < count; i += 2) {
    stp(as_Register(regs[i]), as_Register(regs[i+1]),
       Address(stack, i * wordSize));
    words_pushed += 2;
  }

  assert(words_pushed == count, "oops, pushed != count");

  return count;
}

int MacroAssembler::pop(unsigned int bitset, Register stack) {
  int words_pushed = 0;

  // Scan bitset to accumulate register pairs
  unsigned char regs[32];
  int count = 0;
  for (int reg = 0; reg <= 30; reg++) {
    if (1 & bitset)
      regs[count++] = reg;
    bitset >>= 1;
  }
  regs[count++] = zr->encoding_nocheck();
  count &= ~1;

  for (int i = 2; i < count; i += 2) {
    ldp(as_Register(regs[i]), as_Register(regs[i+1]),
       Address(stack, i * wordSize));
    words_pushed += 2;
  }
  if (count) {
    ldp(as_Register(regs[0]), as_Register(regs[1]),
       Address(post(stack, count * wordSize)));
    words_pushed += 2;
  }

  assert(words_pushed == count, "oops, pushed != count");

  return count;
}
#ifdef ASSERT
void MacroAssembler::verify_heapbase(const char* msg) {
#if 0
  assert (UseCompressedOops, "should be compressed");
  assert (Universe::heap() != NULL, "java heap should be initialized");
  if (CheckCompressedOops) {
    Label ok;
    push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
    cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
    br(Assembler::EQ, ok);
    stop(msg);
    bind(ok);
    pop(1 << rscratch1->encoding(), sp);
  }
#endif
}
#endif

void MacroAssembler::stop(const char* msg) {
  address ip = pc();
  pusha();
  mov(c_rarg0, (address)msg);
  mov(c_rarg1, (address)ip);
  mov(c_rarg2, sp);
  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  // call(c_rarg3);
  blrt(c_rarg3, 3, 0, 1);
  hlt(0);
}

// If a constant does not fit in an immediate field, generate some
// number of MOV instructions and then perform the operation.
void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
					   add_sub_imm_insn insn1,
					   add_sub_reg_insn insn2) {
  assert(Rd != zr, "Rd = zr and not setting flags?");
  if (operand_valid_for_add_sub_immediate((int)imm)) {
    (this->*insn1)(Rd, Rn, imm);
  } else {
    if (uabs(imm) < (1 << 24)) {
       (this->*insn1)(Rd, Rn, imm & -(1 << 12));
       (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
    } else {
       assert_different_registers(Rd, Rn);
       mov(Rd, (uint64_t)imm);
       (this->*insn2)(Rd, Rn, Rd, LSL, 0);
    }
  }
}

// Seperate vsn which sets the flags. Optimisations are more restricted
// because we must set the flags correctly.
void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
					   add_sub_imm_insn insn1,
					   add_sub_reg_insn insn2) {
  if (operand_valid_for_add_sub_immediate((int)imm)) {
    (this->*insn1)(Rd, Rn, imm);
  } else {
    assert_different_registers(Rd, Rn);
    assert(Rd != zr, "overflow in immediate operand");
    mov(Rd, (uint64_t)imm);
    (this->*insn2)(Rd, Rn, Rd, LSL, 0);
  }
}


void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
  if (increment.is_register()) {
    add(Rd, Rn, increment.as_register());
  } else {
    add(Rd, Rn, increment.as_constant());
  }
}

void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
  if (increment.is_register()) {
    addw(Rd, Rn, increment.as_register());
  } else {
    addw(Rd, Rn, increment.as_constant());
  }
}

// !!! FIXME AARCH64 -- check this is correct !!!
void MacroAssembler::reinit_heapbase()
{
  if (UseCompressedOops) {
    if (Universe::heap() != NULL) {
      if (Universe::narrow_oop_base() == NULL) {
        mov(rheapbase, zr);
      } else {
        mov(rheapbase, Universe::narrow_oop_base());
      }
    } else {
      lea(rheapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
      ldr(rheapbase, Address(rheapbase));
    }
  }
}

// this simulates the behaviour of the x86 cmpxchg instruction using a
// load linked/store conditional pair. we use the acquire/release
// versions of these instructions so that we flush pending writes as
// per Java semantics.

// n.b the x86 version assumes the old value to be compared against is
// in rax and updates rax with the value located in memory if the
// cmpxchg fails. we supply a register for the old value explicitly

// the aarch64 load linked/store conditional instructions do not
// accept an offset. so, unlike x86, we must provide a plain register
// to identify the memory word to be compared/exchanged rather than a
// register+offset Address.

void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
				Label &succeed, Label *fail) {
  // oldv holds comparison value
  // newv holds value to write in exchange
  // addr identifies memory word to compare against/update
  // tmp returns 0/1 for success/failure
  Label retry_load, nope;
  
  bind(retry_load);
  // flush and load exclusive from the memory location
  // and fail if it is not what we expect
  ldaxr(tmp, addr);
  cmp(tmp, oldv);
  br(Assembler::NE, nope);
  // if we store+flush with no intervening write tmp wil be zero
  stlxr(tmp, newv, addr);
  cbzw(tmp, succeed);
  // retry so we only ever return after a load fails to compare
  // ensures we don't return a stale value after a failed write.
  b(retry_load);
  // if the memory word differs we return it in oldv and signal a fail
  bind(nope);
  membar(AnyAny);
  mov(oldv, tmp);
  if (fail)
    b(*fail);
}

void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
				Label &succeed, Label *fail) {
  // oldv holds comparison value
  // newv holds value to write in exchange
  // addr identifies memory word to compare against/update
  // tmp returns 0/1 for success/failure
  Label retry_load, nope;
  
  bind(retry_load);
  // flush and load exclusive from the memory location
  // and fail if it is not what we expect
  ldaxrw(tmp, addr);
  cmp(tmp, oldv);
  br(Assembler::NE, nope);
  // if we store+flush with no intervening write tmp wil be zero
  stlxrw(tmp, newv, addr);
  cbzw(tmp, succeed);
  // retry so we only ever return after a load fails to compare
  // ensures we don't return a stale value after a failed write.
  b(retry_load);
  // if the memory word differs we return it in oldv and signal a fail
  bind(nope);
  membar(AnyAny);
  mov(oldv, tmp);
  if (fail)
    b(*fail);
}

static bool different(Register a, RegisterOrConstant b, Register c) {
  if (b.is_constant())
    return a != c;
  else
    return a != b.as_register() && a != c && b.as_register() != c;
}

#define ATOMIC_OP(LDXR, OP, STXR)					\
void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
  Register result = rscratch2;						\
  if (prev->is_valid())							\
    result = different(prev, incr, addr) ? prev : rscratch2;		\
									\
  Label retry_load;							\
  bind(retry_load);							\
  LDXR(result, addr);							\
  OP(rscratch1, result, incr);						\
  STXR(rscratch1, rscratch1, addr);					\
  cbnzw(rscratch1, retry_load);						\
  if (prev->is_valid() && prev != result)				\
    mov(prev, result);							\
}

ATOMIC_OP(ldxr, add, stxr)
ATOMIC_OP(ldxrw, addw, stxrw)

#undef ATOMIC_OP

#define ATOMIC_XCHG(OP, LDXR, STXR)					\
void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {	\
  Register result = rscratch2;						\
  if (prev->is_valid())							\
    result = different(prev, newv, addr) ? prev : rscratch2;		\
									\
  Label retry_load;							\
  bind(retry_load);							\
  LDXR(result, addr);							\
  STXR(rscratch1, newv, addr);						\
  cbnzw(rscratch1, retry_load);						\
  if (prev->is_valid() && prev != result)				\
    mov(prev, result);							\
}

ATOMIC_XCHG(xchg, ldxr, stxr)
ATOMIC_XCHG(xchgw, ldxrw, stxrw)

#undef ATOMIC_XCHG

void MacroAssembler::incr_allocated_bytes(Register thread,
                                          Register var_size_in_bytes,
                                          int con_size_in_bytes,
                                          Register t1) {
  if (!thread->is_valid()) {
    thread = rthread;
  }
  assert(t1->is_valid(), "need temp reg");

  ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
  if (var_size_in_bytes->is_valid()) {
    add(t1, t1, var_size_in_bytes);
  } else {
    add(t1, t1, con_size_in_bytes);
  }
  str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
}

#ifndef PRODUCT
extern "C" void findpc(intptr_t x);
#endif

void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
{
  // In order to get locks to work, we need to fake a in_VM state
  if (ShowMessageBoxOnError ) {
    JavaThread* thread = JavaThread::current();
    JavaThreadState saved_state = thread->thread_state();
    thread->set_thread_state(_thread_in_vm);
#ifndef PRODUCT
    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
      ttyLocker ttyl;
      BytecodeCounter::print();
    }
#endif
    if (os::message_box(msg, "Execution stopped, print registers?")) {
      ttyLocker ttyl;
      tty->print_cr(" pc = 0x%016lx", pc);
#ifndef PRODUCT
      tty->cr();
      findpc(pc);
      tty->cr();
#endif
      tty->print_cr(" r0 = 0x%016lx", regs[0]);
      tty->print_cr(" r1 = 0x%016lx", regs[1]);
      tty->print_cr(" r2 = 0x%016lx", regs[2]);
      tty->print_cr(" r3 = 0x%016lx", regs[3]);
      tty->print_cr(" r4 = 0x%016lx", regs[4]);
      tty->print_cr(" r5 = 0x%016lx", regs[5]);
      tty->print_cr(" r6 = 0x%016lx", regs[6]);
      tty->print_cr(" r7 = 0x%016lx", regs[7]);
      tty->print_cr(" r8 = 0x%016lx", regs[8]);
      tty->print_cr(" r9 = 0x%016lx", regs[9]);
      tty->print_cr("r10 = 0x%016lx", regs[10]);
      tty->print_cr("r11 = 0x%016lx", regs[11]);
      tty->print_cr("r12 = 0x%016lx", regs[12]);
      tty->print_cr("r13 = 0x%016lx", regs[13]);
      tty->print_cr("r14 = 0x%016lx", regs[14]);
      tty->print_cr("r15 = 0x%016lx", regs[15]);
      tty->print_cr("r16 = 0x%016lx", regs[16]);
      tty->print_cr("r17 = 0x%016lx", regs[17]);
      tty->print_cr("r18 = 0x%016lx", regs[18]);
      tty->print_cr("r19 = 0x%016lx", regs[19]);
      tty->print_cr("r20 = 0x%016lx", regs[20]);
      tty->print_cr("r21 = 0x%016lx", regs[21]);
      tty->print_cr("r22 = 0x%016lx", regs[22]);
      tty->print_cr("r23 = 0x%016lx", regs[23]);
      tty->print_cr("r24 = 0x%016lx", regs[24]);
      tty->print_cr("r25 = 0x%016lx", regs[25]);
      tty->print_cr("r26 = 0x%016lx", regs[26]);
      tty->print_cr("r27 = 0x%016lx", regs[27]);
      tty->print_cr("r28 = 0x%016lx", regs[28]);
      tty->print_cr("r30 = 0x%016lx", regs[30]);
      tty->print_cr("r31 = 0x%016lx", regs[31]);
      BREAKPOINT;
    }
    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
  } else {
    ttyLocker ttyl;
    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
                    msg);
    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
  }
}

#ifdef BUILTIN_SIM
// routine to generate an x86 prolog for a stub function which
// bootstraps into the generated ARM code which directly follows the
// stub
//
// the argument encodes the number of general and fp registers
// passed by the caller and the callng convention (currently just
// the number of general registers and assumes C argument passing)

extern "C" {
int aarch64_stub_prolog_size();
void aarch64_stub_prolog();
void aarch64_prolog();
}

void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
				   address *prolog_ptr)
{
  int calltype = (((ret_type & 0x3) << 8) |
		  ((fp_arg_count & 0xf) << 4) |
		  (gp_arg_count & 0xf));

  // the addresses for the x86 to ARM entry code we need to use
  address start = pc();
  // printf("start = %lx\n", start);
  int byteCount =  aarch64_stub_prolog_size();
  // printf("byteCount = %x\n", byteCount);
  int instructionCount = (byteCount + 3)/ 4;
  // printf("instructionCount = %x\n", instructionCount);
  for (int i = 0; i < instructionCount; i++) {
    nop();
  }

  memcpy(start, (void*)aarch64_stub_prolog, byteCount);

  // write the address of the setup routine and the call format at the
  // end of into the copied code
  u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
  if (prolog_ptr)
    patch_end[-2] = (u_int64_t)prolog_ptr;
  patch_end[-1] = calltype;
}
#endif

void MacroAssembler::push_CPU_state() {
    push(0x3fffffff, sp);         // integer registers except lr & sp

    for (int i = 30; i >= 0; i -= 2)
      stpd(as_FloatRegister(i), as_FloatRegister(i+1),
	   Address(pre(sp, -2 * wordSize)));
}

void MacroAssembler::pop_CPU_state() {
  for (int i = 0; i < 32; i += 2)
    ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
	 Address(post(sp, 2 * wordSize)));

  pop(0x3fffffff, sp);         // integer registers except lr & sp
}

/**
 * Emits code to update CRC-32 with a byte value according to constants in table
 *
 * @param [in,out]crc   Register containing the crc.
 * @param [in]val       Register containing the byte to fold into the CRC.
 * @param [in]table     Register containing the table of crc constants.
 *
 * uint32_t crc;
 * val = crc_table[(val ^ crc) & 0xFF];
 * crc = val ^ (crc >> 8);
 *
 */
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
  eor(val, val, crc);
  andr(val, val, 0xff);
  ldrw(val, Address(table, val, Address::lsl(2)));
  eor(crc, val, crc, Assembler::LSR, 8);
}

/**
 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
 *
 * @param [in,out]crc   Register containing the crc.
 * @param [in]v         Register containing the 32-bit to fold into the CRC.
 * @param [in]table0    Register containing table 0 of crc constants.
 * @param [in]table1    Register containing table 1 of crc constants.
 * @param [in]table2    Register containing table 2 of crc constants.
 * @param [in]table3    Register containing table 3 of crc constants.
 *
 * uint32_t crc;
 *   v = crc ^ v
 *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
 *
 */
void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
        Register table0, Register table1, Register table2, Register table3,
        bool upper) {
  eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
  uxtb(tmp, v);
  ldrw(crc, Address(table3, tmp, Address::lsl(2)));
  ubfx(tmp, v, 8, 8);
  ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
  eor(crc, crc, tmp);
  ubfx(tmp, v, 16, 8);
  ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
  eor(crc, crc, tmp);
  ubfx(tmp, v, 24, 8);
  ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
  eor(crc, crc, tmp);
}

/**
 * @param crc   register containing existing CRC (32-bit)
 * @param buf   register pointing to input byte buffer (byte*)
 * @param len   register containing number of bytes
 * @param table register that will contain address of CRC table
 * @param tmp   scratch register
 */
void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
        Register table0, Register table1, Register table2, Register table3,
        Register tmp, Register tmp2, Register tmp3) {
  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
  unsigned long offset;

    ornw(crc, zr, crc);

  if (UseCRC32) {
    Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;

      subs(len, len, 64);
      br(Assembler::GE, CRC_by64_loop);
      adds(len, len, 64-4);
      br(Assembler::GE, CRC_by4_loop);
      adds(len, len, 4);
      br(Assembler::GT, CRC_by1_loop);
      b(L_exit);

    BIND(CRC_by4_loop);
      ldrw(tmp, Address(post(buf, 4)));
      subs(len, len, 4);
      crc32w(crc, crc, tmp);
      br(Assembler::GE, CRC_by4_loop);
      adds(len, len, 4);
      br(Assembler::LE, L_exit);
    BIND(CRC_by1_loop);
      ldrb(tmp, Address(post(buf, 1)));
      subs(len, len, 1);
      crc32b(crc, crc, tmp);
      br(Assembler::GT, CRC_by1_loop);
      b(L_exit);

      align(CodeEntryAlignment);
    BIND(CRC_by64_loop);
      subs(len, len, 64);
      ldp(tmp, tmp3, Address(post(buf, 16)));
      crc32x(crc, crc, tmp);
      crc32x(crc, crc, tmp3);
      ldp(tmp, tmp3, Address(post(buf, 16)));
      crc32x(crc, crc, tmp);
      crc32x(crc, crc, tmp3);
      ldp(tmp, tmp3, Address(post(buf, 16)));
      crc32x(crc, crc, tmp);
      crc32x(crc, crc, tmp3);
      ldp(tmp, tmp3, Address(post(buf, 16)));
      crc32x(crc, crc, tmp);
      crc32x(crc, crc, tmp3);
      br(Assembler::GE, CRC_by64_loop);
      adds(len, len, 64-4);
      br(Assembler::GE, CRC_by4_loop);
      adds(len, len, 4);
      br(Assembler::GT, CRC_by1_loop);
    BIND(L_exit);
      ornw(crc, zr, crc);
      return;
  }

    adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
    if (offset) add(table0, table0, offset);
    add(table1, table0, 1*256*sizeof(juint));
    add(table2, table0, 2*256*sizeof(juint));
    add(table3, table0, 3*256*sizeof(juint));

  if (UseNeon) {
      cmp(len, 64);
      br(Assembler::LT, L_by16);
      eor(v16, T16B, v16, v16);

    Label L_fold;

      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants

      ld1(v0, v1, T2D, post(buf, 32));
      ld1r(v4, T2D, post(tmp, 8));
      ld1r(v5, T2D, post(tmp, 8));
      ld1r(v6, T2D, post(tmp, 8));
      ld1r(v7, T2D, post(tmp, 8));
      mov(v16, T4S, 0, crc);

      eor(v0, T16B, v0, v16);
      sub(len, len, 64);

    BIND(L_fold);
      pmull(v22, T8H, v0, v5, T8B);
      pmull(v20, T8H, v0, v7, T8B);
      pmull(v23, T8H, v0, v4, T8B);
      pmull(v21, T8H, v0, v6, T8B);
    
      pmull2(v18, T8H, v0, v5, T16B);
      pmull2(v16, T8H, v0, v7, T16B);
      pmull2(v19, T8H, v0, v4, T16B);
      pmull2(v17, T8H, v0, v6, T16B);
    
      uzp1(v24, v20, v22, T8H);
      uzp2(v25, v20, v22, T8H);
      eor(v20, T16B, v24, v25);
    
      uzp1(v26, v16, v18, T8H);
      uzp2(v27, v16, v18, T8H);
      eor(v16, T16B, v26, v27);
    
      ushll2(v22, T4S, v20, T8H, 8);
      ushll(v20, T4S, v20, T4H, 8);
    
      ushll2(v18, T4S, v16, T8H, 8);
      ushll(v16, T4S, v16, T4H, 8);
    
      eor(v22, T16B, v23, v22);
      eor(v18, T16B, v19, v18);
      eor(v20, T16B, v21, v20);
      eor(v16, T16B, v17, v16);
    
      uzp1(v17, v16, v20, T2D);
      uzp2(v21, v16, v20, T2D);
      eor(v17, T16B, v17, v21);
    
      ushll2(v20, T2D, v17, T4S, 16);
      ushll(v16, T2D, v17, T2S, 16);
    
      eor(v20, T16B, v20, v22);
      eor(v16, T16B, v16, v18);
    
      uzp1(v17, v20, v16, T2D);
      uzp2(v21, v20, v16, T2D);
      eor(v28, T16B, v17, v21);
    
      pmull(v22, T8H, v1, v5, T8B);
      pmull(v20, T8H, v1, v7, T8B);
      pmull(v23, T8H, v1, v4, T8B);
      pmull(v21, T8H, v1, v6, T8B);
    
      pmull2(v18, T8H, v1, v5, T16B);
      pmull2(v16, T8H, v1, v7, T16B);
      pmull2(v19, T8H, v1, v4, T16B);
      pmull2(v17, T8H, v1, v6, T16B);
    
      ld1(v0, v1, T2D, post(buf, 32));
    
      uzp1(v24, v20, v22, T8H);
      uzp2(v25, v20, v22, T8H);
      eor(v20, T16B, v24, v25);
    
      uzp1(v26, v16, v18, T8H);
      uzp2(v27, v16, v18, T8H);
      eor(v16, T16B, v26, v27);
    
      ushll2(v22, T4S, v20, T8H, 8);
      ushll(v20, T4S, v20, T4H, 8);
    
      ushll2(v18, T4S, v16, T8H, 8);
      ushll(v16, T4S, v16, T4H, 8);
    
      eor(v22, T16B, v23, v22);
      eor(v18, T16B, v19, v18);
      eor(v20, T16B, v21, v20);
      eor(v16, T16B, v17, v16);
    
      uzp1(v17, v16, v20, T2D);
      uzp2(v21, v16, v20, T2D);
      eor(v16, T16B, v17, v21);
    
      ushll2(v20, T2D, v16, T4S, 16);
      ushll(v16, T2D, v16, T2S, 16);
    
      eor(v20, T16B, v22, v20);
      eor(v16, T16B, v16, v18);
    
      uzp1(v17, v20, v16, T2D);
      uzp2(v21, v20, v16, T2D);
      eor(v20, T16B, v17, v21);
    
      shl(v16, v28, T2D, 1);
      shl(v17, v20, T2D, 1);
    
      eor(v0, T16B, v0, v16);
      eor(v1, T16B, v1, v17);

      subs(len, len, 32);
      br(Assembler::GE, L_fold);

      mov(crc, 0);
      mov(tmp, v0, T1D, 0);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
      mov(tmp, v0, T1D, 1);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
      mov(tmp, v1, T1D, 0);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
      mov(tmp, v1, T1D, 1);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);

      add(len, len, 32);
  }

  BIND(L_by16);
    subs(len, len, 16);
    br(Assembler::GE, L_by16_loop);
    adds(len, len, 16-4);
    br(Assembler::GE, L_by4_loop);
    adds(len, len, 4);
    br(Assembler::GT, L_by1_loop);
    b(L_exit);

  BIND(L_by4_loop);
    ldrw(tmp, Address(post(buf, 4)));
    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
    subs(len, len, 4);
    br(Assembler::GE, L_by4_loop);
    adds(len, len, 4);
    br(Assembler::LE, L_exit);
  BIND(L_by1_loop);
    subs(len, len, 1);
    ldrb(tmp, Address(post(buf, 1)));
    update_byte_crc32(crc, tmp, table0);
    br(Assembler::GT, L_by1_loop);
    b(L_exit);

    align(CodeEntryAlignment);
  BIND(L_by16_loop);
    subs(len, len, 16);
    ldp(tmp, tmp3, Address(post(buf, 16)));
    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
    br(Assembler::GE, L_by16_loop);
    adds(len, len, 16-4);
    br(Assembler::GE, L_by4_loop);
    adds(len, len, 4);
    br(Assembler::GT, L_by1_loop);
  BIND(L_exit);
    ornw(crc, zr, crc);
}

SkipIfEqual::SkipIfEqual(
    MacroAssembler* masm, const bool* flag_addr, bool value) {
  _masm = masm;
  unsigned long offset;
  _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
  _masm->ldrb(rscratch1, Address(rscratch1, offset));
  _masm->cbzw(rscratch1, _label);
}

SkipIfEqual::~SkipIfEqual() {
  _masm->bind(_label);
}

void MacroAssembler::cmpptr(Register src1, Address src2) {
  unsigned long offset;
  adrp(rscratch1, src2, offset);
  ldr(rscratch1, Address(rscratch1, offset));
  cmp(src1, rscratch1);
}

void MacroAssembler::store_check(Register obj) {
  // Does a store check for the oop in register obj. The content of
  // register obj is destroyed afterwards.
  store_check_part_1(obj);
  store_check_part_2(obj);
}

void MacroAssembler::store_check(Register obj, Address dst) {
  store_check(obj);
}


// split the store check operation so that other instructions can be scheduled inbetween
void MacroAssembler::store_check_part_1(Register obj) {
  BarrierSet* bs = Universe::heap()->barrier_set();
  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  lsr(obj, obj, CardTableModRefBS::card_shift);
}

void MacroAssembler::store_check_part_2(Register obj) {
  BarrierSet* bs = Universe::heap()->barrier_set();
  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

  // The calculation for byte_map_base is as follows:
  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
  // So this essentially converts an address to a displacement and
  // it will never need to be relocated.

  // FIXME: It's not likely that disp will fit into an offset so we
  // don't bother to check, but it could save an instruction.
  intptr_t disp = (intptr_t) ct->byte_map_base;
  mov(rscratch1, disp);
  strb(zr, Address(obj, rscratch1));
}

void MacroAssembler::load_klass(Register dst, Register src) {
  if (UseCompressedOops) {
    ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
    decode_heap_oop_not_null(dst);
  } else {
    ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
  }
}

// !!! FIXME AARCH64 -- check this is correct !!!

void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
  if (UseCompressedOops) {
    ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
    if (Universe::narrow_oop_base() == NULL) {
      cmp(trial_klass, tmp, LSL, Universe::narrow_oop_shift());
      return;
    }
    decode_heap_oop_not_null(tmp);
  } else {
    ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
  }
  cmp(trial_klass, tmp);
}

// !!! FIXME AARCH64 -- check this is correct !!!

void MacroAssembler::load_prototype_header(Register dst, Register src) {
  load_klass(dst, src);
  ldr(dst, Address(dst, Klass::prototype_header_offset()));
}

void MacroAssembler::store_klass(Register dst, Register src) {
  // FIXME: Should this be a store release?  concurrent gcs assumes
  // klass length is valid if klass field is not null.
  if (UseCompressedOops) {
    encode_heap_oop_not_null(src);
    strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
  } else {
    str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
  }
}

void MacroAssembler::store_klass_gap(Register dst, Register src) {
  if (UseCompressedOops) {
    // Store to klass gap in destination
    strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
  }
}

// Algorithm must match oop.inline.hpp encode_heap_oop.
void MacroAssembler::encode_heap_oop(Register d, Register s) {
#ifdef ASSERT
  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
#endif
  verify_oop(s, "broken oop in encode_heap_oop");
  if (Universe::narrow_oop_base() == NULL) {
    if (Universe::narrow_oop_shift() != 0) {
      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
      lsr(d, s, LogMinObjAlignmentInBytes);
    } else {
      mov(d, s);
    }
  } else {
    subs(d, s, rheapbase);
    csel(d, d, zr, Assembler::HS);
    lsr(d, d, LogMinObjAlignmentInBytes);

    /*  Old algorithm: is this any worse?
    Label nonnull;
    cbnz(r, nonnull);
    sub(r, r, rheapbase);
    bind(nonnull);
    lsr(r, r, LogMinObjAlignmentInBytes);
    */
  }
}

void MacroAssembler::encode_heap_oop_not_null(Register r) {
#ifdef ASSERT
  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
  if (CheckCompressedOops) {
    Label ok;
    cbnz(r, ok);
    stop("null oop passed to encode_heap_oop_not_null");
    bind(ok);
  }
#endif
  verify_oop(r, "broken oop in encode_heap_oop_not_null");
  if (Universe::narrow_oop_base() != NULL) {
    sub(r, r, rheapbase);
  }
  if (Universe::narrow_oop_shift() != 0) {
    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
    lsr(r, r, LogMinObjAlignmentInBytes);
  }
}

void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
#ifdef ASSERT
  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
  if (CheckCompressedOops) {
    Label ok;
    cbnz(src, ok);
    stop("null oop passed to encode_heap_oop_not_null2");
    bind(ok);
  }
#endif
  verify_oop(src, "broken oop in encode_heap_oop_not_null2");

  Register data = src;
  if (Universe::narrow_oop_base() != NULL) {
    sub(dst, src, rheapbase);
    data = dst;
  }
  if (Universe::narrow_oop_shift() != 0) {
    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
    lsr(dst, data, LogMinObjAlignmentInBytes);
    data = dst;
  }
  if (data == src)
    mov(dst, src);
}

void  MacroAssembler::decode_heap_oop(Register d, Register s) {
#ifdef ASSERT
  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
#endif
  if (Universe::narrow_oop_base() == NULL) {
    if (Universe::narrow_oop_shift() != 0 || d != s) {
      lsl(d, s, Universe::narrow_oop_shift());
    }
  } else {
    Label done;
    if (d != s)
      mov(d, s);
    cbz(s, done);
    add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
    bind(done);
  }
  verify_oop(d, "broken oop in decode_heap_oop");
}

void  MacroAssembler::decode_heap_oop_not_null(Register r) {
  assert (UseCompressedOops, "should only be used for compressed headers");
  assert (Universe::heap() != NULL, "java heap should be initialized");
  // Cannot assert, unverified entry point counts instructions (see .ad file)
  // vtableStubs also counts instructions in pd_code_size_limit.
  // Also do not verify_oop as this is called by verify_oop.
  if (Universe::narrow_oop_shift() != 0) {
    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
    if (Universe::narrow_oop_base() != NULL) {
      add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
    } else {
      add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
    }
  } else {
    assert (Universe::narrow_oop_base() == NULL, "sanity");
  }
}

void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
  assert (UseCompressedOops, "should only be used for compressed headers");
  assert (Universe::heap() != NULL, "java heap should be initialized");
  // Cannot assert, unverified entry point counts instructions (see .ad file)
  // vtableStubs also counts instructions in pd_code_size_limit.
  // Also do not verify_oop as this is called by verify_oop.
  if (Universe::narrow_oop_shift() != 0) {
    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
    if (Universe::narrow_oop_base() != NULL) {
      add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
    } else {
      add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
    }
  } else {
    assert (Universe::narrow_oop_base() == NULL, "sanity");
    if (dst != src) {
      mov(dst, src);
    }
  }
}

void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
  assert (UseCompressedOops, "should only be used for compressed oops");
  assert (Universe::heap() != NULL, "java heap should be initialized");
  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");

  int oop_index = oop_recorder()->find_index(obj);
  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");

  InstructionMark im(this);
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
  code_section()->relocate(inst_mark(), rspec);
  movz(dst, 0xDEAD, 16);
  movk(dst, 0xBEEF);
}

void MacroAssembler::load_heap_oop(Register dst, Address src)
{
  if (UseCompressedOops) {
    ldrw(dst, src);
    decode_heap_oop(dst);
  } else {
    ldr(dst, src);
  }  
}

void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
{
  if (UseCompressedOops) {
    ldrw(dst, src);
    decode_heap_oop_not_null(dst);
  } else {
    ldr(dst, src);
  }
}

void MacroAssembler::store_heap_oop(Address dst, Register src) {
  if (UseCompressedOops) {
    assert(!dst.uses(src), "not enough registers");
    encode_heap_oop(src);
    strw(src, dst);
  } else
    str(src, dst);
}

// Used for storing NULLs.
void MacroAssembler::store_heap_oop_null(Address dst) {
  if (UseCompressedOops) {
    strw(zr, dst);
  } else
    str(zr, dst);
}

#ifndef SERIALGC
void MacroAssembler::g1_write_barrier_pre(Register obj,
                                          Register pre_val,
                                          Register thread,
                                          Register tmp,
                                          bool tosca_live,
                                          bool expand_call) {
  // If expand_call is true then we expand the call_VM_leaf macro
  // directly to skip generating the check by
  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.

#ifdef _LP64
  assert(thread == rthread, "must be");
#endif // _LP64

  Label done;
  Label runtime;

  assert(pre_val != noreg, "check this code");

  if (obj != noreg)
    assert_different_registers(obj, pre_val, tmp);

  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_active()));
  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_index()));
  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_buf()));


  // Is marking active?
  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
    ldrw(tmp, in_progress);
  } else {
    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
    ldrb(tmp, in_progress);
  }
  cbzw(tmp, done);

  // Do we need to load the previous value?
  if (obj != noreg) {
    load_heap_oop(pre_val, Address(obj, 0));
  }

  // Is the previous value null?
  cbz(pre_val, done);

  // Can we store original value in the thread's buffer?
  // Is index == 0?
  // (The index field is typed as size_t.)

  ldr(tmp, index);                      // tmp := *index_adr
  cbz(tmp, runtime);                    // tmp == 0?
                                        // If yes, goto runtime

  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
  str(tmp, index);                      // *index_adr := tmp
  ldr(rscratch1, buffer);
  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr

  // Record the previous value
  str(pre_val, Address(tmp, 0));
  b(done);

  bind(runtime);
  // save the live input values
  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);

  // Calling the runtime using the regular call_VM_leaf mechanism generates
  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
  //
  // If we care generating the pre-barrier without a frame (e.g. in the
  // intrinsified Reference.get() routine) then ebp might be pointing to
  // the caller frame and so this check will most likely fail at runtime.
  //
  // Expanding the call directly bypasses the generation of the check.
  // So when we do not have have a full interpreter frame on the stack
  // expand_call should be passed true.

  if (expand_call) {
    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
    pass_arg1(this, thread);
    pass_arg0(this, pre_val);
    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
  } else {
    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
  }

  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);

  bind(done);
}

void MacroAssembler::g1_write_barrier_post(Register store_addr,
                                           Register new_val,
                                           Register thread,
                                           Register tmp,
                                           Register tmp2) {
#ifdef _LP64
  assert(thread == rthread, "must be");
#endif // _LP64

  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
                                       PtrQueue::byte_offset_of_index()));
  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
                                       PtrQueue::byte_offset_of_buf()));

  BarrierSet* bs = Universe::heap()->barrier_set();
  CardTableModRefBS* ct = (CardTableModRefBS*)bs;

  Label done;
  Label runtime;

  // Does store cross heap regions?

  eor(tmp, store_addr, new_val);
  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
  cbz(tmp, done);

  // crosses regions, storing NULL?

  cbz(new_val, done);

  // storing region crossing non-NULL, is card already dirty?

  ExternalAddress cardtable((address) ct->byte_map_base);
  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
  const Register card_addr = tmp;

  lsr(card_addr, store_addr, CardTableModRefBS::card_shift);

  unsigned long offset;
  adrp(tmp2, cardtable, offset);

  // get the address of the card
  add(card_addr, card_addr, tmp2);
  ldrb(tmp2, Address(card_addr, offset));
  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
  br(Assembler::EQ, done);

  assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");

  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));

  ldrb(tmp2, Address(card_addr, offset));
  cbzw(tmp2, done);

  // storing a region crossing, non-NULL oop, card is clean.
  // dirty card and log.

  strb(zr, Address(card_addr, offset));

  ldr(rscratch1, queue_index);
  cbz(rscratch1, runtime);
  sub(rscratch1, rscratch1, wordSize);
  str(rscratch1, queue_index);

  ldr(tmp2, buffer);
  str(card_addr, Address(tmp2, rscratch1));
  b(done);

  bind(runtime);
  // save the live input values
  push(store_addr->bit(true) | new_val->bit(true), sp);
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
  pop(store_addr->bit(true) | new_val->bit(true), sp);

  bind(done);
}

#endif // SERIALGC

// Move an oop into a register.  immediate is true if we want
// immediate instrcutions, i.e. we are not going to patch this
// instruction while the code is being executed by another thread.  In
// that case we can use move immediates rather than the constant pool.
void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
  int oop_index;
  // !!! FIXME AARCH64 -- because of how jdk7 does reloc verification
  // we need to use movoop when planting Universe::non_oop_word (-1L)
  // at a call site under ic_call (the verification routine wants an
  // oop reloc entry). so, that's why we have two special cases here.

  if (obj == NULL || obj == (jobject)Universe::non_oop_word()) {
    oop_index = oop_recorder()->allocate_index(obj);
  } else {
    oop_index = oop_recorder()->find_index(obj);
    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
  }
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
  if (! immediate) {
    address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
    ldr_constant(dst, Address(dummy, rspec));
  } else
    mov(dst, Address((address)obj, rspec));

}

Address MacroAssembler::constant_oop_address(jobject obj) {
  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
  int oop_index = oop_recorder()->find_index(obj);
  return Address((address)obj, oop_Relocation::spec(oop_index));
}

// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
void MacroAssembler::tlab_allocate(Register obj,
                                   Register var_size_in_bytes,
                                   int con_size_in_bytes,
                                   Register t1,
                                   Register t2,
                                   Label& slow_case) {
  assert_different_registers(obj, t2);
  assert_different_registers(obj, var_size_in_bytes);
  Register end = t2;

  // verify_tlab();

  ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
  if (var_size_in_bytes == noreg) {
    lea(end, Address(obj, con_size_in_bytes));
  } else {
    lea(end, Address(obj, var_size_in_bytes));
  }
  ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
  cmp(end, rscratch1);
  br(Assembler::HI, slow_case);

  // update the tlab top pointer
  str(end, Address(rthread, JavaThread::tlab_top_offset()));

  // recover var_size_in_bytes if necessary
  if (var_size_in_bytes == end) {
    sub(var_size_in_bytes, var_size_in_bytes, obj);
  }
  // verify_tlab();
}

// Preserves r19, and r3.
Register MacroAssembler::tlab_refill(Label& retry,
                                     Label& try_eden,
                                     Label& slow_case) {
  Register top = r0;
  Register t1  = r2;
  Register t2  = r4;
  assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
  Label do_refill, discard_tlab;

  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
    // No allocation in the shared eden.
    b(slow_case);
  }

  ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
  ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));

  // calculate amount of free space
  sub(t1, t1, top);
  lsr(t1, t1, LogHeapWordSize);

  // Retain tlab and allocate object in shared space if
  // the amount free in the tlab is too large to discard.

  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
  cmp(t1, rscratch1);
  br(Assembler::LE, discard_tlab);

  // Retain
  // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
  mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
  add(rscratch1, rscratch1, t2);
  str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));

  if (TLABStats) {
    // increment number of slow_allocations
    addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
	 1, rscratch1);
  }
  b(try_eden);

  bind(discard_tlab);
  if (TLABStats) {
    // increment number of refills
    addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
	 rscratch1);
    // accumulate wastage -- t1 is amount free in tlab
    addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
	 rscratch1);
  }

  // if tlab is currently allocated (top or end != null) then
  // fill [top, end + alignment_reserve) with array object
  cbz(top, do_refill);

  // set up the mark word
  mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
  str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
  // set the length to the remaining space
  sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
  add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
  lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
  strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
  // set klass to intArrayKlass
  {
    unsigned long offset;
    // dubious reloc why not an oop reloc?
    adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
	 offset);
    ldr(t1, Address(rscratch1, offset));
  }
  // store klass last.  concurrent gcs assumes klass length is valid if
  // klass field is not null.
  store_klass(top, t1);

  mov(t1, top);
  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
  sub(t1, t1, rscratch1);
  incr_allocated_bytes(rthread, t1, 0, rscratch1);

  // refill the tlab with an eden allocation
  bind(do_refill);
  ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
  lsl(t1, t1, LogHeapWordSize);
  // allocate new tlab, address returned in top
  eden_allocate(top, t1, 0, t2, slow_case);

  // Check that t1 was preserved in eden_allocate.
#ifdef ASSERT
  if (UseTLAB) {
    Label ok;
    Register tsize = r4;
    assert_different_registers(tsize, rthread, t1);
    str(tsize, Address(pre(sp, -16)));
    ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
    lsl(tsize, tsize, LogHeapWordSize);
    cmp(t1, tsize);
    br(Assembler::EQ, ok);
    STOP("assert(t1 != tlab size)");
    should_not_reach_here();

    bind(ok);
    ldr(tsize, Address(post(sp, 16)));
  }
#endif
  str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
  str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
  add(top, top, t1);
  sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
  str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
  verify_tlab();
  b(retry);

  return rthread; // for use by caller
}

// Defines obj, preserves var_size_in_bytes
void MacroAssembler::eden_allocate(Register obj,
                                   Register var_size_in_bytes,
                                   int con_size_in_bytes,
                                   Register t1,
                                   Label& slow_case) {
  assert_different_registers(obj, var_size_in_bytes, t1);
  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
    b(slow_case);
  } else {
    Register end = t1;
    Register heap_end = rscratch2;
    Label retry;
    bind(retry);
    {
      unsigned long offset;
      adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
      ldr(heap_end, Address(rscratch1, offset));
    }

    ExternalAddress heap_top((address) Universe::heap()->top_addr());

    // Get the current top of the heap
    {
      unsigned long offset;
      adrp(rscratch1, heap_top, offset);
      // Use add() here after ARDP, rather than lea().
      // lea() does not generate anything if its offset is zero.
      // However, relocs expect to find either an ADD or a load/store
      // insn after an ADRP.  add() always generates an ADD insn, even
      // for add(Rn, Rn, 0).
      add(rscratch1, rscratch1, offset);
      ldaxr(obj, rscratch1);
    }

    // Adjust it my the size of our new object
    if (var_size_in_bytes == noreg) {
      lea(end, Address(obj, con_size_in_bytes));
    } else {
      lea(end, Address(obj, var_size_in_bytes));
    }

    // if end < obj then we wrapped around high memory
    cmp(end, obj);
    br(Assembler::LO, slow_case);

    cmp(end, heap_end);
    br(Assembler::HI, slow_case);

    // If heap_top hasn't been changed by some other thread, update it.
    stlxr(rscratch1, end, rscratch1);
    cbnzw(rscratch1, retry);
  }
}

void MacroAssembler::verify_tlab() {
#ifdef ASSERT
  if (UseTLAB && VerifyOops) {
    Label next, ok;

    stp(rscratch2, rscratch1, Address(pre(sp, -16)));

    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
    cmp(rscratch2, rscratch1);
    br(Assembler::HS, next);
    STOP("assert(top >= start)");
    should_not_reach_here();

    bind(next);
    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
    cmp(rscratch2, rscratch1);
    br(Assembler::HS, ok);
    STOP("assert(top <= end)");
    should_not_reach_here();

    bind(ok);
    ldp(rscratch2, rscratch1, Address(post(sp, 16)));
  }
#endif
}

// Writes to stack successive pages until offset reached to check for
// stack overflow + shadow pages.  This clobbers tmp.
void MacroAssembler::bang_stack_size(Register size, Register tmp) {
  assert_different_registers(tmp, size, rscratch1);
  mov(tmp, sp);
  // Bang stack for total size given plus shadow page size.
  // Bang one page at a time because large size can bang beyond yellow and
  // red zones.
  Label loop;
  mov(rscratch1, os::vm_page_size());
  bind(loop);
  lea(tmp, Address(tmp, -os::vm_page_size()));
  subsw(size, size, rscratch1);
  str(size, Address(tmp));
  br(Assembler::GT, loop);

  // Bang down shadow pages too.
  // The -1 because we already subtracted 1 page.
  for (int i = 0; i< StackShadowPages-1; i++) {
    // this could be any sized move but this is can be a debugging crumb
    // so the bigger the better.
    lea(tmp, Address(tmp, -os::vm_page_size()));
    str(size, Address(tmp));
  }
}


address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
  unsigned long off;
  adrp(r, Address(page, rtype), off);
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), rtype);
  ldrw(zr, Address(r, off));
  return inst_mark();
}

address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
  InstructionMark im(this);
  code_section()->relocate(inst_mark(), rtype);
  ldrw(zr, Address(r, 0));
  return inst_mark();
}

void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
  relocInfo::relocType rtype = dest.rspec().reloc()->type();
  if (uabs(pc() - dest.target()) >= (1LL << 32)) {
    guarantee(rtype == relocInfo::none
	      || rtype == relocInfo::external_word_type
	      || rtype == relocInfo::poll_type
	      || rtype == relocInfo::poll_return_type,
	      "can only use a fixed address with an ADRP");
    // Out of range.  This doesn't happen very often, but we have to
    // handle it
    mov(reg1, dest);
    byte_offset = 0;
  } else {
    InstructionMark im(this);
    code_section()->relocate(inst_mark(), dest.rspec());
    byte_offset = (uint64_t)dest.target() & 0xfff;
    _adrp(reg1, dest.target());
  }
}

void MacroAssembler::build_frame(int framesize) {
  if (framesize == 0) {
    // Is this even possible?
    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
  } else if (framesize < ((1 << 9) + 2 * wordSize)) {
    sub(sp, sp, framesize);
    stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
  } else {
    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
    if (framesize < ((1 << 12) + 2 * wordSize))
      sub(sp, sp, framesize - 2 * wordSize);
    else {
      mov(rscratch1, framesize - 2 * wordSize);
      sub(sp, sp, rscratch1);
    }
  }
}

void MacroAssembler::remove_frame(int framesize) {
  if (framesize == 0) {
    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
  } else if (framesize < ((1 << 9) + 2 * wordSize)) {
    ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
    add(sp, sp, framesize);
  } else {
    if (framesize < ((1 << 12) + 2 * wordSize))
      add(sp, sp, framesize - 2 * wordSize);
    else {
      mov(rscratch1, framesize - 2 * wordSize);
      add(sp, sp, rscratch1);
    }
    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
  }
}

// Search for str1 in str2 and return index or -1
void MacroAssembler::string_indexof(Register str2, Register str1,
                                    Register cnt2, Register cnt1,
                                    Register tmp1, Register tmp2,
                                    Register tmp3, Register tmp4,
                                    int icnt1, Register result) {
  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;

  Register ch1 = rscratch1;
  Register ch2 = rscratch2;
  Register cnt1tmp = tmp1;
  Register cnt2tmp = tmp2;
  Register cnt1_neg = cnt1;
  Register cnt2_neg = cnt2;
  Register result_tmp = tmp4;

  // Note, inline_string_indexOf() generates checks:
  // if (substr.count > string.count) return -1;
  // if (substr.count == 0) return 0;

// We have two strings, a source string in str2, cnt2 and a pattern string
// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.

// For larger pattern and source we use a simplified Boyer Moore algorithm.
// With a small pattern and source we use linear scan.

  if (icnt1 == -1) {
    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
    br(LO, LINEARSEARCH);       // a byte array.
    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
    br(HS, LINEARSEARCH);
  }

// The Boyer Moore alogorithm is based on the description here:-
//
// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
//
// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
// and the 'Good Suffix' rule.
//
// These rules are essentially heuristics for how far we can shift the
// pattern along the search string.
//
// The implementation here uses the 'Bad Character' rule only because of the
// complexity of initialisation for the 'Good Suffix' rule.
//
// This is also known as the Boyer-Moore-Horspool algorithm:-
//
// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
//
// #define ASIZE 128
//
//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
//       int i, j;
//       unsigned c;
//       unsigned char bc[ASIZE];
//    
//       /* Preprocessing */
//       for (i = 0; i < ASIZE; ++i)
//          bc[i] = 0;
//       for (i = 0; i < m - 1; ) {
//          c = x[i];
//          ++i;
//          if (c < ASIZE) bc[c] = i;
//       }
//    
//       /* Searching */
//       j = 0;
//       while (j <= n - m) {
//          c = y[i+j];
//          if (x[m-1] == c)
//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
//          if (i < 0) return j;
//          if (c < ASIZE)
//            j = j - bc[y[j+m-1]] + m;
//          else
//            j += 1; // Advance by 1 only if char >= ASIZE
//       }
//    }

  if (icnt1 == -1) {
    BIND(BM);

    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
    Label BMADV, BMMATCH, BMCHECKEND;

    Register cnt1end = tmp2;
    Register str2end = cnt2;
    Register skipch = tmp2;

    // Restrict ASIZE to 128 to reduce stack space/initialisation.
    // The presence of chars >= ASIZE in the target string does not affect
    // performance, but we must be careful not to initialise them in the stack
    // array.
    // The presence of chars >= ASIZE in the source string may adversely affect
    // performance since we can only advance by one when we encounter one.

      stp(zr, zr, pre(sp, -128));
      for (int i = 1; i < 8; i++)
          stp(zr, zr, Address(sp, i*16));

      mov(cnt1tmp, 0);
      sub(cnt1end, cnt1, 1);
    BIND(BCLOOP);
      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
      cmp(ch1, 128);
      add(cnt1tmp, cnt1tmp, 1);
      br(HS, BCSKIP);
      strb(cnt1tmp, Address(sp, ch1));
    BIND(BCSKIP);
      cmp(cnt1tmp, cnt1end);
      br(LT, BCLOOP);

      mov(result_tmp, str2);

      sub(cnt2, cnt2, cnt1);
      add(str2end, str2, cnt2, LSL, 1);
    BIND(BMLOOPSTR2);
      sub(cnt1tmp, cnt1, 1);
      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
      cmp(ch1, skipch);
      br(NE, BMSKIP);
      subs(cnt1tmp, cnt1tmp, 1);
      br(LT, BMMATCH);
    BIND(BMLOOPSTR1);
      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
      cmp(ch1, ch2);
      br(NE, BMSKIP);
      subs(cnt1tmp, cnt1tmp, 1);
      br(GE, BMLOOPSTR1);
    BIND(BMMATCH);
      sub(result_tmp, str2, result_tmp);
      lsr(result, result_tmp, 1);
      add(sp, sp, 128);
      b(DONE);
    BIND(BMADV);
      add(str2, str2, 2);
      b(BMCHECKEND);
    BIND(BMSKIP);
      cmp(skipch, 128);
      br(HS, BMADV);
      ldrb(ch2, Address(sp, skipch));
      add(str2, str2, cnt1, LSL, 1);
      sub(str2, str2, ch2, LSL, 1);
    BIND(BMCHECKEND);
      cmp(str2, str2end);
      br(LE, BMLOOPSTR2);
      add(sp, sp, 128);
      b(NOMATCH);
  }

  BIND(LINEARSEARCH);
  {
    Label DO1, DO2, DO3;

    Register str2tmp = tmp2;
    Register first = tmp3;

    if (icnt1 == -1)
    {
        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;

        cmp(cnt1, 4);
        br(LT, DOSHORT);

        sub(cnt2, cnt2, cnt1);
        sub(cnt1, cnt1, 4);
        mov(result_tmp, cnt2);

        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt1_neg, zr, cnt1, LSL, 1);
        sub(cnt2_neg, zr, cnt2, LSL, 1);
        ldr(first, Address(str1, cnt1_neg));

      BIND(FIRST_LOOP);
        ldr(ch2, Address(str2, cnt2_neg));
        cmp(first, ch2);
        br(EQ, STR1_LOOP);
      BIND(STR2_NEXT);
        adds(cnt2_neg, cnt2_neg, 2);
        br(LE, FIRST_LOOP);
        b(NOMATCH);

      BIND(STR1_LOOP);
        adds(cnt1tmp, cnt1_neg, 8);
        add(cnt2tmp, cnt2_neg, 8);
        br(GE, LAST_WORD);

      BIND(STR1_NEXT);
        ldr(ch1, Address(str1, cnt1tmp));
        ldr(ch2, Address(str2, cnt2tmp));
        cmp(ch1, ch2);
        br(NE, STR2_NEXT);
        adds(cnt1tmp, cnt1tmp, 8);
        add(cnt2tmp, cnt2tmp, 8);
        br(LT, STR1_NEXT);

      BIND(LAST_WORD);
        ldr(ch1, Address(str1));
        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
        cmp(ch1, ch2);
        br(NE, STR2_NEXT);
        b(MATCH);

      BIND(DOSHORT);
        cmp(cnt1, 2);
        br(LT, DO1);
        br(GT, DO3);
    }

    if (icnt1 == 4) {
      Label CH1_LOOP;

        ldr(ch1, str1);
        sub(cnt2, cnt2, 4);
        mov(result_tmp, cnt2);
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt2_neg, zr, cnt2, LSL, 1);

      BIND(CH1_LOOP);
        ldr(ch2, Address(str2, cnt2_neg));
        cmp(ch1, ch2);
        br(EQ, MATCH);
        adds(cnt2_neg, cnt2_neg, 2);
        br(LE, CH1_LOOP);
        b(NOMATCH);
    }

    if (icnt1 == -1 || icnt1 == 2) {
      Label CH1_LOOP;

      BIND(DO2);
        ldrw(ch1, str1);
        sub(cnt2, cnt2, 2);
        mov(result_tmp, cnt2);
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt2_neg, zr, cnt2, LSL, 1);

      BIND(CH1_LOOP);
        ldrw(ch2, Address(str2, cnt2_neg));
        cmp(ch1, ch2);
        br(EQ, MATCH);
        adds(cnt2_neg, cnt2_neg, 2);
        br(LE, CH1_LOOP);
        b(NOMATCH);
    }

    if (icnt1 == -1 || icnt1 == 3) {
      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;

      BIND(DO3);
        ldrw(first, str1);
        ldrh(ch1, Address(str1, 4));

        sub(cnt2, cnt2, 3);
        mov(result_tmp, cnt2);
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt2_neg, zr, cnt2, LSL, 1);

      BIND(FIRST_LOOP);
        ldrw(ch2, Address(str2, cnt2_neg));
        cmpw(first, ch2);
        br(EQ, STR1_LOOP);
      BIND(STR2_NEXT);
        adds(cnt2_neg, cnt2_neg, 2);
        br(LE, FIRST_LOOP);
        b(NOMATCH);

      BIND(STR1_LOOP);
        add(cnt2tmp, cnt2_neg, 4);
        ldrh(ch2, Address(str2, cnt2tmp));
        cmp(ch1, ch2);
        br(NE, STR2_NEXT);
        b(MATCH);
    }

    if (icnt1 == -1 || icnt1 == 1) {
      Label CH1_LOOP, HAS_ZERO;
      Label DO1_SHORT, DO1_LOOP;

      BIND(DO1);
        ldrh(ch1, str1);
        cmp(cnt2, 4);
        br(LT, DO1_SHORT);

        orr(ch1, ch1, ch1, LSL, 16);
        orr(ch1, ch1, ch1, LSL, 32);

        sub(cnt2, cnt2, 4);
        mov(result_tmp, cnt2);
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt2_neg, zr, cnt2, LSL, 1);

        mov(tmp3, 0x0001000100010001);
      BIND(CH1_LOOP);
        ldr(ch2, Address(str2, cnt2_neg));
        eor(ch2, ch1, ch2);
        sub(tmp1, ch2, tmp3);
        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
        bics(tmp1, tmp1, tmp2);
        br(NE, HAS_ZERO);
        adds(cnt2_neg, cnt2_neg, 8);
        br(LT, CH1_LOOP);

        cmp(cnt2_neg, 8);
        mov(cnt2_neg, 0);
        br(LT, CH1_LOOP);
        b(NOMATCH);

      BIND(HAS_ZERO);
        rev(tmp1, tmp1);
        clz(tmp1, tmp1);
        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
        b(MATCH);

      BIND(DO1_SHORT);
        mov(result_tmp, cnt2);
        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
        sub(cnt2_neg, zr, cnt2, LSL, 1);
      BIND(DO1_LOOP);
        ldrh(ch2, Address(str2, cnt2_neg));
        cmpw(ch1, ch2);
        br(EQ, MATCH);
        adds(cnt2_neg, cnt2_neg, 2);
        br(LT, DO1_LOOP);
    }
  }
  BIND(NOMATCH);
    mov(result, -1);
    b(DONE);
  BIND(MATCH);
    add(result, result_tmp, cnt2_neg, ASR, 1);
  BIND(DONE);
}

// Compare strings.
void MacroAssembler::string_compare(Register str1, Register str2,
                                    Register cnt1, Register cnt2, Register result,
                                    Register tmp1) {
  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
    NEXT_WORD, DIFFERENCE;

  BLOCK_COMMENT("string_compare {");

  // Compute the minimum of the string lengths and save the difference.
  subsw(tmp1, cnt1, cnt2);
  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min

  // A very short string
  cmpw(cnt2, 4);
  br(Assembler::LT, SHORT_STRING);

  // Check if the strings start at the same location.
  cmp(str1, str2);
  br(Assembler::EQ, LENGTH_DIFF);

  // Compare longwords
  {
    subw(cnt2, cnt2, 4); // The last longword is a special case

    // Move both string pointers to the last longword of their
    // strings, negate the remaining count, and convert it to bytes.
    lea(str1, Address(str1, cnt2, Address::uxtw(1)));
    lea(str2, Address(str2, cnt2, Address::uxtw(1)));
    sub(cnt2, zr, cnt2, LSL, 1);

    // Loop, loading longwords and comparing them into rscratch2.
    bind(NEXT_WORD);
    ldr(result, Address(str1, cnt2));
    ldr(cnt1, Address(str2, cnt2));
    adds(cnt2, cnt2, wordSize);
    eor(rscratch2, result, cnt1);
    cbnz(rscratch2, DIFFERENCE);
    br(Assembler::LT, NEXT_WORD);

    // Last longword.  In the case where length == 4 we compare the
    // same longword twice, but that's still faster than another
    // conditional branch.

    ldr(result, Address(str1));
    ldr(cnt1, Address(str2));
    eor(rscratch2, result, cnt1);
    cbz(rscratch2, LENGTH_DIFF);

    // Find the first different characters in the longwords and
    // compute their difference.
    bind(DIFFERENCE);
    rev(rscratch2, rscratch2);
    clz(rscratch2, rscratch2);
    andr(rscratch2, rscratch2, -16);
    lsrv(result, result, rscratch2);
    uxthw(result, result);
    lsrv(cnt1, cnt1, rscratch2);
    uxthw(cnt1, cnt1);
    subw(result, result, cnt1);
    b(DONE);
  }

  bind(SHORT_STRING);
  // Is the minimum length zero?
  cbz(cnt2, LENGTH_DIFF);

  bind(SHORT_LOOP);
  load_unsigned_short(result, Address(post(str1, 2)));
  load_unsigned_short(cnt1, Address(post(str2, 2)));
  subw(result, result, cnt1);
  cbnz(result, DONE);
  sub(cnt2, cnt2, 1);
  cbnz(cnt2, SHORT_LOOP);

  // Strings are equal up to min length.  Return the length difference.
  bind(LENGTH_DIFF);
  mov(result, tmp1);

  // That's it
  bind(DONE);

  BLOCK_COMMENT("} string_compare");
}


void MacroAssembler::string_equals(Register str1, Register str2,
				   Register cnt, Register result,
				   Register tmp1) {
  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
    NEXT_WORD;

  const Register tmp2 = rscratch1;
  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);

  BLOCK_COMMENT("string_equals {");

  // Start by assuming that the strings are not equal.
  mov(result, zr);

  // A very short string
  cmpw(cnt, 4);
  br(Assembler::LT, SHORT_STRING);

  // Check if the strings start at the same location.
  cmp(str1, str2);
  br(Assembler::EQ, SAME_CHARS);

  // Compare longwords
  {
    subw(cnt, cnt, 4); // The last longword is a special case

    // Move both string pointers to the last longword of their
    // strings, negate the remaining count, and convert it to bytes.
    lea(str1, Address(str1, cnt, Address::uxtw(1)));
    lea(str2, Address(str2, cnt, Address::uxtw(1)));
    sub(cnt, zr, cnt, LSL, 1);

    // Loop, loading longwords and comparing them into rscratch2.
    bind(NEXT_WORD);
    ldr(tmp1, Address(str1, cnt));
    ldr(tmp2, Address(str2, cnt));
    adds(cnt, cnt, wordSize);
    eor(rscratch2, tmp1, tmp2);
    cbnz(rscratch2, DONE);
    br(Assembler::LT, NEXT_WORD);

    // Last longword.  In the case where length == 4 we compare the
    // same longword twice, but that's still faster than another
    // conditional branch.

    ldr(tmp1, Address(str1));
    ldr(tmp2, Address(str2));
    eor(rscratch2, tmp1, tmp2);
    cbz(rscratch2, SAME_CHARS);
    b(DONE);
  }

  bind(SHORT_STRING);
  // Is the length zero?
  cbz(cnt, SAME_CHARS);

  bind(SHORT_LOOP);
  load_unsigned_short(tmp1, Address(post(str1, 2)));
  load_unsigned_short(tmp2, Address(post(str2, 2)));
  subw(tmp1, tmp1, tmp2);
  cbnz(tmp1, DONE);
  sub(cnt, cnt, 1);
  cbnz(cnt, SHORT_LOOP);

  // Strings are equal.
  bind(SAME_CHARS);
  mov(result, true);

  // That's it
  bind(DONE);

  BLOCK_COMMENT("} string_equals");
}


// Compare char[] arrays aligned to 4 bytes
void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
                                        Register result, Register tmp1)
{
  Register cnt1 = rscratch1;
  Register cnt2 = rscratch2;
  Register tmp2 = rscratch2;

  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;

  int length_offset  = arrayOopDesc::length_offset_in_bytes();
  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);

  BLOCK_COMMENT("char_arrays_equals  {");

    // different until proven equal
    mov(result, false);

    // same array?
    cmp(ary1, ary2);
    br(Assembler::EQ, SAME);

    // ne if either null
    cbz(ary1, DIFFER);
    cbz(ary2, DIFFER);

    // lengths ne?
    ldrw(cnt1, Address(ary1, length_offset));
    ldrw(cnt2, Address(ary2, length_offset));
    cmp(cnt1, cnt2);
    br(Assembler::NE, DIFFER);

    lea(ary1, Address(ary1, base_offset));
    lea(ary2, Address(ary2, base_offset));

    subs(cnt1, cnt1, 4);
    br(LT, TAIL03);

  BIND(NEXT);
    ldr(tmp1, Address(post(ary1, 8)));
    ldr(tmp2, Address(post(ary2, 8)));
    subs(cnt1, cnt1, 4);
    eor(tmp1, tmp1, tmp2);
    cbnz(tmp1, DIFFER);
    br(GE, NEXT);

  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
    tst(cnt1, 0b10);
    br(EQ, TAIL01);
    ldrw(tmp1, Address(post(ary1, 4)));
    ldrw(tmp2, Address(post(ary2, 4)));
    cmp(tmp1, tmp2);
    br(NE, DIFFER);
  BIND(TAIL01);  // 0-1 chars left
    tst(cnt1, 0b01);
    br(EQ, SAME);
    ldrh(tmp1, ary1);
    ldrh(tmp2, ary2);
    cmp(tmp1, tmp2);
    br(NE, DIFFER);

  BIND(SAME);
    mov(result, true);
  BIND(DIFFER);	// result already set
  
  BLOCK_COMMENT("} char_arrays_equals");
}