view hotspot/src/cpu/mips/vm/assembler_mips.cpp @ 27:b7ec29b378c9

Update codes to support deoptimization.
author Ao Qi <aoqi@loongson.cn>
date Thu, 11 Nov 2010 19:59:55 +0800
parents 85b046e5468b
children 15f398a44411
line wrap: on
line source

/*
 * Copyright 1997-2008 Sun Microsystems, Inc.  All Rights Reserved.
 * Copyright 2010 Lemote, Inc.  All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 *
 */

#include "incls/_precompiled.incl"
#include "incls/_assembler_mips.cpp.incl"

int MacroAssembler::i[32] = {0,};
float MacroAssembler::f[32] = {0.0,};

void MacroAssembler::print(outputStream *s) {
	unsigned int k;
	for(k=0; k<sizeof(i)/sizeof(i[0]); k++) {
		s->print_cr("i%d = 0x%.16lx", k, i[k]);
	}
	s->cr();

	for(k=0; k<sizeof(f)/sizeof(f[0]); k++) {
		s->print_cr("f%d = %f", k, f[k]); 
	}
	s->cr();
}


int MacroAssembler::i_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->i[k]; }
int MacroAssembler::f_offset(unsigned int k) { return (intptr_t)&((MacroAssembler*)0)->f[k]; }
	
void MacroAssembler::save_registers(MacroAssembler *masm) {
#define __ masm->
	for(int k=0; k<32; k++) {
		__ sw (as_Register(k), A0, i_offset(k));
	}
	
	for(int k=0; k<32; k++) {
		__ swc1 (as_FloatRegister(k), A0, f_offset(k));
	}
#undef __
}

void MacroAssembler::restore_registers(MacroAssembler *masm) {
#define __ masm->
	for(int k=0; k<32; k++) {
		__ lw (as_Register(k), A0, i_offset(k));
	}
		
	for(int k=0; k<32; k++) {
		__ lwc1 (as_FloatRegister(k), A0, f_offset(k));
	}
#undef __
}


// Implementation of AddressLiteral

AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  _is_lval = false;
  _target = target;
  switch (rtype) {
  case relocInfo::oop_type:
    // Oops are a special case. Normally they would be their own section
    // but in cases like icBuffer they are literals in the code stream that
    // we don't have a section for. We use none so that we get a literal address
    // which is always patchable.
    break;
  case relocInfo::external_word_type:
    _rspec = external_word_Relocation::spec(target);
    break;
  case relocInfo::internal_word_type:
    _rspec = internal_word_Relocation::spec(target);
    break;
  case relocInfo::opt_virtual_call_type:
    _rspec = opt_virtual_call_Relocation::spec();
    break;
  case relocInfo::static_call_type:
    _rspec = static_call_Relocation::spec();
    break;
  case relocInfo::runtime_call_type:
    _rspec = runtime_call_Relocation::spec();
    break;
  case relocInfo::poll_type:
  case relocInfo::poll_return_type:
    _rspec = Relocation::spec_simple(rtype);
    break;
  case relocInfo::none:
    break;
  default:
    ShouldNotReachHere();
    break;
  }
}

// Implementation of Address

#ifdef _LP64

Address Address::make_array(ArrayAddress adr) {
  // Not implementable on 64bit machines
  // Should have been handled higher up the call chain.
  ShouldNotReachHere();
  return Address();
}

// exceedingly dangerous constructor
Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  _base  = noreg;
  _index = noreg;
  _scale = no_scale;
  _disp  = disp;
  switch (rtype) {
    case relocInfo::external_word_type:
      _rspec = external_word_Relocation::spec(loc);
      break;
    case relocInfo::internal_word_type:
      _rspec = internal_word_Relocation::spec(loc);
      break;
    case relocInfo::runtime_call_type:
      // HMM
      _rspec = runtime_call_Relocation::spec();
      break;
    case relocInfo::poll_type:
    case relocInfo::poll_return_type:
      _rspec = Relocation::spec_simple(rtype);
      break;
    case relocInfo::none:
      break;
    default:
      ShouldNotReachHere();
  }
}
#else // LP64

Address Address::make_array(ArrayAddress adr) {
  AddressLiteral base = adr.base();
  Address index = adr.index();
  assert(index._disp == 0, "must not have disp"); // maybe it can?
  Address array(index._base, index._index, index._scale, (intptr_t) base.target());
  array._rspec = base._rspec;
  return array;
}

// exceedingly dangerous constructor
Address::Address(address loc, RelocationHolder spec) {
  _base  = noreg;
  _index = noreg;
  _scale = no_scale;
  _disp  = (intptr_t) loc;
  _rspec = spec;
}

#endif // _LP64


/*
// Convert the raw encoding form into the form expected by the constructor for
// Address.  An index of 4 (rsp) corresponds to having no index, so convert
// that to noreg for the Address constructor.
Address Address::make_raw(int base, int index, int scale, int disp) {
  bool valid_index = index != rsp->encoding();
  if (valid_index) {
    Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
    return madr;
  } else {
    Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
    return madr;
  }
}
*/

// Implementation of Assembler
const char *Assembler::ops_name[] = {
	"special",  "regimm",   "j",      "jal",    "beq",      "bne",      "blez",   "bgtz",
	"addi",     "addiu",    "slti",   "sltiu",  "andi",     "ori",      "xori",   "lui",
	"cop0",     "cop1",     "cop2",   "cop3",   "beql",     "bnel",     "bleql",  "bgtzl",
	"daddi",    "daddiu",   "ldl",    "ldr",    "",         "",         "",       "",
	"lb",       "lh",       "lwl",    "lw",     "lbu",      "lhu",      "lwr",    "lwu",
	"sb",       "sh",       "swl",    "sw",     "sdl",      "sdr",      "swr",    "cache",
	"ll",       "lwc1",     "",       "",       "lld",      "ldc1",     "",       "ld",
	"sc",       "swc1",     "",       "",       "scd",      "sdc1",     "",       "sd"
};

const char* Assembler::special_name[] = {
	"sll",      "",         "srl",      "sra",      "sllv",     "",         "srlv",     "srav",
	"jr",       "jalr",     "",         "",         "syscall",  "break",    "",         "sync",
	"mfhi",     "mthi",     "mflo",     "mtlo",     "dsll",     "",         "dsrl",     "dsra",
	"mult",     "multu",    "div",      "divu",     "dmult",    "dmultu",   "ddiv",     "ddivu",
	"add",      "addu",     "sub",      "subu",     "and",      "or",       "xor",      "nor",
	"",         "",         "slt",      "sltu",     "dadd",     "daddu",    "dsub",     "dsubu",
	"tge",      "tgeu",     "tlt",      "tltu",     "teq",      "",         "tne",      "",
	"dsll",     "",         "dsrl",     "dsra",     "dsll32",   "",         "dsrl32",   "dsra32"
};

const char* Assembler::regimm_name[] = {
	"bltz",     "bgez",     "bltzl",    "bgezl",    "",         "",         "",         "",
	"tgei",     "tgeiu",    "tlti",     "tltiu",    "teqi",     "",         "tnei",     "",
	"bltzal",   "bgezal",   "bltzall",  "bgezall"
};
	
const char* Assembler::float_name[] = {
	"add",			"sub",			"mul",			"div",			"sqrt",			"abs",			"mov",			"neg",
	"round.l",	"trunc.l",	"ceil.l",		"floor.l",	"round.w",  "trunc.w",	"ceil.w",		"floor.w"
};

//misleading name, print only branch/jump instruction 
void Assembler::print_instruction(int inst) {
	const char *s;
	switch( opcode(inst) ) {
	default:
		s = ops_name[opcode(inst)];
		break;
	case special_op:
		s = special_name[special(inst)];
		break;
	case regimm_op:
		s = special_name[rt(inst)];
		break;
	}

	::tty->print("%s", s);
}

//without check, maybe fixed
int Assembler::patched_branch(int dest_pos, int inst, int inst_pos) {
	int v = (dest_pos - inst_pos - 4)>>2;
	switch(opcode(inst)) {
	case j_op:
	case jal_op:
		assert(false, "should not use j/jal here");
		break;
	default:
		v = low16(v);
		inst &= 0xffff0000;
		break;
	}

	return inst | v;
}

int Assembler::branch_destination(int inst, int pos) {
	int off;
	
	switch(opcode(inst)) {
	case j_op:
	case jal_op:
		assert(false, "should not use j/jal here");
		break;
	default:
		off = expand(low16(inst), 15);
		break;
	}
	
	return off ? pos + 4 + (off<<2) : 0;
}

int AbstractAssembler::code_fill_byte() {
	  return 0x00;                  // illegal instruction 0x00000000
}

// Now the Assembler instruction (identical for 32/64 bits)

void Assembler::lb(Register rt, Address src) {
	lb(rt, src.base(), src.disp());
}

void Assembler::lbu(Register rt, Address src) {
	lbu(rt, src.base(), src.disp());
}

void Assembler::ld(Register rt, Address src){
	ld(rt, src.base(), src.disp());
}

void Assembler::ldl(Register rt, Address src){
	ldl(rt, src.base(), src.disp());
}

void Assembler::ldr(Register rt, Address src){
	ldr(rt, src.base(), src.disp());
}

void Assembler::lh(Register rt, Address src){
	lh(rt, src.base(), src.disp());
}

void Assembler::lhu(Register rt, Address src){
	lhu(rt, src.base(), src.disp());
}

void Assembler::ll(Register rt, Address src){
	ll(rt, src.base(), src.disp());
}

void Assembler::lld(Register rt, Address src){
	lld(rt, src.base(), src.disp());
}

void Assembler::lw(Register rt, Address src){
	lw(rt, src.base(), src.disp());
}
void Assembler::lea(Register rt, Address src) {
	addi(rt, src.base(), src.disp());
}

void Assembler::lwl(Register rt, Address src){
	lwl(rt, src.base(), src.disp());
}

void Assembler::lwr(Register rt, Address src){
	lwr(rt, src.base(), src.disp());
}

void Assembler::lwu(Register rt, Address src){
	lwu(rt, src.base(), src.disp());
}

void Assembler::sb(Register rt, Address dst) {
	sb(rt, dst.base(), dst.disp());
}

void Assembler::sc(Register rt, Address dst) {
	sc(rt, dst.base(), dst.disp());
}

void Assembler::scd(Register rt, Address dst) {
	scd(rt, dst.base(), dst.disp());
}

void Assembler::sd(Register rt, Address dst) {
	sd(rt, dst.base(), dst.disp());
}

void Assembler::sdl(Register rt, Address dst) {
	sdl(rt, dst.base(), dst.disp());
}

void Assembler::sdr(Register rt, Address dst) {
	sdr(rt, dst.base(), dst.disp());
}

void Assembler::sh(Register rt, Address dst) {
	sh(rt, dst.base(), dst.disp());
}

void Assembler::sw(Register rt, Address dst) {
	sw(rt, dst.base(), dst.disp());
}

void Assembler::swl(Register rt, Address dst) {
	swl(rt, dst.base(), dst.disp());
}

void Assembler::swr(Register rt, Address dst) {
	swr(rt, dst.base(), dst.disp());
}

void Assembler::lwc1(FloatRegister rt, Address src) {
	lwc1(rt, src.base(), src.disp());
}

void Assembler::ldc1(FloatRegister rt, Address src) {
	ldc1(rt, src.base(), src.disp());
}

void Assembler::swc1(FloatRegister rt, Address dst) {
	swc1(rt, dst.base(), dst.disp());
}

void Assembler::sdc1(FloatRegister rt, Address dst) {
	sdc1(rt, dst.base(), dst.disp());
}

void Assembler::j(address entry) {
	int dest = ((int)entry - (((int)pc() + 4) & 0xf0000000))>>2;
	emit_long((j_op<<26) | dest); 
	has_delay_slot(); 
}

void Assembler::jal(address entry) {
	int dest = ((int)entry - (((int)pc() + 4) & 0xf0000000))>>2;
	emit_long((jal_op<<26) | dest); 
	has_delay_slot(); 
}









// Implementation of MacroAssembler

// First all the versions that have distinct versions depending on 32/64 bit
// Unless the difference is trivial (1 line or so).

//#ifndef _LP64

// 32bit versions

void MacroAssembler::ld_ptr(Register rt, Register offset, Register base) {
  addu_long(AT, base, offset);
  ld_ptr(rt, 0, AT);
}

void MacroAssembler::st_ptr(Register rt, Register offset, Register base) {
  addu_long(AT, base, offset);
  st_ptr(rt, 0, AT);
}

void MacroAssembler::ld_long(Register rt, Register offset, Register base) {
  addu_long(AT, base, offset);
  ld_long(rt, 0, AT);
}

void MacroAssembler::st_long(Register rt, Register offset, Register base) {
  addu_long(AT, base, offset);
  st_long(rt, 0, AT);
}

Address MacroAssembler::as_Address(AddressLiteral adr) {
  return Address(adr.target(), adr.rspec());
}

Address MacroAssembler::as_Address(ArrayAddress adr) {
  return Address::make_array(adr);
}

int MacroAssembler::biased_locking_enter(Register lock_reg,
                                         Register obj_reg,
                                         Register swap_reg,
                                         Register tmp_reg,
                                         bool swap_reg_contains_mark,
                                         Label& done,
                                         Label* slow_case,
                                         BiasedLockingCounters* counters) {
	assert(UseBiasedLocking, "why call this otherwise?");
	//assert(swap_reg == eax, "swap_reg must be eax for cmpxchg");
	assert_different_registers(lock_reg, obj_reg, swap_reg);
	bool need_tmp_reg = false;
	if (tmp_reg == noreg) {
		need_tmp_reg = true;
		tmp_reg = lock_reg;
	} else {
		assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
	}
	assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
	Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
	Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
	Address saved_mark_addr(lock_reg, 0);

	// Biased locking
	// See whether the lock is currently biased toward our thread and
	// whether the epoch is still valid
	// Note that the runtime guarantees sufficient alignment of JavaThread
	// pointers to allow age to be placed into low bits
	// First check to see whether biasing is even enabled for this object
	Label cas_label;
	int null_check_offset = -1;
	if (!swap_reg_contains_mark) {
		null_check_offset = offset();
		//   movl(swap_reg, mark_addr);
		lw(swap_reg, mark_addr);
	}
/////////////////////////////////////////////	
		//jerome_for_debug
/*		Label ne;	
		move(AT, 0x00000005);
		sub(AT, AT,swap_reg);
		bne(AT, ZERO, ne);
		delayed()->nop();	
		move(AT, (int)(&jerome8)); 
		sw(swap_reg, AT, 0); 
		bind(ne);
*/
//////////////////////////////////////////////	
	
	
	
	
	if (need_tmp_reg) {
		// pushl(tmp_reg);
		push(tmp_reg);
	}
	//movl(tmp_reg, swap_reg);
	move(tmp_reg, swap_reg);
	//andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
	andi(tmp_reg,tmp_reg, markOopDesc::biased_lock_mask_in_place);
	//cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
	addi(AT, ZERO,markOopDesc::biased_lock_pattern);
	sub(AT, AT, tmp_reg);
	if (need_tmp_reg) {
		// popl(tmp_reg);
		pop(tmp_reg);
	}

	//jcc(Assembler::notEqual, cas_label);
	bne(AT,ZERO,cas_label);
	delayed()->nop();

	
	
	// The bias pattern is present in the object's header. Need to check
	// whether the bias owner and the epoch are both still current.
	// Note that because there is no current thread register on x86 we
	// need to store off the mark word we read out of the object to
	// avoid reloading it and needing to recheck invariants below. This
	// store is unfortunate but it makes the overall code shorter and
	// simpler.
	// movl(saved_mark_addr, swap_reg);
/*
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome1)); 
	move(tmp_reg, 0xeeeeeeee);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	sw(swap_reg,saved_mark_addr); 
	if (need_tmp_reg) {
		//pushl(tmp_reg);
		push(tmp_reg);
	}
	get_thread(tmp_reg);
	//xorl(swap_reg, tmp_reg);
	xorr(swap_reg,swap_reg, tmp_reg);
	if (swap_reg_contains_mark) {
		null_check_offset = offset();
	}
	// movl(tmp_reg, klass_addr);
	lw(tmp_reg,klass_addr); 
	// xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() 
	// + klassOopDesc::klass_part_offset_in_bytes()));
	//xori(swap_reg, swap_reg,Address(tmp_reg, Klass::prototype_header_offset_in_bytes() 
	//+ klassOopDesc::klass_part_offset_in_bytes()));

	lw(AT, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() 
				+ klassOopDesc::klass_part_offset_in_bytes()));
	xorr(swap_reg,swap_reg,AT); 
	// andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
        move(AT, ~((int) markOopDesc::age_mask_in_place)); 
	andr(swap_reg,swap_reg,AT);

	if (need_tmp_reg) {
		//popl(tmp_reg);
		pop(tmp_reg);
	}
	if (PrintBiasedLockingStatistics) {
		//FIXME
		//cond_incl(ZERO, Address((int) BiasedLocking::biased_lock_entry_count_addr(), 
		//relocInfo::none));
	}
	//  jcc(Assembler::equal, done);
	//FIXME, equal is for what ,there is no cmp or test here? @jerome
	//beq(tmp_reg,ZERO, done);
	beq(swap_reg,ZERO, done);
	delayed()->nop();
/*
// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome2)); 
	move(tmp_reg, 0xdddddddd);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	Label try_revoke_bias;
	Label try_rebias;

	// At this point we know that the header has the bias pattern and
	// that we are not the bias owner in the current epoch. We need to
	// figure out more details about the state of the header in order to
	// know what operations can be legally performed on the object's
	// header.

	// If the low three bits in the xor result aren't clear, that means
	// the prototype header is no longer biased and we have to revoke
	// the bias on this object.

	//testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
	//jcc(Assembler::notZero, try_revoke_bias);
        move(AT, markOopDesc::biased_lock_mask_in_place); 	
	andr(AT,swap_reg,AT );
	bne(AT,ZERO,try_revoke_bias); 
	delayed()->nop();
/*	
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome3)); 
	move(tmp_reg, 0xcccccccc);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	// Biasing is still enabled for this data type. See whether the
	// epoch of the current bias is still valid, meaning that the epoch
	// bits of the mark word are equal to the epoch bits of the
	// prototype header. (Note that the prototype header's epoch bits
	// only change at a safepoint.) If not, attempt to rebias the object
	// toward the current thread. Note that we must be absolutely sure
	// that the current epoch is invalid in order to do this because
	// otherwise the manipulations it performs on the mark word are
	// illegal.

	// testl(swap_reg, markOopDesc::epoch_mask_in_place);
	//jcc(Assembler::notZero, try_rebias);
	move(AT, markOopDesc::epoch_mask_in_place);
	andr(AT,swap_reg,AT);
	bne(AT,ZERO,try_rebias);
	delayed()->nop();
/*
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome4)); 
	move(tmp_reg, 0xbbbbbbbb);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	// The epoch of the current bias is still valid but we know nothing
	// about the owner; it might be set or it might be clear. Try to
	// acquire the bias of the object using an atomic operation. If this
	// fails we will go in to the runtime to revoke the object's bias.
	// Note that we first construct the presumed unbiased header so we
	// don't accidentally blow away another thread's valid bias.

	//movl(swap_reg, saved_mark_addr);
	lw(swap_reg, saved_mark_addr);

	//  andl(swap_reg,markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
	move(AT, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);  
	andr(swap_reg,swap_reg,AT);

	if (need_tmp_reg) {
		// pushl(tmp_reg);
		push(tmp_reg);
	}
	get_thread(tmp_reg);
	//orl(tmp_reg, swap_reg);
	orr(tmp_reg,tmp_reg, swap_reg);
	//if (os::is_MP()) {
	// lock();
	//}    
	//cmpxchg(tmp_reg, Address(obj_reg));
	// what is store in eax now ? @jerome,see the entry of the func, swap_reg!
	cmpxchg(tmp_reg, Address(obj_reg, 0),swap_reg);
	if (need_tmp_reg) {
		//popl(tmp_reg);
		pop(tmp_reg);
	}
	// If the biasing toward our thread failed, this means that
	// another thread succeeded in biasing it toward itself and we
	// need to revoke that bias. The revocation will occur in the
	// interpreter runtime in the slow case.
	if (PrintBiasedLockingStatistics) {
		//FIXME 
		// cond_incl(ZERO, Address((int) BiasedLocking::anonymously_biased_lock_entry_count_addr(), relocInfo::none));
	}
	if (slow_case != NULL) {
		//jcc(Assembler::notZero, *slow_case);
		beq(AT,ZERO, *slow_case);
		delayed()->nop();
	}
	//jmp(done);
/*	
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome5)); 
	move(tmp_reg, 0xaaaaaaaa);
	sw(ZERO, AT, 0);  
	pop(tmp_reg);	
*/	
	b(done);
	delayed()->nop();

	bind(try_rebias);
	// At this point we know the epoch has expired, meaning that the
	// current "bias owner", if any, is actually invalid. Under these
	// circumstances _only_, we are allowed to use the current header's
	// value as the comparison value when doing the cas to acquire the
	// bias in the current epoch. In other words, we allow transfer of
	// the bias from one thread to another directly in this situation.
	//
	// FIXME: due to a lack of registers we currently blow away the age
	// bits in this situation. Should attempt to preserve them.
	if (need_tmp_reg) {
		// pushl(tmp_reg);
		push(tmp_reg);
	}
/*
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome6)); 
	move(tmp_reg, 0x99999999);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	get_thread(tmp_reg);
	//movl(swap_reg, klass_addr);
	lw(swap_reg, klass_addr);
	// orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset_in_bytes() 
	// + klassOopDesc::klass_part_offset_in_bytes()));
	lw(AT,Address(swap_reg, Klass::prototype_header_offset_in_bytes() 
				+ klassOopDesc::klass_part_offset_in_bytes()));
	orr(tmp_reg,tmp_reg,AT);  
	// movl(swap_reg, saved_mark_addr);
	lw(swap_reg, saved_mark_addr);

	// if (os::is_MP()) {
	//  lock();
	//}    
	// cmpxchg(tmp_reg, Address(obj_reg));
	cmpxchg(tmp_reg, Address(obj_reg, 0),swap_reg);
	if (need_tmp_reg) {
		//    popl(tmp_reg);
		pop(tmp_reg);
	}
	// If the biasing toward our thread failed, then another thread
	// succeeded in biasing it toward itself and we need to revoke that
	// bias. The revocation will occur in the runtime in the slow case.
	if (PrintBiasedLockingStatistics) {
		//FIXME 
		//cond_incl(ZERO, Address((int) BiasedLocking::rebiased_lock_entry_count_addr(), 
		//relocInfo::none));
	}
	if (slow_case != NULL) {
		//jcc(Assembler::notZero, *slow_case);
		beq(AT,ZERO, *slow_case);
		delayed()->nop();
	}
	//jmp(done);

	b(done);
	delayed()->nop();
	bind(try_revoke_bias);
	// The prototype mark in the klass doesn't have the bias bit set any
	// more, indicating that objects of this data type are not supposed
	// to be biased any more. We are going to try to reset the mark of
	// this object to the prototype value and fall through to the
	// CAS-based locking scheme. Note that if our CAS fails, it means
	// that another thread raced us for the privilege of revoking the
	// bias of this particular object, so it's okay to continue in the
	// normal locking code.
	//
	// FIXME: due to a lack of registers we currently blow away the age
	// bits in this situation. Should attempt to preserve them.
	// movl(swap_reg, saved_mark_addr);
	lw(swap_reg, saved_mark_addr);

	if (need_tmp_reg) {
		//pushl(tmp_reg);
		push(tmp_reg);
	}
/*	
	// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome7)); 
	move(tmp_reg, 0x88888888);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	//movl(tmp_reg, klass_addr);
	lw(tmp_reg, klass_addr);
	//movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
	lw(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() 
				+ klassOopDesc::klass_part_offset_in_bytes()));
	//if (os::is_MP()) {
	// lock();
	//}    
	//cmpxchg(tmp_reg, Address(obj_reg));
	cmpxchg(tmp_reg, Address(obj_reg, 0),swap_reg);
	if (need_tmp_reg) {
		//popl(tmp_reg);
		pop(tmp_reg);
	}
	// Fall through to the normal CAS-based lock, because no matter what
	// the result of the above CAS, some thread must have succeeded in
	// removing the bias bit from the object's header.
	if (PrintBiasedLockingStatistics) {
		//FIXME
		//cond_incl(ZERO, Address((int) BiasedLocking::revoked_lock_entry_count_addr(), relocInfo::none));
	}

	bind(cas_label);
/*// jerome_for_debug
	push(tmp_reg);
	move(AT, (int)(&jerome8)); 
	move(tmp_reg, 0x77777777);
	sw(tmp_reg, AT, 0);  
	pop(tmp_reg);	
*/	
	return null_check_offset;
}

void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
  assert(UseBiasedLocking, "why call this otherwise?");

  // Check for biased locking unlock case, which is a no-op
	// Note: we do not have to check the thread ID for two reasons.
	// First, the interpreter checks for IllegalMonitorStateException at
	// a higher level. Second, if the bias was revoked while we held the
	// lock, the object could not be rebiased toward another thread, so
	// the bias bit would be clear.
	//movl(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
	lw(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
	//andl(temp_reg, markOopDesc::biased_lock_mask_in_place);
	andi(temp_reg, temp_reg,markOopDesc::biased_lock_mask_in_place);
	// cmpl(temp_reg, markOopDesc::biased_lock_pattern);
	addi(AT,ZERO,markOopDesc::biased_lock_pattern);
	//jcc(Assembler::equal, done);

	beq(AT,temp_reg,done);
	delayed()->nop();
}

// NOTE: we dont increment the SP after call like the x86 version, maybe this is a problem, FIXME. 
// by yjl 6/27/2005 
// the stack pointer adjustment is needed. see InterpreterMacroAssembler::super_call_VM_leaf
// by yjl 7/11/2005
// this method will handle the stack problem, you need not to preserve the stack space for the argument now
// by yjl 8/1/2005
void MacroAssembler::call_VM_leaf_base(address entry_point,
                                       int number_of_arguments) {
  //call(RuntimeAddress(entry_point));
  //increment(rsp, number_of_arguments * wordSize);
#ifndef OPT_THREAD
  Register java_thread = T8;
  get_thread(java_thread);
#else
  Register java_thread = TREG;
#endif

// save stack pointer
  assert(number_of_arguments <= 4, "just check");
	sw(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));

	if (number_of_arguments)
	  addi(SP, SP, - number_of_arguments * wordSize);
	move(AT, -8); 
	andr(SP, SP, AT); 
												  
	call(entry_point, relocInfo::runtime_call_type);
	delayed()->nop();
														  
#ifndef OPT_THREAD
	get_thread(java_thread);
#endif
	lw(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
}


// FIXME: i'm not sure of which register to use for jr. i use AT now.
// by yjl 6/27/2005
void MacroAssembler::jmp(address entry) {
	//if (fit_in_jal((entry - pc() - 4))/4) {
	//	j(entry);
	//} else {
		move(T9, (int)entry);
		jr(T9);
	//}
}

// FIXME: i'm not sure of which register to use for jr. i use AT now.
// maybe should use T9 instead
// by yjl 6/27/2005
void MacroAssembler::jmp(address entry, relocInfo::relocType rtype) {
	switch (rtype) {
		case relocInfo::runtime_call_type:
		case relocInfo::none:
			jmp(entry);
			break;
		default:
			{
			InstructionMark im(this);
			relocate(rtype);
			//move(T9, (int)entry);
			lui(T9, Assembler::split_high((int)entry));
			addiu(T9, T9, Assembler::split_low((int)entry));
			jr(T9);
			}
			break;
	}
}

void MacroAssembler::call(address entry) {
	// c/c++ code assume T9 is it's entry point, so we just always move entry to t9
	// maybe there is some more graceful method to handle this. FIXME 
	// by yjl 6/27/2005
	move(T9, (int)entry);
	jalr();
}

void MacroAssembler::call(address entry, relocInfo::relocType rtype) {
	switch (rtype) {
	case relocInfo::runtime_call_type:
	case relocInfo::none:
	//	call(entry);
		move(T9, (int)entry);
		jalr();	
		break;
	default:
		{
			InstructionMark im(this);
			relocate(rtype);
			//move(T9, (int)entry);
			lui(T9, Assembler::split_high((int)entry));
			addiu(T9, T9, Assembler::split_low((int)entry));
			jalr();
		}
		break;
	}
}

void MacroAssembler::call(address entry, RelocationHolder& rh)
{
	switch (rh.type()) {
	case relocInfo::runtime_call_type:
	case relocInfo::none:
		call(entry);
		break;
	default:
		{
			InstructionMark im(this);
			relocate(rh);
			//move(T9, (int)entry);
			lui(T9, Assembler::split_high((int)entry));
			addiu(T9, T9, Assembler::split_low((int)entry));
			jalr();
		}
		break;
	}
}

void MacroAssembler::c2bool(Register r) {
  Label L;
  Assembler::beq(r, ZERO, L);
  delayed()->nop();
  move(r, 1);
  bind(L);
}

static void pass_arg0(MacroAssembler* masm, Register arg) {
  masm->push(arg);
}

static void pass_arg1(MacroAssembler* masm, Register arg) {
  masm->push(arg);
}

static void pass_arg2(MacroAssembler* masm, Register arg) {
  masm->push(arg);
}

static void pass_arg3(MacroAssembler* masm, Register arg) {
  masm->push(arg);
}

#ifndef PRODUCT
extern "C" void findpc(intptr_t x);
#endif

void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
  // In order to get locks to work, we need to fake a in_VM state
  JavaThread* thread = JavaThread::current();
  JavaThreadState saved_state = thread->thread_state();
  thread->set_thread_state(_thread_in_vm);
  if (ShowMessageBoxOnError) {
    JavaThread* thread = JavaThread::current();
    JavaThreadState saved_state = thread->thread_state();
    thread->set_thread_state(_thread_in_vm);
    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
      ttyLocker ttyl;
      BytecodeCounter::print();
    }
    // To see where a verify_oop failed, get $ebx+40/X for this frame.
    // This is the value of eip which points to where verify_oop will return.
    if (os::message_box(msg, "Execution stopped, print registers?")) {
      ttyLocker ttyl;
      tty->print_cr("eip = 0x%08x", eip);
#ifndef PRODUCT
      tty->cr();
      findpc(eip);
      tty->cr();
#endif
      tty->print_cr("rax, = 0x%08x", rax);
      tty->print_cr("rbx, = 0x%08x", rbx);
      tty->print_cr("rcx = 0x%08x", rcx);
      tty->print_cr("rdx = 0x%08x", rdx);
      tty->print_cr("rdi = 0x%08x", rdi);
      tty->print_cr("rsi = 0x%08x", rsi);
      tty->print_cr("rbp, = 0x%08x", rbp);
      tty->print_cr("rsp = 0x%08x", rsp);
      BREAKPOINT;
    }
  } else {
    ttyLocker ttyl;
    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
    assert(false, "DEBUG MESSAGE");
  }
  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
}

void MacroAssembler::debug(char* msg/*, RegistersForDebugging* regs*/) {
	if ( ShowMessageBoxOnError ) {
		JavaThreadState saved_state = JavaThread::current()->thread_state();
		JavaThread::current()->set_thread_state(_thread_in_vm);
		{
			// In order to get locks work, we need to fake a in_VM state
			ttyLocker ttyl;
			::tty->print_cr("EXECUTION STOPPED: %s\n", msg);
			if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
				BytecodeCounter::print();
			}
																						
//			if (os::message_box(msg, "Execution stopped, print registers?"))
//				regs->print(::tty);
		}
		ThreadStateTransition::transition(JavaThread::current(), _thread_in_vm, saved_state);
	}
	else
		::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
}


void MacroAssembler::stop(const char* msg) {
	move(A0, (int)msg);
	//reserver space for argument. added by yjl 7/10/2005
	addiu(SP, SP, - 1 * wordSize);
	call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
	delayed()->nop();
	//restore space for argument
	addiu(SP, SP, 1 * wordSize);
		brk(17);
}

void MacroAssembler::warn(const char* msg) {
/*
	push_CPU_state();

  ExternalAddress message((address) msg);
  // push address of message
  pushptr(message.addr());

  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
  addl(rsp, wordSize);       // discard argument
  pop_CPU_state();
*/
	
	//save_registers(this);
	pushad();
	addi(SP, SP, -4);
	sw(A0, SP, -1 * wordSize);
	move(A0, (int)msg);
	addi(SP, SP, -1 * wordSize);
	call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
	delayed()->nop();
	addi(SP, SP, 1 * wordSize);
	lw(A0, SP, -1 * wordSize);
	addi(SP, SP, 4);
	popad();
	//restore_registers(this);
}

#ifndef PRODUCT
//FIXME cannot print sp
void MacroAssembler::print_reg(Register reg) {
  pushad();
  push(FP);
  char* a = new char[50];
  sprintf(a, "%s: 0x",reg->name());
  move(A0, (int)a);
  call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  delayed()->nop();
  pop(FP);
  popad();

  pushad();
  push(FP);
  move(A0, reg);
  call(CAST_FROM_FN_PTR(address, SharedRuntime::print_int),relocInfo::runtime_call_type);
  delayed()->nop();
  pop(FP);
  popad();

  pushad();
  push(FP);
  char* b = new char[50];
  sprintf(b, " pc: %p\n",pc());
  move(A0, (int)b);
  call(CAST_FROM_FN_PTR(address, SharedRuntime::print_str),relocInfo::runtime_call_type);
  delayed()->nop();
  pop(FP);
  popad();
}
#endif

void MacroAssembler::increment(Register reg, int imm) {
	if (!imm) return;
	if (is_simm16(imm)) {
		addiu(reg, reg, imm);
	} else {
		move(AT, imm);
		addu(reg, reg, AT);
	}
}

void MacroAssembler::decrement(Register reg, int imm) {
	increment(reg, -imm);
}


void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             bool check_exceptions) {
	call_VM_helper(oop_result, entry_point, 0, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             bool check_exceptions) {
  /*
	Label C, E;
  call(C, relocInfo::none);
  jmp(E);

  bind(C);
  pass_arg1(this, arg_1);
  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
  ret(0);

  bind(E);
	*/
	if (arg_1!=A1) 
		move(A1, arg_1);
	call_VM_helper(oop_result, entry_point, 1, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             bool check_exceptions) {
	if (arg_1!=A1) 
		move(A1, arg_1);
	if (arg_2!=A2) 
		move(A2, arg_2); 
	assert(arg_2 != A1, "smashed argument");
	call_VM_helper(oop_result, entry_point, 2, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             Register arg_3,
                             bool check_exceptions) {
	if (arg_1!=A1) 
		move(A1, arg_1);
	if (arg_2!=A2) 
		move(A2, arg_2); 
	assert(arg_2 != A1, "smashed argument");
	if (arg_3!=A3) 
		move(A3, arg_3); 
	assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
	call_VM_helper(oop_result, entry_point, 3, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             int number_of_arguments,
                             bool check_exceptions) {
  //Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
  //call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
	call_VM_base(oop_result, NOREG, last_java_sp, entry_point, number_of_arguments, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             bool check_exceptions) {
	if (arg_1!=A1) 
		move(A1, arg_1);
	call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             bool check_exceptions) {
	if (arg_1!=A1) 
		move(A1, arg_1);
	if (arg_2!=A2) 
		move(A2, arg_2); //assert(arg_2 != O1, "smashed argument");
	call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
}

void MacroAssembler::call_VM(Register oop_result,
                             Register last_java_sp,
                             address entry_point,
                             Register arg_1,
                             Register arg_2,
                             Register arg_3,
                             bool check_exceptions) {
	if (arg_1!=A1) 
		move(A1, arg_1);
	if (arg_2!=A2) 
		move(A2, arg_2); 
	assert(arg_2 != A1,                "smashed argument");
	if (arg_3!=A3) 
		move(A3, arg_3); 
	assert(arg_3 != A1 && arg_3 != A2, "smashed argument");
	call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
}

void MacroAssembler::call_VM_base(Register oop_result,
                                  Register java_thread,
                                  Register last_java_sp,
                                  address  entry_point,
                                  int      number_of_arguments,
                                  bool     check_exceptions) {

	address before_call_pc;
	// determine java_thread register
	if (!java_thread->is_valid()) {
#ifndef OPT_THREAD
		java_thread = T2;
		get_thread(java_thread);
#else
		java_thread = TREG;
#endif
	}
	// determine last_java_sp register
	if (!last_java_sp->is_valid()) {
		last_java_sp = SP;
	}
	// debugging support
	assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
	assert(number_of_arguments <= 4   , "cannot have negative number of arguments");
	assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
	assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
	
	assert(last_java_sp != FP, "this code doesn't work for last_java_sp == fp, which currently can't portably work anyway since C2 doesn't save ebp");
	
	// set last Java frame before call
	before_call_pc = (address)pc();
	set_last_Java_frame(java_thread, last_java_sp, FP, before_call_pc);
	
	// do the call
	move(A0, java_thread);
	call(entry_point, relocInfo::runtime_call_type);
	delayed()->nop();
	
	// restore the thread (cannot use the pushed argument since arguments
	// may be overwritten by C code generated by an optimizing compiler);
	// however can use the register value directly if it is callee saved.
#ifndef OPT_THREAD
	if (java_thread >=S0 && java_thread <=S7) {
#ifdef ASSERT
		{ Label L;
			get_thread(AT);
			beq(java_thread, AT, L);
			delayed()->nop();
			stop("MacroAssembler::call_VM_base: edi not callee saved?");
			bind(L);
		}
#endif
	} else {
		get_thread(java_thread);
	}
#endif

	// discard thread and arguments
	lw(SP, java_thread, in_bytes(JavaThread::last_Java_sp_offset())); 
	// reset last Java frame
	reset_last_Java_frame(java_thread, false, true);

	check_and_handle_popframe(java_thread);
        check_and_handle_earlyret(java_thread);
	if (check_exceptions) {
		// check for pending exceptions (java_thread is set upon return)
		Label L;
		lw(AT, java_thread, in_bytes(Thread::pending_exception_offset()));
		beq(AT, ZERO, L);
		delayed()->nop();
		move(AT, (int)before_call_pc);
		push(AT);
		jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
		delayed()->nop();
		bind(L);
	}

	// get oop result if there is one and reset the value in the thread
	if (oop_result->is_valid()) {
		lw(oop_result, java_thread, in_bytes(JavaThread::vm_result_offset()));
		sw(ZERO, java_thread, in_bytes(JavaThread::vm_result_offset()));
		verify_oop(oop_result);
	}
}

void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {

	move(V0, SP);
	//we also reserve space for java_thread here
	addi(SP, SP, (1 + number_of_arguments) * (- wordSize));
	move(AT, 0xfffffff8);
	andr(SP, SP, AT);
	call_VM_base(oop_result, NOREG, V0, entry_point, number_of_arguments, check_exceptions);

}

void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
	call_VM_leaf_base(entry_point, number_of_arguments);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
  //pass_arg0(this, arg_0);
  //call_VM_leaf(entry_point, 1);
	if (arg_0!=A0) 
		move(A0, arg_0);
	call_VM_leaf(entry_point, 1);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
/*
  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
  pass_arg1(this, arg_1);
  pass_arg0(this, arg_0);
  call_VM_leaf(entry_point, 2);
*/
	if (arg_0 != A0) 
		move(A0, arg_0);
	if (arg_1 != A1) 
		move(A1, arg_1); 
	assert(arg_1 != A1, "smashed argument");
	call_VM_leaf(entry_point, 2);
}

void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
	if (arg_0 != A0) 
			move(A0, arg_0);
	if (arg_1 != A1) 
			move(A1, arg_1); 
	assert(arg_1 != A1, "smashed argument");
	if (arg_2 != A2) 
			move(A2, arg_2); 
	assert(arg_2 != A1 && arg_2 != A2, "smashed argument");
	call_VM_leaf(entry_point, 3);
}

void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
}

void MacroAssembler::check_and_handle_popframe(Register java_thread) {
}

void MacroAssembler::null_check(Register reg, int offset) {
  if (needs_explicit_null_check(offset)) {
    // provoke OS NULL exception if reg = NULL by
    // accessing M[reg] w/o changing any (non-CC) registers
    // NOTE: cmpl is plenty here to provoke a segv
    lw(AT, reg, 0);
		nop();	
		nop();
		nop();
		// Note: should probably use testl(rax, Address(reg, 0));
    //       may be shorter code (however, this version of
    //       testl needs to be implemented first)
  } else {
    // nothing to do, (later) access of M[reg + offset]
    // will provoke OS NULL exception if reg = NULL
  }
}

void MacroAssembler::enter() {
  push2(RA, FP);
  move(FP, SP);
}
 
void MacroAssembler::leave() {
  //move(SP, FP);
  //pop2(FP, RA);
  addi(SP, FP, 2 * wordSize);
  lw(RA, SP, - 1 * wordSize);
  lw(FP, SP, - 2 * wordSize);
}
/*
void MacroAssembler::os_breakpoint() {
  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
  // (e.g., MSVC can't call ps() otherwise)
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
}
*/
void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
  // determine java_thread register
  if (!java_thread->is_valid()) {
#ifndef OPT_THREAD
    java_thread = T1;
    get_thread(java_thread);
#else
    java_thread = TREG;
#endif
  }
  // we must set sp to zero to clear frame
  sw(ZERO, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
  // must clear fp, so that compiled frames are not confused; it is possible
  // that we need it only for debugging
  if(clear_fp)	
    sw(ZERO, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));

  if (clear_pc)
    sw(ZERO, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
}

// Write serialization page so VM thread can do a pseudo remote membar.
// We use the current thread pointer to calculate a thread specific
// offset to write to within the page. This minimizes bus traffic
// due to cache line collision.
void MacroAssembler::serialize_memory(Register thread, Register tmp) {
/*  movl(tmp, thread);
  shrl(tmp, os::get_serialize_page_shift_count());
  andl(tmp, (os::vm_page_size() - sizeof(int)));

  Address index(noreg, tmp, Address::times_1);
  ExternalAddress page(os::get_memory_serialize_page());

  movptr(ArrayAddress(page, index), tmp);
*/
  move(tmp, thread);
  srl(tmp, tmp,os::get_serialize_page_shift_count());
  move(AT, (os::vm_page_size() - sizeof(int))); 
  andr(tmp, tmp,AT);

	sw(tmp,Address(tmp, (int)os::get_memory_serialize_page()));
}

// Calls to C land
//
// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
// has to be reset to 0. This is required to allow proper stack traversal.
void MacroAssembler::set_last_Java_frame(Register java_thread,
                                         Register last_java_sp,
                                         Register last_java_fp,
                                         address  last_java_pc) {
  // determine java_thread register
  if (!java_thread->is_valid()) {
#ifndef OPT_THREAD
    java_thread = T2;
    get_thread(java_thread);
#else
    java_thread = TREG;
#endif
  }
  // determine last_java_sp register
  if (!last_java_sp->is_valid()) {
    last_java_sp = SP;
  }

  // last_java_fp is optional

  if (last_java_fp->is_valid()) {
    sw(last_java_fp, java_thread, in_bytes(JavaThread::last_Java_fp_offset()));
  }

  // last_java_pc is optional

  if (last_java_pc != NULL) {
    relocate(relocInfo::internal_pc_type);
    lui(AT, split_high((int)last_java_pc));
    addiu(AT, AT, split_low((int)last_java_pc));
    sw(AT, java_thread, in_bytes(JavaThread::last_Java_pc_offset()));
  }
  sw(last_java_sp, java_thread, in_bytes(JavaThread::last_Java_sp_offset()));
}
//////////////////////////////////////////////////////////////////////////////////
#ifndef SERIALGC
/*
void MacroAssembler::g1_write_barrier_pre(Register obj,
#ifndef _LP64
                                          Register thread,
#endif
                                          Register tmp,
                                          Register tmp2,
                                          bool tosca_live) {
  LP64_ONLY(Register thread = r15_thread;)
  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_active()));

  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_index()));
  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
                                       PtrQueue::byte_offset_of_buf()));


  Label done;
  Label runtime;

  // if (!marking_in_progress) goto done;
  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
    cmpl(in_progress, 0);
  } else {
    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
    cmpb(in_progress, 0);
  }
  jcc(Assembler::equal, done);

  // if (x.f == NULL) goto done;
  cmpptr(Address(obj, 0), NULL_WORD);
  jcc(Assembler::equal, done);

  // Can we store original value in the thread's buffer?

  LP64_ONLY(movslq(tmp, index);)
  movptr(tmp2, Address(obj, 0));
#ifdef _LP64
  cmpq(tmp, 0);
#else
  cmpl(index, 0);
#endif
  jcc(Assembler::equal, runtime);
#ifdef _LP64
  subq(tmp, wordSize);
  movl(index, tmp);
  addq(tmp, buffer);
#else
  subl(index, wordSize);
  movl(tmp, buffer);
  addl(tmp, index);
#endif
  movptr(Address(tmp, 0), tmp2);
  jmp(done);
  bind(runtime);
  // save the live input values
  if(tosca_live) push(rax);
  push(obj);
#ifdef _LP64
  movq(c_rarg0, Address(obj, 0));
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), c_rarg0, r15_thread);
#else
  push(thread);
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), tmp2, thread);
  pop(thread);
#endif
  pop(obj);
  if(tosca_live) pop(rax);
  bind(done);

}

void MacroAssembler::g1_write_barrier_post(Register store_addr,
                                           Register new_val,
#ifndef _LP64
                                           Register thread,
#endif
                                           Register tmp,
                                           Register tmp2) {

  LP64_ONLY(Register thread = r15_thread;)
  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
                                       PtrQueue::byte_offset_of_index()));
  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
                                       PtrQueue::byte_offset_of_buf()));
  BarrierSet* bs = Universe::heap()->barrier_set();
  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  Label done;
  Label runtime;

  // Does store cross heap regions?

  movptr(tmp, store_addr);
  xorptr(tmp, new_val);
  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
  jcc(Assembler::equal, done);

  // crosses regions, storing NULL?

  cmpptr(new_val, (int32_t) NULL_WORD);
  jcc(Assembler::equal, done);

  // storing region crossing non-NULL, is card already dirty?

  ExternalAddress cardtable((address) ct->byte_map_base);
  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
#ifdef _LP64
  const Register card_addr = tmp;

  movq(card_addr, store_addr);
  shrq(card_addr, CardTableModRefBS::card_shift);

  lea(tmp2, cardtable);

  // get the address of the card
  addq(card_addr, tmp2);
#else
  const Register card_index = tmp;

  movl(card_index, store_addr);
  shrl(card_index, CardTableModRefBS::card_shift);

  Address index(noreg, card_index, Address::times_1);
  const Register card_addr = tmp;
  lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
#endif
  cmpb(Address(card_addr, 0), 0);
  jcc(Assembler::equal, done);

  // storing a region crossing, non-NULL oop, card is clean.
  // dirty card and log.

  movb(Address(card_addr, 0), 0);

  cmpl(queue_index, 0);
  jcc(Assembler::equal, runtime);
  subl(queue_index, wordSize);
  movptr(tmp2, buffer);
#ifdef _LP64
  movslq(rscratch1, queue_index);
  addq(tmp2, rscratch1);
  movq(Address(tmp2, 0), card_addr);
#else
  addl(tmp2, queue_index);
  movl(Address(tmp2, 0), card_index);
#endif
  jmp(done);

  bind(runtime);
  // save the live input values
  push(store_addr);
  push(new_val);
#ifdef _LP64
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
#else
  push(thread);
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
  pop(thread);
#endif
  pop(new_val);
  pop(store_addr);

  bind(done);

}
*/
#endif // SERIALGC
//////////////////////////////////////////////////////////////////////////////////


void MacroAssembler::store_check(Register obj) {
  // Does a store check for the oop in register obj. The content of
  // register obj is destroyed afterwards.
  store_check_part_1(obj);
  store_check_part_2(obj);
}

void MacroAssembler::store_check(Register obj, Address dst) {
  store_check(obj);
}


// split the store check operation so that other instructions can be scheduled inbetween
void MacroAssembler::store_check_part_1(Register obj) {
	BarrierSet* bs = Universe::heap()->barrier_set();
	assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
	shr(obj, CardTableModRefBS::card_shift);
}

void MacroAssembler::store_check_part_2(Register obj) {
  BarrierSet* bs = Universe::heap()->barrier_set();
  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");

	move(AT, (int)ct->byte_map_base);
	add(AT, AT, obj);
	sb(ZERO, AT, 0);
	/*
  // The calculation for byte_map_base is as follows:
  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
  // So this essentially converts an address to a displacement and
  // it will never need to be relocated. On 64bit however the value may be too
  // large for a 32bit displacement

  intptr_t disp = (intptr_t) ct->byte_map_base;
  if (is_simm32(disp)) {
    Address cardtable(noreg, obj, Address::times_1, disp);
    movb(cardtable, 0);
  } else {
    // By doing it as an ExternalAddress disp could be converted to a rip-relative
    // displacement and done in a single instruction given favorable mapping and
    // a smarter version of as_Address. Worst case it is two instructions which
    // is no worse off then loading disp into a register and doing as a simple
    // Address() as above.
    // We can't do as ExternalAddress as the only style since if disp == 0 we'll
    // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
    // in some cases we'll get a single instruction version.

    ExternalAddress cardtable((address)disp);
    Address index(noreg, obj, Address::times_1);
    movb(as_Address(ArrayAddress(cardtable, index)), 0);
  }
	*/
}
/*
void MacroAssembler::subptr(Register dst, int32_t imm32) {
  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
}

void MacroAssembler::subptr(Register dst, Register src) {
  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
}

void MacroAssembler::test32(Register src1, AddressLiteral src2) {
  // src2 must be rval

  if (reachable(src2)) {
    testl(src1, as_Address(src2));
  } else {
    lea(rscratch1, src2);
    testl(src1, Address(rscratch1, 0));
  }
}

// C++ bool manipulation
void MacroAssembler::testbool(Register dst) {
  if(sizeof(bool) == 1)
    testb(dst, 0xff);
  else if(sizeof(bool) == 2) {
    // testw implementation needed for two byte bools
    ShouldNotReachHere();
  } else if(sizeof(bool) == 4)
    testl(dst, dst);
  else
    // unsupported
    ShouldNotReachHere();
}

void MacroAssembler::testptr(Register dst, Register src) {
  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
}


*/

// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
void MacroAssembler::tlab_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
		                                   Register t1, Register t2, Label& slow_case) {
	assert_different_registers(obj, var_size_in_bytes, t1, t2, AT);

	Register end = t2;
	Register thread = t1;
	verify_tlab(t1, t2);		//blows t1&t2

	get_thread(thread);
	lw(obj, thread, in_bytes(JavaThread::tlab_top_offset()));

	if (var_size_in_bytes == NOREG) {
		// i dont think we need move con_size_in_bytes to a register first.
		// by yjl 8/17/2005
		assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
		addi(end, obj, con_size_in_bytes);
	} else {
		add(end, obj, var_size_in_bytes);
	}

	lw(AT, thread, in_bytes(JavaThread::tlab_end_offset()));
	sltu(AT, AT, end);
	bne(AT, ZERO, slow_case);
	delayed()->nop();


	// update the tlab top pointer
	sw(end, thread, in_bytes(JavaThread::tlab_top_offset()));

	// recover var_size_in_bytes if necessary
	/*if (var_size_in_bytes == end) {
		sub(var_size_in_bytes, end, obj);
	}*/
        
	verify_tlab(t1, t2);
}

// Defines obj, preserves var_size_in_bytes
void MacroAssembler::eden_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes,
		Register t1, Register t2, Label& slow_case) {
  assert_different_registers(obj, var_size_in_bytes, t1, AT);
  if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
    // No allocation in the shared eden.
    b(slow_case);
    delayed()->nop();
  } else {

    Address heap_top(t1, Assembler::split_low((int)Universe::heap()->top_addr()));
    lui(t1, split_high((int)Universe::heap()->top_addr()));
    lw(obj, heap_top);

    Register end = t2;
    Label retry;
    
    bind(retry);
    if (var_size_in_bytes == NOREG) {
    	// i dont think we need move con_size_in_bytes to a register first.
    	// by yjl 8/17/2005
      assert(is_simm16(con_size_in_bytes), "fixme by moving imm to a register first");
      addi(end, obj, con_size_in_bytes);
    } else {
      add(end, obj, var_size_in_bytes);
    }
    // if end < obj then we wrapped around => object too long => slow case
    sltu(AT, end, obj);
    bne(AT, ZERO, slow_case);
    delayed()->nop();
    
    lui(AT, split_high((int)Universe::heap()->end_addr()));
    lw(AT, AT, split_low((int)Universe::heap()->end_addr()));
    sltu(AT, AT, end);
    bne(AT, ZERO, slow_case);
    delayed()->nop();
    // Compare obj with the top addr, and if still equal, store the new top addr in
    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
    // it otherwise. Use lock prefix for atomicity on MPs.
    if (os::is_MP()) {
    	///lock();
    }
    
    // if someone beat us on the allocation, try again, otherwise continue
    cmpxchg(end, heap_top, obj);
    beq(AT, ZERO, retry);    //by yyq
    delayed()->nop();

  }
}

void MacroAssembler::tlab_refill(Label& retry, Label& try_eden, Label& slow_case) {
	Register top = T0;
	Register t1  = T1;
	Register t2  = T5;
	Register t3  = T6;
	Register thread_reg = T3;
	Label do_refill, discard_tlab;
	if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) { //by yyq
		// No allocation in the shared eden.
		b(slow_case);
		delayed()->nop();
	}

	get_thread(thread_reg);

	lw(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
	lw(t1, thread_reg, in_bytes(JavaThread::tlab_end_offset()));

	// calculate amount of free space
	sub(t1, t1, top);
	shr(t1, LogHeapWordSize);

	// Retain tlab and allocate object in shared space if
	// the amount free in the tlab is too large to discard.
	lw(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
	slt(AT, t2, t1);
	beq(AT, ZERO, discard_tlab);
	delayed()->nop();

	// Retain
	
	move(AT, ThreadLocalAllocBuffer::refill_waste_limit_increment());
	add(t2, t2, AT);
	sw(t2, thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()));
	
	if (TLABStats) {
		// increment number of slow_allocations
		lw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
		addiu(AT, AT, 1);
		sw(AT, thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset()));
	}
	b(try_eden);
	delayed()->nop();

  bind(discard_tlab);
	if (TLABStats) {
		// increment number of refills
		lw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
		addi(AT, AT, 1);
		sw(AT, thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset()));
		// accumulate wastage -- t1 is amount free in tlab
		lw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
		add(AT, AT, t1);
		sw(AT, thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset()));
	}

	// if tlab is currently allocated (top or end != null) then
	// fill [top, end + alignment_reserve) with array object
	beq(top, ZERO, do_refill);
	delayed()->nop();

	// set up the mark word
	move(AT, (int)markOopDesc::prototype()->copy_set_hash(0x2));
	sw(AT, top, oopDesc::mark_offset_in_bytes());

	// set the length to the remaining space
	addi(t1, t1, - typeArrayOopDesc::header_size(T_INT));
	addi(t1, t1, ThreadLocalAllocBuffer::alignment_reserve());
	shl(t1, log2_intptr(HeapWordSize/sizeof(jint)));
	sw(t1, top, arrayOopDesc::length_offset_in_bytes());

	// set klass to intArrayKlass
	lui(AT, split_high((int)Universe::intArrayKlassObj_addr()));
	lw(t1, AT, split_low((int)Universe::intArrayKlassObj_addr())); 
	sw(t1, top, oopDesc::klass_offset_in_bytes());

	// refill the tlab with an eden allocation
	bind(do_refill);
	lw(t1, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
	shl(t1, LogHeapWordSize);
	// add object_size ??
	eden_allocate(top, t1, 0, t2, t3, slow_case);

	// Check that t1 was preserved in eden_allocate.
#ifdef ASSERT
	if (UseTLAB) {
		Label ok;
		assert_different_registers(thread_reg, t1);
		lw(AT, thread_reg, in_bytes(JavaThread::tlab_size_offset()));
		shl(AT, LogHeapWordSize);
		beq(AT, t1, ok);
		delayed()->nop();
		stop("assert(t1 != tlab size)");
		should_not_reach_here();

		bind(ok);
	}
#endif
	sw(top, thread_reg, in_bytes(JavaThread::tlab_start_offset()));
	sw(top, thread_reg, in_bytes(JavaThread::tlab_top_offset()));
	add(top, top, t1);	
	addi(top, top, - ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
	sw(top, thread_reg, in_bytes(JavaThread::tlab_end_offset()));
	verify_tlab(t1, t2);
	b(retry);
	delayed()->nop();
}

static const double     pi_4 =  0.7853981633974483;

// the x86 version is to clumsy, i dont think we need that fuss. maybe i'm wrong, FIXME
// must get argument(a double) in F12/F13
//void MacroAssembler::trigfunc(char trig, bool preserve_cpu_regs, int num_fpu_regs_in_use) {
//We need to preseve the register which maybe modified during the Call @Jerome
void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
//save all modified register here
//	if (preserve_cpu_regs) {
//	}
//FIXME, in the disassembly of tirgfunc, only used V0,V1,T9, SP,RA,so we ony save V0,V1,T9 
	pushad();
//we should preserve the stack space before we call
	addi(SP, SP, -wordSize * 2);
        switch (trig){
		case 's' :
              		call( CAST_FROM_FN_PTR(address, SharedRuntime::dsin), relocInfo::runtime_call_type );
			delayed()->nop();
			break;
		case 'c':	
			call( CAST_FROM_FN_PTR(address, SharedRuntime::dcos), relocInfo::runtime_call_type );
			delayed()->nop();
			break;
		case 't':
			call( CAST_FROM_FN_PTR(address, SharedRuntime::dtan), relocInfo::runtime_call_type );
			delayed()->nop();
			break;
		default:assert (false, "bad intrinsic")
		break;
	
	}

	addi(SP, SP, wordSize * 2);
	popad();
//	if (preserve_cpu_regs) {
//	}
}
/*

void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
  ucomisd(dst, as_Address(src));
}

void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
  ucomiss(dst, as_Address(src));
}

void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
  if (reachable(src)) {
    xorpd(dst, as_Address(src));
  } else {
    lea(rscratch1, src);
    xorpd(dst, Address(rscratch1, 0));
  }
}

void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
  if (reachable(src)) {
    xorps(dst, as_Address(src));
  } else {
    lea(rscratch1, src);
    xorps(dst, Address(rscratch1, 0));
  }
}
*/

void MacroAssembler::move(Register reg, int imm) {
  if (is_simm16(imm)) {
    addiu(reg, ZERO, imm);
  } else {
    lui(reg, split_high(imm));
    if (split_low(imm))
      addiu(reg, reg, split_low(imm));
  }
}

// NOTE: i dont push eax as i486.
// the x86 save eax for it use eax as the jump register
void MacroAssembler::verify_oop(Register reg, const char* s) {
/*
  if (!VerifyOops) return;

  // Pass register number to verify_oop_subroutine
  char* b = new char[strlen(s) + 50];
  sprintf(b, "verify_oop: %s: %s", reg->name(), s);
  push(rax);                          // save rax,
  push(reg);                          // pass register argument
  ExternalAddress buffer((address) b);
  // avoid using pushptr, as it modifies scratch registers
  // and our contract is not to modify anything
  movptr(rax, buffer.addr());
  push(rax);
  // call indirectly to solve generation ordering problem
  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
  call(rax);
*/
	if (!VerifyOops) return;

	// Pass register number to verify_oop_subroutine
	char* b = new char[strlen(s) + 50];
	sprintf(b, "verify_oop: %s: %s", reg->name(), s);
	sw(T5, SP, - wordSize);
	sw(T6, SP, - 2*wordSize);
	sw(RA, SP, - 3*wordSize);
	sw(A0, SP ,- 4*wordSize);	
	sw(A1, SP ,- 5*wordSize);	
	sw(AT, SP ,- 6*wordSize);	
	sw(T9, SP ,- 7*wordSize);	
	addiu(SP, SP, - 7 * wordSize);
	move(A0, (int)b);
	move(A1, reg);
	// call indirectly to solve generation ordering problem
	move(AT, (int)StubRoutines::verify_oop_subroutine_entry_address());        	
	lw(T9, AT, 0);
	jalr(T9);
	delayed()->nop();
	lw(T5, SP, 6* wordSize);
	lw(T6, SP, 5* wordSize);
	lw(RA, SP, 4* wordSize);
	lw(A0, SP, 3* wordSize);
	lw(A1, SP, 2* wordSize);
	lw(AT, SP, 1* wordSize);
	lw(T9, SP, 0* wordSize);
	addiu(SP, SP, 7 * wordSize);
}


void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
	if (!VerifyOops) {
		nop();
		return;
	}
	// Pass register number to verify_oop_subroutine
	char* b = new char[strlen(s) + 50];
	sprintf(b, "verify_oop_addr: %s",  s);

	sw(T5, SP, - wordSize);
	sw(T6, SP, - 2*wordSize);
	sw(RA, SP, - 3*wordSize);
	sw(A0, SP, - 4*wordSize);	
	sw(A1, SP, - 5*wordSize);	
	sw(AT, SP, - 6*wordSize);	
	sw(T9, SP, - 7*wordSize);	
	lw(A1, addr);   // addr may use SP, so load from it before change SP
	addiu(SP, SP, - 7 * wordSize);

	move(A0, (int)b);
	// call indirectly to solve generation ordering problem
	move(AT, (int)StubRoutines::verify_oop_subroutine_entry_address());        	
	lw(T9, AT, 0);
	jalr(T9);
	delayed()->nop();
	lw(T5, SP, 6* wordSize);
	lw(T6, SP, 5* wordSize);
	lw(RA, SP, 4* wordSize);
	lw(A0, SP, 3* wordSize);
	lw(A1, SP, 2* wordSize);
	lw(AT, SP, 1* wordSize);
	lw(T9, SP, 0* wordSize);
	addiu(SP, SP, 7 * wordSize);
}

// used registers :  T5, T6
void MacroAssembler::verify_oop_subroutine() {
	// RA: ra
	// A0: char* error message    
	// A1: oop   object to verify 

	Label exit, error;
	// increment counter
	move(T5, (int)StubRoutines::verify_oop_count_addr());
	lw(AT, T5, 0);
	addi(AT, AT, 1);
	sw(AT, T5, 0);

	// make sure object is 'reasonable'
	beq(A1, ZERO, exit);         // if obj is NULL it is ok
	delayed()->nop();

	// Check if the oop is in the right area of memory
	const int oop_mask = Universe::verify_oop_mask();
	const int oop_bits = Universe::verify_oop_bits();
	move(AT, oop_mask);
	andr(T5, A1, AT);
	move(AT, oop_bits);
	bne(T5, AT, error);
	delayed()->nop();

	// make sure klass is 'reasonable'
	lw(T5, A1, oopDesc::klass_offset_in_bytes()); // get klass
	beq(T5, ZERO, error);                        // if klass is NULL it is broken
	delayed()->nop();
	// Check if the klass is in the right area of memory
	const int klass_mask = Universe::verify_klass_mask();
	const int klass_bits = Universe::verify_klass_bits();

	move(AT, klass_mask);
	andr(T6, T5, AT);
	move(AT, klass_bits);
	bne(T6, AT, error);
	delayed()->nop();
	// make sure klass' klass is 'reasonable'
	lw(T5, T5, oopDesc::klass_offset_in_bytes()); // get klass' klass
	beq(T5, ZERO, error);  // if klass' klass is NULL it is broken
	delayed()->nop();

	move(AT, klass_mask);
	andr(T6, T5, AT);
	move(AT, klass_bits);
	bne(T6, AT, error);
	delayed()->nop();     // if klass not in right area of memory it is broken too.

	// return if everything seems ok
	bind(exit);

	jr(RA);
	delayed()->nop();

	// handle errors
	bind(error);
	pushad();
	addi(SP, SP, (-1) * wordSize);
	call(CAST_FROM_FN_PTR(address, MacroAssembler::debug), relocInfo::runtime_call_type);
	delayed()->nop();
	addiu(SP, SP, 1 * wordSize);
	popad();	
	jr(RA);
	delayed()->nop();
}

void MacroAssembler::verify_tlab(Register t1, Register t2) {
#ifdef ASSERT
	assert_different_registers(t1, t2, AT);
  if (UseTLAB && VerifyOops) {
		Label next, ok;

		get_thread(t1);

		lw(t2, t1, in_bytes(JavaThread::tlab_top_offset()));
		lw(AT, t1, in_bytes(JavaThread::tlab_start_offset()));
		sltu(AT, t2, AT);
		beq(AT, ZERO, next);
		delayed()->nop();

		stop("assert(top >= start)");

		bind(next);
		lw(AT, t1, in_bytes(JavaThread::tlab_end_offset()));
		sltu(AT, AT, t2);	
		beq(AT, ZERO, ok);
		delayed()->nop();

		stop("assert(top <= end)");

		bind(ok);

		/*
    Label next, ok;
    Register t1 = rsi;
    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);

    push(t1);
    NOT_LP64(push(thread_reg));
    NOT_LP64(get_thread(thread_reg));

    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
    jcc(Assembler::aboveEqual, next);
    stop("assert(top >= start)");
    should_not_reach_here();

    bind(next);
    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
    jcc(Assembler::aboveEqual, ok);
    stop("assert(top <= end)");
    should_not_reach_here();

    bind(ok);
    NOT_LP64(pop(thread_reg));
    pop(t1);
		*/
  }
#endif
}

void MacroAssembler::hswap(Register reg) {
	//andi(reg, reg, 0xffff);
	srl(AT, reg, 8);
	sll(reg, reg, 24);
	sra(reg, reg, 16);
	orr(reg, reg, AT);
}

void MacroAssembler::huswap(Register reg) {
	//andi(reg, reg, 0xffff);
	srl(AT, reg, 8);
	sll(reg, reg, 24);
	srl(reg, reg, 16);
	orr(reg, reg, AT);
}

// something funny to do this will only one more register AT
// by yjl 6/29/2005
void MacroAssembler::swap(Register reg) {
	srl(AT, reg, 8);
	sll(reg, reg, 24);
	orr(reg, reg, AT);
	//reg : 4 1 2 3
	srl(AT, AT, 16);
	xorr(AT, AT, reg);
	andi(AT, AT, 0xff);
	//AT : 0 0 0 1^3);
	xorr(reg, reg, AT);
	//reg : 4 1 2 1
	sll(AT, AT, 16);
	xorr(reg, reg, AT);
	//reg : 4 3 2 1
}

void MacroAssembler::cmpxchg(Register x_reg, Address dest, Register c_reg) {
	Label done, again, nequal;

	bind(again);
	ll(AT, dest);
	bne(AT, c_reg, nequal);
	delayed()->nop(); 

	move(AT, x_reg);
	sc(AT, dest);
	beq(AT, ZERO, again);
	delayed()->nop();
	b(done);
	delayed()->nop();

	// not xchged
	bind(nequal);
	move(c_reg, AT);
	move(AT, ZERO);

	bind(done);
}

void MacroAssembler::cmpxchg8(Register x_regLo, Register x_regHi, Address dest, Register c_regLo, Register c_regHi) {
	Label done, again, nequal;

	Register x_reg = x_regLo;
	dsll32(x_regHi, x_regHi, 0);
	dsll32(x_regLo, x_regLo, 0);
	dsrl32(x_regLo, x_regLo, 0);
	orr(x_reg, x_regLo, x_regHi);

	Register c_reg = c_regLo;
	dsll32(c_regHi, c_regHi, 0);
	dsll32(c_regLo, c_regLo, 0);
	dsrl32(c_regLo, c_regLo, 0);
	orr(c_reg, c_regLo, c_regHi);

	bind(again);
	lld(AT, dest);
	bne(AT, c_reg, nequal);
	delayed()->nop(); 

	//move(AT, x_reg);
	dadd(AT, x_reg, ZERO);
	scd(AT, dest);
	beq(AT, ZERO, again);
	delayed()->nop();
	b(done);
	delayed()->nop();

	// not xchged
	bind(nequal);
	//move(c_reg, AT);
	//move(AT, ZERO);
	dadd(c_reg, AT, ZERO);
	dadd(AT, ZERO, ZERO);
	bind(done);
}

// be sure the three register is different
void MacroAssembler::rem_s(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {    
  assert_different_registers(tmp, fs, ft); 
	div_s(tmp, fs, ft); 
	trunc_l_s(tmp, tmp); 
	cvt_s_l(tmp, tmp); 
	mul_s(tmp, tmp, ft); 
	sub_s(fd, fs, tmp); 
}

// be sure the three register is different
void MacroAssembler::rem_d(FloatRegister fd, FloatRegister fs, FloatRegister ft, FloatRegister tmp) {    
	assert_different_registers(tmp, fs, ft); 
	div_d(tmp, fs, ft); 
	trunc_l_d(tmp, tmp); 
	cvt_d_l(tmp, tmp); 
	mul_d(tmp, tmp, ft); 
	sub_d(fd, fs, tmp); 
}

class ControlWord {
				public:
								int32_t _value;

  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
  int  precision_control() const       { return  (_value >>  8) & 3      ; }
  bool precision() const               { return ((_value >>  5) & 1) != 0; }
  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }

  void print() const {
    // rounding control
    const char* rc;
    switch (rounding_control()) {
      case 0: rc = "round near"; break;
      case 1: rc = "round down"; break;
      case 2: rc = "round up  "; break;
      case 3: rc = "chop      "; break;
    };
    // precision control
    const char* pc;
    switch (precision_control()) {
      case 0: pc = "24 bits "; break;
      case 1: pc = "reserved"; break;
      case 2: pc = "53 bits "; break;
      case 3: pc = "64 bits "; break;
    };
    // flags
    char f[9];
    f[0] = ' ';
    f[1] = ' ';
    f[2] = (precision   ()) ? 'P' : 'p';
    f[3] = (underflow   ()) ? 'U' : 'u';
    f[4] = (overflow    ()) ? 'O' : 'o';
    f[5] = (zero_divide ()) ? 'Z' : 'z';
    f[6] = (denormalized()) ? 'D' : 'd';
    f[7] = (invalid     ()) ? 'I' : 'i';
    f[8] = '\x0';
    // output
    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
  }

};

class StatusWord {
 public:
  int32_t _value;

  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
  int  top() const                     { return  (_value >> 11) & 7      ; }
  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
  bool precision() const               { return ((_value >>  5) & 1) != 0; }
  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }

  void print() const {
    // condition codes
    char c[5];
    c[0] = (C3()) ? '3' : '-';
    c[1] = (C2()) ? '2' : '-';
    c[2] = (C1()) ? '1' : '-';
    c[3] = (C0()) ? '0' : '-';
    c[4] = '\x0';
    // flags
    char f[9];
    f[0] = (error_status()) ? 'E' : '-';
    f[1] = (stack_fault ()) ? 'S' : '-';
    f[2] = (precision   ()) ? 'P' : '-';
    f[3] = (underflow   ()) ? 'U' : '-';
    f[4] = (overflow    ()) ? 'O' : '-';
    f[5] = (zero_divide ()) ? 'Z' : '-';
    f[6] = (denormalized()) ? 'D' : '-';
    f[7] = (invalid     ()) ? 'I' : '-';
    f[8] = '\x0';
    // output
    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
  }

};

class TagWord {
 public:
  int32_t _value;

  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }

  void print() const {
    printf("%04x", _value & 0xFFFF);
  }

};

class FPU_Register {
 public:
  int32_t _m0;
  int32_t _m1;
  int16_t _ex;

  bool is_indefinite() const           {
    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
  }

  void print() const {
    char  sign = (_ex < 0) ? '-' : '+';
    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
  };

};

class FPU_State {
 public:
  enum {
    register_size       = 10,
    number_of_registers =  8,
    register_mask       =  7
  };

  ControlWord  _control_word;
  StatusWord   _status_word;
  TagWord      _tag_word;
  int32_t      _error_offset;
  int32_t      _error_selector;
  int32_t      _data_offset;
  int32_t      _data_selector;
  int8_t       _register[register_size * number_of_registers];

  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }

  const char* tag_as_string(int tag) const {
    switch (tag) {
      case 0: return "valid";
      case 1: return "zero";
      case 2: return "special";
      case 3: return "empty";
    }
    ShouldNotReachHere()
    return NULL;
  }

  void print() const {
    // print computation registers
    { int t = _status_word.top();
      for (int i = 0; i < number_of_registers; i++) {
        int j = (i - t) & register_mask;
        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
        st(j)->print();
        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
      }
    }
    printf("\n");
    // print control registers
    printf("ctrl = "); _control_word.print(); printf("\n");
    printf("stat = "); _status_word .print(); printf("\n");
    printf("tags = "); _tag_word    .print(); printf("\n");
  }

};

class Flag_Register {
 public:
  int32_t _value;

  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
  bool direction() const               { return ((_value >> 10) & 1) != 0; }
  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
  bool carry() const                   { return ((_value >>  0) & 1) != 0; }

  void print() const {
    // flags
    char f[8];
    f[0] = (overflow       ()) ? 'O' : '-';
    f[1] = (direction      ()) ? 'D' : '-';
    f[2] = (sign           ()) ? 'S' : '-';
    f[3] = (zero           ()) ? 'Z' : '-';
    f[4] = (auxiliary_carry()) ? 'A' : '-';
    f[5] = (parity         ()) ? 'P' : '-';
    f[6] = (carry          ()) ? 'C' : '-';
    f[7] = '\x0';
    // output
    printf("%08x  flags = %s", _value, f);
  }

};

class IU_Register {
 public:
  int32_t _value;

  void print() const {
    printf("%08x  %11d", _value, _value);
  }

};

class IU_State {
 public:
  Flag_Register _eflags;
  IU_Register   _rdi;
  IU_Register   _rsi;
  IU_Register   _rbp;
  IU_Register   _rsp;
  IU_Register   _rbx;
  IU_Register   _rdx;
  IU_Register   _rcx;
  IU_Register   _rax;

  void print() const {
    // computation registers
    printf("rax,  = "); _rax.print(); printf("\n");
    printf("rbx,  = "); _rbx.print(); printf("\n");
    printf("rcx  = "); _rcx.print(); printf("\n");
    printf("rdx  = "); _rdx.print(); printf("\n");
    printf("rdi  = "); _rdi.print(); printf("\n");
    printf("rsi  = "); _rsi.print(); printf("\n");
    printf("rbp,  = "); _rbp.print(); printf("\n");
    printf("rsp  = "); _rsp.print(); printf("\n");
    printf("\n");
    // control registers
    printf("flgs = "); _eflags.print(); printf("\n");
  }
};


class CPU_State {
 public:
  FPU_State _fpu_state;
  IU_State  _iu_state;

  void print() const {
    printf("--------------------------------------------------\n");
    _iu_state .print();
    printf("\n");
    _fpu_state.print();
    printf("--------------------------------------------------\n");
  }

};


static void _print_CPU_state(CPU_State* state) {
  state->print();
};

/*
void MacroAssembler::print_CPU_state() {
  push_CPU_state();
  push(rsp);                // pass CPU state
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
  addptr(rsp, wordSize);       // discard argument
  pop_CPU_state();
}
*/

void MacroAssembler::align(int modulus) {
	while (offset() % modulus != 0) nop();
}

static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
  static int counter = 0;
  FPU_State* fs = &state->_fpu_state;
  counter++;
  // For leaf calls, only verify that the top few elements remain empty.
  // We only need 1 empty at the top for C2 code.
  if( stack_depth < 0 ) {
    if( fs->tag_for_st(7) != 3 ) {
      printf("FPR7 not empty\n");
      state->print();
      assert(false, "error");
      return false;
    }
    return true;                // All other stack states do not matter
  }

  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
         "bad FPU control word");

  // compute stack depth
  int i = 0;
  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
  int d = i;
  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
  // verify findings
  if (i != FPU_State::number_of_registers) {
    // stack not contiguous
    printf("%s: stack not contiguous at ST%d\n", s, i);
    state->print();
    assert(false, "error");
    return false;
  }
  // check if computed stack depth corresponds to expected stack depth
  if (stack_depth < 0) {
    // expected stack depth is -stack_depth or less
    if (d > -stack_depth) {
      // too many elements on the stack
      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
      state->print();
      assert(false, "error");
      return false;
    }
  } else {
    // expected stack depth is stack_depth
    if (d != stack_depth) {
      // wrong stack depth
      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
      state->print();
      assert(false, "error");
      return false;
    }
  }
  // everything is cool
  return true;
}


void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
	//FIXME aoqi
	// %%%%% need to implement this
	//Unimplemented();
	/*
	if (!VerifyFPU) return;
  push_CPU_state();
  push(rsp);                // pass CPU state
  ExternalAddress msg((address) s);
  // pass message string s
  pushptr(msg.addr());
  push(stack_depth);        // pass stack depth
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
  addptr(rsp, 3 * wordSize);   // discard arguments
  // check for error
  { Label L;
    testl(rax, rax);
    jcc(Assembler::notZero, L);
    int3();                  // break if error condition
    bind(L);
  }
  pop_CPU_state();
	*/
}
//We preserve all caller-saved register
void  MacroAssembler::pushad(){

	push(AT);
	push(A0);
	push(A1);
	push(A2);
	push(A3);
	push(V0);
	push(V1);
	push(T0);
	push(T1);
	push(T2);
	push(T3);
	push(T4);
	push(T5);
	push(T6);
	push(T7);
	push(T8); 
	push(T9); 
	push(GP); 
	push(RA); 
	push(FP); 
	
};

void  MacroAssembler::popad(){
	pop(FP);
	pop(RA);
	pop(GP);
	pop(T9);
	pop(T8);
	pop(T7);
	pop(T6);
	pop(T5);
	pop(T4);
	pop(T3);
	pop(T2);
	pop(T1);
	pop(T0);
	pop(V1);
	pop(V0);
	pop(A3);
	pop(A2);
	pop(A1);
	pop(A0); 
	pop(AT); 
};

void MacroAssembler::push2(Register reg1, Register reg2) {
  addi(SP, SP, -8);
	sw(reg2, SP, 0);
	sw(reg1, SP, 4);
}   

void MacroAssembler::pop2(Register reg1, Register reg2) {
	lw(reg1, SP, 0);
	lw(reg2, SP, 4);
	addi(SP, SP, 8);
}

void MacroAssembler::load_two_bytes_from_at_bcp(Register reg, Register tmp, int offset)
{
	
	if(offset & 1){
		lbu(reg, BCP, offset+1); 
		lbu(tmp, BCP, offset);
		sll(reg, reg, 8);
		addu(reg, tmp, reg);   
	}
	else
		lhu(reg, BCP, offset);
}

void MacroAssembler::store_two_byts_to_at_bcp(Register reg, Register tmp, int offset)
{	
	if(offset & 1){
		
		sb(reg, BCP, offset);
		srl(reg, reg, 8);
		sb(reg, BCP, offset + 1);   
	}
	else
		sh(reg, BCP, offset);
}

/*
void MacroAssembler::load_klass(Register dst, Register src) {
#ifdef _LP64
  if (UseCompressedOops) {
    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
    decode_heap_oop_not_null(dst);
  } else
#endif
    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
}

void MacroAssembler::load_prototype_header(Register dst, Register src) {
#ifdef _LP64
  if (UseCompressedOops) {
    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
    movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
  } else
#endif
    {
      movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
      movptr(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
    }
}

void MacroAssembler::store_klass(Register dst, Register src) {
#ifdef _LP64
  if (UseCompressedOops) {
    encode_heap_oop_not_null(src);
    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
  } else
#endif
    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
}

#ifdef _LP64
void MacroAssembler::store_klass_gap(Register dst, Register src) {
  if (UseCompressedOops) {
    // Store to klass gap in destination
    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
  }
}

void MacroAssembler::load_heap_oop(Register dst, Address src) {
  if (UseCompressedOops) {
    movl(dst, src);
    decode_heap_oop(dst);
  } else {
    movq(dst, src);
  }
}

void MacroAssembler::store_heap_oop(Address dst, Register src) {
  if (UseCompressedOops) {
    assert(!dst.uses(src), "not enough registers");
    encode_heap_oop(src);
    movl(dst, src);
  } else {
    movq(dst, src);
  }
}

// Algorithm must match oop.inline.hpp encode_heap_oop.
void MacroAssembler::encode_heap_oop(Register r) {
  assert (UseCompressedOops, "should be compressed");
#ifdef ASSERT
  if (CheckCompressedOops) {
    Label ok;
    push(rscratch1); // cmpptr trashes rscratch1
    cmpptr(r12_heapbase, ExternalAddress((address)Universe::heap_base_addr()));
    jcc(Assembler::equal, ok);
    stop("MacroAssembler::encode_heap_oop: heap base corrupted?");
    bind(ok);
    pop(rscratch1);
  }
#endif
  verify_oop(r, "broken oop in encode_heap_oop");
  testq(r, r);
  cmovq(Assembler::equal, r, r12_heapbase);
  subq(r, r12_heapbase);
  shrq(r, LogMinObjAlignmentInBytes);
}

void MacroAssembler::encode_heap_oop_not_null(Register r) {
  assert (UseCompressedOops, "should be compressed");
#ifdef ASSERT
  if (CheckCompressedOops) {
    Label ok;
    testq(r, r);
    jcc(Assembler::notEqual, ok);
    stop("null oop passed to encode_heap_oop_not_null");
    bind(ok);
  }
#endif
  verify_oop(r, "broken oop in encode_heap_oop_not_null");
  subq(r, r12_heapbase);
  shrq(r, LogMinObjAlignmentInBytes);
}

void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
  assert (UseCompressedOops, "should be compressed");
#ifdef ASSERT
  if (CheckCompressedOops) {
    Label ok;
    testq(src, src);
    jcc(Assembler::notEqual, ok);
    stop("null oop passed to encode_heap_oop_not_null2");
    bind(ok);
  }
#endif
  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
  if (dst != src) {
    movq(dst, src);
  }
  subq(dst, r12_heapbase);
  shrq(dst, LogMinObjAlignmentInBytes);
}

void  MacroAssembler::decode_heap_oop(Register r) {
  assert (UseCompressedOops, "should be compressed");
#ifdef ASSERT
  if (CheckCompressedOops) {
    Label ok;
    push(rscratch1);
    cmpptr(r12_heapbase,
           ExternalAddress((address)Universe::heap_base_addr()));
    jcc(Assembler::equal, ok);
    stop("MacroAssembler::decode_heap_oop: heap base corrupted?");
    bind(ok);
    pop(rscratch1);
  }
#endif

  Label done;
  shlq(r, LogMinObjAlignmentInBytes);
  jccb(Assembler::equal, done);
  addq(r, r12_heapbase);
#if 0
   // alternate decoding probably a wash.
   testq(r, r);
   jccb(Assembler::equal, done);
   leaq(r, Address(r12_heapbase, r, Address::times_8, 0));
#endif
  bind(done);
  verify_oop(r, "broken oop in decode_heap_oop");
}

void  MacroAssembler::decode_heap_oop_not_null(Register r) {
  assert (UseCompressedOops, "should only be used for compressed headers");
  // Cannot assert, unverified entry point counts instructions (see .ad file)
  // vtableStubs also counts instructions in pd_code_size_limit.
  // Also do not verify_oop as this is called by verify_oop.
  assert(Address::times_8 == LogMinObjAlignmentInBytes, "decode alg wrong");
  leaq(r, Address(r12_heapbase, r, Address::times_8, 0));
}

void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
  assert (UseCompressedOops, "should only be used for compressed headers");
  // Cannot assert, unverified entry point counts instructions (see .ad file)
  // vtableStubs also counts instructions in pd_code_size_limit.
  // Also do not verify_oop as this is called by verify_oop.
  assert(Address::times_8 == LogMinObjAlignmentInBytes, "decode alg wrong");
  leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
}

void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
  int oop_index = oop_recorder()->find_index(obj);
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
  mov_literal32(dst, oop_index, rspec, narrow_oop_operand);
}

void MacroAssembler::reinit_heapbase() {
  if (UseCompressedOops) {
    movptr(r12_heapbase, ExternalAddress((address)Universe::heap_base_addr()));
  }
}
#endif // _LP64
*/
SkipIfEqual::SkipIfEqual(
    MacroAssembler* masm, const bool* flag_addr, bool value) {
  _masm = masm;
  _masm->move(AT, (int32_t)flag_addr);
  _masm->lb(AT,AT,0);
  _masm->addi(AT,AT,-value);
  _masm->beq(AT,ZERO,_label);
  _masm->delayed()->nop();
}

SkipIfEqual::~SkipIfEqual() {
  _masm->bind(_label);
}