view hotspot/src/cpu/mips/vm/templateTable_mips.cpp @ 18:d2a6a000ff33

Fix a bug in FrameMap::nr2floatreg. In FrameMap::nr2floatreg, it is wrong that multiplying 2 to the argument rnr.
author YANG Yongqiang <yangyongqiang@loongson.cn>
date Sat, 30 Oct 2010 17:47:17 +0800
parents 85b046e5468b
children
line wrap: on
line source

/*
 * Copyright 2003-2008 Sun Microsystems, Inc.  All Rights Reserved.
 * Copyright 2010 Lemote, Inc.  All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 *
 */

#include "incls/_precompiled.incl"
#include "incls/_templateTable_mips.cpp.incl"

#ifndef CC_INTERP

#define __ _masm->

// Platform-dependent initialization

void TemplateTable::pd_initialize() {
  // No mips specific initialization
}

// Address computation: local variables
// we use t8 as the local variables pointer register, by yjl 6/27/2005
static inline Address iaddress(int n) {
  //return Address(r14, Interpreter::local_offset_in_bytes(n));
	return Address(LVP, Interpreter::local_offset_in_bytes(n));
}

static inline Address laddress(int n) {
  return iaddress(n + 1);
}

static inline Address faddress(int n) {
  return iaddress(n);
}

static inline Address daddress(int n) {
  return laddress(n);
}

static inline Address aaddress(int n) {
  return iaddress(n);
}
static inline Address haddress(int n)            { return iaddress(n + 0); }

//FIXME , can not use add and sll
/*
static inline Address iaddress(Register r) {
  return Address(r14, r, Address::times_8, Interpreter::value_offset_in_bytes());
}

static inline Address laddress(Register r) {
  return Address(r14, r, Address::times_8, Interpreter::local_offset_in_bytes(1));
}

static inline Address faddress(Register r) {
  return iaddress(r);
}

static inline Address daddress(Register r) {
  return laddress(r);
}

static inline Address aaddress(Register r) {
  return iaddress(r);
}
*/

static inline Address at_sp() 						{	return Address(SP, 	0); }					
static inline Address at_sp_p1()          { return Address(SP,  1 * wordSize); }
static inline Address at_sp_p2()          { return Address(SP,  2 * wordSize); }

// At top of Java expression stack which may be different than esp().  It
// isn't for category 1 objects.
static inline Address at_tos   () {
  //return Address(rsp,  Interpreter::expr_offset_in_bytes(0));
	Address tos = Address(SP,  Interpreter::expr_offset_in_bytes(0));
	return tos;
}

static inline Address at_tos_p1() {
  //return Address(rsp,  Interpreter::expr_offset_in_bytes(1));
	return Address(SP,  Interpreter::expr_offset_in_bytes(1));
}

static inline Address at_tos_p2() {
  //return Address(rsp,  Interpreter::expr_offset_in_bytes(2));
	return Address(SP,  Interpreter::expr_offset_in_bytes(2));
}

static inline Address at_tos_p3() {
  //return Address(rsp,  Interpreter::expr_offset_in_bytes(3));
	return Address(SP,  Interpreter::expr_offset_in_bytes(3));
}
/*
// Condition conversion
static Assembler::Condition j_not(TemplateTable::Condition cc) {
  switch (cc) {
  case TemplateTable::equal        : return Assembler::notEqual;
  case TemplateTable::not_equal    : return Assembler::equal;
  case TemplateTable::less         : return Assembler::greaterEqual;
  case TemplateTable::less_equal   : return Assembler::greater;
  case TemplateTable::greater      : return Assembler::lessEqual;
  case TemplateTable::greater_equal: return Assembler::less;
  }
  ShouldNotReachHere();
  return Assembler::zero;
}
*/

// Miscelaneous helper routines
// Store an oop (or NULL) at the address described by obj.
// If val == noreg this means store a NULL
/*
static void do_oop_store(InterpreterMacroAssembler* _masm,
                         Address obj,
                         Register val,
                         BarrierSet::Name barrier,
                         bool precise) {
  assert(val == noreg || val == rax, "parameter is just for looks");
  switch (barrier) {
#ifndef SERIALGC
    case BarrierSet::G1SATBCT:
    case BarrierSet::G1SATBCTLogging:
      {
        // flatten object address if needed
        if (obj.index() == noreg && obj.disp() == 0) {
          if (obj.base() != rdx) {
            __ movq(rdx, obj.base());
          }
        } else {
          __ leaq(rdx, obj);
        }
        __ g1_write_barrier_pre(rdx, r8, rbx, val != noreg);
        if (val == noreg) {
          __ store_heap_oop(Address(rdx, 0), NULL_WORD);
        } else {
          __ store_heap_oop(Address(rdx, 0), val);
          __ g1_write_barrier_post(rdx, val, r8, rbx);
        }

      }
      break;
#endif // SERIALGC
    case BarrierSet::CardTableModRef:
    case BarrierSet::CardTableExtension:
      {
        if (val == noreg) {
          __ store_heap_oop(obj, NULL_WORD);
        } else {
          __ store_heap_oop(obj, val);
          // flatten object address if needed
          if (!precise || (obj.index() == noreg && obj.disp() == 0)) {
            __ store_check(obj.base());
          } else {
            __ leaq(rdx, obj);
            __ store_check(rdx);
          }
        }
      }
      break;
    case BarrierSet::ModRef:
    case BarrierSet::Other:
      if (val == noreg) {
        __ store_heap_oop(obj, NULL_WORD);
      } else {
        __ store_heap_oop(obj, val);
      }
      break;
    default      :
      ShouldNotReachHere();

  }
}
*/
// we use S1 as bcp, be sure you have bcp in S1 before you call any of the Template generator 
Address TemplateTable::at_bcp(int offset) {
  assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
  return Address(BCP, offset);
}

#define callee_saved_register(R) assert((R>=S0 && R<=S7), "should use callee saved registers!")

// bytecode folding
void TemplateTable::patch_bytecode(Bytecodes::Code bytecode, Register bc,
                                   Register scratch,
                                   bool load_bc_into_scratch/*=true*/) {
  if (!RewriteBytecodes) {
    return;
  }
  // the pair bytecodes have already done the load.
  if (load_bc_into_scratch) {
    __ move(bc, bytecode);
  }
  Label patch_done;
  if (JvmtiExport::can_post_breakpoint()) {
    Label fast_patch;
    // if a breakpoint is present we can't rewrite the stream directly
		__ lbu(scratch, at_bcp(0));
		__ move(AT, Bytecodes::_breakpoint);
		__ bne(scratch, AT, fast_patch);
		__ delayed()->nop();

		__ get_method(scratch);
		// Let breakpoint table handling rewrite to quicker bytecode 
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
				InterpreterRuntime::set_original_bytecode_at), scratch, BCP, bc);

		__ b(patch_done);
		__ delayed()->nop();
		__ bind(fast_patch);
	}

#ifdef ASSERT
	Label okay;
	__ lbu(scratch, at_bcp(0));
	__ move(AT, (int)Bytecodes::java_code(bytecode));
	__ beq(scratch, AT, okay);
	__ delayed()->nop();
	__ beq(scratch, bc, patch_done);
	__ delayed()->nop();
	__ stop("patching the wrong bytecode");
	__ bind(okay);
#endif

	// patch bytecode
	__ sb(bc, at_bcp(0));
	__ bind(patch_done);
}


// Individual instructions

void TemplateTable::nop() {
  transition(vtos, vtos);
  // nothing to do
}

void TemplateTable::shouldnotreachhere() {
  transition(vtos, vtos);
  __ stop("shouldnotreachhere bytecode");
}

void TemplateTable::aconst_null() {
  transition(vtos, atos);
	__ move(FSR, ZERO);
}

void TemplateTable::iconst(int value) {
  transition(vtos, itos);
  if (value == 0) {
    //__ xorl(rax, rax);
		__ move(FSR, ZERO);
  } else {
    //__ movl(rax, value);
		__ move(FSR, value);
  }
}

void TemplateTable::lconst(int value) {
  transition(vtos, ltos);
  if (value == 0) {
		__ move(FSR, ZERO);
  } else {
		__ move(FSR, value);
  }
	assert(value >= 0, "check this code");
	__ move(SSR, ZERO);
}

const static float  _f0 = 0.0, _f1 = 1.0, _f2 = 2.0;

const static double _d0 = 0.0, _d1 = 1.0;

void TemplateTable::fconst(int value) {
  transition(vtos, ftos);
	if (value == 0) {
		__ lui(AT, Assembler::split_high((int)&_f0));
		__ lwc1(FSF, AT, Assembler::split_low((int)&_f0));
	} else if (value == 1) {
		__ lui(AT, Assembler::split_high((int)&_f1));
		__ lwc1(FSF, AT, Assembler::split_low((int)&_f1));
	} else if (value == 2) {
		__ lui(AT, Assembler::split_high((int)&_f2));
		__ lwc1(FSF, AT, Assembler::split_low((int)&_f2));
	} else { 
		ShouldNotReachHere();
	}
}

void TemplateTable::dconst(int value) {
  transition(vtos, dtos);
	if (value == 0) { 
		__ lui(AT, Assembler::split_high((int)&_d0));
		__ lwc1(FSF, AT, Assembler::split_low((int)&_d0));
		__ lwc1(SSF, AT, Assembler::split_low((int)&_d0)+4);
	} else if (value == 1) {
		__ lui(AT, Assembler::split_high((int)&_d1));
		__ lwc1(FSF, AT, Assembler::split_low((int)&_d1));
		__ lwc1(SSF, AT, Assembler::split_low((int)&_d1)+4);
	} else { 
		ShouldNotReachHere();
	}
}

void TemplateTable::bipush() {
	transition(vtos, itos);
	__ lb(FSR, at_bcp(1));
}

void TemplateTable::sipush() {
	transition(vtos, itos);
	__ load_two_bytes_from_at_bcp(FSR, AT, 1);
	__ hswap(FSR);
}

// used register : T2, T3, T4
// T2 : index
// T3 : cpool
// T4 : tag
void TemplateTable::ldc(bool wide) {
  transition(vtos, vtos);
  Label call_ldc, notFloat, notClass, Done;
	// get index in cpool
  if (wide) {
		__ load_two_bytes_from_at_bcp(T2, AT, 1);
		__ huswap(T2);
  } else {
		__ lbu(T2, at_bcp(1));
  }

	__ get_cpool_and_tags(T3, T4);

  const int base_offset = constantPoolOopDesc::header_size() * wordSize;
  const int tags_offset = typeArrayOopDesc::header_size(T_BYTE) * wordSize;

  // get type
	__ add(AT, T4, T2);
	__ lb(T4, AT, tags_offset);
	//now T4 is the tag

  // unresolved string - get the resolved string
	__ addiu(AT, T4, - JVM_CONSTANT_UnresolvedString);
	__ beq(AT, ZERO, call_ldc);
	__ delayed()->nop();

	// unresolved class - get the resolved class
	__ addiu(AT, T4, - JVM_CONSTANT_UnresolvedClass);
	__ beq(AT, ZERO, call_ldc);
	__ delayed()->nop();
	// unresolved class in error (resolution failed) - call into runtime
	// so that the same error from first resolution attempt is thrown.
	//  __ cmpl(edx, JVM_CONSTANT_UnresolvedClassInError);
	__ addiu(AT, T4, -JVM_CONSTANT_UnresolvedClassInError); 
	//	__ jccb(Assembler::equal, call_ldc);

	__ beq(AT, ZERO, call_ldc);
	__ delayed()->nop();

	// resolved class - need to call vm to get java mirror of the class
	__ addiu(AT, T4, - JVM_CONSTANT_Class);
	__ bne(AT, ZERO, notClass);
	__ delayed()->sll(T2, T2, 2);

	__ bind(call_ldc);

	__ move(A1, wide);
	call_VM(FSR, CAST_FROM_FN_PTR(address, InterpreterRuntime::ldc), A1);
//	__ sw(FSR, SP, - 1 * wordSize);
	__ push(atos);	
	__ b(Done);
//	__ delayed()->addi(SP, SP, - 1 * wordSize);
	__ delayed()->nop();
	__ bind(notClass);

	__ addiu(AT, T4, -JVM_CONSTANT_Float);
	__ bne(AT, ZERO, notFloat);
	__ delayed()->nop();
	// ftos
	__ add(AT, T3, T2);
	__ lwc1(FSF, AT, base_offset);
	__ swc1(FSF, SP, - 1 * wordSize);
	__ b(Done);
	__ delayed()->addi(SP, SP, - 1 * wordSize);

	__ bind(notFloat);
#ifdef ASSERT
	{ 
		Label L;
		__ addiu(AT, T4, -JVM_CONSTANT_Integer);
		__ beq(AT, ZERO, L);
		__ delayed()->addiu(AT, T4, -JVM_CONSTANT_String);
		__ beq(AT, ZERO, L);
		__ delayed()->nop();
		__ stop("unexpected tag type in ldc");
		__ bind(L);
	}
#endif
	// atos and itos
	Label isOop;
	__ add(AT, T3, T2);
	__ lw(FSR, AT, base_offset);
	// String is only oop type we will see here
	__ addiu(AT, T4, -JVM_CONSTANT_String);
	//__ bne(AT, ZERO, Done);
	__ beq(AT,ZERO,isOop);	
	__ delayed()->nop();
	__ push(itos);
	__ b(Done);
	__ delayed()->nop(); 
	__ bind(isOop);
	__ push(atos);


	if (VerifyOops) {
		__ verify_oop(FSR);
	}

	__ bind(Done);
}

// used register: T2, T3, T4
// T2 : index
// T3 : cpool
// T4 : tag
void TemplateTable::ldc2_w() {
  transition(vtos, vtos);
  Label Long, Done;

	// get index in cpool
	__ load_two_bytes_from_at_bcp(T2, AT, 1);
	__ huswap(T2);

	__ get_cpool_and_tags(T3, T4);

	const int base_offset = constantPoolOopDesc::header_size() * wordSize;
	const int tags_offset = typeArrayOopDesc::header_size(T_BYTE) * wordSize;

	// get type in T4
	__ add(AT, T4, T2);
	__ lb(T4, AT, tags_offset);

	__ addiu(AT, T4, - JVM_CONSTANT_Double);
	__ bne(AT, ZERO, Long);
	__ delayed()->sll(T2, T2, 2);
	// dtos	
	__ addu(AT, T3, T2);
	__ lwc1(FSF, AT, base_offset + 0 * wordSize);
	__ lwc1(SSF, AT, base_offset + 1 * wordSize);
	__ swc1(FSF, SP, - 2*wordSize);
	__ swc1(SSF, SP, - 1*wordSize);
	__ b(Done);
	__ delayed()->addi(SP, SP, -8);

	// ltos
	__ bind(Long);
	__ add(AT, T3, T2);	
	__ lw(FSR, AT, base_offset + 0 * wordSize);
	__ lw(SSR, AT, base_offset + 1 * wordSize);
	__ push(ltos);

	__ bind(Done);
}

// we compute the actual local variable address here
// the x86 dont do so for it has scaled index memory access model, we dont have, so do here
//FIXME
void TemplateTable::locals_index(Register reg, int offset) {
	__ lbu(reg, at_bcp(offset));
	__ sll(reg, reg, 2);
	__ sub(reg, LVP, reg);
}

// this method will do bytecode folding of the two form:
// iload iload			iload caload
// used register : T2, T3
// T2 : bytecode
// T3 : folded code
void TemplateTable::iload() {
	transition(vtos, itos);
	if (RewriteFrequentPairs) { 
		Label rewrite, done;
		// get the next bytecode in T2
		__ lbu(T2, at_bcp(Bytecodes::length_for(Bytecodes::_iload)));
		// if _iload, wait to rewrite to iload2.  We only want to rewrite the
		// last two iloads in a pair.  Comparing against fast_iload means that
		// the next bytecode is neither an iload or a caload, and therefore
		// an iload pair.
		__ move(AT, Bytecodes::_iload);
		__ beq(AT, T2, done);
		__ delayed()->nop();

		__ move(AT, Bytecodes::_fast_iload);
		__ beq(AT, T2, rewrite);
		__ delayed();
		__ move(T3, Bytecodes::_fast_iload2);

		// if _caload, rewrite to fast_icaload
		__ move(AT, Bytecodes::_caload);
		__ beq(AT, T2, rewrite);
		__ delayed();
		__ move(T3, Bytecodes::_fast_icaload);

		// rewrite so iload doesn't check again.
		__ move(T3, Bytecodes::_fast_iload);

		// rewrite
		// T3 : fast bytecode
		__ bind(rewrite);
		patch_bytecode(Bytecodes::_iload, T3, T2, false);
		__ bind(done);
	}

	// Get the local value into tos
	locals_index(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

// used register T2
// T2 : index
void TemplateTable::fast_iload2() {
	transition(vtos, itos);
	locals_index(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
	__ push(itos);
	locals_index(T2, 3);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}
  
// used register T2
// T2 : index
void TemplateTable::fast_iload() {
	transition(vtos, itos);
	locals_index(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

// used register T2
// T2 : index
void TemplateTable::lload() {

	transition(vtos, ltos);
	locals_index(T2);
	__ lw(FSR, T2, -4);
	__ lw(SSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

// used register T2
// T2 : index
void TemplateTable::fload() {
	transition(vtos, ftos);
	locals_index(T2);
	__ lwc1(FSF, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

// used register T2
// T2 : index
void TemplateTable::dload() {

	transition(vtos, dtos);
	locals_index(T2);
	if (TaggedStackInterpreter) {
		// Get double out of locals array, onto temp stack and load with
		// float instruction into ST0
		//    __ movl(eax, laddress(ebx));
		__ sll(AT,T2,Interpreter::stackElementScale());
		__ add(AT, LVP, AT);
		__ lwc1(FSF, AT, Interpreter::local_offset_in_bytes(1)); 
		// __ movl(edx, haddress(ebx));
		__ lwc1(SSF, AT, Interpreter::local_offset_in_bytes(0)); 

		//   __ pushl(edx);  // push hi first
		// __ pushl(eax);
		//    __ fld_d(Address(esp));
		//   __ addl(esp, 2*wordSize);
		debug_only(__ verify_local_tag(frame::TagCategory2, T2));
	} else {
		__ lwc1(FSF, T2, -4);
		__ lwc1(SSF, T2, 0);
	}
}

// used register T2
// T2 : index
void TemplateTable::aload() 
{
	transition(vtos, atos);

	locals_index(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

void TemplateTable::locals_index_wide(Register reg) {
	__ load_two_bytes_from_at_bcp(reg, AT, 2);
	__ huswap(reg);
	__ sll(reg, reg, 2);
	__ sub(reg, LVP, reg);
}

// used register T2
// T2 : index
void TemplateTable::wide_iload() {
	transition(vtos, itos);
	locals_index_wide(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

// used register T2
// T2 : index
void TemplateTable::wide_lload() {
	transition(vtos, ltos);
	locals_index_wide(T2);
	__ lw(FSR, T2, -4);
	__ lw(SSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

// used register T2
// T2 : index
void TemplateTable::wide_fload() {
	transition(vtos, ftos);
	locals_index_wide(T2);
	__ lwc1(FSF, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

// used register T2
// T2 : index
void TemplateTable::wide_dload() {
	transition(vtos, dtos);
	locals_index_wide(T2);
	if (TaggedStackInterpreter) {
		// Get double out of locals array, onto temp stack and load with
		// float instruction into ST0
		//   __ movl(eax, laddress(ebx));
		//  __ movl(edx, haddress(ebx));
		__ sll(AT,T2,Interpreter::stackElementScale());
		__ add(AT, LVP, AT);
		__ lwc1(FSF, AT, Interpreter::local_offset_in_bytes(1)); 
		// __ movl(edx, haddress(ebx));
		__ lwc1(SSF, AT, Interpreter::local_offset_in_bytes(0)); 

		//  __ pushl(edx);  // push hi first
		//  __ pushl(eax);
		//  __ fld_d(Address(esp));
		//  __ addl(esp, 2*wordSize);
		debug_only(__ verify_local_tag(frame::TagCategory2, T2));
	} else {
		__ lwc1(FSF, T2, -4);
		__ lwc1(SSF, T2, 0);
	}
}

// used register T2
// T2 : index
void TemplateTable::wide_aload() {
	transition(vtos, atos);
	locals_index_wide(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

// we use A2 as the regiser for index, BE CAREFUL!
// we dont use our tge 29 now, for later optimization
void TemplateTable::index_check(Register array, Register index) {
	// Pop ptr into array
	__ pop_ptr(array);
	index_check_without_pop(array, index);
}

void TemplateTable::index_check_without_pop(Register array, Register index) {
	// destroys ebx
	// check array
	__ null_check(array, arrayOopDesc::length_offset_in_bytes());

	// check index
	Label ok;
	__ lw(AT, array, arrayOopDesc::length_offset_in_bytes());
#ifndef OPT_RANGECHECK
	__ sltu(AT, index, AT);
	__ bne(AT, ZERO, ok);
	__ delayed()->nop(); 

	//throw_ArrayIndexOutOfBoundsException assume abberrant index in A2
	if (A2!=index) __ move(A2, index);		
	__ jmp(Interpreter::_throw_ArrayIndexOutOfBoundsException_entry);
	__ delayed()->nop();
	__ bind(ok);
#else
	__ lw(AT, array, arrayOopDesc::length_offset_in_bytes());
	__ move(A2, index);
	__ tgeu(A2, AT, 29);
#endif
}

void TemplateTable::iaload() {
	transition(itos, itos);
	//  __ pop(SSR);
	index_check(SSR, FSR);
	__ shl(FSR, 2);
	__ add(FSR, SSR, FSR);
	//FSR: index
	__ lw(FSR, FSR, arrayOopDesc::base_offset_in_bytes(T_INT));

}


void TemplateTable::laload() {
	transition(itos, ltos);
	//  __ pop(SSR);
	index_check(SSR, FSR); 
	__ sll(AT, FSR, 3);
	__ add(AT, SSR, AT);
	__ lw(FSR, AT, arrayOopDesc::base_offset_in_bytes(T_LONG) + 0 * wordSize);
	__ lw(SSR, AT, arrayOopDesc::base_offset_in_bytes(T_LONG) + 1 * wordSize);
}

void TemplateTable::faload() {
	transition(itos, ftos);
	// __ pop(SSR);
	index_check(SSR, FSR);  
	__ shl(FSR, 2);
	__ add(FSR, SSR, FSR);
	__ lwc1(FSF, FSR, arrayOopDesc::base_offset_in_bytes(T_FLOAT));
}

void TemplateTable::daload() {
	transition(itos, dtos);
	//__ pop(SSR);
	index_check(SSR, FSR);  
	__ sll(AT, FSR, 3);
	__ add(AT, SSR, AT);
	__ lwc1(FSF, AT, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) + 0 * wordSize);
	__ lwc1(SSF, AT, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) + 1 * wordSize);
}

void TemplateTable::aaload() {
	transition(itos, atos);
	//__ pop(SSR);
	index_check(SSR, FSR);
	__ shl(FSR, 2);
	__ add(FSR, SSR, FSR);
	__ lw(FSR, FSR, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
}

void TemplateTable::baload() {
	transition(itos, itos);
	//__ pop(SSR);
	index_check(SSR, FSR); 
	__ add(FSR, SSR, FSR);
	__ lb(FSR, FSR, arrayOopDesc::base_offset_in_bytes(T_BYTE));
}

void TemplateTable::caload() {
	transition(itos, itos);
	// __ pop(SSR);
	index_check(SSR, FSR);
	__ shl(FSR, 1);
	__ add(FSR, SSR, FSR);
	__ lhu(FSR, FSR,  arrayOopDesc::base_offset_in_bytes(T_CHAR));
}

// iload followed by caload frequent pair
// used register : T2
// T2 : index
void TemplateTable::fast_icaload() {
	transition(vtos, itos);
	// load index out of locals
	locals_index(T2);
	__ lw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagValue, T2));
//	__ pop(SSR);
	index_check(SSR, FSR);
	__ shl(FSR, 1);
	__ add(FSR, SSR, FSR);
	__ lhu(FSR, FSR,  arrayOopDesc::base_offset_in_bytes(T_CHAR));
}

void TemplateTable::saload() {
	transition(itos, itos);
	// __ pop(SSR);
	index_check(SSR, FSR);  
	__ shl(FSR, 1);
	__ add(FSR, SSR, FSR);
	__ lh(FSR, FSR,  arrayOopDesc::base_offset_in_bytes(T_SHORT));
}

void TemplateTable::iload(int n) {
	transition(vtos, itos);
	__ lw(FSR, iaddress(n));
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

void TemplateTable::lload(int n) {
	transition(vtos, ltos);
	__ lw(FSR, laddress(n));
	__ lw(SSR, haddress(n));
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}

void TemplateTable::fload(int n) {
	transition(vtos, ftos);
	__ lwc1(FSF, faddress(n));
	debug_only(__ verify_local_tag(frame::TagValue, T2));
}
//FIXME here
void TemplateTable::dload(int n) {
	transition(vtos, dtos);
	if (TaggedStackInterpreter) {
		// Get double out of locals array, onto temp stack and load with
		// float instruction into ST0
		//__ movl(eax, laddress(n));
		//__ movl(edx, haddress(n));
		//__ pushl(edx);  // push hi first
		//__ pushl(eax);
		//  __ fld_d(Address(esp));
		// __ addl(esp, 2*wordSize);  // reset esp
		__ lwc1(FSF, laddress(n));
		__ lwc1(SSF, haddress(n));
		debug_only(__ verify_local_tag(frame::TagCategory2, T2));
	} else {
		__ lwc1(FSF, laddress(n));
		__ lwc1(SSF, haddress(n));
	}
}

void TemplateTable::aload(int n) {
	transition(vtos, atos);
	__ lw(FSR, aaddress(n));
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

// used register : T2, T3
// T2 : bytecode
// T3 : folded code
void TemplateTable::aload_0() {
	transition(vtos, atos);
	// According to bytecode histograms, the pairs:
	//
	// _aload_0, _fast_igetfield
	// _aload_0, _fast_agetfield
	// _aload_0, _fast_fgetfield
	//
	// occur frequently. If RewriteFrequentPairs is set, the (slow) _aload_0
	// bytecode checks if the next bytecode is either _fast_igetfield, 
	// _fast_agetfield or _fast_fgetfield and then rewrites the
	// current bytecode into a pair bytecode; otherwise it rewrites the current
	// bytecode into _fast_aload_0 that doesn't do the pair check anymore.
	//
	// Note: If the next bytecode is _getfield, the rewrite must be delayed,
	//       otherwise we may miss an opportunity for a pair.
	//
	// Also rewrite frequent pairs
	//   aload_0, aload_1
	//   aload_0, iload_1
	// These bytecodes with a small amount of code are most profitable to rewrite
	if (RewriteFrequentPairs) {
		Label rewrite, done;
		// get the next bytecode in T2
		__ lbu(T2, at_bcp(Bytecodes::length_for(Bytecodes::_aload_0)));

		// do actual aload_0
		aload(0);

		// if _getfield then wait with rewrite
		__ move(AT, Bytecodes::_getfield);
		__ beq(AT, T2, done);
		__ delayed()->nop();

		// if _igetfield then reqrite to _fast_iaccess_0
		assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == 
				Bytecodes::_aload_0, "fix bytecode definition");
		__ move(AT, Bytecodes::_fast_igetfield);
		__ beq(AT, T2, rewrite);
		__ delayed();
		__ move(T3, Bytecodes::_fast_iaccess_0);

		// if _agetfield then reqrite to _fast_aaccess_0
		assert(Bytecodes::java_code(Bytecodes::_fast_aaccess_0) == 
				Bytecodes::_aload_0, "fix bytecode definition");
		__ move(AT, Bytecodes::_fast_agetfield);
		__ beq(AT, T2, rewrite);
		__ delayed();
		__ move(T3, Bytecodes::_fast_aaccess_0);

		// if _fgetfield then reqrite to _fast_faccess_0
		assert(Bytecodes::java_code(Bytecodes::_fast_faccess_0) == 
				Bytecodes::_aload_0, "fix bytecode definition");
		__ move(AT, Bytecodes::_fast_fgetfield);
		__ beq(AT, T2, rewrite);
		__ delayed();
		__ move(T3, Bytecodes::_fast_faccess_0);

		// else rewrite to _fast_aload0
		assert(Bytecodes::java_code(Bytecodes::_fast_aload_0) == 
				Bytecodes::_aload_0, "fix bytecode definition");
		__ move(T3, Bytecodes::_fast_aload_0);

		// rewrite
		__ bind(rewrite);
		patch_bytecode(Bytecodes::_aload_0, T3, T2, false);

		__ bind(done);
	} else {
		aload(0);
	}
}

void TemplateTable::istore() {
	transition(itos, vtos);
	locals_index(T2);
	__ sw(FSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

void TemplateTable::lstore() {
	transition(ltos, vtos);
	locals_index(T2);
	__ sw(FSR, T2, -4);
	__ sw(SSR, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

void TemplateTable::fstore() {
	transition(ftos, vtos);
	locals_index(T2);
	__ swc1(FSF, T2, 0);
	debug_only(__ verify_local_tag(frame::TagCategory2, T2));
}

void TemplateTable::dstore() {
	transition(dtos, vtos);
	locals_index(T2);
	if (TaggedStackInterpreter) {
		// Store double on stack and reload into locals nonadjacently
		//    __ subl(esp, 2 * wordSize);
		//   __ fstp_d(Address(esp));
		//  __ popl(eax);
		// __ popl(edx);
		//__ movl(laddress(ebx), eax);
		//__ movl(haddress(ebx), edx);
		// __ swc1(FSF, laddress(T2));
		//__ swc1(SSF,  haddress(T2));
		__ sll(AT,T2,Interpreter::stackElementScale());
		__ add(AT, LVP, AT);
		__ lwc1(FSF, AT, Interpreter::local_offset_in_bytes(1)); 
		// __ movl(edx, haddress(ebx));
		__ lwc1(SSF, AT, Interpreter::local_offset_in_bytes(0)); 


		__ tag_local(frame::TagCategory2, T2);
	} else {
		__ swc1(FSF, T2, -4);
		__ swc1(SSF, T2, 0);
	}
}

void TemplateTable::astore() {
	transition(vtos, vtos);
	//  __ pop(FSR);
	__ pop_ptr(FSR, SSR);
	locals_index(T2);
	__ sw(FSR, T2, 0);
	__ tag_local(SSR, T2);    // need to store same tag in local may be returnAddr

}

void TemplateTable::wide_istore() {
	transition(vtos, vtos);
	//  __ pop(FSR);
	__ pop_i(FSR);
	locals_index_wide(T2);
	__ sw(FSR, T2, 0);
	__ tag_local(frame::TagValue, T2);
}

void TemplateTable::wide_lstore() {
	transition(vtos, vtos);
	//__ pop2(FSR, SSR);
	//__ pop_l(FSR, SSR); 
	__ pop_l(FSR); //aoqi:FIXME Is this right?
	locals_index_wide(T2);
	__ sw(FSR, T2, -4);
	__ sw(SSR, T2, 0);
	__ tag_local(frame::TagCategory2, T2);
}

void TemplateTable::wide_fstore() {
	wide_istore();
}

void TemplateTable::wide_dstore() {
	wide_lstore();
}

void TemplateTable::wide_astore() {
	//  wide_istore();
	transition(vtos, vtos);
	//  __ pop_ptr(eax, edx);
	__ pop_ptr(FSR, SSR);
	// locals_index_wide(ebx);
	locals_index_wide(T2);
	//__ movl(aaddress(ebx), eax);
	//  __ sw(FSR, aaddress(T2)); 
	__ sll(AT,T2,Interpreter::stackElementScale());
	__ add(AT, LVP, AT);
	__ addi(AT, AT, Interpreter::value_offset_in_bytes());
	__ tag_local(SSR,AT );

}

// used register : T2
void TemplateTable::iastore() {
	transition(itos, vtos);
	/* 
	   __ pop2(SSR, T2);
	   index_check(T2, SSR);
	   __ shl(SSR, 2);
	   __ add(T2, T2, SSR);
	   __ sw(FSR, T2, arrayOopDesc::base_offset_in_bytes(T_INT));
	   */
	// __ pop_i(ebx);
	__ pop_i(SSR);
	index_check(T2, SSR);  // prefer index in ebx
	__ shl(SSR, Address::times_4);
	__ add(T2, T2, SSR);
	__ sw(FSR, T2, arrayOopDesc::base_offset_in_bytes(T_INT));
}



// used register T2, T3
void TemplateTable::lastore() {
	transition(ltos, vtos);
	//	__ pop2(T2, T3);
	__ pop_i (T2); 
	index_check(T3, T2);
	__ shl(T2, 3);
	__ add(T3, T3, T2);
	__ sw(FSR, T3, arrayOopDesc::base_offset_in_bytes(T_LONG) + 0 * wordSize);
	__ sw(SSR, T3, arrayOopDesc::base_offset_in_bytes(T_LONG) + 1 * wordSize);
}

// used register T2
void TemplateTable::fastore() {
	transition(ftos, vtos);
	//__ pop2(SSR, T2);
         __ pop_i(SSR);	
	index_check(T2, SSR); 
	__ shl(SSR, 2);
	__ add(T2, T2, SSR);
	__ swc1(FSF, T2, arrayOopDesc::base_offset_in_bytes(T_FLOAT));
}

// used register T2, T3
void TemplateTable::dastore() {
	transition(dtos, vtos);
	//__ pop2(T2, T3);
	__ pop_i (T2); 
	index_check(T3, T2);  
	__ shl(T2, Address::times_8);
	__ addu(T3, T3, T2);
	__ swc1(FSF, T3, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) + 0 * wordSize);
	__ swc1(SSF, T3, arrayOopDesc::base_offset_in_bytes(T_DOUBLE) + 1 * wordSize);

}

// used register : T2, T3, T4
// T2 : array
// T3 : subklass
// T4 : supklass
void TemplateTable::aastore() {
	Label is_null, ok_is_subtype, done;
	transition(vtos, vtos);
	// stack: ..., array, index, value
	//  __ lw(FSR, at_sp());     // Value
	//  __ lw(SSR, at_sp_p1());  // Index
	//  __ lw(T2, at_sp_p2());  // Array
	__ lw(FSR, at_tos());     // Value
	__ lw(SSR, at_tos_p1());  // Index
	__ lw(T2, at_tos_p2());  // Array

	// index_check(T2, SSR);
	index_check_without_pop(T2, SSR);
	// do array store check - check for NULL value first
	__ beq(FSR, ZERO, is_null);
	__ delayed()->nop();
	__ profile_checkcast(false, T3); // Blows T3

	// Move subklass into T3
	__ lw(T3,  Address(FSR, oopDesc::klass_offset_in_bytes()));
	// Move superklass into T4
	__ lw(T4, Address(T2, oopDesc::klass_offset_in_bytes()));
	__ lw(T4, Address(T4, sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes()));
	// Compress array+index*4+12 into a single register. T2
	__ sll(AT, SSR, 2);
	__ add(T2, T2, AT);
	__ addi(T2, T2, arrayOopDesc::base_offset_in_bytes(T_OBJECT));

	// Generate subtype check.
	// Superklass in T4.  Subklass in T3.
	__ gen_subtype_check(T4, T3, ok_is_subtype);
	// Come here on failure
	// object is at FSR
	__ jmp(Interpreter::_throw_ArrayStoreException_entry);
	__ delayed()->nop();
	// Come here on success
	__ bind(ok_is_subtype);
	__ sw(FSR, T2, 0);
	__ store_check(T2);
	__ b(done);
	__ delayed()->nop();

	// Have a NULL in FSR, EDX=T2, SSR=index.  Store NULL at ary[idx]
	__ bind(is_null);
	__ profile_checkcast(true, T3);	//blows T3
	__ sll(AT, SSR, 2);
	__ add(T2, T2, AT);
	__ sw(FSR, T2, arrayOopDesc::base_offset_in_bytes(T_OBJECT));

	__ bind(done);
	__ addi(SP, SP, 3 * Interpreter::stackElementSize());

}

void TemplateTable::bastore() {
	transition(itos, vtos);
	//__ pop2(SSR, T2);
	__ pop_i (SSR); 
	index_check(T2, SSR);
	__ add(SSR, T2, SSR);
	__ sb(FSR, SSR, arrayOopDesc::base_offset_in_bytes(T_BYTE));
}

void TemplateTable::castore() {
	transition(itos, vtos);
	//__ pop2(SSR, T2);
	__ pop_i(SSR); 
	index_check(T2, SSR); 
	__ shl(SSR, 1);
	__ add(SSR, T2, SSR);
	__ sh(FSR, SSR, arrayOopDesc::base_offset_in_bytes(T_CHAR));
}

void TemplateTable::sastore() {
	castore();
}

void TemplateTable::istore(int n) {
	transition(itos, vtos);
	__ sw(FSR, iaddress(n));
	__ tag_local(frame::TagValue, n);
}

void TemplateTable::lstore(int n) {
	transition(ltos, vtos);
	__ sw(FSR, laddress(n));
	__ sw(SSR, haddress(n));
	__ tag_local(frame::TagCategory2, n);
}

void TemplateTable::fstore(int n) {
	transition(ftos, vtos);
	__ swc1(FSF, faddress(n));
	__ tag_local(frame::TagValue, n);
}
//FIXME,
void TemplateTable::dstore(int n) {
	transition(dtos, vtos);
	if (TaggedStackInterpreter) {
		/*  __ subl(esp, 2 * wordSize);
		    __ fstp_d(Address(esp));
		    __ popl(eax);
		    __ popl(edx);
		    __ movl(laddress(n), eax);
		    __ movl(haddress(n), edx);
		    */ 
		__ swc1(FSF, laddress(n));
		__ swc1(SSF, haddress(n));
		__ tag_local(frame::TagCategory2, n);
	} else {
		__ swc1(FSF, laddress(n));
		__ swc1(SSF, haddress(n));
	}
}

void TemplateTable::astore(int n) {
	transition(vtos, vtos);
	//__ pop(FSR);
	__ pop_ptr(FSR, SSR);
	__ sw(FSR, aaddress(n));
	__ tag_local(SSR, n);
}

void TemplateTable::pop() {
	transition(vtos, vtos);
	//  __ pop();
	__ addi(SP, SP, Interpreter::stackElementSize());
}

void TemplateTable::pop2() {
	transition(vtos, vtos);
	//__ pop2();
	__ addi(SP, SP, 2*Interpreter::stackElementSize());
}

void TemplateTable::dup() {
	transition(vtos, vtos);
	// stack: ..., a
	//	__ lw(AT, SP, 0);
	//	__ push(AT);
	__ load_ptr_and_tag(0, FSR, SSR);
	__ push_ptr(FSR, SSR);
	// stack: ..., a, a
}

// blows FSR
void TemplateTable::dup_x1() {
	transition(vtos, vtos);
	// stack: ..., a, b
	__ load_ptr_and_tag(0, FSR, SSR);  // load b
	__ load_ptr_and_tag(1, T5, T4);  // load a
	__ store_ptr_and_tag(1, FSR, SSR); // store b
	__ store_ptr_and_tag(0, T5, T4); // store a
	__ push_ptr(FSR, SSR);             // push b
	// stack: ..., b, a, b
}

// blows FSR
void TemplateTable::dup_x2() {
	transition(vtos, vtos);
	// stack: ..., a, b, c
	__ load_ptr_and_tag(0, FSR, SSR);  // load c
	__ load_ptr_and_tag(2, T5, T4);  // load a
	__ store_ptr_and_tag(2, FSR, SSR); // store c in a
	__ push_ptr(FSR, SSR);             // push c
	// stack: ..., c, b, c, c
	__ load_ptr_and_tag(2, FSR, SSR);  // load b
	__ store_ptr_and_tag(2, T5, T4); // store a in b
	// stack: ..., c, a, c, c
	__ store_ptr_and_tag(1, FSR, SSR); // store b in c
	// stack: ..., c, a, b, c
}

// blows FSR
void TemplateTable::dup2() {
	transition(vtos, vtos);
	// stack: ..., a, b
	__ load_ptr_and_tag(1, FSR, SSR);  // load a
	__ push_ptr(FSR, SSR);             // push a
	__ load_ptr_and_tag(1, FSR, SSR);  // load b
	__ push_ptr(FSR, SSR);             // push b
	// stack: ..., a, b, a, b
}

// blows FSR
void TemplateTable::dup2_x1() {
	transition(vtos, vtos);
	// stack: ..., a, b, c
	__ load_ptr_and_tag(0, T5, T4);  // load c
	__ load_ptr_and_tag(1, FSR, SSR);  // load b
	__ push_ptr(FSR, SSR);             // push b
	__ push_ptr(T5, T4);             // push c
	// stack: ..., a, b, c, b, c
	__ store_ptr_and_tag(3, T5, T4); // store c in b
	// stack: ..., a, c, c, b, c
	__ load_ptr_and_tag(4, T5, T4);  // load a
	__ store_ptr_and_tag(2, T5, T4); // store a in 2nd c
	// stack: ..., a, c, a, b, c
	__ store_ptr_and_tag(4, FSR, SSR); // store b in a
	// stack: ..., b, c, a, b, c

	// stack: ..., b, c, a, b, c
}

// blows FSR, SSR
void TemplateTable::dup2_x2() {
	transition(vtos, vtos);
	// stack: ..., a, b, c, d
	// stack: ..., a, b, c, d
	__ load_ptr_and_tag(0, T5, T4);  // load d
	__ load_ptr_and_tag(1, FSR, SSR);  // load c
	__ push_ptr(FSR, SSR);             // push c
	__ push_ptr(T5, T4);             // push d
	// stack: ..., a, b, c, d, c, d
	__ load_ptr_and_tag(4, FSR, SSR);  // load b
	__ store_ptr_and_tag(2, FSR, SSR); // store b in d
	__ store_ptr_and_tag(4, T5, T4); // store d in b
	// stack: ..., a, d, c, b, c, d
	__ load_ptr_and_tag(5, T5, T4);  // load a
	__ load_ptr_and_tag(3, FSR, SSR);  // load c
	__ store_ptr_and_tag(3, T5, T4); // store a in c
	__ store_ptr_and_tag(5, FSR, SSR); // store c in a
	// stack: ..., c, d, a, b, c, d

	// stack: ..., c, d, a, b, c, d
}

// blows FSR
void TemplateTable::swap() {
	transition(vtos, vtos);
	// stack: ..., a, b

	__ load_ptr_and_tag(1, T5, T4);  // load a
	__ load_ptr_and_tag(0, FSR, SSR);  // load b
	__ store_ptr_and_tag(0, T5, T4); // store a in b
	__ store_ptr_and_tag(1, FSR, SSR); // store b in a

	// stack: ..., b, a
}

void TemplateTable::iop2(Operation op) {
	transition(itos, itos);
	switch (op) {
		case add  :                    
			__ pop_i(SSR); 
			__ addu(FSR, SSR, FSR); 
			break;
		case sub  :  
			__ pop_i(SSR); 
			__ subu(FSR, SSR, FSR); 
			break;
		case mul  :                    
			__ lw(SSR, SP, 0);
			__ mult(SSR, FSR);
			__ addi(SP, SP, wordSize);
			__ nop();
			__ mflo(FSR);
			break;
		case _and :                    
			__ pop_i(SSR); 
			__ andr(FSR, SSR, FSR); 
			break;
		case _or  :                    
			__ pop_i(SSR); 
			__ orr(FSR, SSR, FSR); 
			break;
		case _xor :                    
			__ pop_i(SSR); 
			__ xorr(FSR, SSR, FSR); 
			break;
		case shl  : 
			__ pop_i(SSR); 
			__ sllv(FSR, SSR, FSR);      
			break; // implicit masking of lower 5 bits by Intel shift instr. mips also
		case shr  : 
			__ pop_i(SSR); 
			__ srav(FSR, SSR, FSR);      
			break; // implicit masking of lower 5 bits by Intel shift instr. mips also
		case ushr : 
			__ pop_i(SSR); 
			__ srlv(FSR, SSR, FSR);     
			break; // implicit masking of lower 5 bits by Intel shift instr. mips also
		default   : ShouldNotReachHere();
	}
}

// the result stored in FSR, SSR,
// used registers : T2, T3
void TemplateTable::lop2(Operation op) {
	transition(ltos, ltos);
	//__ pop2(T2, T3);
	__ pop_l(T2, T3);
	switch (op) {
		case add : 
			__ addu(FSR, T2, FSR);
			__ sltu(AT, FSR, T2);
			__ addu(SSR, T3, SSR);
			__ addu(SSR, SSR, AT); 
			break;
		case sub :
			__ subu(FSR, T2, FSR);
			__ sltu(AT, T2, FSR);
			__ subu(SSR, T3, SSR);
			__ subu(SSR, SSR, AT);
			break;
		case _and: 
			__ andr(FSR, T2, FSR); 
			__ andr(SSR, T3, SSR); 
			break;
		case _or : 
			__ orr(FSR, T2, FSR); 
			__ orr(SSR, T3, SSR); 
			break;
		case _xor: 
			__ xorr(FSR, T2, FSR); 
			__ xorr(SSR, T3, SSR); 
			break;
		default : ShouldNotReachHere();
	}
}

// java require this bytecode could handle 0x80000000/-1, dont cause a overflow exception, 
// the result is 0x80000000
// the godson2 cpu do the same, so we need not handle this specially like x86
void TemplateTable::idiv() {
	transition(itos, itos);
	Label not_zero;
	//__ pop(SSR);
	__ pop_i(SSR);
	__ div(SSR, FSR);

	__ bne(FSR, ZERO, not_zero);
	__ delayed()->nop();
	//__ brk(7);
	__ jmp(Interpreter::_throw_ArithmeticException_entry); 
	__ delayed()->nop();

	__ bind(not_zero);
	__ mflo(FSR);
}

void TemplateTable::irem() {
	transition(itos, itos);
	Label not_zero;
	//__ pop(SSR);
	__ pop_i(SSR);
	__ div(SSR, FSR);

	__ bne(FSR, ZERO, not_zero);
	__ delayed()->nop();
	//__ brk(7);
	__ jmp(Interpreter::_throw_ArithmeticException_entry);
	__ delayed()->nop();

	__ bind(not_zero);
	__ mfhi(FSR);
}

// the multiplier in SSR||FSR, the multiplicand in stack
// the result in SSR||FSR
// used registers : T2, T3
void TemplateTable::lmul() {
	transition(ltos, ltos);
	Label zero, quick, done;

	//__ lw(T2, SP, 0);
	//__ lw(T3, SP, 4);
	__ pop_l(T2, T3);
	__ orr(AT, T2, FSR);
	__ beq(AT, ZERO, zero);
	//__ delayed()->addi(SP, SP, 2 * wordSize);
	__ delayed()->nop();

	__ orr(AT, T3, SSR);
	__ beq(AT, ZERO, quick);
	__ delayed()->nop();

	__ multu(T2, SSR);
	__ nop();
	__ nop();
	__ mflo(SSR);

	__ multu(T3, FSR);
	__ nop();
	__ nop();
	__ mflo(T3);

	__ bind(quick);
	__ multu(T2, FSR);
	__ addu(SSR, SSR, T3);
	__ nop();
	__ mflo(FSR);
	__ mfhi(T2);
	__ b(done);
	__ delayed()->addu(SSR, SSR, T2);

	__ bind(zero);
	__ move(SSR, ZERO);
	__ bind(done);
}

// NOTE: i DONT use the Interpreter::_throw_ArithmeticException_entry
void TemplateTable::ldiv() {
	transition(ltos, ltos);
	Label normal;

	__ orr(AT, FSR, SSR);
	__ bne(AT, ZERO, normal);
	__ delayed()->nop();

	//__ brk(7);		//generate FPE
	__ jmp(Interpreter::_throw_ArithmeticException_entry);
	__ delayed()->nop();

	__ bind(normal);
	__ move(A0, FSR);
	__ move(A1, SSR);
	//__ lw(A2, SP, 0);
	//__ lw(A3, SP, 4);
	//__ addi(SP, SP, 2 * wordSize);
	__ pop_l (A2, A3); 
	__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::ldiv), 4);
}

// NOTE: i DONT use the Interpreter::_throw_ArithmeticException_entry
void TemplateTable::lrem() {
	transition(ltos, ltos);
	Label normal;

	__ orr(AT, FSR, SSR);
	__ bne(AT, ZERO, normal);
	__ delayed()->nop();

	__ jmp(Interpreter::_throw_ArithmeticException_entry);
	__ delayed()->nop();

	__ bind(normal);
	__ move(A0, FSR);
	__ move(A1, SSR);
	__ pop_l (A2, A3); 
	__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::lrem), 4);
}

// result in SSR||FSR
// used registers : T2, T3
void TemplateTable::lshl() {
	transition(itos, ltos);
	Label normal, done, notZero;
	__ pop_l(T2, T3);	
	__ andi(FSR, FSR, 0x3f);				// the bit to be shifted

	__ bne(FSR, ZERO, notZero);
	__ delayed()-> nop();

	__ move(FSR, T2);
	__ b(done);
	__ delayed(); __ move(SSR, T3);

	__ bind(notZero);
	__ sltiu(AT, FSR, BitsPerWord);
	__ bne(AT, ZERO, normal);				// <BitsPerWord?
	__ delayed()->nop();

	__ addi(AT, FSR, - BitsPerWord);
	__ sllv(SSR, T2, AT);
	__ b(done);
	__ delayed(); __ move(FSR, ZERO);

	__ bind(normal);
	__ sllv(SSR, T3, FSR);
	__ move(AT, BitsPerWord);
	__ sub(AT, AT, FSR);
	__ srlv(AT, T2, AT);
	__ orr(SSR, SSR, AT);

	__ sllv(FSR, T2, FSR);

	__ bind(done);
}

// used registers : T2, T3
void TemplateTable::lshr() {
  transition(itos, ltos);
	Label normal, done, notZero;
        __ pop_l(T2, T3);	
	__ andi(FSR, FSR, 0x3f);	// the bit to be shifted

	__ bne(FSR, ZERO, notZero);
	__ delayed()-> nop();

	__ move(FSR, T2);	      // zero shift must be handled specially
	__ b(done);
	__ delayed(); __ move(SSR, T3);

	__ bind(notZero);
	__ sltiu(AT, FSR, BitsPerWord);
	__ bne(AT, ZERO, normal);	// shift < BitsPerWord?
	__ delayed()->nop();

	__ addi(AT, FSR, -BitsPerWord);	// quick
	__ srav(FSR, T3, AT);
	__ b(done);
	__ delayed()->sra(SSR, T3, BitsPerWord-1);

	__ bind(normal);
	__ srav(SSR, T3, FSR);		// normal
	__ move(AT, BitsPerWord);
	__ sub(AT, AT, FSR);
	__ srlv(FSR, T2, FSR);
	__ sllv(AT, T3, AT);
	__ orr(FSR, FSR, AT);
	
	__ bind(done);
}

// used registers : T2, T3
void TemplateTable::lushr() {
	transition(itos, ltos);
	Label normal, done, notZero;
	__ pop_l(T2, T3);	
	__ andi(FSR, FSR, 0x3f);	// the bit to be shifted

	__ bne(FSR, ZERO, notZero);
	__ delayed()->nop();

	__ move(FSR, T2);		// zero shift must be handled specially
	__ b(done);
	__ delayed(); __ move(SSR, T3);

	__ bind(notZero);
	__ sltiu(AT, FSR, BitsPerWord);
	__ bne(AT, ZERO, normal);	// shift < BitsPerWord?
	__ delayed()->nop();

	__ addi(AT, FSR, - BitsPerWord);	// quick
	__ srlv(FSR, T3, AT);
	__ b(done);
	__ delayed(); __ move(SSR, ZERO);

	__ bind(normal);		// normal
	__ srlv(SSR, T3, FSR);
	__ move(AT, BitsPerWord);
	__ sub(AT, AT, FSR);
	__ srlv(FSR, T2, FSR);
	__ sllv(AT, T3, AT);
	__ orr(FSR, FSR, AT);

	__ bind(done);
}

// result in FSF
void TemplateTable::fop2(Operation op) {
	transition(ftos, ftos);
	__ pop_ftos_to_esp();  // pop ftos into esp
	switch (op) {
		case add:
			__ lwc1(FTF, at_sp());
			__ add_s(FSF, FTF, FSF);
			break;
		case sub: 
			__ lwc1(FTF, at_sp());
			__ sub_s(FSF, FTF, FSF);
			break;
		case mul: 
			__ lwc1(FTF, at_sp());
			__ mul_s(FSF, FTF, FSF);
			break;
		case div: 
			__ lwc1(FTF, at_sp());
			__ div_s(FSF, FTF, FSF);
			break;
		case rem: 
			__ mfc1(FSR, FSF);
			__ mtc1(FSR, F12);
			__ lwc1(FTF, at_sp());
			__ rem_s(FSF, FTF, F12, FSF);
			break;
		default : ShouldNotReachHere();
	}

	__ addi(SP, SP, 1 * wordSize);
}

// result in SSF||FSF
// i dont handle the strict flags
void TemplateTable::dop2(Operation op) {
	transition(dtos, dtos);
	__ pop_dtos_to_esp();  // pop dtos into esp
	switch (op) {
		case add: 
			__ lwc1(FTF, at_sp());
			__ lwc1(STF, at_sp_p1());
			__ add_d(FSF, FTF, FSF);
			break;
		case sub: 
			__ lwc1(FTF, at_sp());
			__ lwc1(STF, at_sp_p1());
			__ sub_d(FSF, FTF, FSF);
			break;
		case mul: 
			__ lwc1(FTF, at_sp());
			__ lwc1(STF, at_sp_p1());
			__ mul_d(FSF, FTF, FSF);
			break;
		case div:
			__ lwc1(FTF, at_sp());
			__ lwc1(STF, at_sp_p1());
			__ div_d(FSF, FTF, FSF);
			break;
		case rem:
			__ mfc1(FSR, FSF);
			__ mfc1(SSR, SSF);
			__ mtc1(FSR, F12);
			__ mtc1(SSR, F13);
			__ lwc1(FTF, at_sp());
			__ lwc1(STF, at_sp_p1());
			__ rem_d(FSF, FTF, F12, FSF);
			break;
		default : ShouldNotReachHere();
	}

	__ addi(SP, SP, 2 * wordSize);
}

void TemplateTable::ineg() {
	transition(itos, itos);
	__ neg(FSR);
}

void TemplateTable::lneg() {
	transition(ltos, ltos);
	__ nor(FSR, ZERO, FSR);
	__ addiu(FSR, FSR, 1);
	__ sltiu(AT, FSR, 1);
	__ nor(SSR, ZERO, SSR);
	__ addu(SSR, SSR, AT);
}
/*
// Note: 'double' and 'long long' have 32-bits alignment on x86.
static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
  // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
  // of 128-bits operands for SSE instructions.
  jlong *operand = (jlong*)(((intptr_t)adr)&((intptr_t)(~0xF)));
  // Store the value to a 128-bits operand.
  operand[0] = lo;
  operand[1] = hi;
  return operand;
}

// Buffer for 128-bits masks used by SSE instructions.
static jlong float_signflip_pool[2*2];
static jlong double_signflip_pool[2*2];
*/
void TemplateTable::fneg() {
	transition(ftos, ftos);
	__ neg_s(FSF, FSF);
}

void TemplateTable::dneg() {
	transition(dtos, dtos);
	__ neg_d(FSF, FSF);
}

// used registers : T2
void TemplateTable::iinc() {
	transition(vtos, vtos);
	locals_index(T2);
	__ lw(FSR, T2, 0);
	__ lb(AT, at_bcp(2));           // get constant
	__ addu(FSR, FSR, AT);
	__ sw(FSR, T2, 0);
}

// used register : T2
void TemplateTable::wide_iinc() {
	transition(vtos, vtos);
	locals_index_wide(T2);
	__ load_two_bytes_from_at_bcp(FSR, AT, 4);
	__ hswap(FSR);
	__ lw(AT, T2, 0);
	__ addu(FSR, AT, FSR);
	__ sw(FSR, T2, 0);
}

void TemplateTable::convert() {
	// Checking
#ifdef ASSERT
	{ TosState tos_in  = ilgl;
		TosState tos_out = ilgl;
		switch (bytecode()) {
			case Bytecodes::_i2l: // fall through
			case Bytecodes::_i2f: // fall through
			case Bytecodes::_i2d: // fall through
			case Bytecodes::_i2b: // fall through
			case Bytecodes::_i2c: // fall through
			case Bytecodes::_i2s: tos_in = itos; break;
			case Bytecodes::_l2i: // fall through
			case Bytecodes::_l2f: // fall through
			case Bytecodes::_l2d: tos_in = ltos; break;
			case Bytecodes::_f2i: // fall through
			case Bytecodes::_f2l: // fall through
			case Bytecodes::_f2d: tos_in = ftos; break;
			case Bytecodes::_d2i: // fall through
			case Bytecodes::_d2l: // fall through
			case Bytecodes::_d2f: tos_in = dtos; break;
			default             : ShouldNotReachHere();
		}
		switch (bytecode()) {
			case Bytecodes::_l2i: // fall through
			case Bytecodes::_f2i: // fall through
			case Bytecodes::_d2i: // fall through
			case Bytecodes::_i2b: // fall through
			case Bytecodes::_i2c: // fall through
			case Bytecodes::_i2s: tos_out = itos; break;
			case Bytecodes::_i2l: // fall through
			case Bytecodes::_f2l: // fall through
			case Bytecodes::_d2l: tos_out = ltos; break;
			case Bytecodes::_i2f: // fall through
			case Bytecodes::_l2f: // fall through
			case Bytecodes::_d2f: tos_out = ftos; break;
			case Bytecodes::_i2d: // fall through
			case Bytecodes::_l2d: // fall through
			case Bytecodes::_f2d: tos_out = dtos; break;
			default             : ShouldNotReachHere();
		}
		transition(tos_in, tos_out);
	}
#endif // ASSERT

	// Conversion
	// (Note: use pushl(ecx)/popl(ecx) for 1/2-word stack-ptr manipulation)
	switch (bytecode()) {
		case Bytecodes::_i2l:
			__ extend_sign(SSR, FSR);
			break;
		case Bytecodes::_i2f:
			__ mtc1(FSR, FSF);
			__ cvt_s_w(FSF, FSF);
			break;
		case Bytecodes::_i2d:
			__ mtc1(FSR, FSF);
			__ cvt_d_w(FSF, FSF);
			break;
		case Bytecodes::_i2b:
			__ shl(FSR, 24);
			__ sar(FSR, 24);
			break;
		case Bytecodes::_i2c:
			__ andi(FSR, FSR, 0xFFFF);  // truncate upper 16 bits
			break;
		case Bytecodes::_i2s:
			__ shl(FSR, 16);
			__ sar(FSR, 16);
			break;
		case Bytecodes::_l2i:
			/* nothing to do */
			break;
		case Bytecodes::_l2f:
			__ mtc1(FSR, FSF);
			__ mtc1(SSR, SSF);
			__ cvt_s_l(FSF, FSF);
			break;
		case Bytecodes::_l2d:
			__ mtc1(FSR, FSF);
			__ mtc1(SSR, SSF);
			__ cvt_d_l(FSF, FSF);
			break;
		case Bytecodes::_f2i:
			{
				Label L;
				__ c_un_s(FSF, FSF);		//NaN?
				__ bc1t(L);
				__ delayed();
				__ move(FSR, ZERO);

				__ trunc_w_s(FSF, FSF);
				__ mfc1(FSR, FSF);
				__ bind(L);
			}
			break;
		case Bytecodes::_f2l:
			{
				Label L;
				__ move(SSR, ZERO);
				__ c_un_s(FSF, FSF);		//NaN?
				__ bc1t(L);
				__ delayed();
				__ move(FSR, ZERO);

				__ trunc_l_s(FSF, FSF);
				__ mfc1(FSR, FSF);
				__ mfc1(SSR, SSF);
				__ bind(L);
			}
			break;
		case Bytecodes::_f2d:
			__ cvt_d_s(FSF, FSF);
			break;
		case Bytecodes::_d2i:
			{
				Label L;
				__ c_un_d(FSF, FSF);		//NaN?
				__ bc1t(L);
				__ delayed();
				__ move(FSR, ZERO);

				__ trunc_w_d(FSF, FSF);
				__ mfc1(FSR, FSF);
				__ bind(L);
			}
			break;
		case Bytecodes::_d2l:
			{
				Label L;
				__ move(SSR, ZERO);
				__ c_un_d(FSF, FSF);		//NaN?
				__ bc1t(L);
				__ delayed();
				__ move(FSR, ZERO);

				__ trunc_l_d(FSF, FSF);
				__ mfc1(FSR, FSF);
				__ mfc1(SSR, SSF);
				__ bind(L);
			}
			break;
		case Bytecodes::_d2f:
			__ cvt_s_d(FSF, FSF);
			break;
		default             :
			ShouldNotReachHere();
	}
}

void TemplateTable::lcmp() {
	transition(ltos, itos);

	Label low, high, done;
	__ lw(T3, SP, 4);
//	__ pop_l(T2, T3);
	__ slt(AT, T3, SSR);
	__ bne(AT, ZERO, low);
	__ delayed()->addi(SP, SP, 8);
//	__ delayed()->nop();

	__ slt(AT, SSR, T3);
	__ bne(AT, ZERO, high);
	__ delayed()->nop();

	__ lw(T2, SP, -8);
	__ sltu(AT, T2, FSR);
	__ bne(AT, ZERO, low);
	__ delayed();

	__ sltu(AT, FSR, T2);
	__ bne(AT, ZERO, high);
	__ delayed()->nop();

	__ b(done);
	__ delayed(); __ move(FSR, 0);

	__ bind(low);
	__ b(done);
	__ delayed(); __ move(FSR, -1);

	__ bind(high);
	__ b(done);
	__ delayed(); __ move(FSR, 1);

	__ bind(done);
}

void TemplateTable::float_cmp(bool is_float, int unordered_result) {
	Label less, done;

	__ move(FSR, ZERO);

	if (is_float) {
		__ pop_ftos_to_esp();
		__ lwc1(FTF, at_sp());
		__ c_eq_s(FTF, FSF);
		__ bc1t(done);
		__ delayed()->addi(SP, SP, 1 * wordSize);

		if (unordered_result<0)
			__ c_ult_s(FTF, FSF);
		else
			__ c_olt_s(FTF, FSF);
	} else {
		__ pop_dtos_to_esp();
		__ lwc1(FTF, at_sp());
		__ lwc1(STF, at_sp_p1());
		__ c_eq_d(FTF, FSF);
		__ bc1t(done);
		__ delayed()->addi(SP, SP, 2 * wordSize);

		if (unordered_result<0)
			__ c_ult_d(FTF, FSF);
		else
			__ c_olt_d(FTF, FSF);
	}
	__ bc1t(less);
	__ delayed()->nop();
	__ b(done);
	__ delayed(); __ move(FSR, 1);
	__ bind(less);
	__ move(FSR, -1);
	__ bind(done);
}


// used registers : T3, T4, T7
// FSR : return bci, this is defined by the vm specification
// T3 : method
// T4 : offset
// T7 : next bytecode, this is required by dispatch_base
void TemplateTable::branch(bool is_jsr, bool is_wide) {
	__ get_method(T3);
	__ profile_taken_branch(T4, T7);		// only C2 meaningful 

#ifndef CORE
	const ByteSize be_offset = methodOopDesc::backedge_counter_offset() 
		+ InvocationCounter::counter_offset();
	const ByteSize inv_offset = methodOopDesc::invocation_counter_offset() 
		+ InvocationCounter::counter_offset();
	const int method_offset = frame::interpreter_frame_method_offset * wordSize;
#endif // CORE

	// Load up T4 with the branch displacement
	if (!is_wide) {
		__ load_two_bytes_from_at_bcp(T4, AT, 1);
		__ hswap(T4);
	} else {
		__ lw(T4, at_bcp(1));
		__ swap(T4);
	}

	// Handle all the JSR stuff here, then exit.
	// It's much shorter and cleaner than intermingling with the
	// non-JSR normal-branch stuff occuring below.
	if (is_jsr) {
		// Pre-load the next target bytecode into T7
		__ add(AT, BCP, T4);
		__ lbu(T7, AT, 0);

		// compute return address as bci in FSR
		__ addi(FSR, BCP, (is_wide?5:3) - in_bytes(constMethodOopDesc::codes_offset()));
		__ lw(AT, T3, in_bytes(methodOopDesc::const_offset()));
		__ sub(FSR, FSR, AT);
		// Adjust the bcp in BCP by the displacement in T4
		__ add(BCP, BCP, T4);
		// jsr returns atos that is not an oop
		// __ dispatch_only_noverify(atos);
		// Push return address
		//   __ push_i(eax);
		__ push_i(FSR);
		// jsr returns vtos
		__ dispatch_only_noverify(vtos);

		return;
	}

	// Normal (non-jsr) branch handling

	// Adjust the bcp in S0 by the displacement in T4
	__ add(BCP, BCP, T4);

#ifdef CORE
	// Pre-load the next target bytecode into EBX
	__ lbu(T7, BCP, 0);
	// continue with the bytecode @ target
	__ dispatch_only(vtos);
#else
	assert(UseLoopCounter || !UseOnStackReplacement, "on-stack-replacement requires loop counters");
	Label backedge_counter_overflow;
	Label profile_method;
	Label dispatch;
	if (UseLoopCounter) {
		// increment backedge counter for backward branches
		// eax: MDO
		// ebx: MDO bumped taken-count
		// T3: method
		// T4: target offset
		// BCP: target bcp
		// LVP: locals pointer
		__ bgtz(T4, dispatch);	// check if forward or backward branch
		__ delayed()->nop();

		// increment back edge counter 
		__ lw(T0, T3, in_bytes(be_offset));
		__ increment(T0, InvocationCounter::count_increment);
		__ sw(T0, T3, in_bytes(be_offset));

		// load invocation counter
		__ lw(T1, T3, in_bytes(inv_offset));
		// buffer bit added, mask no needed
		// by yjl 10/24/2005
		//__ move(AT, InvocationCounter::count_mask_value);
		//__ andr(T1, T1, AT);

		// add backedge counter & invocation counter
		__ add(T1, T1, T0);

		if (ProfileInterpreter) {
			// Test to see if we should create a method data oop
			__ lui(AT, Assembler::split_high(int(&InvocationCounter::InterpreterProfileLimit)));
			__ lw(AT, AT, Assembler::split_low(int(&InvocationCounter::InterpreterProfileLimit)));
			__ slt(AT, T1, AT);
			__ bne(AT, ZERO, dispatch);
			__ delayed()->nop();

			// if no method data exists, go to profile method
			__ test_method_data_pointer(T1, profile_method);

			if (UseOnStackReplacement) {
				// check for overflow against ebx which is the MDO taken count
				__ lui(AT, Assembler::split_high(int(&InvocationCounter::InterpreterBackwardBranchLimit)));
				__ lw(AT, AT, Assembler::split_low(int(&InvocationCounter::InterpreterBackwardBranchLimit)));
				// the value T7 Is get from the beginning profile_taken_branch
				__ slt(AT, T7, AT);
				__ bne(AT, ZERO, dispatch);
				__ delayed()->nop();

				// When ProfileInterpreter is on, the backedge_count comes 
				// from the methodDataOop, which value does not get reset on 
				// the call to  frequency_counter_overflow().  
				// To avoid excessive calls to the overflow routine while 
				// the method is being compiled, add a second test to make 
				// sure the overflow function is called only once every 
				// overflow_frequency.
				const int overflow_frequency = 1024;
				__ andi(T7, T7, overflow_frequency-1);
				__ beq(T7, ZERO, backedge_counter_overflow);
				__ delayed()->nop();
			}
		} else {
			if (UseOnStackReplacement) {
				// check for overflow against eax, which is the sum of the counters
				__ lui(AT, Assembler::split_high(int(&InvocationCounter::InterpreterBackwardBranchLimit)));
				__ lw(AT, AT, Assembler::split_low(int(&InvocationCounter::InterpreterBackwardBranchLimit)));
				__ slt(AT, T1, AT);
				__ beq(AT, ZERO, backedge_counter_overflow);
				__ delayed()->nop();
			}
		}
		__ bind(dispatch);
	}

	// Pre-load the next target bytecode into T7
	__ lbu(T7, BCP, 0);

	// continue with the bytecode @ target
	// FSR: return bci for jsr's, unused otherwise
	// T7: target bytecode
	// BCP: target bcp
	__ dispatch_only(vtos);

	if (UseLoopCounter) {
		if (ProfileInterpreter) {
			// Out-of-line code to allocate method data oop.
			__ bind(profile_method);
			__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method), BCP);
			__ lbu(T2, BCP, 0);
			__ lw(T3, FP, method_offset);
			__ lw(T3, T3, in_bytes(methodOopDesc::method_data_offset()));
			__ sw(T3, FP, frame::interpreter_frame_mdx_offset * wordSize);
			__ test_method_data_pointer(T3, dispatch);
			// offset non-null mdp by MDO::data_offset() + IR::profile_method()
			__ addi(T3, T3, in_bytes(methodDataOopDesc::data_offset()));
			__ add(T3, T3, T1);
			__ sw(T3, FP, frame::interpreter_frame_mdx_offset * wordSize);
			__ b(dispatch);
			__ delayed()->nop();
		}

		if (UseOnStackReplacement) {
			// invocation counter overflow
			__ bind(backedge_counter_overflow);
			__ sub(T4, BCP, T4);	// branch bcp
			call_VM(NOREG, CAST_FROM_FN_PTR(address, 
						InterpreterRuntime::frequency_counter_overflow), T4);
			__ lbu(T7, BCP, 0);

			// V0: osr nmethod (osr ok) or NULL (osr not possible)
			// V1: osr adapter frame return address
			// T7: target bytecode
			// LVP: locals pointer
			// BCP: bcp
			__ beq(V0, ZERO, dispatch);
			__ delayed()->nop();
			// nmethod may have been invalidated (VM may block upon call_VM return)
			__ lw(T3, V0, nmethod::entry_bci_offset());
			__ move(AT, InvalidOSREntryBci);
			__ beq(AT, T3, dispatch);
			__ delayed()->nop();
			// We need to prepare to execute the OSR method. First we must
			// migrate the locals and monitors off of the stack.
			//eax V0: osr nmethod (osr ok) or NULL (osr not possible)
			//ebx V1: osr adapter frame return address
			//edx  T7: target bytecode
			//edi  LVP: locals pointer
			//esi  BCP: bcp
			//__ movl(esi, eax);          // save the nmethod
			__ move(BCP, V0); 
			// const Register thread = ecx;
			const Register thread = T8;
			__ get_thread(thread);
			call_VM(noreg, CAST_FROM_FN_PTR(address, 
						SharedRuntime::OSR_migration_begin));
			// eax is OSR buffer, move it to expected parameter location
			//refer to osrBufferPointer in c1_LIRAssembler_mips.cpp	
			// __ movl(ecx, eax);
			__ move(T0, V0);

			// pop the interpreter frame
			//  __ movl(edx, Address(ebp, frame::interpreter_frame_sender_sp_offset 
			//  * wordSize)); // get sender sp
			__ lw(T8, Address(FP, 
				frame::interpreter_frame_sender_sp_offset * wordSize)); 
		//FIXME, shall we keep the return address on the stack?	
			__ leave();                                // remove frame anchor
			// __ popl(edi);                         // get return address
			//__ addi(SP, SP, wordSize);               // get return address
		       //   __ pop(LVP);	
			__ move(LVP, RA);	
		       // __ movl(esp, edx);                         // set sp to sender sp
			__ move(SP, T8 );

			Label skip;
			Label chkint;

			// The interpreter frame we have removed may be returning to
			// either the callstub or the interpreter. Since we will
			// now be returning from a compiled (OSR) nmethod we must
			// adjust the return to the return were it can handler compiled
			// results and clean the fpu stack. This is very similar to
			// what a i2c adapter must do.

			// Are we returning to the call stub?
#if 0	
			// __ cmpl(edi, (int)StubRoutines::_call_stub_return_address);
			__ addi(AT, LVP, -(int)StubRoutines::_call_stub_return_address); 
			//  __ jcc(Assembler::notEqual, chkint);
			__ bne(AT, ZERO, chkint);
			__ delayed()->nop();      
			// yes adjust to the specialized call stub  return.
			// assert(StubRoutines::i486::get_call_stub_compiled_return() != NULL,
			// "must be set");
			assert(StubRoutines::gs2::get_call_stub_compiled_return() != NULL, 
					"must be set");
			// __ movl(edi, (intptr_t) StubRoutines::i486::get_call_stub_compiled_return());
			__ move(LVP, (intptr_t) StubRoutines::gs2::get_call_stub_compiled_return()); 
			//  __ jmp(skip);
			__ b(skip);
			__ delayed()->nop();
			__ bind(chkint);

			// Are we returning to the interpreter? Look for sentinel

			//__ cmpl(Address(edi, -8), Interpreter::return_sentinel);
			__ lw(AT, LVP , -8); 
			__ addi(AT, AT, -Interpreter::return_sentinel); 
			//__ jcc(Assembler::notEqual, skip);
			__ bne(AT, ZERO, skip);
			__ delayed()->nop(); 
			// Adjust to compiled return back to interpreter

			// __ movl(edi, Address(edi, -4));
			__ lw(LVP, LVP, -4); 

			__ bind(skip);
#endif
			// Align stack pointer for compiled code (note that caller is
			// responsible for undoing this fixup by remembering the old SP
			// in an ebp-relative location)
			//  __ andl(esp, -(StackAlignmentInBytes));
		        __ move(AT, -(StackAlignmentInBytes));	
			__ andr(SP , SP , AT);
			// push the (possibly adjusted) return address
			//  __ pushl(edi);
			//__ push(LVP);
//			__ move(RA, LVP);	
			// and begin the OSR nmethod
			//  __ jmp(Address(esi, nmethod::osr_entry_point_offset()));
		//refer to osr_entry in c1_LIRAssembler_mips.cpp	
			__ lw(AT, BCP, nmethod::osr_entry_point_offset()); 
			__ jr(AT); 
			__ delayed()->nop(); 
		}
	}
#endif // not CORE
}

void TemplateTable::if_0cmp(Condition cc) {
	transition(itos, vtos);
	// assume branch is more often taken than not (loops use backward branches)
	Label not_taken;
	switch(cc) {
		case not_equal:
			__ beq(FSR, ZERO, not_taken);
			break;
		case equal:
			__ bne(FSR, ZERO, not_taken);
			break;
		case less:
			__ bgez(FSR, not_taken);
			break;
		case less_equal:
			__ bgtz(FSR, not_taken);
			break;
		case greater:
			__ blez(FSR, not_taken);
			break;
		case greater_equal:
			__ bltz(FSR, not_taken);
			break;
	}
	__ delayed()->nop();

	branch(false, false);

	__ bind(not_taken);
	__ profile_not_taken_branch(FSR);
}


void TemplateTable::if_icmp(Condition cc) {
	transition(itos, vtos);
	// assume branch is more often taken than not (loops use backward branches)
	Label not_taken;
	//__ lw(SSR, SP, 0);
	
	__ pop_i(SSR);	
	switch(cc) {
		case not_equal:
			__ beq(SSR, FSR, not_taken);
			break;
		case equal:
			__ bne(SSR, FSR, not_taken);
			break;
		case less:
			__ slt(AT, SSR, FSR);
			__ beq(AT, ZERO, not_taken);
			break;
		case less_equal:
			__ slt(AT, FSR, SSR);
			__ bne(AT, ZERO, not_taken);
			break;
		case greater:
			__ slt(AT, FSR, SSR);
			__ beq(AT, ZERO, not_taken);
			break;
		case greater_equal:
			__ slt(AT, SSR, FSR);
			__ bne(AT, ZERO, not_taken);
			break;
	}
	//	__ delayed()->addi(SP, SP, 1 * wordSize);
	__ delayed()->nop();

	branch(false, false);

	__ bind(not_taken);
	__ profile_not_taken_branch(FSR);
}


void TemplateTable::if_nullcmp(Condition cc) {
	transition(atos, vtos);
	// assume branch is more often taken than not (loops use backward branches)
	Label not_taken;
	switch(cc) {
		case not_equal:
			__ beq(FSR, ZERO, not_taken);
			break;
		case equal:
			__ bne(FSR, ZERO, not_taken);
			break;
		default:
			ShouldNotReachHere();
	}
	__ delayed()->nop();

	branch(false, false);

	__ bind(not_taken);
	__ profile_not_taken_branch(FSR);
}


void TemplateTable::if_acmp(Condition cc) {
	transition(atos, vtos);
	// assume branch is more often taken than not (loops use backward branches)
	Label not_taken;
	//	__ lw(SSR, SP, 0);
	__ pop_ptr(SSR);
	switch(cc) {
		case not_equal:
			__ beq(SSR, FSR, not_taken);
			break;
		case equal:
			__ bne(SSR, FSR, not_taken);
			break;
		default:
			ShouldNotReachHere();
	}
	//	__ delayed()->addi(SP, SP, 4);
	__ delayed()->nop();

	branch(false, false);

	__ bind(not_taken);
	__ profile_not_taken_branch(FSR);
}

// used registers : T1, T2, T3
// T1 : method
// T2 : returb bci
void TemplateTable::ret() {
	transition(vtos, vtos);

	locals_index(T2);
	__ lw(T2, T2, 0);
	__ profile_ret(T2, T3);

	__ get_method(T1);
	__ lw(BCP, T1, in_bytes(methodOopDesc::const_offset()));
	__ add(BCP, BCP, T2);
	__ addi(BCP, BCP, in_bytes(constMethodOopDesc::codes_offset()));

	__ dispatch_next(vtos);
}

// used registers : T1, T2, T3
// T1 : method
// T2 : returb bci
void TemplateTable::wide_ret() {
	transition(vtos, vtos);

	locals_index_wide(T2);
	__ lw(T2, T2, 0);                   // get return bci, compute return bcp
	__ profile_ret(T2, T3);

	__ get_method(T1);
	__ lw(BCP, T1, in_bytes(methodOopDesc::const_offset()));
	__ add(BCP, BCP, T2);
	__ addi(BCP, BCP, in_bytes(constMethodOopDesc::codes_offset()));

	__ dispatch_next(vtos);
}

// used register T2, T3, T4, T7
// T2 : bytecode pointer
// T3 : low
// T4 : high
// T7 : dest bytecode, required by dispatch_base
void TemplateTable::tableswitch() {
	Label default_case, continue_execution;
	transition(itos, vtos);

	// align BCP
	__ addi(T2, BCP, wordSize);
	__ move(AT, -wordSize);
	__ andr(T2, T2, AT);

	// load lo & hi
	__ lw(T3, T2, 1*wordSize);
	__ swap(T3);
	__ lw(T4, T2, 2*wordSize);
	__ swap(T4);

	// check against lo & hi
	__ slt(AT, FSR, T3);
	__ bne(AT, ZERO, default_case);
	__ delayed()->nop();

	__ slt(AT, T4, FSR);
	__ bne(AT, ZERO, default_case);
	__ delayed()->nop();

	// lookup dispatch offset, in T4 big endian
	__ sub(FSR, FSR, T3);
	__ sll(AT, FSR, 2);
	__ add(AT, T2, AT);
	__ lw(T4, AT, 3*wordSize);
	__ profile_switch_case(FSR, T2, T3);

	__ bind(continue_execution);
	__ swap(T4);
	__ add(BCP, BCP, T4);
	__ lbu(T7, BCP, 0);
	__ dispatch_only(vtos);

	// handle default
	__ bind(default_case);
	__ profile_switch_default(FSR);
	__ lw(T4, T2, 0);
	__ b(continue_execution);
	__ delayed()->nop();
}

void TemplateTable::lookupswitch() {
	transition(itos, itos);
	__ stop("lookupswitch bytecode should have been rewritten");
}

// used registers : T2, T3, T4, T7
// T2 : bytecode pointer
// T3 : pair index
// T4 : offset
// T7 : dest bytecode
// the data after the opcode is the same as lookupswith
// see Rewriter::rewrite_method for more information
void TemplateTable::fast_linearswitch() {
	transition(itos, vtos);
	Label loop_entry, loop, found, continue_execution;  

	// swap eax so we can avoid swapping the table entries
	__ swap(FSR);

	// align BCP
	__ addi(T2, BCP, wordSize);
	__ move(AT, -wordSize);
	__ andr(T2, T2, AT);

	// set counter
	__ lw(T3, T2, wordSize);
	__ swap(T3);
	__ b(loop_entry);
	__ delayed()->nop();

	// table search
	__ bind(loop);
	// get the entry value
	__ sll(AT, T3, 3);
	__ add(AT, T2, AT);
	__ lw(AT, AT, 2 * wordSize);

	// found?
	__ beq(FSR, AT, found);
	__ delayed()->nop();

	__ bind(loop_entry);
	__ bgtz(T3, loop);
	__ delayed()->addiu(T3, T3, -1);

	// default case
	__ profile_switch_default(FSR);
	__ lw(T4, T2, 0);
	__ b(continue_execution);
	__ delayed()->nop();

	// entry found -> get offset
	__ bind(found);
	__ sll(AT, T3, 3);
	__ add(AT, T2, AT);
	__ lw(T4, AT, 3 * wordSize);
	__ profile_switch_case(T3, FSR, T2);

	// continue execution
	__ bind(continue_execution);  
	__ swap(T4);
	__ add(BCP, BCP, T4);
	__ lbu(T7, BCP, 0);
	__ dispatch_only(vtos);
}

// used registers : T0, T1, T2, T3, T4, T7
// T2 : pairs address(array)
// T7 : dest bytecode
// the data after the opcode is the same as lookupswith
// see Rewriter::rewrite_method for more information
void TemplateTable::fast_binaryswitch() {
	transition(itos, vtos);
	// Implementation using the following core algorithm:
	//
	// int binary_search(int key, LookupswitchPair* array, int n) {
	//   // Binary search according to "Methodik des Programmierens" by
	//   // Edsger W. Dijkstra and W.H.J. Feijen, Addison Wesley Germany 1985.
	//   int i = 0;
	//   int j = n;
	//   while (i+1 < j) {
	//     // invariant P: 0 <= i < j <= n and (a[i] <= key < a[j] or Q)
	//     // with      Q: for all i: 0 <= i < n: key < a[i]
	//     // where a stands for the array and assuming that the (inexisting)
	//     // element a[n] is infinitely big.
	//     int h = (i + j) >> 1;
	//     // i < h < j
	//     if (key < array[h].fast_match()) {
	//       j = h;
	//     } else {
	//       i = h;
	//     }
	//   }
	//   // R: a[i] <= key < a[i+1] or Q
	//   // (i.e., if key is within array, i is the correct index)
	//   return i;
	// }

	// register allocation
	const Register array = T2;
	const Register i=T3, j=T4;
	const Register h=T1;
	const Register temp=T0;
	const Register key=FSR;

	// setup array
	__ addi(array, BCP, 3*wordSize);
	__ move(AT, -wordSize);
	__ andr(array, array, AT);

	// initialize i & j
	__ move(i, ZERO);
	__ lw(j, array, - 1 * wordSize);
	// Convert j into native byteordering  
	__ swap(j);

	// and start
	Label entry;
	__ b(entry);
	__ delayed()->nop();

	// binary search loop
	{ 
		Label loop;
		__ bind(loop);
		// int h = (i + j) >> 1;
		__ add(h, i, j);
		__ shr(h, 1);
		// if (key < array[h].fast_match()) {
		//   j = h;
		// } else {
		//   i = h;
		// }
		// Convert array[h].match to native byte-ordering before compare
		__ sll(AT, h, 3);
		__ add(AT, array, AT);
		__ lw(temp, AT, 0*wordSize);
		__ swap(temp);
		
		{
			Label set_i, end_of_if;
			__ slt(AT, key, temp);
			__ beq(AT, ZERO, set_i);
			__ delayed()->nop(); 

			__ b(end_of_if);
			__ delayed(); __ move(j, h);

			__ bind(set_i);
			__ move(i, h);

			__ bind(end_of_if);
		}
		// while (i+1 < j)
		__ bind(entry);
		__ addi(h, i, 1);
		__ slt(AT, h, j);
		__ bne(AT, ZERO, loop);
		__ delayed()->nop();
	}

	// end of binary search, result index is i (must check again!)
	Label default_case;
	// Convert array[i].match to native byte-ordering before compare
	__ sll(AT, i, 3);
	__ add(AT, array, AT);
	__ lw(temp, AT, 0 * wordSize);
	__ swap(temp);
	__ bne(key, temp, default_case);
	__ delayed()->nop();

	// entry found -> j = offset
	__ sll(AT, i, 3);
	__ add(AT, array, AT);
	__ lw(j, AT, 1 * wordSize);
	__ profile_switch_case(i, key, array);
	__ swap(j);

	__ add(BCP, BCP, j);
	__ lbu(T7, BCP, 0);
	__ dispatch_only(vtos);

	// default case -> j = default offset
	__ bind(default_case);
	__ profile_switch_default(i);
	__ lw(j, array, - 2 * wordSize);
	__ swap(j);
	__ add(BCP, BCP, j);
	__ lbu(T7, BCP, 0);
	__ dispatch_only(vtos);
}

void TemplateTable::_return(TosState state) {
	transition(state, state);
	assert(_desc->calls_vm(), "inconsistent calls_vm information"); // call in remove_activation
	if (_desc->bytecode() == Bytecodes::_return_register_finalizer) {
		assert(state == vtos, "only valid state");
		//__ movl(eax, aaddress(0));
		__ lw(T4, aaddress(0));
		//__ movl(edi, Address(eax, oopDesc::klass_offset_in_bytes()));
		__ lw(LVP, T4, oopDesc::klass_offset_in_bytes());
		//__ movl(edi, Address(edi, Klass::access_flags_offset_in_bytes() 
		//+ sizeof(oopDesc)));
		__ lw(LVP, LVP, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
		//__ testl(edi, JVM_ACC_HAS_FINALIZER);
		__ move(AT, JVM_ACC_HAS_FINALIZER); 
		__ andr(AT, AT, LVP);//by_css
		//__ andi(AT, LVP, JVM_ACC_HAS_FINALIZER); 
		Label skip_register_finalizer;
		//__ jcc(Assembler::zero, skip_register_finalizer);
		__ beq(AT, ZERO, skip_register_finalizer);
		__ delayed()->nop(); 
		//__ call_VM(noreg, CAST_FROM_FN_PTR(address, 
		//InterpreterRuntime::register_finalizer), eax);
		__ call_VM(noreg, CAST_FROM_FN_PTR(address, 
					InterpreterRuntime::register_finalizer), T4);
		__ bind(skip_register_finalizer);
	}
	__ remove_activation(state, T9);

	__ jr(T9);
	__ delayed()->nop();
}

// ----------------------------------------------------------------------------
// Volatile variables demand their effects be made known to all CPU's
// in order.  Store buffers on most chips allow reads & writes to
// reorder; the JMM's ReadAfterWrite.java test fails in -Xint mode
// without some kind of memory barrier (i.e., it's not sufficient that
// the interpreter does not reorder volatile references, the hardware
// also must not reorder them).
//
// According to the new Java Memory Model (JMM):
// (1) All volatiles are serialized wrt to each other.  ALSO reads &
//     writes act as aquire & release, so:
// (2) A read cannot let unrelated NON-volatile memory refs that
//     happen after the read float up to before the read.  It's OK for
//     non-volatile memory refs that happen before the volatile read to
//     float down below it.
// (3) Similar a volatile write cannot let unrelated NON-volatile
//     memory refs that happen BEFORE the write float down to after the
//     write.  It's OK for non-volatile memory refs that happen after the
//     volatile write to float up before it.
//
// We only put in barriers around volatile refs (they are expensive),
// not _between_ memory refs (that would require us to track the
// flavor of the previous memory refs).  Requirements (2) and (3)
// require some barriers before volatile stores and after volatile
// loads.  These nearly cover requirement (1) but miss the
// volatile-store-volatile-load case.  This final case is placed after
// volatile-stores although it could just as well go before
// volatile-loads.
//void TemplateTable::volatile_barrier(Assembler::Membar_mask_bits
//                                     order_constraint) {
void TemplateTable::volatile_barrier( ) {
  // Helper function to insert a is-volatile test and memory barrier
  //if (os::is_MP()) { // Not needed on single CPU
  //  __ membar(order_constraint);
  //}
	if( !os::is_MP() ) return;	// Not needed on single CPU
	__ sync();
}

// we dont shift left 2 bits in get_cache_and_index_at_bcp
// for we always need shift the index we use it. the ConstantPoolCacheEntry 
// is 16-byte long, index is the index in 
// constantPoolCacheOopDesc, so cache + base_offset() + index * 16 is 
// the corresponding ConstantPoolCacheEntry
// used registers : T2
// NOTE : the returned index need also shift left 4 to get the address!
void TemplateTable::resolve_cache_and_index(int byte_no,
                                            Register Rcache,
                                            Register index) {
	assert(byte_no == 1 || byte_no == 2, "byte_no out of range");

	Register temp = T2;

	assert_different_registers(Rcache, index, temp);

	const int shift_count = (1 + byte_no)*BitsPerByte;
	Label resolved;
	__ get_cache_and_index_at_bcp(Rcache, index, 1);

	// is resolved?
	__ sll(AT, index, 4); 
	__ add(AT, Rcache, AT);
	__ lw(AT, AT, in_bytes(constantPoolCacheOopDesc::base_offset() 
				+ ConstantPoolCacheEntry::indices_offset()));
	__ shr(AT, shift_count);
	__ andi(AT, AT, 0xff);
	__ addi(AT, AT, - bytecode());
	__ beq(AT, ZERO, resolved);
	__ delayed()->nop();
	// resolve first time through
	address entry;
	switch (bytecode()) {
		case Bytecodes::_getstatic      : // fall through
		case Bytecodes::_putstatic      : // fall through
		case Bytecodes::_getfield       : // fall through
		case Bytecodes::_putfield       : 
			entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_get_put); 
			break;
		case Bytecodes::_invokevirtual  : // fall through
		case Bytecodes::_invokespecial  : // fall through
		case Bytecodes::_invokestatic   : // fall through
		case Bytecodes::_invokeinterface: 
			entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invoke);  
			break;
		default                      		: 
			ShouldNotReachHere();
	}
	
	__ move(A1, (int)bytecode());
	__ call_VM(NOREG, entry, A1);

	// Update registers with resolved info
	__ get_cache_and_index_at_bcp(Rcache, index, 1);
	__ bind(resolved);
}

// The Rcache and index registers must be set before call
void TemplateTable::load_field_cp_cache_entry(Register obj,
                                              Register cache,
                                              Register index,
                                              Register off,
                                              Register flags,
                                              bool is_static = false) {
	assert_different_registers(cache, index, flags, off);
	ByteSize cp_base_offset = constantPoolCacheOopDesc::base_offset();
	// Field offset
	__ shl(index, 4);
	__ add(index, cache, index);
	__ lw(off, index, in_bytes(cp_base_offset + ConstantPoolCacheEntry::f2_offset()));
	// Flags    
	__ lw(flags, index, in_bytes(cp_base_offset + ConstantPoolCacheEntry::flags_offset()));

	// klass     overwrite register
	if (is_static) {
		__ lw(obj, index, in_bytes(cp_base_offset + 
					ConstantPoolCacheEntry::f1_offset())); 
		__ verify_oop(obj);	
	}
}

// get the method, itable_index and flags of the current invoke
void TemplateTable::load_invoke_cp_cache_entry(int byte_no,
                                               Register method,
                                               Register itable_index,
                                               Register flags,
                                               bool is_invokevirtual,
                                               bool is_invokevfinal /*unused*/) {
	// setup registers
	///const Register cache = ecx;
	///const Register index = edx;
	const Register cache = T3;
	const Register index = T4;
	assert_different_registers(method, flags);
	assert_different_registers(method, cache, index);
	assert_different_registers(itable_index, flags);
	assert_different_registers(itable_index, cache, index);
	// determine constant pool cache field offsets
	const int method_offset = in_bytes(
			constantPoolCacheOopDesc::base_offset() +
			(is_invokevirtual
			 ? ConstantPoolCacheEntry::f2_offset()
			 : ConstantPoolCacheEntry::f1_offset()
			)
			);
	const int flags_offset = in_bytes(constantPoolCacheOopDesc::base_offset() +
			ConstantPoolCacheEntry::flags_offset());
	// access constant pool cache fields
	const int index_offset = in_bytes(constantPoolCacheOopDesc::base_offset() +
			ConstantPoolCacheEntry::f2_offset());
 
	resolve_cache_and_index(byte_no, cache, index);

	assert(wordSize == 4, "adjust code below");
	// note we shift 4 not 2, for we get is the true inde 
	// of ConstantPoolCacheEntry, not the shifted 2-bit index as x86 version
	__ sll(AT, index, 4);
	__ add(AT, cache, AT);
	__ lw(method, AT, method_offset);


	if (itable_index != NOREG) {
		//__ sll(AT, index, 4);
		//__ addu(AT, cache, AT);
		__ lw(itable_index, AT, index_offset);
	}
	__ lw(flags, AT, flags_offset);
}


// The registers cache and index expected to be set before call.
// Correct values of the cache and index registers are preserved.
void TemplateTable::jvmti_post_field_access(Register cache, Register index,
                                            bool is_static, bool has_tos) {
  // do the JVMTI work here to avoid disturbing the register state below
  // We use c_rarg registers here because we want to use the register used in
  // the call to the VM
	if (JvmtiExport::can_post_field_access()) {
		// Check to see if a field access watch has been set before we take
		// the time to call into the VM.
		Label L1;
		assert_different_registers(cache, index, FSR);
	//	__ movl(eax, Address((int)JvmtiExport::get_field_access_count_addr(), relocInfo::none));
		__ lui(AT, Assembler::split_high((int)JvmtiExport::get_field_access_count_addr()));
		__ lw(FSR, AT, Assembler::split_low((int)JvmtiExport::get_field_access_count_addr()));
		//	__ testl(eax,eax);
		//	__ beq(T3, ZERO, L1);
		__ beq(FSR, ZERO, L1);
		__ delayed()->nop();

		// We rely on the bytecode being resolved and the cpCache entry filled in.
		// cache entry pointer
		__ addi(cache, cache, in_bytes(constantPoolCacheOopDesc::base_offset()));
		__ shl(index, 4);
		__ add(cache, cache, index);
		if (is_static) {
			__ move(FSR, ZERO);
		} else {
			__ lw(FSR, SP, 0);
			__ verify_oop(FSR);
		}
		// FSR: object pointer or NULL
		// cache: cache entry pointer
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
					InterpreterRuntime::post_field_access), FSR, cache);
		__ get_cache_and_index_at_bcp(cache, index, 1);
		__ bind(L1);
	} 
}

void TemplateTable::pop_and_check_object(Register r) {
  __ pop_ptr(r);
  __ null_check(r);  // for field access must check obj.
  __ verify_oop(r);
}

// used registers : T1, T2, T3, T4
// T1 : flags
// T2 : off
// T3 : obj
// T4 : field address
// The flags 31, 30, 29, 28 together build a 4 bit number 0 to 8 with the
// following mapping to the TosState states:
// btos: 0
// ctos: 1
// stos: 2
// itos: 3
// ltos: 4
// ftos: 5
// dtos: 6
// atos: 7
// vtos: 8
// see ConstantPoolCacheEntry::set_field for more info
void TemplateTable::getfield_or_static(int byte_no, bool is_static) {
  transition(vtos, vtos);

	// const Register cache = ecx;
	const Register cache = T3;
	// const Register index = edx;
	const Register index = T4;

	const Register obj   = T3;
	const Register off   = T2;
	const Register flags = T1;
	resolve_cache_and_index(byte_no, cache, index);
	jvmti_post_field_access(cache, index, is_static, false);
	load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);

	if (!is_static) pop_and_check_object(obj);
	__ add(T4, obj, off);


	Label Done, notByte, notInt, notShort, notChar, notLong, notFloat, notObj, notDouble;

	assert(btos == 0, "change code, btos != 0");
	__ shr(flags, ConstantPoolCacheEntry::tosBits);
	__ andi(flags, flags, 0xf);
	__ bne(flags, ZERO, notByte);
	__ delayed()->nop();

	// btos
	__ lb(FSR, T4, 0);	
	__ sw(FSR, SP, - wordSize);

	// Rewrite bytecode to be faster
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_bgetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notByte);
	__ move(AT, itos);
	__ bne(T1, AT, notInt);
	__ delayed()->nop();

	// itos
	__ lw(FSR, T4, 0);
	__ sw(FSR, SP, - wordSize);

	// Rewrite bytecode to be faster
	if (!is_static) {
		// patch_bytecode(Bytecodes::_fast_igetfield, T3, T2);
		patch_bytecode(Bytecodes::_fast_igetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notInt);
	__ move(AT, atos);
	__ bne(T1, AT, notObj);
	__ delayed()->nop();

	// atos
	__ lw(FSR, T4, 0);
	__ sw(FSR, SP, - wordSize);

	if (!is_static) {
		//patch_bytecode(Bytecodes::_fast_agetfield, T3, T2);
		patch_bytecode(Bytecodes::_fast_agetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notObj);
	__ move(AT, ctos);
	__ bne(T1, AT, notChar);
	__ delayed()->nop();

	// ctos
	__ lhu(FSR, T4, 0);
	__ sw(FSR, SP, - wordSize);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_cgetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notChar);
	__ move(AT, stos);
	__ bne(T1, AT, notShort);
	__ delayed()->nop();

	// stos
	__ lh(FSR, T4, 0);
	__ sw(FSR, SP, - wordSize);

	if (!is_static) {
		// patch_bytecode(Bytecodes::_fast_sgetfield, T3, T2);
		patch_bytecode(Bytecodes::_fast_sgetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notShort);
	__ move(AT, ltos);
	__ bne(T1, AT, notLong);
	__ delayed()->nop();

	// FIXME : the load/store should be atomic, we have no simple method to do this in mips32
	// ltos
	__ lw(FSR, T4, 0*wordSize);
	__ lw(SSR, T4, 1*wordSize);
	__ sw(FSR, SP, - 2*wordSize);
	__ sw(SSR, SP, - 1*wordSize);

	// Don't rewrite to _fast_lgetfield for potential volatile case.
	__ b(Done);
	__ delayed()->addi(SP, SP, - 2 * wordSize);

	__ bind(notLong);
	__ move(AT, ftos);
	__ bne(T1, AT, notFloat);
	__ delayed()->nop();

	// ftos
	__ lwc1(FSF, T4, 0);
	__ swc1(FSF, SP, - wordSize);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_fgetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - wordSize);

	__ bind(notFloat);
	__ move(AT, dtos);
	__ bne(T1, AT, notDouble);
	__ delayed()->nop();

	// dtos
	__ lwc1(FSF, T4, 0 * wordSize);
	__ lwc1(SSF, T4, 1 * wordSize);
	__ swc1(FSF, SP, - 2 * wordSize);
	__ swc1(SSF, SP, - 1 * wordSize);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_dgetfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->addi(SP, SP, - 2 * wordSize);

	__ bind(notDouble);

	__ stop("Bad state");

	__ bind(Done);
	// Doug Lea believes this is not needed with current Sparcs (TSO) and Intel (PSO).
 	//volatile_barrier( );
      //jerome_for_debug 
  /*      __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
        __ nop(); 
	__ verify_oop(FSR);*/
}

void TemplateTable::getfield(int byte_no) {
	getfield_or_static(byte_no, false);
}

void TemplateTable::getstatic(int byte_no) {
	getfield_or_static(byte_no, true);
}
/*
// used registers : T1, T2, T3, T4
// T1 : cache & cp entry
// T2 : obj
// T3 : flags & value pointer
// T4 : index
// see ConstantPoolCacheEntry::set_field for more info
void TemplateTable::jvmti_post_field_mod(int byte_no, bool is_static) {
 */

// The registers cache and index expected to be set before call.
// The function may destroy various registers, just not the cache and index registers.
void TemplateTable::jvmti_post_field_mod(Register cache, Register index, bool is_static) {
	ByteSize cp_base_offset = constantPoolCacheOopDesc::base_offset();

	if (JvmtiExport::can_post_field_modification()) {
		// Check to see if a field modification watch has been set before we take
		// the time to call into the VM.
		Label L1;
		assert_different_registers(cache, index, T3);

		__ lui(AT, Assembler::split_high((int)JvmtiExport::get_field_modification_count_addr()));
		__ lw(FSR, AT, Assembler::split_low((int)JvmtiExport::get_field_modification_count_addr()));
		__ beq(FSR, ZERO, L1);
		__ delayed()->nop();

		/* // We rely on the bytecode being resolved and the cpCache entry filled in.
		   resolve_cache_and_index(byte_no, T1, T4);
		   */
		// The cache and index registers have been already set.
		// This allows to eliminate this call but the cache and index
		// registers have to be correspondingly used after this line.
		// __ get_cache_and_index_at_bcp(eax, edx, 1);
		__ get_cache_and_index_at_bcp(T1, T4, 1);

		if (is_static) {
			__ move(T2, ZERO);
		} else {
			// Life is harder. The stack holds the value on top, 
			// followed by the object.
			// We don't know the size of the value, though; 
			// it could be one or two words
			// depending on its type. As a result, we must find 
			// the type to determine where the object is.
			Label two_word, valsize_known;
			__ sll(AT, T4, 4); 
			__ add(AT, T1, AT);
			__ lw(T3, AT, in_bytes(cp_base_offset 
						+ ConstantPoolCacheEntry::flags_offset()));
			__ move(T2, SP);
			__ shr(T3, ConstantPoolCacheEntry::tosBits);

			// Make sure we don't need to mask ecx for tosBits 
			// after the above shift
			ConstantPoolCacheEntry::verify_tosBits();
			__ move(AT, ltos);
			__ beq(T3, AT, two_word);
			__ delayed()->nop();
			__ move(AT, dtos);
			__ beq(T3, AT, two_word);
			__ delayed()->nop();
			__ b(valsize_known);
			//__ delayed()->addi(T2, T2, wordSize*1);
			__ delayed()->addi(T2, T2,Interpreter::expr_offset_in_bytes(1) );

			__ bind(two_word);
			//	__ addi(T2, T2, wordSize*2);
			__ addi(T2, T2,Interpreter::expr_offset_in_bytes(2));

			__ bind(valsize_known);
			// setup object pointer
			__ lw(T2, T2, 0*wordSize);
		}
		// cache entry pointer
		__ addi(T1, T1, in_bytes(cp_base_offset));
		__ shl(T4, 4); 
		__ addu(T1, T1, T4);
		// object (tos)
		__ move(T3, SP);
		// T2: object pointer set up above (NULL if static)
		// T1: cache entry pointer
		// T3: jvalue object on the stack
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
				InterpreterRuntime::post_field_modification), T2, T1, T3);
		__ get_cache_and_index_at_bcp(cache, index, 1);
		__ bind(L1);
	}
}

// used registers : T1, T2, T3, T4
// T1 : flags
// T2 : off
// T3 : obj
// T4 : volatile bit
// see ConstantPoolCacheEntry::set_field for more info
void TemplateTable::putfield_or_static(int byte_no, bool is_static) {
	transition(vtos, vtos);

	const Register cache = T3;
	const Register index = T4;
	const Register obj   = T3;
	const Register off   = T2;
	const Register flags = T1;

	resolve_cache_and_index(byte_no, cache, index);
	jvmti_post_field_mod(cache, index, is_static);
	load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
	// Doug Lea believes this is not needed with current Sparcs (TSO) and Intel (PSO).
	// volatile_barrier( );

	Label notVolatile, Done;
	__ move(AT, 1<<ConstantPoolCacheEntry::volatileField);
	__ andr(T4, T1, AT);

	Label notByte, notInt, notShort, notChar, notLong, notFloat, notObj, notDouble;

	assert(btos == 0, "change code, btos != 0");
	// btos
	__ shr(T1, ConstantPoolCacheEntry::tosBits);
	__ andi(T1, T1, 0xf);
	__ bne(T1, ZERO, notByte);
	__ delayed()->nop();

	__ pop(btos);
	if (!is_static) {
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ sb(FSR, AT, 0);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_bputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();

	__ bind(notByte);
	// itos
	__ move(AT, itos);
	__ bne(T1, AT, notInt);
	__ delayed()->nop();

	__ pop(itos);
	if (!is_static) {
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ sw(FSR, AT, 0);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_iputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();  
	__ bind(notInt);
	// atos
	__ move(AT, atos);
	__ bne(T1, AT, notObj);
	__ delayed()->nop();

	__ pop(atos);
	if (!is_static) {
		pop_and_check_object(T3); 
	}

	__ add(AT, T3, T2);
	__ sw(FSR, AT, 0);
	__ store_check(T3);

	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_aputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();
	__ bind(notObj);
	// ctos
	__ move(AT, ctos);
	__ bne(T1, AT, notChar);
	__ delayed()->nop();

//	__ lhu(FSR, SP, 0);
//	__ addi(SP, SP, wordSize);
	__ pop(ctos);
	if (!is_static) {
		//		__ lw(T3, SP, addent);
		//		addent += 1 * wordSize;
		//		__ verify_oop(T3);
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ sh(FSR, AT, 0);
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_cputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();
	__ bind(notChar);
	// stos
	__ move(AT, stos);
	__ bne(T1, AT, notShort);
	__ delayed()->nop();

//	__ lh(FSR, SP, 0);
//	__ addi(SP, SP, wordSize);
	__ pop(stos);
	if (!is_static) {
		//		__ lw(T3, SP, addent);
		//		addent += 1 * wordSize;
		//		__ verify_oop(T3);
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ sh(FSR, AT, 0);
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_sputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();
	__ bind(notShort);
	// ltos
	__ move(AT, ltos);
	__ bne(T1, AT, notLong);
	__ delayed()->nop();

	// FIXME: there is no simple method to load/store 64-bit data in a atomic operation
	// we just ignore the volatile flag.
	//Label notVolatileLong;
	//__ beq(T4, ZERO, notVolatileLong);
	//__ delayed()->nop();

	//addent = 2 * wordSize;
	// no need
	//__ lw(FSR, SP, 0);
	//__ lw(SSR, SP, 1 * wordSize);
	//if (!is_static) {
	//	__ lw(T3, SP, addent);
	//	addent += 1 * wordSize;
	//	__ verify_oop(T3);
	//}

	//__ addu(AT, T3, T2);

	// Replace with real volatile test
	// NOTE : we assume that sdc1&ldc1 operate in 32-bit, this is true for Godson2 even in 64-bit kernel
	// last modified by yjl 7/12/2005
	//__ ldc1(FSF, SP, 0); 
	//__ sdc1(FSF, AT, 0);
	//volatile_barrier();

	// Don't rewrite volatile version
	//__ b(notVolatile);
	//__ delayed()->addiu(SP, SP, addent);

	//__ bind(notVolatileLong);

	//__ pop(ltos);  // overwrites edx
//	__ lw(FSR, SP, 0 * wordSize);
//	__ lw(SSR, SP, 1 * wordSize);
//	__ addi(SP, SP, 2*wordSize);
	__ pop(ltos);
	if (!is_static) {
		//		__ lw(T3, SP, addent);
		//		addent += 1 * wordSize;
		//		__ verify_oop(T3);
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ sw(FSR, AT, 0);
	__ sw(SSR, AT, 4);
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_lputfield, T3, T2);
	}
	__ b(notVolatile);
	__ delayed()->nop();

	__ bind(notLong);
	// ftos
	__ move(AT, ftos);
	__ bne(T1, AT, notFloat);
	__ delayed()->nop();

//	__ lwc1(FSF, SP, 0);
//	__ addi(SP, SP, wordSize);
	__ pop(ftos);
	if (!is_static) {
		//		__ lw(T3, SP, addent);
		//		addent += 1 * wordSize;
		//		__ verify_oop(T3);
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	__ swc1(FSF, AT, 0);
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_fputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();
	__ bind(notFloat);
	// dtos
	__ move(AT, dtos);
	__ bne(T1, AT, notDouble);
	__ delayed()->nop();

	//__ ldc1(FSF, SP, 0);
//	__ lwc1(FSF, SP, 0);
//	__ lwc1(SSF, SP, wordSize);
//	__ addi(SP, SP, 2*wordSize);
	__ pop(dtos);
	if (!is_static) {
		//		__ lw(T3, SP, addent);
		//		addent += 1 * wordSize;
		//		__ verify_oop(T3);
		pop_and_check_object(T3); 
	}
	__ add(AT, T3, T2);
	//__ sdc1(F12, AT, 0);
	__ swc1(FSF, AT, 0);
	__ swc1(SSF, AT, wordSize);
	if (!is_static) {
		patch_bytecode(Bytecodes::_fast_dputfield, T3, T2);
	}
	__ b(Done);
	__ delayed()->nop();
	__ bind(notDouble);

	__ stop("Bad state");

	__ bind(Done);

	// Check for volatile store
	__ beq(T4, ZERO, notVolatile);
	__ delayed()->nop();
	volatile_barrier( );
	__ bind(notVolatile);
}

void TemplateTable::putfield(int byte_no) {
	putfield_or_static(byte_no, false);
}

void TemplateTable::putstatic(int byte_no) {
	putfield_or_static(byte_no, true);
}

// used registers : T1, T2, T3
// T1 : cp_entry
// T2 : obj
// T3 : value pointer
void TemplateTable::jvmti_post_fast_field_mod() {
	if (JvmtiExport::can_post_field_modification()) {
		// Check to see if a field modification watch has been set before we take
		// the time to call into the VM.
		Label L2;
		__ lui(AT, Assembler::split_high((int)JvmtiExport::get_field_modification_count_addr()));
		__ lw(T3, AT, Assembler::split_low((int)JvmtiExport::get_field_modification_count_addr()));
		__ beq(T3, ZERO, L2);
		__ delayed()->nop();
		//__ pop(T2);
		__ pop_ptr(T2);
		//__ lw(T2, SP, 0);
		__ verify_oop(T2);
		__ push_ptr(T2);	
		__ addiu(SP, SP, -sizeof(jvalue));
		__ move(T3, SP);
		//__ push(T2);
		//__ move(T2, ZERO);

		switch (bytecode()) {          // load values into the jvalue object
			case Bytecodes::_fast_bputfield: 
				__ sb(FSR, SP, 0); 

				break;
			case Bytecodes::_fast_sputfield: 
				__ sh(FSR, SP, 0);
				break;
			case Bytecodes::_fast_cputfield: 
				__ sh(FSR, SP, 0);
				break;
			case Bytecodes::_fast_iputfield: 
				__ sw(FSR, SP, 0);
				break;							 
			case Bytecodes::_fast_lputfield: 
				__ sw(FSR, SP, 0);
				__ sw(SSR, SP, 4);
				break;
			case Bytecodes::_fast_fputfield: 
				__ swc1(FSF, SP, 0);
				break;
			case Bytecodes::_fast_dputfield: 
				__ swc1(FSF, SP, 0);
				__ swc1(SSF, SP, 4);
				break;
			case Bytecodes::_fast_aputfield: 
				__ sw(FSR, SP, 0);
				break;
			default:  ShouldNotReachHere();
		}

		//__ pop(T2);  // restore copy of object pointer

		// Save eax and sometimes edx because call_VM() will clobber them,
		// then use them for JVM/DI purposes
		__ push(FSR);
		if (bytecode() == Bytecodes::_fast_lputfield) __ push(SSR);
		// access constant pool cache entry
		__ get_cache_entry_pointer_at_bcp(T1, T4, 1);
		// no need, verified ahead
		__ verify_oop(T2);

		// ebx: object pointer copied above
		// eax: cache entry pointer
		// ecx: jvalue object on the stack
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
					InterpreterRuntime::post_field_modification), T2, T1, T3);
		if (bytecode() == Bytecodes::_fast_lputfield) __ pop(SSR);  // restore high value
		//__ pop(FSR);     // restore lower value   
		//__ addi(SP, SP, sizeof(jvalue));  // release jvalue object space
		__ lw(FSR, SP, 0);
		__ addiu(SP, SP, sizeof(jvalue) + 1 * wordSize);
		__ bind(L2);
	}
}

// used registers : T2, T3, T4
// T2 : index & off & field address
// T3 : cache & obj
// T4 : flags
void TemplateTable::fast_storefield(TosState state) {
	transition(state, vtos);

	ByteSize base = constantPoolCacheOopDesc::base_offset();

	jvmti_post_fast_field_mod();

	// access constant pool cache
	__ get_cache_and_index_at_bcp(T3, T2, 1);

	// test for volatile with edx but edx is tos register for lputfield.
	__ sll(AT, T2, 4); 
	__ add(AT, T3, AT);
	__ lw(T4, AT, in_bytes(base + ConstantPoolCacheEntry::flags_offset()));

	// replace index with field offset from cache entry
	__ lw(T2, AT, in_bytes(base + ConstantPoolCacheEntry::f2_offset()));

	// Doug Lea believes this is not needed with current Sparcs (TSO) and Intel (PSO).
	// volatile_barrier( );

	Label notVolatile, Done;
	// Check for volatile store
	__ move(AT, 1<<ConstantPoolCacheEntry::volatileField);
	__ andr(AT, T4, AT);
	__ beq(AT, ZERO, notVolatile);
	__ delayed()->nop();


	// Get object from stack
	// NOTE : the value in FSR/FSF now
	//	__ pop(T3);
	//	__ verify_oop(T3);
	pop_and_check_object(T3);
	// field addresses
	__ add(T2, T3, T2);

	// access field
	switch (bytecode()) {
		case Bytecodes::_fast_bputfield: 
			__ sb(FSR, T2, 0);
			break;
		case Bytecodes::_fast_sputfield: // fall through
		case Bytecodes::_fast_cputfield: 
			__ sh(FSR, T2, 0);
			break;
		case Bytecodes::_fast_iputfield: 
			__ sw(FSR, T2, 0);
			break;
		case Bytecodes::_fast_lputfield: 
			__ sw(FSR, T2, 0 * wordSize);
			__ sw(SSR, T2, 1 * wordSize);
			break;
		case Bytecodes::_fast_fputfield: 
			__ swc1(FSF, T2, 0);
			break;
		case Bytecodes::_fast_dputfield: 
			__ swc1(FSF, T2, 0 * wordSize);
			__ swc1(SSF, T2, 1 * wordSize);
			break;
		case Bytecodes::_fast_aputfield: 
			__ sw(FSR, T2, 0);
			__ store_check(T3);
			break;
		default:
			ShouldNotReachHere();
	}

	Label done;
	volatile_barrier( );
	__ b(done);
	__ delayed()->nop();

	// Same code as above, but don't need edx to test for volatile.
	__ bind(notVolatile);

	// Get object from stack
	//	__ pop(T3);
	//	__ verify_oop(T3);
	pop_and_check_object(T3);
	//get the field address
	__ add(T2, T3, T2);

	// access field
	switch (bytecode()) {
		case Bytecodes::_fast_bputfield: 
			__ sb(FSR, T2, 0); 
			break;
		case Bytecodes::_fast_sputfield: // fall through
		case Bytecodes::_fast_cputfield: 
			__ sh(FSR, T2, 0);
			break;
		case Bytecodes::_fast_iputfield: 
			__ sw(FSR, T2, 0);
			break;
		case Bytecodes::_fast_lputfield: 
			__ sw(FSR, T2, 0 * wordSize);
			__ sw(SSR, T2, 1 * wordSize);
			break;
		case Bytecodes::_fast_fputfield: 
			__ swc1(FSF, T2, 0);
			break;
		case Bytecodes::_fast_dputfield: 
			__ swc1(FSF, T2, 0 * wordSize);
			__ swc1(SSF, T2, 1 * wordSize);
			break;
		case Bytecodes::_fast_aputfield: 
			__ sw(FSR, T2, 0);
			__ store_check(T3);
			break;
		default:
			ShouldNotReachHere();
	}
	__ bind(done);
}

// used registers : T2, T3, T4
// T3 : cp_entry & cache
// T2 : index & offset
void TemplateTable::fast_accessfield(TosState state) {
	transition(atos, state);

	// do the JVMTI work here to avoid disturbing the register state below
	if (JvmtiExport::can_post_field_access()) {
		// Check to see if a field access watch has been set before we take
		// the time to call into the VM.
		Label L1;
		__ lui(AT, Assembler::split_high((int)JvmtiExport::get_field_access_count_addr()));
		__ lw(T3, AT, Assembler::split_low((int)JvmtiExport::get_field_access_count_addr()));
		__ beq(T3, ZERO, L1);
		__ delayed()->nop();
		// access constant pool cache entry
		__ get_cache_entry_pointer_at_bcp(T3, T4, 1);
		__ move(TSR, FSR);
		__ verify_oop(FSR);
		// FSR: object pointer copied above
		// T3: cache entry pointer
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_field_access),
				FSR, T3);
		__ move(FSR, TSR);
		__ bind(L1);
	}

	// access constant pool cache
	__ get_cache_and_index_at_bcp(T3, T2, 1);
	// replace index with field offset from cache entry
	__ sll(AT, T2, 4);
	__ add(AT, T3, AT);
	__ lw(T2, AT, in_bytes(constantPoolCacheOopDesc::base_offset() 
				+ ConstantPoolCacheEntry::f2_offset()));

	// eax: object
	__ verify_oop(FSR);
	// __ null_check(FSR, 0);
	__ null_check(FSR);
	// field addresses
	__ add(FSR, FSR, T2);

	// access field
	switch (bytecode()) {
		case Bytecodes::_fast_bgetfield: 
			__ lb(FSR, FSR, 0);
			break;
		case Bytecodes::_fast_sgetfield: 
			__ lh(FSR, FSR, 0);
			break;
		case Bytecodes::_fast_cgetfield: 
			__ lhu(FSR, FSR, 0);
			break;
		case Bytecodes::_fast_igetfield: 
			__ lw(FSR, FSR, 0);
			break;
		case Bytecodes::_fast_lgetfield: 
			__ stop("should not be rewritten");  
			break;
		case Bytecodes::_fast_fgetfield: 
			__ lwc1(FSF, FSR, 0);
			break;
		case Bytecodes::_fast_dgetfield: 
			__ lwc1(FSF, FSR, 0);
			__ lwc1(SSF, FSR, 4);
			break;
		case Bytecodes::_fast_agetfield: 
			__ lw(FSR, FSR, 0);
			__ verify_oop(FSR);
			break;
		default:
			ShouldNotReachHere();
	}

	// Doug Lea believes this is not needed with current Sparcs(TSO) and Intel(PSO)
	// volatile_barrier( );
}

// generator for _fast_iaccess_0, _fast_aaccess_0, _fast_faccess_0
// used registers : T1, T2, T3, T4
// T1 : obj & field address
// T2 : off
// T3 : cache
// T4 : index
void TemplateTable::fast_xaccess(TosState state) {
	transition(vtos, state);
	// get receiver
	__ lw(T1, aaddress(0));
	debug_only(__ verify_local_tag(frame::TagReference, 0));
	// access constant pool cache
	__ get_cache_and_index_at_bcp(T3, T4, 2);
	__ sll(AT, T4, 4);
	__ add(AT, T3, AT);
	__ lw(T2, AT, in_bytes(constantPoolCacheOopDesc::base_offset() 
				+ ConstantPoolCacheEntry::f2_offset()));

	// make sure exception is reported in correct bcp range (getfield is next instruction)
	__ addi(BCP, BCP, 1);
	//	__ null_check(T1, 0);
	__ null_check(T1);
	__ add(T1, T1, T2);

	if (state == itos) {
		__ lw(FSR, T1, 0);
	} else if (state == atos) {
		__ lw(FSR, T1, 0);
		__ verify_oop(FSR);
	} else if (state == ftos) {
		__ lwc1(FSF, T1, 0);
	} else {
		ShouldNotReachHere();
	}
	__ addi(BCP, BCP, -1);
}

//---------------------------------------------------
//-------------------------------------------------
// Calls

void TemplateTable::count_calls(Register method, Register temp) {  
	// implemented elsewhere
	ShouldNotReachHere();
}

// method, index, recv, flags: T1, T2, T3, T4
// byte_no = 2 for _invokevirtual, 1 else
// T0 : return address
// get the method & index of the invoke, and push the return address of 
// the invoke(first word in the frame)
// this address is where the return code jmp to.
// NOTE : this method will set T3&T4 as recv&flags
void TemplateTable::prepare_invoke(Register method, Register index, 
		                  int byte_no, Bytecodes::Code code) {
	// determine flags
	const bool is_invokeinterface  = code == Bytecodes::_invokeinterface;
	const bool is_invokevirtual    = code == Bytecodes::_invokevirtual;
	const bool is_invokespecial    = code == Bytecodes::_invokespecial;
	const bool load_receiver       = code != Bytecodes::_invokestatic;
	const bool receiver_null_check = is_invokespecial;
	// const bool save_flags = is_invokeinterface || is_invokevirtual;
	// setup registers & access constant pool cache
	const Register recv   = T3;
	const Register flags  = T4;

	assert_different_registers(method, index, recv, flags);

	// save 'interpreter return address'
	__ save_bcp();

	load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual);

	// load receiver if needed (note: no return address pushed yet)
	if (load_receiver) {
		__ andi(AT, flags, 0xff);
		__ shl(AT, Interpreter::stackElementScale());
		__ add(AT, SP, AT);
		//__ move(T8, AT);	
		__ lw(recv, AT, - Interpreter::expr_offset_in_bytes(1));
                __ verify_oop(recv);	
	}
/*	
	if (load_receiver) {
	Label mmm;
	__ move(AT, 0xf0000000);	
	__ andr(AT, AT, recv);	
	__ srl(AT, AT, 28);	
	__ addi(AT, AT, -1);	
	__ bne(AT, ZERO, mmm);	
	__ delayed()->nop();	
//	__ move(AT,  (int)&jerome6);	
//	__ lw(AT, AT, 0);	
//	__ beq(AT, ZERO, mmm);	
//	__ delayed()->nop();	
	__ move(AT, (int)&jerome1 );
	__ sw(SP, AT, 0); 	
	__ move(AT, (int)&jerome2 );
	__ sw(FP, AT, 0); 	
	__ move(AT, (int)&jerome3 );
	__ sw(BCP, AT, 0); 	
	__ move(AT, (int)&jerome4 );
	__ sw(recv, AT, 0); 	
	__ move(AT, (int)&jerome5 );
	__ sw(V0, AT, 0); 	


	__ move(AT, (int)&jerome6 );
	__ lw(flags, T8, -4);	
	__ sw(flags , AT, 0);
	__ move(AT, (int)&jerome7 );
	__ lw(flags, T8, 0);	
	__ sw(flags , AT, 0);
	
	__ move(AT, (int)&jerome8 );
	__ lw(flags, T8, 4);	
	__ sw(flags , AT, 0);
	
	__ move(AT, (int)&jerome9 );
	__ lw(flags, recv, oopDesc::klass_offset_in_bytes());
	__ sw(flags , AT, 0);
	__ move(AT, (int)&jerome10 );
	__ lbu(flags, BCP, -1);	
	__ sw(flags , AT, 0);


	__ move(AT, (int)&jerome5 );
	__ lw(flags, AT, 0); 	


	__ pushad();
//	__ enter();
	__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
				relocInfo::runtime_call_type);
	__ delayed()->nop();
//	__ leave();
	__ popad();
      
	__ bind(mmm);
	}
*/	
	// do null check if needed
	if (receiver_null_check) {
		__ null_check(recv);
	}
        //FIXME, why not save flags here?
	// compute return type
	__ srl(T0, flags, ConstantPoolCacheEntry::tosBits);

	// Make sure we don't need to mask flags for tosBits after the above shift
	ConstantPoolCacheEntry::verify_tosBits();
	// load return address
	{ 
		const int table =
			is_invokeinterface
			? (int)Interpreter::return_5_addrs_by_index_table()
			: (int)Interpreter::return_3_addrs_by_index_table();
		__ lui(AT, Assembler::split_high(table));
		__ shl(T0, 2);
		__ add(AT, AT, T0);
		__ lw(RA, AT, Assembler::split_low(table));
	}

  // push return address, see generate_fixed_frame for more info
//		__ push(T0);
}

// used registers : T0, T3, T4, T7, T9
// T9 : entry
// T3 : recv, this two register using convention is by prepare_invoke
// T4 : flags, klass
// T7 : method, index must be T7
void TemplateTable::invokevirtual_helper(Register index, Register recv,
		Register flags) {

	assert_different_registers(index, recv, T1, T4);

	// Test for an invoke of a final method
	Label notFinal;
	__ move(AT, (1 << ConstantPoolCacheEntry::vfinalMethod));
	__ andr(AT, flags, AT);
	__ beq(AT, ZERO, notFinal);
	__ delayed()->nop();

	Register method = index;  // method must be T7
	assert(method == T7, "methodOop must be T7 for interpreter calling convention");

	// do the call - the index is actually the method to call
	// the index is indeed methodOop, for this is vfinal, 
	// see ConstantPoolCacheEntry::set_method for more info

	__ verify_oop(method);

	// It's final, need a null check here!
	__ null_check(recv);

	// profile this call
	__ profile_final_call(T0);
	__ move(T0, recv);
	__ jump_from_interpreted(method, T4);

	__ bind(notFinal);

	// get receiver klass
	__ null_check(recv, oopDesc::klass_offset_in_bytes());
	// Keep recv in ecx for callee expects it there
	__ lw(T4, recv, oopDesc::klass_offset_in_bytes());
	__ verify_oop(T4);
	// profile this call
	__ profile_virtual_call(T1, T0, T4);

	// get target methodOop & entry point
	const int base = instanceKlass::vtable_start_offset() * wordSize;    
	assert(vtableEntry::size() * wordSize == 4, "adjust the scaling in the code below");
	__ sll(AT, index, 2);
	__ add(AT, T4, AT);
	//this is a ualign read 
	__ lw(method, AT, base + vtableEntry::method_offset_in_bytes());
	__ move(T0, recv);
	__ jump_from_interpreted(method, T4);
	
}

void TemplateTable::invokevirtual(int byte_no) {
	transition(vtos, vtos);
	prepare_invoke(T7, NOREG, byte_no, bytecode());
	// now recv & flags in T3, T4

	invokevirtual_helper(T7, T3, T4);
}

// used registers : T9, T7
// T9 : entry
// T7 : method
void TemplateTable::invokespecial(int byte_no) {
	transition(vtos, vtos);
	// prepare_invoke(method, index, byte_no, bytecode());
	prepare_invoke(T7, NOREG, byte_no, bytecode());
	// do the call
	// now recv & flags in T3, T4
	__ verify_oop(T7);
	__ profile_call(T9);
	__ jump_from_interpreted(T7, T9);
	__ move(T0, T3);
}

void TemplateTable::invokestatic(int byte_no) {
	transition(vtos, vtos);
	prepare_invoke(T7, NOREG, byte_no, bytecode());
	__ verify_oop(T7);
	__ profile_call(T9);
	__ jump_from_interpreted(T7, T9);
}

// i have no idea what to do here, now. for future change. FIXME. 
void TemplateTable::fast_invokevfinal(int byte_no) {
	transition(vtos, vtos);
	__ stop("fast_invokevfinal not used on x86");
}

// used registers : T0, T1, T2, T3, T4, T7
// T0 : itable, vtable, entry
// T1 : interface
// T3 : receiver
// T4 : flags, klass
// T7 : index, method, this is required by interpreter_entry
void TemplateTable::invokeinterface(int byte_no) {
	transition(vtos, vtos);
	//this method will use T1-T4 and T0
	prepare_invoke(T1, T7, byte_no, bytecode());
	// T1: Interface
	// T2: index
	// T3: receiver    
	// T4: flags
       Label notMethod;
	__ move(AT, (1 << ConstantPoolCacheEntry::methodInterface));
	__ andr(AT, T4, AT);
	__ beq(AT, ZERO, notMethod);
	__ delayed()->nop();

	// Special case of invokeinterface called for virtual method of
	// java.lang.Object.  See cpCacheOop.cpp for details.
	// This code isn't produced by javac, but could be produced by
	// another compliant java compiler.
	invokevirtual_helper(T7, T3, T4);

	__ bind(notMethod);
	// Get receiver klass into T4 - also a null check
	__ lw(T4, T3, oopDesc::klass_offset_in_bytes());
	__ verify_oop(T4);

	// profile this call
	__ profile_virtual_call(T4, T0, FSR);

	// Compute start of first itableOffsetEntry (which is at the end of the vtable)
	const int base = instanceKlass::vtable_start_offset() * wordSize;    
	assert(vtableEntry::size() * wordSize == 4, "adjust the scaling in the code below");
	__ lw(AT, T4, instanceKlass::vtable_length_offset() * wordSize); 
	__ shl(AT, 2);
	__ add(T0, T4, AT);
	__ addi(T0, T0, base);
	if (HeapWordsPerLong > 1) {
		// Round up to align_object_offset boundary
		__ round_to(T0, BytesPerLong);
	}
	// now T0 is the begin of the itable

	Label entry, search, interface_ok;

	///__ jmp(entry);   
	__ b(entry);
	__ delayed()->nop();

	__ bind(search);
	__ increment(T0, itableOffsetEntry::size() * wordSize);

	__ bind(entry);

	// Check that the entry is non-null.  A null entry means that the receiver
	// class doesn't implement the interface, and wasn't the same as the
	// receiver class checked when the interface was resolved.
	__ lw(AT, T0, itableOffsetEntry::interface_offset_in_bytes());
	__ bne(AT, ZERO, interface_ok);
	__ delayed()->nop();
	// throw exception
	// the call_VM checks for exception, so we should never return here.

	//__ pop();//FIXME here,			
	// pop return address (pushed by prepare_invoke). 
	// no need now, we just save the value in RA now

	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_IncompatibleClassChangeError));
	__ should_not_reach_here();

	__ bind(interface_ok);
	//NOTICE here, no pop as x86 do	
	//__ lw(AT, T0, itableOffsetEntry::interface_offset_in_bytes());
	__ bne(AT, T1, search);
	__ delayed()->nop();

	// now we get vtable of the interface
	__ lw(T0, T0, itableOffsetEntry::offset_offset_in_bytes());
	__ addu(T0, T4, T0);
	assert(itableMethodEntry::size() * wordSize == 4, "adjust the scaling in the code below");
	__ sll(AT, T7, 2);
	__ addu(AT, T0, AT);
	// now we get the method
	__ lw(T7, AT, 0);
	// T7: methodOop to call
	// T3: receiver
	// Check for abstract method error
	// Note: This should be done more efficiently via a throw_abstract_method_error
	//       interpreter entry point and a conditional jump to it in case of a null
	//       method.
	{ 
		Label L;
		///__ testl(ebx, ebx);
		///__ jcc(Assembler::notZero, L);
		__ bne(T7, ZERO, L);
		__ delayed()->nop();
		// throw exception
		// note: must restore interpreter registers to canonical
		//       state for exception handling to work correctly!
		///__ popl(ebx);          // pop return address (pushed by prepare_invoke)
		//__ restore_bcp();      // esi must be correct for exception handler   
		//(was destroyed)
		//__ restore_locals();   // make sure locals pointer 
		//is correct as well (was destroyed)
		///__ call_VM(noreg, CAST_FROM_FN_PTR(address, 
		//InterpreterRuntime::throw_AbstractMethodError));
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodError));
		// the call_VM checks for exception, so we should never return here.
		__ should_not_reach_here();
		__ bind(L);
	}
	__ jump_from_interpreted(T7, T9);
}

//----------------------------------------------------------------------------------------------------
// Allocation
// T1 : tags & buffer end & thread
// T2 : object end
// T3 : klass
// T4 : object size
// A1 : cpool
// A2 : cp index
// return object in FSR
void TemplateTable::_new() {
	transition(vtos, atos);
	__ load_two_bytes_from_at_bcp(A2, AT, 1);
	__ huswap(A2);

	Label slow_case;
	Label done;
	Label initialize_header;
	Label initialize_object;  // including clearing the fields
	Label allocate_shared;

	// get instanceKlass in T3
	__ get_cpool_and_tags(A1, T1);
	__ sll(AT, A2, 2);
	__ add(AT, A1, AT);
	__ lw(T3, AT, sizeof(constantPoolOopDesc));

	// make sure the class we're about to instantiate has been resolved. 
	// Note: slow_case does a pop of stack, which is why we loaded class/pushed above
	const int tags_offset = typeArrayOopDesc::header_size(T_BYTE) * wordSize;
	__ add(T1, T1, A2);
	__ lb(AT, T1, tags_offset);
	//__ addiu(AT, AT, - (int)JVM_CONSTANT_UnresolvedClass);
	__ addiu(AT, AT, - (int)JVM_CONSTANT_Class);
	//__ beq(AT, ZERO, slow_case);
	__ bne(AT, ZERO, slow_case);
	__ delayed()->nop();

	/*make sure klass is initialized & doesn't have finalizer*/

	// make sure klass is fully initialized
	__ lw(T1, T3, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc));
	__ addiu(AT, T1, - (int)instanceKlass::fully_initialized);
	__ bne(AT, ZERO, slow_case);
	__ delayed()->nop();

	// has_finalizer
	//__ lw(T1, T3, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
	//__ move(AT, JVM_ACC_CAN_BE_FASTPATH_ALLOCATED);
	//__ andr(AT, T1, AT);
	//FIXME need confirmation and test. aoqi
	__ lw(T1, T3, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc));
	__ andi(AT, T1, Klass::_lh_instance_slow_path_bit);
	__ bne(AT, ZERO, slow_case);
	__ delayed()->nop();

	// get instance_size in instanceKlass (already aligned) in T4, 
	// be sure to preserve this value 
	//__ lw(T4, T3, Klass::size_helper_offset_in_bytes() + sizeof(oopDesc));
	//Klass::_size_helper is renamed Klass::_layout_helper. aoqi 
	__ lw(T4, T3, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc));

	// 
	// Allocate the instance
	// 1) Try to allocate in the TLAB
	// 2) if fail and the object is large allocate in the shared Eden
	// 3) if the above fails (or is not applicable), go to a slow case
	// (creates a new TLAB, etc.)

	const bool allow_shared_alloc =
		Universe::heap()->supports_inline_contig_alloc() && !CMSIncrementalMode;

	if (UseTLAB) {
#ifndef OPT_THREAD
		const Register thread = T1;
		__ get_thread(thread);
#else
		const Register thread = TREG;
#endif
		// get tlab_top
		__ lw(FSR, thread, in_bytes(JavaThread::tlab_top_offset()));
		__ add(T2, FSR, T4);
		// get tlab_end
		__ lw(AT, thread, in_bytes(JavaThread::tlab_end_offset()));
		__ slt(AT, AT, T2);
		//		__ bne(AT, ZERO, allocate_shared);
		__ bne(AT, ZERO, allow_shared_alloc ? allocate_shared : slow_case);
		__ delayed()->nop();
		__ sw(T2, thread, in_bytes(JavaThread::tlab_top_offset()));

		if (ZeroTLAB) {
			// the fields have been already cleared
			__ b(initialize_header);
		} else {
			// initialize both the header and fields
			__ b(initialize_object);
		}
		__ delayed()->nop();
		/*

		   if (CMSIncrementalMode) {
		// No allocation in shared eden. 
		///__ jmp(slow_case);
		__ b(slow_case);
		__ delayed()->nop();
		}
		*/ 
	}

	// Allocation in the shared Eden , if allowed
	// T4 : instance size in words
	if(allow_shared_alloc){ 
		__ bind(allocate_shared);
		Label retry;
		Address heap_top(T1, Assembler::split_low((int)Universe::heap()->top_addr()));
		__ lui(T1, Assembler::split_high((int)Universe::heap()->top_addr()));

		__ lw(FSR, heap_top);
		__ bind(retry);
		__ add(T2, FSR, T4);
		__ lui(AT, Assembler::split_high((int)Universe::heap()->end_addr()));
		__ lw(AT, AT, Assembler::split_low((int)Universe::heap()->end_addr()));
		__ slt(AT, AT, T2);
		__ bne(AT, ZERO, slow_case);
		__ delayed()->nop();

		// Compare eax with the top addr, and if still equal, store the new
		// top addr in ebx at the address of the top addr pointer. Sets ZF if was
		// equal, and clears it otherwise. Use lock prefix for atomicity on MPs.
		//
		// FSR: object begin
		// T2: object end
		// T4: instance size in words

		// if someone beat us on the allocation, try again, otherwise continue 
		//__ lui(T1, Assembler::split_high((int)Universe::heap()->top_addr()));
		__ cmpxchg(T2, heap_top, FSR);
		__ beq(AT, ZERO, retry);
		__ delayed()->nop();
	}

	if (UseTLAB || Universe::heap()->supports_inline_contig_alloc()) {
		// The object is initialized before the header.  If the object size is
		// zero, go directly to the header initialization.
		__ bind(initialize_object);
		__ addiu(T4, T4, - sizeof(oopDesc));
		__ beq(T4, ZERO, initialize_header);
		__ delayed()->nop();


		// T4 must have been multiple of 2
#ifdef ASSERT
		// make sure T4 was multiple of 2
		Label L;
		__ andi(AT, T4, 1);
		__ beq(AT, ZERO, L);
		__ delayed()->nop();
		__ stop("object size is not multiple of 2 - adjust this code");
		__ bind(L);
		// edx must be > 0, no extra check needed here
#endif

		// initialize remaining object fields: T4 is a multiple of 2
		{ 
			Label loop;
			__ add(T1, FSR, T4);
			__ addi(T1, T1, -8);

			__ bind(loop);
			__ sw(ZERO, T1, sizeof(oopDesc) + 0*oopSize);
			__ sw(ZERO, T1, sizeof(oopDesc) + 1*oopSize);
			__ bne(T1, FSR, loop); //dont clear header
			__ delayed()->addi(T1, T1, -8);
			// actually sizeof(oopDesc)==8, so we can move  
			// __ addiu(AT, AT, -8) to delay slot, and compare FSR with T1
		}
                //klass in T3, 
		// initialize object header only.
		__ bind(initialize_header);
		if (UseBiasedLocking) {
			// __ popl(ecx);   // get saved klass back in the register.
			// __ movl(ebx, Address(ecx, Klass::prototype_header_offset_in_bytes() 
			// + klassOopDesc::klass_part_offset_in_bytes()));
			__ lw(AT,T3, Klass::prototype_header_offset_in_bytes() 
					+ klassOopDesc::klass_part_offset_in_bytes()); 
			// __ movl(Address(eax, oopDesc::mark_offset_in_bytes ()), ebx);
			__ sw(AT, FSR, oopDesc::mark_offset_in_bytes ());    
		} else {
			__ move(AT, (int)markOopDesc::prototype());
			__ sw(AT, FSR, oopDesc::mark_offset_in_bytes());
		}

		__ sw(T3, FSR, oopDesc::klass_offset_in_bytes());

		{
			SkipIfEqual skip_if(_masm, &DTraceAllocProbes, 0);
			// Trigger dtrace event for fastpath
			__ push(atos);
			__ call_VM_leaf(
				CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), FSR);
			__ pop(atos);
		}
		__ b(done);
		__ delayed()->nop();
	}	
	// slow case
	__ bind(slow_case);
	// call_VM(result, InterpreterRuntime::_new, cpool, index)
	call_VM(FSR, CAST_FROM_FN_PTR(address, InterpreterRuntime::_new), A1, A2);

	// continue
	__ bind(done);
}

void TemplateTable::newarray() {
	transition(itos, atos);
	__ lbu(A1, at_bcp(1));
	//type, count
	call_VM(FSR, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray), A1, FSR);
}

void TemplateTable::anewarray() {
	transition(itos, atos);
	__ load_two_bytes_from_at_bcp(A2, AT, 1);
	__ huswap(A2);
	__ get_constant_pool(A1);
	// cp, index, count
	call_VM(FSR, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray), A1, A2, FSR);
}

void TemplateTable::arraylength() {
	transition(atos, itos);
	__ null_check(FSR, arrayOopDesc::length_offset_in_bytes());
	__ lw(FSR, FSR, arrayOopDesc::length_offset_in_bytes());
}

// i use T2 as ebx, T3 as ecx, T4 as edx
// when invoke gen_subtype_check, super in T4, sub in T2, object in FSR(it's always)
// T2 : sub klass
// T3 : cpool
// T4 : super klass
void TemplateTable::checkcast() {
	transition(atos, atos);
	Label done, is_null, ok_is_subtype, quicked, resolved;
	__ beq(FSR, ZERO, is_null);
	__ delayed()->nop();

	__ profile_checkcast(false, T3);

	// Get cpool & tags index
	__ get_cpool_and_tags(T3, T4);
	__ load_two_bytes_from_at_bcp(T2, AT, 1);
	__ huswap(T2);

	// See if bytecode has already been quicked
	__ add(AT, T4, T2);
	__ lb(AT, AT, typeArrayOopDesc::header_size(T_BYTE) * wordSize);
	__ addiu(AT, AT, - (int)JVM_CONSTANT_Class);
	__ beq(AT, ZERO, quicked);
	__ delayed()->nop();

	__ move(TSR, FSR);	//call_VM blow FSR
	call_VM(T4, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
	__ b(resolved);
	__ delayed();	__ move(FSR, TSR);

	// klass already in cp, get superklass in T4
	__ bind(quicked);
	__ sll(AT, T2, 2);
	__ add(AT, T3, AT);
	__ lw(T4, AT, sizeof(constantPoolOopDesc));

	__ bind(resolved);

	
	
	
	
	// get subklass in T2
	__ lw(T2, FSR, oopDesc::klass_offset_in_bytes());
/*	__ move(AT, (int)&jerome1 );
	__ sw(T2, AT, 0); 	
	__ move(AT, (int)&jerome2 );
	__ sw(ZERO, AT, 0); 	
	__ move(AT, (int)&jerome3 );
	__ sw(ZERO, AT, 0); 	
	__ move(AT, (int)&jerome4 );
	__ sw(ZERO, AT, 0); 	

	__ move(AT, (int)&jerome5 );
	__ sw(ZERO, AT, 0); 	

	__ move(AT, (int)&jerome6 );
	__ sw(ZERO, AT, 0); 	

	__ move(AT, (int)&jerome7 );
	__ sw(ZERO, AT, 0); 	

	__ move(AT, (int)&jerome8 );
	__ sw(ZERO, AT, 0); 	
	
	__ move(AT, (int)&jerome9 );
	__ sw(ZERO, AT, 0);
	__ move(AT, (int)&jerome10 );
	__ sw(ZERO, AT, 0);


	__ pushad();
//	__ enter();
	__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
				relocInfo::runtime_call_type);
	__ delayed()->nop();
//	__ leave();
	__ popad();
*/
	// Superklass in T4.  Subklass in T2.
	__ gen_subtype_check(T4, T2, ok_is_subtype);

	// Come here on failure
	// object is at FSR
	__ jmp(Interpreter::_throw_ClassCastException_entry);
	__ delayed()->nop();

	// Come here on success
	__ bind(ok_is_subtype);

	// Collect counts on whether this check-cast sees NULLs a lot or not.
	if (ProfileInterpreter) {
		__ b(done);
		__ delayed()->nop();
	}
	__ bind(is_null);
	__ profile_checkcast(true, T3);
	__ bind(done);
}

// i use T3 as cpool, T4 as tags, T2 as index
// object always in FSR, superklass in T4, subklass in T2
void TemplateTable::instanceof() {
	transition(atos, itos);
	Label done, ok_is_subtype, quicked, resolved;

	__ beq(FSR, ZERO, done);
	__ delayed()->nop();

	// Get cpool & tags index
	__ get_cpool_and_tags(T3, T4);
	// get index
	__ load_two_bytes_from_at_bcp(T2, AT, 1);
	__ hswap(T2);

	// See if bytecode has already been quicked
	// quicked
	__ addu(AT, T4, T2);
	__ lb(AT, AT, typeArrayOopDesc::header_size(T_BYTE) * wordSize);
	__ addiu(AT, AT, - (int)JVM_CONSTANT_Class);
	__ beq(AT, ZERO, quicked);
	__ delayed()->nop();

	// get superklass in T4
	//__ move(TSR, FSR);
	// sometimes S2 may be changed during the call, 
	// be careful if u use TSR as a saving place
	//__ push(FSR);
	__ push(atos);
	call_VM(T4, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
	//__ lw(FSR, SP, 0);
	__ pop_ptr(FSR);	
	__ b(resolved);
	__ delayed()->nop();
	//__ move(FSR, TSR);

	// get superklass in T4, subklass in T2
	__ bind(quicked);
	__ sll(AT, T2, 2);
	__ addu(AT, T3, AT);
	__ lw(T4, AT, sizeof(constantPoolOopDesc)); 

	__ bind(resolved);
	// get subklass in T2
	__ lw(T2, FSR, oopDesc::klass_offset_in_bytes());

	// Superklass in T4.  Subklass in T2.
	__ gen_subtype_check(T4, T2, ok_is_subtype);
	// Come here on failure
	__ b(done);
	__ delayed(); __ move(FSR, ZERO);

	// Come here on success
	__ bind(ok_is_subtype);
	__ move(FSR, 1);

	__ bind(done);
	// FSR = 0: obj == NULL or  obj is not an instanceof the specified klass
	// FSR = 1: obj != NULL and obj is     an instanceof the specified klass
}

//--------------------------------------------------------
//--------------------------------------------
// Breakpoints
void TemplateTable::_breakpoint() {

	// Note: We get here even if we are single stepping..
	// jbug inists on setting breakpoints at every bytecode 
	// even if we are in single step mode.  

	transition(vtos, vtos);

	// get the unpatched byte code
	///__ get_method(ecx);
	///__ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::get_original_bytecode_at)
	//, ecx, esi);
	///__ movl(ebx, eax);
	__ get_method(A1);
	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::get_original_bytecode_at), 
			A1, BCP);
	__ move(T2, V0);

	// post the breakpoint event
	///__ get_method(ecx);
	///__ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::_breakpoint), ecx, esi);
	__ get_method(A1);
	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::_breakpoint), A1, BCP);

	// complete the execution of original bytecode
	__ dispatch_only_normal(vtos);
} 

//----------------------------------------------------------------------------------------------------
// Exceptions

void TemplateTable::athrow() {
	transition(atos, vtos);
	__ null_check(FSR);
	__ jmp(Interpreter::throw_exception_entry());
	__ delayed()->nop();
}

//----------------------------------------------------------------------------------------------------
// Synchronization
//
// Note: monitorenter & exit are symmetric routines; which is reflected
//       in the assembly code structure as well
//
// Stack layout:
//
// [expressions  ] <--- SP               = expression stack top
// ..
// [expressions  ]
// [monitor entry] <--- monitor block top = expression stack bot
// ..
// [monitor entry]
// [frame data   ] <--- monitor block bot
// ...
// [return addr  ] <--- FP

// we use T2 as monitor entry pointer, T3 as monitor top pointer, T6 as free slot pointer
// object always in FSR
void TemplateTable::monitorenter() {
	transition(atos, vtos);
	// check for NULL object
	__ null_check(FSR);

	const Address monitor_block_top(FP, frame::interpreter_frame_monitor_block_top_offset 
			* wordSize);
	const int entry_size = (frame::interpreter_frame_monitor_size()* wordSize);
	Label allocated;

	// initialize entry pointer
	__ move(T6, ZERO);

	// find a free slot in the monitor block (result in edx)
	{ 
		Label entry, loop, exit, next;
		__ lw(T2, monitor_block_top);
		__ b(entry);
		__ delayed()->addi(T3, FP, frame::interpreter_frame_initial_sp_offset * wordSize);

		// free slot?
		__ bind(loop);
		__ lw(AT, T2, BasicObjectLock::obj_offset_in_bytes());
		__ bne(AT, ZERO, next);
		__ delayed()->nop();
		__ move(T6, T2);

		__ bind(next);
		__ beq(FSR, AT, exit);
		__ delayed()->nop();
		__ addi(T2, T2, entry_size);

		__ bind(entry);
		__ bne(T3, T2, loop);
		__ delayed()->nop();
		__ bind(exit);
	}

	__ bne(T6, ZERO, allocated);
	__ delayed()->nop();

	// allocate one if there's no free slot
	{ 
		Label entry, loop;
		// 1. compute new pointers                   // SP: old expression stack top
		__ lw(T6, monitor_block_top);
		__ addi(SP, SP, - entry_size);
		__ addi(T6, T6, - entry_size);
		__ sw(T6, monitor_block_top);
		__ b(entry);
		__ delayed();
		__ move(T3, SP);

		// 2. move expression stack contents
		__ bind(loop);
		__ lw(AT, T3, entry_size);
		__ sw(AT, T3, 0);
		__ addi(T3, T3, wordSize); 
		__ bind(entry);
		__ bne(T3, T6, loop);
		__ delayed()->nop();
	}

	__ bind(allocated);
	// Increment bcp to point to the next bytecode, 
	// so exception handling for async. exceptions work correctly. 
	// The object has already been poped from the stack, so the 
	// expression stack looks correct.
	__ addi(BCP, BCP, 1); 
	__ sw(FSR, T6, BasicObjectLock::obj_offset_in_bytes());
	__ lock_object(T6);
	// check to make sure this monitor doesn't cause stack overflow after locking
	__ save_bcp();  // in case of exception
	__ generate_stack_overflow_check(0);
	// The bcp has already been incremented. Just need to dispatch to next instruction.

	__ dispatch_next(vtos);
}

// T2 : top
// T6 : entry
void TemplateTable::monitorexit() {
	transition(atos, vtos);

	__ null_check(FSR);

	const int entry_size =(frame::interpreter_frame_monitor_size()* wordSize);
	Label found;

	// find matching slot
	{ 
		Label entry, loop;
		__ lw(T6, FP, frame::interpreter_frame_monitor_block_top_offset * wordSize);
		__ b(entry);
		__ delayed()->addiu(T2, FP, frame::interpreter_frame_initial_sp_offset * wordSize);

		__ bind(loop);
		__ lw(AT, T6, BasicObjectLock::obj_offset_in_bytes());
		__ beq(FSR, AT, found);
		__ delayed()->nop();
		__ addiu(T6, T6, entry_size);
		__ bind(entry);
		__ bne(T2, T6, loop);
		__ delayed()->nop();
	}

	// error handling. Unlocking was not block-structured
	Label end;
	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
				InterpreterRuntime::throw_illegal_monitor_state_exception));
	__ should_not_reach_here();

	// call run-time routine
	// T6: points to monitor entry
	__ bind(found);
	__ move(TSR, FSR);
	__ unlock_object(T6);
	__ move(FSR, TSR);
	__ bind(end);
}

//--------------------------------------------------------------------------------------------------// Wide instructions

void TemplateTable::wide() {
	transition(vtos, vtos);
	// Note: the esi increment step is part of the individual wide bytecode implementations
	__ lbu(T7, at_bcp(1));
	__ sll(AT, T7, 2);
	__ lui(T9, Assembler::split_high(int(Interpreter::_wentry_point)));
	__ add(T9, T9, AT);
	__ lw(T9, T9, Assembler::split_low(int(Interpreter::_wentry_point)));
	__ jr(T9);
	__ delayed()->nop();
}

//--------------------------------------------------------------------------------------------------// Multi arrays

void TemplateTable::multianewarray() {
	transition(vtos, atos);
	// last dim is on top of stack; we want address of first one:
	// first_addr = last_addr + (ndims - 1) * wordSize
	__ lbu(A1, at_bcp(3));	// dimension
	//	__ sll(A1, A1, 2);
	__ sll(A1, A1, Interpreter::stackElementScale());
	__ addi(A1, A1, -4);	
	__ add(A1, SP, A1);		// now A1 pointer to the count array on the stack
	call_VM(FSR, CAST_FROM_FN_PTR(address, InterpreterRuntime::multianewarray), A1);
	__ lbu(AT, at_bcp(3));
	//	__ sll(AT, AT, 2);
	__ sll(AT, AT, Interpreter::stackElementScale());
	__ add(SP, SP, AT);
}

#endif // !CC_INTERP