view hotspot/src/cpu/mips/vm/templateInterpreter_mips.cpp @ 1:c1e1428eff7c

The preliminary porting to MIPS architecture. With this commit, the interpreter can pass 140/141 regression tests, 8/8 SPECjvm98 tests and 31/37 SPECjvm2008 tests. The compiler can pass 136/141 regression tests, but it can not run the benchmark of SPECjvm98 and SPECjvm2008.
author LIU Qi <liuqi82@gmail.com>
date Thu, 30 Sep 2010 13:48:16 +0800
parents
children d0a60cd6d61c
line wrap: on
line source

/*
 * Copyright 2003-2008 Sun Microsystems, Inc.  All Rights Reserved.
 * Copyright 2010 Lemote, Inc.  All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 *
 */

#include "incls/_precompiled.incl"
#include "incls/_interpreter_mips.cpp.incl"

#define __ _masm->

#ifndef CC_INTERP

const int Interpreter::return_sentinel = 0xfeedbeed;
const int method_offset = frame::interpreter_frame_method_offset * wordSize;
const int bci_offset    = frame::interpreter_frame_bcx_offset    * wordSize;
const int locals_offset = frame::interpreter_frame_locals_offset * wordSize;

//-----------------------------------------------------------------------------

address TemplateInterpreterGenerator::generate_StackOverflowError_handler() {
  address entry = __ pc();

#ifdef ASSERT
  {
    Label L;
		__ addi(T1, FP, frame::interpreter_frame_monitor_block_top_offset * wordSize);
		__ sub(T1, T1, SP); // T1 = maximal sp for current fp
		__ bgez(T1, L);     // check if frame is complete
		__ delayed()->nop();
		__ stop("interpreter frame not set up");
		__ bind(L);
  }
#endif // ASSERT
  // Restore bcp under the assumption that the current frame is still
  // interpreted
	// FIXME: please change the func restore_bcp
	// S0 is the conventional register for bcp
  __ restore_bcp();

  // expression stack must be empty before entering the VM if an
  // exception happened
  __ empty_expression_stack();
  // throw exception
  //__ call_VM(noreg,
  //           CAST_FROM_FN_PTR(address,
  //                            InterpreterRuntime::throw_StackOverflowError));
	// FIXME: why do not pass parameter thread ? 
	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_StackOverflowError));
  return entry;
}

address TemplateInterpreterGenerator::generate_ArrayIndexOutOfBounds_handler(
        const char* name) {
  address entry = __ pc();
  // expression stack must be empty before entering the VM if an
  // exception happened
  __ empty_expression_stack();
  // setup parameters
  // ??? convention: expect aberrant index in register ebx
  //__ lea(c_rarg1, ExternalAddress((address)name));
  //__ call_VM(noreg,
  //           CAST_FROM_FN_PTR(address,
  //                            InterpreterRuntime::
  //                            throw_ArrayIndexOutOfBoundsException),
  //           c_rarg1, rbx);
	__ move(A1, (int)name);
	__ call_VM(noreg, CAST_FROM_FN_PTR(address, 
			InterpreterRuntime::throw_ArrayIndexOutOfBoundsException), A1, A2);
  return entry;
}

address TemplateInterpreterGenerator::generate_ClassCastException_handler() {
  address entry = __ pc();

  // object is at TOS
//FIXME, I am not sure if the object is at TOS as x86 do now @jerome, 04/20,2007
  //__ pop(c_rarg1);

  // expression stack must be empty before entering the VM if an
  // exception happened
  __ empty_expression_stack();
  __ empty_FPU_stack();
  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException),  FSR);
  return entry;
}

address TemplateInterpreterGenerator::generate_exception_handler_common(
        const char* name, const char* message, bool pass_oop) {
	assert(!pass_oop || message == NULL, "either oop or message but not both");
	address entry = __ pc();

	// expression stack must be empty before entering the VM if an exception happened
	__ empty_expression_stack();
	// setup parameters
	__ move(A1, (int)name);
	if (pass_oop) {
		__ call_VM(V0, 
		CAST_FROM_FN_PTR(address, InterpreterRuntime::create_klass_exception), A1, FSR);
	} else {
		__ move(A2, (int)message);
		__ call_VM(V0, 
		CAST_FROM_FN_PTR(address, InterpreterRuntime::create_exception), A1, A2);
	}
	// throw exception
	__ jmp(Interpreter::throw_exception_entry(), relocInfo::none);
	__ delayed()->nop();
	return entry;
}


address TemplateInterpreterGenerator::generate_continuation_for(TosState state) {
  address entry = __ pc();
  // NULL last_sp until next java call
  //__ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD);
  //__ dispatch_next(state);
	__ sw(ZERO,Address(FP, frame::interpreter_frame_last_sp_offset * wordSize)); 
	__ dispatch_next(state);
  return entry;
}


address TemplateInterpreterGenerator::generate_return_entry_for(TosState state,
                                                                int step) {
	Label interpreter_entry;
	address compiled_entry = __ pc();

#ifdef COMPILER2
	// The FPU stack is clean if UseSSE >= 2 but must be cleaned in other cases
	if ((state == ftos && UseSSE < 1) || (state == dtos && UseSSE < 2)) {
		for (int i = 1; i < 8; i++) {
			__ ffree(i);
		}
	} else if (UseSSE < 2) {
		__ empty_FPU_stack();
	}
#endif
	if ((state == ftos && UseSSE < 1) || (state == dtos && UseSSE < 2)) {
		__ MacroAssembler::verify_FPU(1, "generate_return_entry_for compiled");
	} else {
		__ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
	}

	// __ jmp(interpreter_entry, relocInfo::none);
	__ b(interpreter_entry);
	__ delayed()->nop(); 
	// emit a sentinel we can test for when converting an interpreter
	// entry point to a compiled entry point.
	__ a_long(Interpreter::return_sentinel);
	__ a_long((int)compiled_entry);

	address entry = __ pc();

	__ bind(interpreter_entry);

	/*  // In SSE mode, interpreter returns FP results in xmm0 but they need
	// to end up back on the FPU so it can operate on them.
	if (state == ftos && UseSSE >= 1) {
	__ subl(esp, wordSize);
	__ movss(Address(esp, 0), xmm0);
	__ fld_s(Address(esp, 0));
	__ addl(esp, wordSize);
	} else if (state == dtos && UseSSE >= 2) {
	__ subl(esp, 2*wordSize);
	__ movsd(Address(esp, 0), xmm0);
	__ fld_d(Address(esp, 0));
	__ addl(esp, 2*wordSize);
	}
	*/
	__ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");

	// Restore stack bottom in case i2c adjusted stack
	// __ movl(esp, Address(ebp, frame::interpreter_frame_last_sp_offset * wordSize));
	__ lw(SP, FP, frame::interpreter_frame_last_sp_offset * wordSize);
	// and NULL it as marker that esp is now tos until next java call
	// __ movl(Address(ebp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
	__ sw(ZERO,FP, frame::interpreter_frame_last_sp_offset * wordSize);

	__ restore_bcp();
	__ restore_locals();
	__ get_cache_and_index_at_bcp(T7, T3, 1);
//shift 4 here , because in get_cache_and_index_at_bcp, we do not shift	
	//__ sll(T3, T3, Address::times_4);
	__ sll(T3, T3,4);
	__ add(T3, T3, T7);
	__ lw(T7, T3, in_bytes(constantPoolCacheOopDesc::base_offset()
				+ConstantPoolCacheEntry::flags_offset()));
	__ andi(T7, T7, 0xFF);
	__ sll(T7, T7, Interpreter::stackElementScale());
	__ add(SP, SP, T7);
/*	
	Label mmm;
	__ move(AT,  (int)&jerome4);	
	__ lw(AT, AT, 0);	
	__ beq(AT, ZERO, mmm);	
	__ delayed()->nop();	
	
	__ move(AT, (int)&jerome1 );
	__ sw(SP, AT, 0); 	
	__ move(AT, (int)&jerome2 );
	__ sw(FP, AT, 0); 	
	__ move(AT, (int)&jerome3 );
	__ sw(BCP, AT, 0); 	

	__ move(AT, (int)&jerome4 );
	__ sw(RA, AT, 0); 	
	__ move(AT, (int)&jerome5 );
	__ sw(T7, AT, 0); 	

	__ move(AT, (int)&jerome6 );
	__ sw(V0 , AT, 0);
	__ move(AT, (int)&jerome7 );
	__ sw(V1 , AT, 0);
	__ move(AT, (int)&jerome8 );
	__ sw(ZERO , AT, 0);
	
	__ move(AT, (int)&jerome9 );
	__ move(RA, step);	
	__ sw(RA , AT, 0);
	__ move(AT, (int)&jerome10 );
	__ lbu(RA, BCP, step);	
	__ sw(RA , AT, 0);


	__ move(AT, (int)&jerome5 );
	__ lw(RA, AT, 0); 	


	__ pushad();
//	__ enter();
	__ call(CAST_FROM_FN_PTR(address, SharedRuntime::print_call_statistics), 
				relocInfo::runtime_call_type);
	__ delayed()->nop();
//	__ leave();
	__ popad();
      
	__ bind(mmm);
*/

	
	__ dispatch_next(state, step);
  return entry;
}


address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state,
                                                               int step) {
  address entry = __ pc();
  // NULL last_sp until next java call
  //__ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD);
	__ sw(ZERO, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  __ restore_bcp();
  __ restore_locals();
  // handle exceptions
  {
    Label L;
		const Register thread = TREG;
#ifndef OPT_THREAD
		__ get_thread(thread);
#endif
		__ lw(AT, thread, in_bytes(Thread::pending_exception_offset()));
		__ beq(AT, ZERO, L);
		__ delayed()->nop();
		__ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_pending_exception));
		__ should_not_reach_here();
		__ bind(L);
  }
  __ dispatch_next(state, step);
  return entry;
}

int AbstractInterpreter::BasicType_as_index(BasicType type) {
  int i = 0;
/*
  switch (type) {
    case T_BOOLEAN: i = 0; break;
    case T_CHAR   : i = 1; break;
    case T_BYTE   : i = 2; break;
    case T_SHORT  : i = 3; break;
    case T_INT    : i = 4; break;
    case T_LONG   : i = 5; break;
    case T_VOID   : i = 6; break;
    case T_FLOAT  : i = 7; break;
    case T_DOUBLE : i = 8; break;
    case T_OBJECT : i = 9; break;
    case T_ARRAY  : i = 9; break;
    default       : ShouldNotReachHere();
  }
*/
	switch (type) {
		case T_BOOLEAN: i = 0; break;
		case T_CHAR   : i = 1; break;
		case T_BYTE   : i = 2; break;
		case T_SHORT  : i = 3; break;
		case T_INT    : // fall through
		case T_LONG   : // fall through
		case T_VOID   : i = 4; break;
		case T_FLOAT  : i = 5; break;
		case T_DOUBLE : i = 6; break;
		case T_OBJECT : // fall through
		case T_ARRAY  : i = 7; break;
		default       : ShouldNotReachHere();
	}
  assert(0 <= i && i < AbstractInterpreter::number_of_result_handlers,
         "index out of bounds");
  return i;
}


// why do not consider float and double , @jerome, 12/27,06, @jerome
address TemplateInterpreterGenerator::generate_result_handler_for(
        BasicType type) {
	address entry = __ pc();
	switch (type) {
		case T_BOOLEAN: __ c2bool(V0);             break;
		case T_CHAR   : __ andi(V0, V0, 0xFFFF);   break;
		case T_BYTE   : __ sign_extend_byte (V0);  break;
		case T_SHORT  : __ sign_extend_short(V0);  break;
		case T_INT    : /* nothing to do */        break;
		case T_FLOAT  : /* nothing to do */        break;
		case T_DOUBLE : /* nothing to do */        break;
		case T_OBJECT :
		{
		//	__ beq(V0, ZERO, L);       // test if NULL handle
		//	__ delayed()->nop();       // if not then
		//	__ lw(V0, V0, 0);          // unbox result
	 		__ lw(V0,FP, frame::interpreter_frame_oop_temp_offset * wordSize);	
			__ verify_oop(V0);         // and verify it
		}
							   break;
		default       : ShouldNotReachHere();
	}
	__ jr(RA);                                  // return from result handler
	__ delayed()->nop();
	return entry;
}

address TemplateInterpreterGenerator::generate_safept_entry_for(
        TosState state,
        address runtime_entry) {
  address entry = __ pc();
  __ push(state);
  __ call_VM(noreg, runtime_entry);
  __ dispatch_via(vtos, Interpreter::_normal_table.table_for(vtos));
  return entry;
}



// Helpers for commoning out cases in the various type of method entries.
//


// increment invocation count & check for overflow
//
// Note: checking for negative value instead of overflow
//       so we have a 'sticky' overflow test
//
// prerequisites : method in T7, invocation counter in T3
void InterpreterGenerator::generate_counter_incr(
        Label* overflow,
        Label* profile_method,
        Label* profile_method_continue) {

	const Address invocation_counter(T7, in_bytes(methodOopDesc::invocation_counter_offset()) 
			+ in_bytes(InvocationCounter::counter_offset()));
	const Address backedge_counter  (T7, in_bytes(methodOopDesc::backedge_counter_offset()) 
			+ in_bytes(InvocationCounter::counter_offset()));

	if (ProfileInterpreter) { // %%% Merge this into methodDataOop
		__ lw(AT, T7, in_bytes(methodOopDesc::interpreter_invocation_counter_offset()));
		__ addiu(AT, AT, 1);
		__ sw(AT, T7, in_bytes(methodOopDesc::interpreter_invocation_counter_offset()));
	}
	// Update standard invocation counters
	__ lw(FSR, backedge_counter);

	__ increment(T3, InvocationCounter::count_increment);
	// a buffer bit added, this no need now
	// by yjl 10/24/2005
	//__ move(AT, InvocationCounter::count_mask_value);
	//__ andr(FSR, FSR, AT);

	__ sw(T3, invocation_counter);
	__ add(T3, T3, FSR);

	// profile_method is non-null only for interpreted method so
	// profile_method != NULL == !native_call

	if (ProfileInterpreter && profile_method != NULL) {
		// Test to see if we should create a method data oop
		__ lui(AT, Assembler::split_high(
					int(&InvocationCounter::InterpreterProfileLimit)));
		__ lw(AT, AT, Assembler::split_low(
					int(&InvocationCounter::InterpreterProfileLimit)));
		__ slt(AT, T3, AT);
		__ bne(AT, ZERO, *profile_method_continue);
		__ delayed()->nop();

		// if no method data exists, go to profile_method
		__ test_method_data_pointer(FSR, *profile_method);
	}

	__ lui(AT, Assembler::split_high(int(&InvocationCounter::InterpreterInvocationLimit)));
	__ lw(AT, AT, Assembler::split_low(int(&InvocationCounter::InterpreterInvocationLimit)));
	__ slt(AT, T3, AT);
	__ beq(AT, ZERO, *overflow);
	__ delayed()->nop();
}

void InterpreterGenerator::generate_counter_overflow(Label* do_continue) {

	// Asm interpreter on entry
	// S7 - locals
	// S0 - bcp
	// T7 - method
	// FP - interpreter frame

	// On return (i.e. jump to entry_point)
	// T7 - method
	// RA - return address of interpreter caller
	// tos - the last parameter to Java method 
	// SP - sender_sp
	const Address size_of_parameters(T7,in_bytes( methodOopDesc::size_of_parameters_offset()));

	// the bcp is valid if and only if it's not null
	__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
		  InterpreterRuntime::frequency_counter_overflow), ZERO);
	__ lw(T7, FP, method_offset);
/*
  // method has been compiled - remove activation frame
  // (leave return address on stack) and continue at
  // verified entry point (eax). (eax in some past life maybe, seems to use methodoop these days)
  //
  // Note: continuation at verified entry point works if the method that has been
  //       compiled is the right one (in case of virtual calls); i.e., the inline
  //       cache check must have happened before the invocation counter overflow
  //       check.
	__ lhu(V0, size_of_parameters);
	__ move(SP, FP);
	__ lw(FP, SP, frame::interpreter_frame_sender_fp_offset * wordSize);
	__ lw(RA, SP, frame::interpreter_frame_return_addr_offset * wordSize);
	__ sll(V0, V0, 2);
	__ addi(V0, V0, - 1 * wordSize);
	__ sub(SP, LVP, V0);
//	__ lw(T0, LVP, 0);
*/
  // Preserve invariant that esi/edi contain bcp/locals of sender frame  
	__ b(*do_continue);
	__ delayed()->nop();
}

// See if we've got enough room on the stack for locals plus overhead.
// The expression stack grows down incrementally, so the normal guard
// page mechanism will work for that.
//
// NOTE: Since the additional locals are also always pushed (wasn't
// obvious in generate_method_entry) so the guard should work for them
// too.
//
// Args:
//      rdx: number of additional locals this frame needs (what we must check)
//      rbx: methodOop
//
// Kills:
//      rax
void InterpreterGenerator::generate_stack_overflow_check(void) {
	// see if we've got enough room on the stack for locals plus overhead.
	// the expression stack grows down incrementally, so the normal guard
	// page mechanism will work for that.
	//
	// Registers live on entry:
	//
	// T2: number of additional locals this frame needs (what we must check)
	// T7: methodOop

	// destroyed on exit
	// T1, T3, T4

	// NOTE:  since the additional locals are also always pushed (wasn't obvious in
	// generate_method_entry) so the guard should work for them too. 
	//

	// monitor entry size: see picture of stack set (generate_method_entry) and frame_i486.hpp
	const int entry_size    = frame::interpreter_frame_monitor_size() * wordSize;

	// total overhead size: entry_size + (saved ebp thru expr stack bottom).
	// be sure to change this if you add/subtract anything to/from the overhead area
	const int overhead_size = -(frame::interpreter_frame_initial_sp_offset*wordSize) 
		+ entry_size;

	const int page_size = os::vm_page_size();

	Label after_frame_check;

	// see if the frame is greater than one page in size. If so,
	// then we need to verify there is enough stack space remaining
	// for the additional locals.
	__ move(AT, (page_size - overhead_size) / Interpreter::stackElementSize());
	__ slt(AT, AT, T2);
	__ beq(AT, ZERO, after_frame_check);
	__ delayed()->nop();

	// compute sp as if this were going to be the last frame on
	// the stack before the red zone
#ifndef OPT_THREAD
	Register thread = T1;
	__ get_thread(thread);
#else
	Register thread = TREG;
#endif

	// locals + overhead, in bytes
	__ sll(T4, T2, Interpreter::stackElementScale());
	__ addiu(T4, T4, overhead_size); 	// locals * 4 + overhead_size --> T4

#ifdef ASSERT
	Label stack_base_okay, stack_size_okay;
	// verify that thread stack base is non-zero
	__ lw(T3, thread, in_bytes(Thread::stack_base_offset()));
	__ bne(T3, ZERO, stack_base_okay);
	__ delayed()->nop();
	__ stop("stack base is zero");
	__ bind(stack_base_okay);
	// verify that thread stack size is non-zero
	__ lw(T3, thread, in_bytes(Thread::stack_size_offset()));
	__ bne(T3, ZERO, stack_size_okay);
	__ delayed()->nop();
	__ stop("stack size is zero");
	__ bind(stack_size_okay);
#endif

	// Add stack base to locals and subtract stack size
	__ lw(T3, thread, in_bytes(Thread::stack_base_offset())); // stack_base --> T3
	__ add(T4, T4, T3); 	// locals * 4 + overhead_size + stack_base--> T4
	__ lw(T3, thread, in_bytes(Thread::stack_size_offset()));  // stack_size --> T3
	__ sub(T4, T4, T3);	// locals * 4 + overhead_size + stack_base - stack_size --> T4


	// add in the redzone and yellow size
	__ move(AT, (StackRedPages+StackYellowPages) * page_size);
	__ add(T4, T4, AT);

	// check against the current stack bottom
	__ slt(AT, T4, SP);
	__ bne(AT, ZERO, after_frame_check);
	__ delayed()->nop();
        // x86 version pop saved bcp and return address here, FIXME
	__ jmp(Interpreter::throw_StackOverflowError_entry(), relocInfo::runtime_call_type);
	__ delayed()->nop();

	// all done with frame size check
	__ bind(after_frame_check);
}

// Allocate monitor and lock method (asm interpreter)
// T7 - methodOop
void InterpreterGenerator::lock_method(void) {
  // synchronize method
	const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;

#ifdef ASSERT
	{ Label L;
		__ lw(T0, T7, in_bytes(methodOopDesc::access_flags_offset()));
		__ andi(T0, T0, JVM_ACC_SYNCHRONIZED);
		__ bne(T0, ZERO, L);
		__ delayed()->nop();
		__ stop("method doesn't need synchronization");
		__ bind(L);
	}
#endif // ASSERT
	// get synchronization object
	{ Label done;
		const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() 
			+ Klass::java_mirror_offset_in_bytes();
		__ lw(T0, T7, in_bytes(methodOopDesc::access_flags_offset()));
		__ andi(T2, T0, JVM_ACC_STATIC);
		__ lw(T0, LVP, Interpreter::local_offset_in_bytes(0));         
		__ beq(T2, ZERO, done);
		__ delayed()->nop();
		__ lw(T0, T7, in_bytes(methodOopDesc::constants_offset()));
		__ lw(T0, T0, constantPoolOopDesc::pool_holder_offset_in_bytes());
		__ lw(T0, T0, mirror_offset);
		__ bind(done);
	}
	// add space for monitor & lock
	__ addi(SP, SP, (-1) * entry_size);           // add space for a monitor entry
	__ sw(SP, FP, frame::interpreter_frame_monitor_block_top_offset * wordSize);	
	// set new monitor block top
	__ sw(T0, SP, BasicObjectLock::obj_offset_in_bytes());   // store object
	// FIXME: I do not know what lock_object will do and what it will need
	__ move(T6, SP);      // object address
	__ lock_object(T6);          
}

// Generate a fixed interpreter frame. This is identical setup for
// interpreted methods and for native methods hence the shared code.
void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {

	// [ local var m-1      ] <--- sp
	//   ...
	// [ local var 0        ]				
	// [ argumnet word n-1  ] <--- T0(sender's sp)
	//   ...
	// [ argument word 0    ] <--- S7

	// initialize fixed part of activation frame
	// sender's sp in T5
	int i = 0;
	__ sw(RA, SP, (-1) * wordSize); 	// save return address
	__ sw(FP, SP, (-2) * wordSize);	// save sender's fp
	__ addiu(FP, SP, (-2) * wordSize);
	//__ sw(T0, FP, (-++i) * wordSize);	// save sender's sp
	__ sw(T5, FP, (-++i) * wordSize);	// save sender's sp
	__ sw(ZERO,FP,(-++i)*wordSize);       //save last_sp as null, FIXME aoqi 
	__ sw(LVP, FP, (-++i) * wordSize);	// save locals offset
	__ lw(BCP, T7, in_bytes(methodOopDesc::const_offset())); // get constMethodOop
	__ addiu(BCP, BCP, in_bytes(constMethodOopDesc::codes_offset())); // get codebase
	__ sw(T7, FP, (-++i) * wordSize);                              // save methodOop
#ifndef CORE
	if (ProfileInterpreter) {
		Label method_data_continue;
		__ lw(AT, T7,  in_bytes(methodOopDesc::method_data_offset())); 
		__ beq(AT, ZERO, method_data_continue); 
		__ delayed()->nop(); 
		__ addi(AT, AT, in_bytes(methodDataOopDesc::data_offset()));  
		__ bind(method_data_continue);
		__ sw(AT, FP,  (-++i) * wordSize); 
	} else {
		__ sw(ZERO, FP, (-++i) * wordSize);
	}
#endif // !CORE

	__ lw(T2, T7, in_bytes(methodOopDesc::constants_offset()));
	__ lw(T2, T2, constantPoolOopDesc::cache_offset_in_bytes());
	__ sw(T2, FP, (-++i) * wordSize);                    // set constant pool cache
	if (native_call) {
		__ sw(ZERO, FP, (-++i) * wordSize);					// no bcp
	} else {
		__ sw(BCP, FP, (-++i) * wordSize);					// set bcp
	}
	__ addiu(SP, FP, (-++i) * wordSize);
	__ sw(SP, FP, (-i) * wordSize);               // reserve word for pointer to expression stack bottom	
}

// End of helpers

// Various method entries
//------------------------------------------------------------------------------------------------------------------------
//
//

// Call an accessor method (assuming it is resolved, otherwise drop
// into vanilla (slow path) entry
address InterpreterGenerator::generate_accessor_entry(void) {

	// T7: methodOop
	// V0: receiver (preserve for slow entry into asm interpreter)
	//  T5: senderSP must preserved for slow path, set SP to it on fast path

	address entry_point = __ pc();
	Label xreturn_path;
	// do fastpath for resolved accessor methods
	if (UseFastAccessorMethods) {
		Label slow_path;
		//	  __ cmpl(Address((int)SafepointSynchronize::address_of_state(),
		//	relocInfo::none), SafepointSynchronize::_not_synchronized);
		//	  __ jcc(Assembler::notEqual, slow_path);
		__ move(T2, (int)SafepointSynchronize::address_of_state()); 
		__ lw(AT,T2, 0);
		__ addi(AT,AT,-(SafepointSynchronize::_not_synchronized));
		__ bne(AT,ZERO,slow_path); 
		__ delayed()->nop();	
		// Code: _aload_0, _(i|a)getfield, _(i|a)return or any rewrites thereof; 
		// parameter size = 1
		// Note: We can only use this code if the getfield has been resolved
		//       and if we don't have a null-pointer exception => check for
		//       these conditions first and use slow path if necessary.
		// T7: method
		// V0: receiver

		// [ receiver  ] <-- sp
		__ lw(T0, SP, 0);

		// check if local 0 != NULL and read field
		__ beq(T0, ZERO, slow_path);
		__ delayed()->nop();
		__ lw(T2, T7, in_bytes(methodOopDesc::constants_offset()));
		// read first instruction word and extract bytecode @ 1 and index @ 2
		__ lw(T3, T7, in_bytes(methodOopDesc::const_offset()));
		__ lw(T3, T3, in_bytes(constMethodOopDesc::codes_offset()));
		// Shift codes right to get the index on the right.
		// The bytecode fetched looks like <index><0xb4><0x2a>
		__ srl(T3, T3, 2*BitsPerByte);
		// FIXME: maybe it's wrong
		__ sll(T3, T3, exact_log2(in_words(ConstantPoolCacheEntry::size())));
		__ lw(T2, T2, constantPoolOopDesc::cache_offset_in_bytes());

		// T0: local 0 eax
		// T7: method ebx
		// V0: receiver - do not destroy since it is needed for slow path! ecx
		// ecx: scratch use which register instead ?     
		// T6: scratch use which register instead ?     
		// T3: constant pool cache index	edx
		// T2: constant pool cache	edi
		// esi: send's sp
		// T5: send's sp
		// check if getfield has been resolved and read constant pool cache entry
		// check the validity of the cache entry by testing whether _indices field
		// contains Bytecode::_getfield in b1 byte.
		assert(in_words(ConstantPoolCacheEntry::size()) == 4, "adjust shift below");
		//    __ movl(esi, 
		//	    Address(edi, 
		//		    edx, 
		//		    Address::times_4, constantPoolCacheOopDesc::base_offset() 
		//		    + ConstantPoolCacheEntry::indices_offset()));

	
		__ sll(T4, T3, Address::times_4);
		__ move(T6, in_bytes(constantPoolCacheOopDesc::base_offset() 
					+ ConstantPoolCacheEntry::indices_offset()));
		__ add(T6, T4, T6);
		__ add(T6, T6, T2);
		__ lw(T6, T6, 0);
		__ srl(T6, T6, 2*BitsPerByte);
		__ andi(T6, T6, 0xFF);
		__ addi(T6, T6, (-1) * Bytecodes::_getfield);
		__ bne(T6, ZERO, slow_path);
		__ delayed()->nop();

		//    __ shrl(esi, 2*BitsPerByte);
		//    __ andl(esi, 0xFF);
		//    __ cmpl(esi, Bytecodes::_getfield);
		//    __ jcc(Assembler::notEqual, slow_path);

		// Note: constant pool entry is not valid before bytecode is resolved

		//    __ movl(esi, 
		//	    Address(edi, 
		//		    edx, 
		//		    Address::times_4, constantPoolCacheOopDesc::base_offset() 
		//		    + ConstantPoolCacheEntry::f2_offset()));
		__ move(T6, in_bytes(constantPoolCacheOopDesc::base_offset() 
					+ ConstantPoolCacheEntry::f2_offset()));
		__ add(T6, T6, T4);
		__ add(T6, T6, T2);
		__ lw(AT, T6, 0);
		//    __ movl(edx, 
		//	    Address(edi, 
		//		    edx, 
		//		    Address::times_4, constantPoolCacheOopDesc::base_offset() 
		//		    + ConstantPoolCacheEntry::flags_offset()));


		__ move(T6, in_bytes(constantPoolCacheOopDesc::base_offset() 
					+ ConstantPoolCacheEntry::flags_offset()));
		__ add(T6, T6, T4);
		__ add(T6, T6, T2);
		__ lw(T3, T6, 0);

		Label notByte, notShort, notChar;
		//    const Address field_address (eax, esi, Address::times_1);

		// Need to differentiate between igetfield, agetfield, bgetfield etc.
		// because they are different sizes.
		// Use the type from the constant pool cache
		__ srl(T3, T3, ConstantPoolCacheEntry::tosBits);
		// Make sure we don't need to mask edx for tosBits after the above shift
		ConstantPoolCacheEntry::verify_tosBits();
		// btos = 0
		__ bne(T3, ZERO, notByte);
		__ delayed()->add(T0, T0, AT);

		__ lb(V0, T0, 0);
		__ b(xreturn_path);
		__ delayed()->nop();

		__ bind(notByte);
		__ addi(T6, T3, (-1) * stos);
		__ bne(T6, ZERO, notShort);
		__ delayed()->nop();
		__ lh(V0, T0, 0);
		__ b(xreturn_path);
		__ delayed()->nop();

		__ bind(notShort);
		__ addi(T6, T3, (-1) * ctos);
		__ bne(T6, ZERO, notChar);
		__ delayed()->nop();
		__ lhu(V0, T0, 0);
		__ b(xreturn_path);
		__ delayed()->nop();

		__ bind(notChar);
#ifdef ASSERT
		Label okay;
		__ addi(T6, T3, (-1) * atos);
		__ beq(T6, ZERO, okay);
		__ delayed()->addi(T6, T3, (-1) * itos);
		__ beq(T6, ZERO, okay);
		__ delayed()->nop();
		__ stop("what type is this?");
		__ bind(okay);
#endif // ASSERT
		// All the rest are a 32 bit wordsize
		__ lw(V0, T0, 0);

		__ bind(xreturn_path);

		// _ireturn/_areturn
		//FIXME 
		 __ move(SP,T5);//FIXME, set sender's fp to SP	
		__ jr(RA);
		__ delayed()->nop();

		// generate a vanilla interpreter entry as the slow path
		__ bind(slow_path);
		(void) generate_normal_entry(false);
	} else {
		(void) generate_normal_entry(false);
	}

	return entry_point;
}

// Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the
// native method than the typical interpreter frame setup.
address InterpreterGenerator::generate_native_entry(bool synchronized) {
  // determine code generation flags
  bool inc_counter  = UseCompiler || CountCompiledCalls;
	//esi: sender's sp
	//T5: sender's sp
	// T7: methodOop
	address entry_point = __ pc();

#ifndef CORE
	const Address invocation_counter(T7,in_bytes(methodOopDesc::invocation_counter_offset() + 
				InvocationCounter::counter_offset()));
#endif

	// get parameter size (always needed)
	// the size in the java stack
	__ lhu(V0, T7, in_bytes(methodOopDesc::size_of_parameters_offset()));

	// native calls don't need the stack size check since they have no expression stack
	// and the arguments are already on the stack and we only add a handful of words
	// to the stack 

	// T7: methodOop
	// V0: size of parameters
	// Layout of frame at this point
	//
	// [ argument word n-1  ] <--- sp
	//   ...
	// [ argument word 0    ]

	// for natives the size of locals is zero

	// compute beginning of parameters (S7)
	__ sll(LVP, V0, Interpreter::stackElementScale());
	__ addiu(LVP, LVP, (-1) * wordSize);
	__ add(LVP, LVP, SP);

	//__ move(T0, SP);               // remember sender sp for generate_fixed_frame


	// add 2 zero-initialized slots for native calls
	__ addi(SP, SP, (-2) * wordSize);
	__ sw(ZERO, SP, 1 * wordSize);	// slot for native oop temp offset (setup via runtime)
	__ sw(ZERO, SP, 0 * wordSize);	// slot for static native result handler3 (setup via runtime)

	// Layout of frame at this point
	// [ method holder mirror	] <--- sp
	// [ result type info			] 
	// [ argument word n-1   	] <--- T0
	//   ...
	// [ argument word 0    	] <--- LVP


#ifndef CORE
	if (inc_counter) __ lw(T3, invocation_counter);  // (pre-)fetch invocation count
#endif

	// initialize fixed part of activation frame
	generate_fixed_frame(true);
        // jerome_for_debug
	//__ move(AT, (int)(&jerome1)); 
	//__ sw(FP, AT, 0);  
	// after this function, the layout of frame is as following
	//
	// [ monitor block top        ] <--- sp ( the top monitor entry )
	// [ byte code pointer (0)    ] (if native, bcp = 0)
	// [ constant pool cache      ]
	// [ methodOop                ]
	// [ locals offset            ]
	// [ sender's sp              ]
	// [ sender's fp              ]
	// [ return address           ] <--- fp
	// [ method holder mirror     ]
	// [ result type info         ]
	// [ argumnet word n-1        ] <--- sender's sp 
	//	 ...
	// [ argument word 0          ] <--- S7


	// make sure method is native & not abstract
#ifdef ASSERT
	__ lw(T0, T7, in_bytes(methodOopDesc::access_flags_offset()));
	{
		Label L;
		__ andi(T2, T0, JVM_ACC_NATIVE);
		__ bne(T2, ZERO, L);
		__ delayed()->nop();
		__ stop("tried to execute native method as non-native");
		__ bind(L);
	}
	{ Label L;
		__ andi(T2, T0, JVM_ACC_ABSTRACT);
		__ beq(T2, ZERO, L);
		__ delayed()->nop();
		__ stop("tried to execute abstract method in interpreter");
		__ bind(L);
	}
#endif

	// Since at this point in the method invocation the exception handler
	// would try to exit the monitor of synchronized methods which hasn't
	// been entered yet, we set the thread local variable
	// _do_not_unlock_if_synchronized to true. The remove_activation will
	// check this flag.
	Register thread = TREG;
#ifndef OPT_THREAD
	__ get_thread(thread);
#endif
	__ move(AT, (int)true);
	__ sb(AT, thread, in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));

#ifndef CORE
	// increment invocation count & check for overflow
	Label invocation_counter_overflow;
	if (inc_counter) {
		generate_counter_incr(&invocation_counter_overflow, NULL, NULL);
	}
	Label continue_after_compile;
	__ bind(continue_after_compile);
#endif // CORE

	bang_stack_shadow_pages(true);

	// reset the _do_not_unlock_if_synchronized flag
#ifndef OPT_THREAD
	__ get_thread(thread);
#endif
	__ sb(ZERO, thread, in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));

	// check for synchronized methods
	// Must happen AFTER invocation_counter check and stack overflow check,
	// so method is not locked if overflows.
	if (synchronized) {
		lock_method();
	} else {
		// no synchronization necessary
#ifdef ASSERT
		{
			Label L;
			__ lw(T0, T7, in_bytes(methodOopDesc::access_flags_offset()));
			__ andi(T2, T0, JVM_ACC_SYNCHRONIZED);
			__ beq(T2, ZERO, L);
			__ delayed()->nop();
			__ stop("method needs synchronization");
			__ bind(L);
		}
#endif
	}

	// after method_lock, the layout of frame is as following
	//
	// [ monitor entry            ] <--- sp
	//   ...
	// [ monitor entry            ]
	// [ monitor block top        ] ( the top monitor entry ) 
	// [ byte code pointer (0)    ] (if native, bcp = 0)
	// [ constant pool cache      ]
	// [ methodOop                ]
	// [ locals offset	      ]
	// [ sender's sp              ]
	// [ sender's fp              ]
	// [ return address           ] <--- fp
	// [ method holder mirror     ]
	// [ result type info         ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//	 ...
	// [ argument word 0          ] <--- S7

	// start execution
#ifdef ASSERT
	{ Label L;
		__ lw(T0, FP, frame::interpreter_frame_monitor_block_top_offset * wordSize);
		__ beq(T0, SP, L);
		__ delayed()->nop();
		__ stop("broken stack frame setup in interpreter in asm");
		__ bind(L);
	}
#endif

	// jvmti/jvmpi support
	__ notify_method_entry();

	// work registers
	const Register method = T7;
	//const Register thread = T2;
	const Register t      = T3;    

	// allocate space for parameters
	__ get_method(method);

	__ verify_oop(method);
/*
	// jerome_for_debug
	__ move(AT, (int)(&jerome10)); 
	__ sw(SP, AT, 0);  
	 // jerome_for_debug
	__ move(AT, (int)(&jerome9)); 
	__ sw(ZERO, AT, 0);  
*/	
	__ lhu(t, method, in_bytes(methodOopDesc::size_of_parameters_offset()));
	// FIXME : to align long/double parameter, we reserve as much as 
	// two the size of the actual needed
	//FIXME, jdk6 allocate 2 more word  here ,@jerome	
	//	__ shl(t, Interpreter::logStackElementSize());
/* 
	// jerome_for_debug
	__ move(AT, (int)(&jerome9)); 
	__ sw(t, AT, 0);  
*/	
	__ shl(t, 3);
	__ addi(t,t,2*wordSize);//for JNIEnv and mirror	
	__ sub(SP, SP, t);
	__ move(AT, -8);
	__ andr(SP, SP, AT);	
	// [			      ] <--- sp
	//   ...                        size of parameters
	// [ monitor entry            ] 
	//   ...
	// [ monitor entry            ]
	// [ monitor block top        ] ( the top monitor entry ) 
	// [ byte code pointer (0)    ] (if native, bcp = 0)
	// [ constant pool cache      ]
	// [ methodOop                ]
	// [ locals offset            ]
	// [ sender's sp              ]
	// [ sender's fp              ]
	// [ return address           ] <--- fp
	// [ method holder mirror     ]
	// [ result type info         ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//	 ...
	// [ argument word 0          ] <--- LVP

	// get signature handler
	{ Label L;
		__ lw(T9, method, in_bytes(methodOopDesc::signature_handler_offset()));
		__ bne(T9, ZERO, L);
		__ delayed()->nop();
		__ call_VM(NOREG, CAST_FROM_FN_PTR(address, 
					InterpreterRuntime::prepare_native_call), method);
		__ get_method(method);
		__ lw(T9, method, in_bytes(methodOopDesc::signature_handler_offset()));
		__ bind(L);
	}

	// call signature handler
	// FIXME: when change codes in InterpreterRuntime, note this point
	// from: begin of parameters
	assert(InterpreterRuntime::SignatureHandlerGenerator::from() == LVP, "adjust this code");
	// to: current sp
	assert(InterpreterRuntime::SignatureHandlerGenerator::to  () == SP, "adjust this code");
	// temp: T3
	assert(InterpreterRuntime::SignatureHandlerGenerator::temp() == t  , "adjust this code");

	__ jalr(T9);
	__ delayed()->nop();
	__ get_method(method);	// slow path call blows EBX on DevStudio 5.0

	/* 
	   if native function is static, and its second parameter has type length of double word,
	   and first parameter has type length of word, we have to reserve one word
	   for the first parameter, according to mips o32 abi.
	   if native function is not static, and its third parameter has type length of double word,
	   and second parameter has type length of word, we have to reserve one word for the second
	   parameter.
	 */


	// result handler is in V0
	// set result handler
	__ sw(V0, FP, (frame::interpreter_frame_result_handler_offset)*wordSize);

#define FIRSTPARA_SHIFT_COUNT 5
#define SECONDPARA_SHIFT_COUNT 9
#define THIRDPARA_SHIFT_COUNT 13
#define PARA_MASK	0xf

	// pass mirror handle if static call
	{ 
		Label L;
		const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() 
			+ Klass::java_mirror_offset_in_bytes();
		__ lw(t, method, in_bytes(methodOopDesc::access_flags_offset()));
		__ andi(t, t, JVM_ACC_STATIC);
		__ beq(t, ZERO, L);
		__ delayed()->nop();

		// get mirror
		__ lw(t, method, in_bytes(methodOopDesc:: constants_offset()));
		__ lw(t, t, constantPoolOopDesc::pool_holder_offset_in_bytes());
		__ lw(t, t, mirror_offset);
		// copy mirror into activation frame
		//__ sw(t, FP, frame::interpreter_frame_oop_temp_offset * wordSize);
		// pass handle to mirror
		__ st_ptr(t, FP, frame::interpreter_frame_oop_temp_offset * wordSize);
		__ addi(t, FP, frame::interpreter_frame_oop_temp_offset * wordSize);
//		__ ld_ptr(t,Address(SP ,wordSize));		
		__ st_ptr(t, Address(SP, wordSize));
		__ move(A1, t);
		__ bind(L);
	}

	// [ mthd holder mirror ptr   ] <--- sp  --------------------| (only for static method)
	// [                          ]                              |
	//   ...                        size of parameters(or +1)    |
	// [ monitor entry            ]                              |
	//   ...                                                     |
	// [ monitor entry            ]                              |
	// [ monitor block top        ] ( the top monitor entry )    |
	// [ byte code pointer (0)    ] (if native, bcp = 0)         |
	// [ constant pool cache      ]                              |
	// [ methodOop                ]                              |
	// [ locals offset            ]                              |
	// [ sender's sp              ]                              |
	// [ sender's fp              ]                              |
	// [ return address           ] <--- fp                      |
	// [ method holder mirror     ] <----------------------------|                             
	// [ result type info         ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//	 ...
	// [ argument word 0          ] <--- S7

	// get native function entry point
	{ Label L;
		__ lw(T9, method, in_bytes(methodOopDesc::native_function_offset()));
		__ move(V1, (uintptr_t) SharedRuntime::native_method_throw_unsatisfied_link_error_entry());
		__ bne(V1, T9, L);
		__ delayed()->nop();
		__ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::prepare_native_call), method);
		__ get_method(method);
		__ verify_oop(method);
		__ lw(T9, method, in_bytes(methodOopDesc::native_function_offset()));
		__ bind(L);
	}

	// pass JNIEnv
	// native function in T9
#ifndef OPT_THREAD
	__ get_thread(thread);
#endif
	__ addi(t, thread, in_bytes(JavaThread::jni_environment_offset()));
	//__ addi(SP, SP, (-1) * wordSize);
        //__ sw(t, SP, 0);
	// stack,but I think it won't work when pass float,double etc @jerome,10/17,2006
	__ move(A0, t);
	// [ jni environment          ] <--- sp
	// [ mthd holder mirror ptr   ] ---------------------------->| (only for static method)
	// [                          ]                              |
	//   ...                        size of parameters           |
	// [ monitor entry            ]                              |
	//   ...                                                     |
	// [ monitor entry            ]                              |
	// [ monitor block top        ] ( the top monitor entry )    |
	// [ byte code pointer (0)    ] (if native, bcp = 0)         |
	// [ constant pool cache      ]                              |
	// [ methodOop                ]                              |
	// [ locals offset            ]                              |
	// [ sender's sp              ]                              |
	// [ sender's fp              ]                              |
	// [ return address           ] <--- fp                      |
	// [ method holder mirror     ] <----------------------------|                             
	// [ result type info         ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//	 ...
	// [ argument word 0          ] <--- S7

	/*
	// reset handle block
	__ lw(t, thread, in_bytes(JavaThread::active_handles_offset()));
	__ sw(ZERO, t, JNIHandleBlock::top_offset_in_bytes());

        */
	// set_last_Java_frame_before_call
	__ sw(FP, thread, in_bytes(JavaThread::last_Java_fp_offset()));
	 //set_last_Java_frame_before_call
	// It is enough that the pc()
	// points into the right code segment. It does not have to be the correct return pc.
	//__ set_last_Java_frame(thread, noreg, FP, __ pc());
	// change thread state
#ifdef ASSERT
	{ Label L;
		__ lw(t, thread, in_bytes(JavaThread::thread_state_offset()));
		__ addi(t, t, (-1) * _thread_in_Java);
		__ beq(t, ZERO, L);
		__ delayed()->nop();
		__ stop("Wrong thread state in native stub");
		__ bind(L);
	}
#endif

	// Change state to native (we save the return address in the thread, since it might not
	// be pushed on the stack when we do a a stack traversal). It is enough that the pc()
	// points into the right code segment. It does not have to be the correct return pc.
	__ move(t, (int) __ pc());
//	__ sw(t, thread, in_bytes(JavaThread::frame_anchor_offset() 
//			+ JavaFrameAnchor::last_Java_pc_offset()));
	__ sw(t, thread, in_bytes(JavaThread::last_Java_pc_offset())); 
	__ sw(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));
	
	__ move(t, _thread_in_native);
	__ sw(t, thread, in_bytes(JavaThread::thread_state_offset()));

	// FIXME: parameter's format has not been determined
	// it will be wrong when pass float, double parameter @jerome ,10/17,2006

//	__ warn("get signature handler");
	__ lw(A1, SP, 1 * wordSize);
	__ lw(A2, SP, 2 * wordSize);
	__ lw(A3, SP, 3 * wordSize);

	// call native method
	__ jalr(T9);
	__ delayed()->nop();
	// result potentially in V0:V1 or F0:F1
	/*
	   __ get_method(method);    
#ifndef OPT_THREAD
	__ get_thread(thread);
#endif
        */
//#ifdef COMPILER2
//#endif
//jerome_for_debug 
	//__ move(AT, (int)(&jerome2)); 
	//__ sw(FP, AT, 0);  


	if (CheckJNICalls) {
		//FIXME	
		//	 __ call(StubRoutines::gs2::verify_fpu_cntrl_wrd_entry(), 
		//	 relocInfo::runtime_call_type);
	}

	// restore S0 to have legal interpreter frame, i.e., bci == 0 <=> S0 == code_base()
	//__ lw(BCP, method, in_bytes(methodOopDesc::const_offset())); // get constMethodOop
	//__ addi(BCP, BCP, in_bytes(constMethodOopDesc::codes_offset()));    // get codebase

	// via _last_native_pc and not via _last_jave_sp
	// NOTE: the order of theses push(es) is known to frame::interpreter_frame_result.
	//  If the order changes or anything else is added to the stack the code in
	// interpreter_frame_result will have to be changed.
	//FIXME, should modify here
	// save return value to keep the value from being destroyed by other calls
	//__ addi(SP, SP, (-4) * wordSize);
	//__ sw(V0, SP, 3 * wordSize);
	//__ sw(V1, SP, 2 * wordSize);
	//__ swc1(F0, SP, 1 * wordSize);
	//__ swc1(F1, SP, 0 * wordSize);
	__ move(S1, V0);
	__ move(S3, V1);
	__ mfc1(S4, F0);
	__ mfc1(S5, F1);

	// change thread state
	__ get_thread(thread); 
	__ move(t, _thread_in_native_trans);
	__ sw(t, thread, in_bytes(JavaThread::thread_state_offset()));

	if( os::is_MP() ) __ sync(); // Force this write out before the read below

	// check for safepoint operation in progress and/or pending suspend requests
	{ Label Continue;

		// Don't use call_VM as it will see a possible pending exception and forward it
		// and never return here preventing us from clearing _last_native_pc down below.
		// Also can't use call_VM_leaf either as it will check to see if esi & edi are
		// preserved and correspond to the bcp/locals pointers. So we do a runtime call
		// by hand.
		//
		Label L;
		__ move(T4, (int)SafepointSynchronize::address_of_state());
		__ lw(T0, T4, 0);
		__ bne(T0, ZERO, L);
		__ delayed()->nop();
		__ lw(T0, thread, in_bytes(JavaThread::suspend_flags_offset()));
		__ beq(T0, ZERO, Continue);
		__ delayed()->nop();
		__ bind(L);
		__ addi(SP, SP, (-1) * wordSize);
		__ move(A0, thread);
		__ call(CAST_FROM_FN_PTR(address, 
		             JavaThread::check_special_condition_for_native_trans), 
				  relocInfo::runtime_call_type);
		__ delayed()->nop();
		__ addi(SP, SP, wordSize);

	//	__ get_method(method);
#ifndef OPT_THREAD
		__ get_thread(thread);
#endif

		__ bind(Continue);
	}

	// change thread state
	__ move(t, _thread_in_Java);
	__ sw(t, thread, in_bytes(JavaThread::thread_state_offset()));
	//__ reset_last_Java_frame(thread, true);
	__ reset_last_Java_frame(thread, true,true);
	// reset handle block
	//  __ movl(t, Address(thread, JavaThread::active_handles_offset()));

	__ lw(t,thread, in_bytes(JavaThread::active_handles_offset())); 
	// __ movl(Address(t, JNIHandleBlock::top_offset_in_bytes()), 0);
	__ sw(ZERO,t, JNIHandleBlock::top_offset_in_bytes());
	// If result was an oop then unbox and save it in the frame
	{ Label L;
		Label no_oop, store_result;
		//FIXME, addi only support 16-bit imeditate  
		__ lw(AT,FP, frame::interpreter_frame_result_handler_offset*wordSize); 
		// __ addi(AT,AT,-(int)AbstractInterpreter::result_handler(T_OBJECT)); 
		__ move(T6, (int)AbstractInterpreter::result_handler(T_OBJECT)); 
		__ bne(AT,T6,no_oop); 
		__ delayed()->nop(); 
		//__ cmpl(Address(esp), NULL_WORD);
		//FIXME, do we need pop here ? @jerome	
		//__ pop(ltos);
		//__ testl(eax, eax);
		//__ jcc(Assembler::zero, store_result);
		__ move(V0, S1);	
		__ beq(V0,ZERO,store_result); 
		__ delayed()->nop();	
		// unbox
		__ lw(V0,V0, 0); 
		__ bind(store_result);
		__ sw(V0,FP, (frame::interpreter_frame_oop_temp_offset)*wordSize);  
		// keep stack depth as expected by pushing oop which will eventually be discarded
		__ bind(no_oop);
	}
	{
		Label no_reguard;
		__ lw(t, thread, in_bytes(JavaThread::stack_guard_state_offset()));
		//__ bne(t, JavaThread::stack_guard_yellow_disabled, no_reguard);
		__ move(AT,(int) JavaThread::stack_guard_yellow_disabled);	
		__ bne(t, AT, no_reguard);
		__ delayed()->nop();
		__ pushad();	
		__ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), 
				relocInfo::runtime_call_type);
		__ delayed()->nop();
		__ popad();	
		__ bind(no_reguard);
	}
	// restore esi to have legal interpreter frame, 
	// i.e., bci == 0 <=> esi == code_base()
	// Can't call_VM until bcp is within reasonable.
	__ get_method(method);      // method is junk from thread_in_native to now.
	__ verify_oop(method);
	//  __ movl(esi, Address(method,methodOopDesc::const_offset())); // get constMethodOop
	__ lw(BCP,method,in_bytes(methodOopDesc::const_offset())); 
	// __ leal(esi, Address(esi,constMethodOopDesc::codes_offset()));    // get codebase
	__ lea(BCP,Address(BCP, in_bytes(constMethodOopDesc::codes_offset())));
	// handle exceptions (exception handling will handle unlocking!)
	{ 
		Label L;
		__ lw(t, thread, in_bytes(Thread::pending_exception_offset()));
		__ beq(t, ZERO, L);
		__ delayed()->nop();
		// Note: At some point we may want to unify this with the code used in 
		// call_VM_base();
		// i.e., we should use the StubRoutines::forward_exception code. For now this
		// doesn't work here because the esp is not correctly set at this point.
		__ MacroAssembler::call_VM(noreg, CAST_FROM_FN_PTR(address, 
					InterpreterRuntime::throw_pending_exception));
		__ should_not_reach_here();
		__ bind(L);
	}

	// do unlocking if necessary
	{ Label L;
		__ lw(t, method, in_bytes(methodOopDesc::access_flags_offset()));
		__ andi(t, t, JVM_ACC_SYNCHRONIZED);
		__ beq(t, ZERO, L);
		// the code below should be shared with interpreter macro assembler implementation
		{ Label unlock;
			// BasicObjectLock will be first in list,
			// since this is a synchronized method. However, need
			// to check that the object has not been unlocked by 
			// an explicit monitorexit bytecode.        
			__ delayed()->addi(T6, FP, frame::interpreter_frame_initial_sp_offset 
					* wordSize - (int)sizeof(BasicObjectLock));
			// address of first monitor

			__ lw(t, T6, BasicObjectLock::obj_offset_in_bytes());
			__ bne(t, ZERO, unlock);
			__ delayed()->nop();

			// Entry already unlocked, need to throw exception
			__ MacroAssembler::call_VM(NOREG, CAST_FROM_FN_PTR(address, 
				InterpreterRuntime::throw_illegal_monitor_state_exception));
			__ should_not_reach_here();

			__ bind(unlock);        
			__ unlock_object(T6);             
		}
		__ bind(L);
	}    

	// jvmti/jvmpi support
	// Note: This must happen _after_ handling/throwing any exceptions since
	//       the exception handler code notifies the runtime of method exits
	//       too. If this happens before, method entry/exit notifications are
	//       not properly paired (was bug - gri 11/22/99).
	__ notify_method_exit(false, vtos, InterpreterMacroAssembler::NotifyJVMTI );

	// restore potential result in V0:V1, 
	// call result handler to restore potential result in ST0 & handle result
	//__ lw(V0, SP, 3 * wordSize);
	//__ lw(V1, SP, 2 * wordSize);
	//__ lwc1(F0, SP, 1 * wordSize);
	//__ lwc1(F1, SP, 0 * wordSize);
	//__ addi(SP, SP, 4 * wordSize);
	__ move(V0, S1);
	__ move(V1, S3);
	__ mtc1(S4, F0);
	__ mtc1(S5, F1);
	__ lw(t, FP, (frame::interpreter_frame_result_handler_offset) * wordSize);
	__ jalr(t);
	__ delayed()->nop();
//jerome_for_debug 
	//__ move(AT, (int)(&jerome4)); 
	//__ sw(FP, AT, 0);  


	// remove activation
	__ lw(SP, FP, frame::interpreter_frame_sender_sp_offset * wordSize); // get sender sp
	__ lw(RA, FP, frame::interpreter_frame_return_addr_offset * wordSize); // get return address
	__ lw(FP, FP, frame::interpreter_frame_sender_fp_offset * wordSize); // restore sender's fp
	__ jr(RA);
	__ delayed()->nop();

#ifndef CORE
	if (inc_counter) {
		// Handle overflow of counter and compile method
		__ bind(invocation_counter_overflow);
		generate_counter_overflow(&continue_after_compile);
		// entry_point is the beginning of this
		// function and checks again for compiled code
	}
#endif
	return entry_point;
}

//
// Generic interpreted method entry to (asm) interpreter
//
// Layout of frame just at the entry
//
//   [ argument word n-1	] <--- sp
//     ...
//   [ argument word 0  	]
// assume methodOop in T7 before call this method. 
// prerequisites to the generated stub : the callee methodOop in T7
// note you must save the caller bcp before call the generated stub
//
address InterpreterGenerator::generate_normal_entry(bool synchronized) {
  // determine code generation flags
  bool inc_counter  = UseCompiler || CountCompiledCalls;

	// T7: methodOop
	// T5: sender 's sp	
	address entry_point = __ pc();
/*
#ifndef CORE
	// check if compiled code exists
	Label run_compiled_code;
	if (!CompileTheWorld) {
	check_for_compiled_code(run_compiled_code);
	}
#endif
*/
#ifndef CORE
	const Address invocation_counter(T7, 
	in_bytes(methodOopDesc::invocation_counter_offset() + InvocationCounter::counter_offset()));
#endif

	// get parameter size (always needed)
	__ lhu(V0, T7, in_bytes(methodOopDesc::size_of_parameters_offset()));

	// T7: methodOop
	// V0: size of parameters
	// T5: sender 's sp ,could be different frome sp+ wordSize if we call via c2i 
	// get size of locals in words to T2
	__ lhu(T2, T7, in_bytes(methodOopDesc::size_of_locals_offset()));       	
	// T2 = no. of additional locals, locals include parameters
	__ sub(T2, T2, V0);                                

	// see if we've got enough room on the stack for locals plus overhead.
	// Layout of frame at this point
	//
	// [ argument word n-1  ] <--- sp
	//   ...
	// [ argument word 0  	]
	generate_stack_overflow_check();
	// after this function, the layout of frame does not change

	// compute beginning of parameters (S7)
	__ sll(LVP, V0, Interpreter::stackElementScale());
	__ addiu(LVP, LVP, (-1) * wordSize);
	__ add(LVP, LVP, SP);
	// remember current sp
	//__ move(T0, SP);		// SP --> T0

	// T2 - # of additional locals
	// allocate space for locals
	// explicitly initialize locals
	{
		Label exit, loop;
		// for test
	//	__ slt(AT, ZERO, T2);
	//	__ beq(AT, ZERO, exit);
		__ beq(T2, ZERO, exit);
		__ delayed()->nop();
		__ bind(loop);
		if(TaggedStackInterpreter)__ addi(SP, SP, -1 * wordSize);  
		__ sw(ZERO, SP, -1 * wordSize);     // initialize local variables
		__ addiu(T2, T2, -1);               // until everything initialized
		__ bne(T2, ZERO, loop);
	//	__ slt(AT, ZERO, T2);
	//	__ bne(AT, ZERO, loop);
		__ delayed();
		__ addiu(SP, SP, (-1) * wordSize); //fill delay slot
		__ bind(exit);
	}

#ifndef CORE
	if (inc_counter) __ lw(T3, invocation_counter);  // (pre-)fetch invocation count
#endif
	// 				
	// [ local var m-1	] <--- sp
	//   ...
	// [ local var 0	]
	// [ argument word n-1	] <--- T0
	//   ...
	// [ argument word 0  	] <--- S7

	// initialize fixed part of activation frame

	generate_fixed_frame(false);


	// after this function, the layout of frame is as following
	//                           
	// [ monitor block top        ] <--- sp ( the top monitor entry )
	// [ byte code pointer        ] (if native, bcp = 0)
	// [ constant pool cache      ]
	// [ methodOop                ]
	// [ locals offset						]
	// [ sender's sp              ]
	// [ sender's fp              ] <--- fp
	// [ return address           ] 
	// [ local var m-1            ]
	//   ...
	// [ local var 0              ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//   ...
	// [ argument word 0          ] <--- S7


	// make sure method is not native & not abstract
#ifdef ASSERT
	__ lw(T0, T7, in_bytes(methodOopDesc::access_flags_offset()));
	{
		Label L;
		__ andi(T2, T0, JVM_ACC_NATIVE);
		__ beq(T2, ZERO, L);
		__ delayed()->nop();
		__ stop("tried to execute native method as non-native");
		__ bind(L);
	}
	{ Label L;
		__ andi(T2, T0, JVM_ACC_ABSTRACT);
		__ beq(T2, ZERO, L);
		__ delayed()->nop();
		__ stop("tried to execute abstract method in interpreter");
		__ bind(L);
	}
#endif

  // Since at this point in the method invocation the exception handler
  // would try to exit the monitor of synchronized methods which hasn't
  // been entered yet, we set the thread local variable
  // _do_not_unlock_if_synchronized to true. The remove_activation will
  // check this flag.

#ifndef OPT_THREAD
	Register thread = T0;
	__ get_thread(T0);
#else
	Register thread = TREG;
#endif
	__ move(AT, (int)true);
	__ sb(AT, thread, in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));

#ifndef CORE
	// increment invocation count & check for overflow
	Label invocation_counter_overflow;
	Label profile_method;
	Label profile_method_continue;
	if (inc_counter) {
		generate_counter_incr(&invocation_counter_overflow, &profile_method, 
				&profile_method_continue);
		if (ProfileInterpreter) {
			__ bind(profile_method_continue);
		}
	}

	Label continue_after_compile;
	__ bind(continue_after_compile);

#endif // CORE

	bang_stack_shadow_pages(false);

	// reset the _do_not_unlock_if_synchronized flag
#ifndef OPT_THREAD
	__ get_thread(thread);
#endif
	__ sb(ZERO, thread, in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));

	// check for synchronized methods
	// Must happen AFTER invocation_counter check and stack overflow check,
	// so method is not locked if overflows.
	//
	if (synchronized) {
		// Allocate monitor and lock method
		lock_method();
	} else {
		// no synchronization necessary
#ifdef ASSERT
		{ Label L;
			__ lw(AT, T7, in_bytes(methodOopDesc::access_flags_offset()));
			__ andi(T2, AT, JVM_ACC_SYNCHRONIZED);
			__ beq(T2, ZERO, L);
			__ delayed()->nop();
			__ stop("method needs synchronization");
			__ bind(L);
		}
#endif
	}

	// layout of frame after lock_method
	// [ monitor entry	      ] <--- sp
	//   ...
	// [ monitor entry	      ] 
	// [ monitor block top        ] ( the top monitor entry )
	// [ byte code pointer        ] (if native, bcp = 0)
	// [ constant pool cache      ]
	// [ methodOop                ]
	// [ locals offset	      ]
	// [ sender's sp              ]
	// [ sender's fp              ]
	// [ return address           ] <--- fp
	// [ local var m-1            ]
	//   ...
	// [ local var 0              ]
	// [ argumnet word n-1        ] <--- ( sender's sp )
	//   ...
	// [ argument word 0          ] <--- S7


	// start execution
#ifdef ASSERT
	{ Label L;
		__ lw(AT, FP, frame::interpreter_frame_monitor_block_top_offset * wordSize);
		__ beq(AT, SP, L);
		__ delayed()->nop();
		__ stop("broken stack frame setup in interpreter in native");
		__ bind(L);
	}
#endif

	// jvmti/jvmpi support
	__ notify_method_entry();
	//jerome_for_debug 
	//__ sub(S1,FP,SP);
	//__ move(AT, (int)(&jerome7)); 
	//__ sw(S1, AT, 0);  

	__ dispatch_next(vtos);
	//jerome_for_debug 
	//__ move(AT, (int)(&jerome6)); 
	//__ sw(FP, AT, 0);  

#ifndef CORE
	// invocation counter overflow
	if (inc_counter) {
		if (ProfileInterpreter) {
			// We have decided to profile this method in the interpreter
			__ bind(profile_method);

			__ call_VM(noreg, CAST_FROM_FN_PTR(address, 
						InterpreterRuntime::profile_method), T5, true);

			__ lw(T7, FP, method_offset);
			__ lw(FSR, T7, in_bytes(methodOopDesc::method_data_offset()));
			__ sw(FSR, T7, frame::interpreter_frame_mdx_offset * wordSize);
			__ test_method_data_pointer(FSR, profile_method_continue);
			__ addiu(FSR, FSR, in_bytes(methodDataOopDesc::data_offset()));
			__ sw(FSR, FP, frame::interpreter_frame_mdx_offset * wordSize);
			__ b(profile_method_continue);
			__ delayed()->nop();
		}
		// Handle overflow of counter and compile method
		__ bind(invocation_counter_overflow);
		generate_counter_overflow(&continue_after_compile); 
	}

#endif
	return entry_point;
}

// Entry points
//
// Here we generate the various kind of entries into the interpreter.
// The two main entry type are generic bytecode methods and native
// call method.  These both come in synchronized and non-synchronized
// versions but the frame layout they create is very similar. The
// other method entry types are really just special purpose entries
// that are really entry and interpretation all in one. These are for
// trivial methods like accessor, empty, or special math methods.
//
// When control flow reaches any of the entry types for the interpreter
// the following holds ->
//
// Arguments:
//
// T7: methodOop
// V0: receiver
//
//
// Stack layout immediately at entry
//
// [ parameter n-1      ] <--- sp
//   ...
// [ parameter 0        ]
// [ expression stack   ] (caller's java expression stack)

// Assuming that we don't go to one of the trivial specialized entries
// the stack will look like below when we are ready to execute the
// first bytecode (or call the native routine). The register usage
// will be as the template based interpreter expects (see
// interpreter_amd64.hpp).
//
// local variables follow incoming parameters immediately; i.e.
// the return address is moved to the end of the locals).
//
// [ monitor entry	      ] <--- sp
//   ...
// [ monitor entry	      ] 
// [ monitor block top        ] ( the top monitor entry )
// [ byte code pointer        ] (if native, bcp = 0)
// [ constant pool cache      ]
// [ methodOop                ]
// [ locals offset	      ]
// [ sender's sp              ]
// [ sender's fp              ]
// [ return address           ] <--- fp
// [ local var m-1            ]
//   ...
// [ local var 0              ]
// [ argumnet word n-1        ] <--- ( sender's sp )
//   ...
// [ argument word 0          ] <--- S7

address AbstractInterpreterGenerator::generate_method_entry(
                                        AbstractInterpreter::MethodKind kind) {
  // determine code generation flags
  bool synchronized = false;
  address entry_point = NULL;
	switch (kind) {    
		case Interpreter::zerolocals             :                                                                             break;
		case Interpreter::zerolocals_synchronized: synchronized = true;                                                        break;
		case Interpreter::native                 : 
		entry_point = ((InterpreterGenerator*)this)->generate_native_entry(false);  
				   break;
		case Interpreter::native_synchronized    : 
		entry_point = ((InterpreterGenerator*)this)->generate_native_entry(true);   
				   break;
		case Interpreter::empty                  : 
		entry_point = ((InterpreterGenerator*)this)->generate_empty_entry();        
				  break;
		case Interpreter::accessor               : 
		entry_point = ((InterpreterGenerator*)this)->generate_accessor_entry();     
				  break;
		case Interpreter::abstract               : 
		entry_point = ((InterpreterGenerator*)this)->generate_abstract_entry();     
				  break;

		case Interpreter::java_lang_math_sin     : // fall thru
		case Interpreter::java_lang_math_cos     : // fall thru
		case Interpreter::java_lang_math_tan     : // fall thru
		case Interpreter::java_lang_math_abs     : // fall thru
		case Interpreter::java_lang_math_log     : // fall thru 
		case Interpreter::java_lang_math_log10   : // fall thru
		case Interpreter::java_lang_math_sqrt    : break;
		entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind);     
				  break;

		default                                  : ShouldNotReachHere();                                                       break;
	}
	if (entry_point) return entry_point;

	return ((InterpreterGenerator*)this)->generate_normal_entry(synchronized);
}

// How much stack a method activation needs in words.
int AbstractInterpreter::size_top_interpreter_activation(methodOop method) {

	const int entry_size    = frame::interpreter_frame_monitor_size();

	// total overhead size: entry_size + (saved ebp thru expr stack bottom).
	// be sure to change this if you add/subtract anything to/from the overhead area
	const int overhead_size = -(frame::interpreter_frame_initial_sp_offset) + entry_size;

	const int stub_code = 6;  // see generate_call_stub
	// return overhead_size + method->max_locals() + method->max_stack() + stub_code;
	const int method_stack = (method->max_locals() + method->max_stack()) *
					Interpreter::stackElementWords();
	return overhead_size + method_stack + stub_code;
}

int AbstractInterpreter::layout_activation(methodOop method,
                                           int tempcount,
                                           int popframe_extra_args,
                                           int moncount,
                                           int callee_param_count,
                                           int callee_locals,
                                           frame* caller,
                                           frame* interpreter_frame,
                                           bool is_top_frame) {
  // Note: This calculation must exactly parallel the frame setup
  // in AbstractInterpreterGenerator::generate_method_entry.
  // If interpreter_frame!=NULL, set up the method, locals, and monitors.
  // The frame interpreter_frame, if not NULL, is guaranteed to be the
  // right size, as determined by a previous call to this method.
  // It is also guaranteed to be walkable even though it is in a skeletal state

  // fixed size of an interpreter frame:
 // int max_locals = method->max_locals();
  
 int max_locals = method->max_locals() * Interpreter::stackElementWords();
 int extra_locals = (method->max_locals() - method->size_of_parameters()) * Interpreter::stackElementWords();

  int overhead = frame::sender_sp_offset - frame::interpreter_frame_initial_sp_offset;
  // Our locals were accounted for by the caller (or last_frame_adjust on the transistion)
  // Since the callee parameters already account for the callee's params we only need to account for
  // the extra locals.

 // int size = overhead + callee_locals - callee_param_size + moncount*frame::interpreter_frame_monitor_size() + tempcount;
 int size = overhead +
	((callee_locals - callee_param_count)*Interpreter::stackElementWords()) +
	 (moncount*frame::interpreter_frame_monitor_size()) +
	 tempcount*Interpreter::stackElementWords() + popframe_extra_args;
  if (interpreter_frame != NULL) {
#ifdef ASSERT
    assert(caller->sp() == interpreter_frame->interpreter_frame_sender_sp(), "Frame not properly walkable");
#endif

    interpreter_frame->interpreter_frame_set_method(method);
    // NOTE the difference in using sender_sp and interpreter_frame_sender_sp
    // interpreter_frame_sender_sp is the original sp of the caller (the unextended_sp)
    // and sender_sp is fp+8
    jint* locals = interpreter_frame->sender_sp() + max_locals - 1;

    interpreter_frame->interpreter_frame_set_locals(locals);
    BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
    BasicObjectLock* monbot = montop - moncount;
    interpreter_frame->interpreter_frame_set_monitor_end(montop - moncount);

//set last sp;
    intptr_t*  esp = (intptr_t*) monbot - tempcount*Interpreter::stackElementWords() -
			                popframe_extra_args;
	printf("last sp is %x\n", esp);
     interpreter_frame->interpreter_frame_set_last_sp(esp);
    // All frames but the initial interpreter frame we fill in have a
    // value for sender_sp that allows walking the stack but isn't
    // truly correct. Correct the value here.
    // 
   // int extra_locals = method->max_locals() - method->size_of_parameters();
    if (extra_locals != 0 && 
	interpreter_frame->sender_sp() == interpreter_frame->interpreter_frame_sender_sp() ) {
      interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() + extra_locals);
    }
    *interpreter_frame->interpreter_frame_cache_addr() = 
      method->constants()->cache();
  }
  return size;
}

//-----------------------------------------------------------------------------
// Exceptions

void TemplateInterpreterGenerator::generate_throw_exception() {
  // Entry point in previous activation (i.e., if the caller was
  // interpreted)
  Interpreter::_rethrow_exception_entry = __ pc();

 // Restore sp to interpreter_frame_last_sp even though we are going
 // to empty the expression stack for the exception processing.
// __ movl(Address(ebp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
   __ sw(ZERO,FP, frame::interpreter_frame_last_sp_offset * wordSize); 
  
  // V0: exception
  // V1: return address/pc that threw exception
  __ restore_bcp();                              // esi points to call/send
  __ restore_locals();

  // Entry point for exceptions thrown within interpreter code
  Interpreter::_throw_exception_entry = __ pc();  
  // expression stack is undefined here
  // V0: exception
  // BCP: exception bcp
  __ verify_oop(V0);

  // expression stack must be empty before entering the VM in case of an exception
  __ empty_expression_stack();
  // find exception handler address and preserve exception oop
	__ move(A1, V0);
  __ call_VM(V1, CAST_FROM_FN_PTR(address, InterpreterRuntime::exception_handler_for_exception), A1);
  // V0: exception handler entry point
  // V1: preserved exception oop
  // S0: bcp for exception handler
	__ addi(SP, SP, (-1) * wordSize);
	__ sw(V1, SP, 0);                              // push exception which is now the only value on the stack
  __ jr(V0);                                   // jump to exception handler (may be _remove_activation_entry!)
	__ delayed()->nop();

  // If the exception is not handled in the current frame the frame is removed and
  // the exception is rethrown (i.e. exception continuation is _rethrow_exception).
  //
  // Note: At this point the bci is still the bxi for the instruction which caused
  //       the exception and the expression stack is empty. Thus, for any VM calls
  //       at this point, GC will find a legal oop map (with empty expression stack).

  // In current activation
  // V0: exception
  // BCP: exception bcp

  //
  // JVMTI PopFrame support
  //

   Interpreter::_remove_activation_preserving_args_entry = __ pc();
  __ empty_expression_stack();
  // Set the popframe_processing bit in pending_popframe_condition indicating that we are
  // currently handling popframe, so that call_VMs that may happen later do not trigger new
  // popframe handling cycles.
#ifndef OPT_THREAD
	Register thread = T2;
  __ get_thread(T2);
#else
	Register thread = TREG;
#endif
  __ lw(T3, thread, in_bytes(JavaThread::popframe_condition_offset()));
  __ ori(T3, T3, JavaThread::popframe_processing_bit);
  __ sw(T3, thread, in_bytes(JavaThread::popframe_condition_offset()));

#ifndef CORE
  {
    // Check to see whether we are returning to a deoptimized frame.
    // (The PopFrame call ensures that the caller of the popped frame is
    // either interpreted or compiled and deoptimizes it if compiled.)
    // In this case, we can't call dispatch_next() after the frame is
    // popped, but instead must save the incoming arguments and restore
    // them after deoptimization has occurred.
    //
    // Note that we don't compare the return PC against the
    // deoptimization blob's unpack entry because of the presence of
    // adapter frames in C2.
    Label caller_not_deoptimized;
		__ lw(A0, FP, frame::return_addr_offset * wordSize);
		__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::interpreter_contains), A0);
		__ bne(V0, ZERO, caller_not_deoptimized);
		__ delayed()->nop();

    // Compute size of arguments for saving when returning to deoptimized caller
		__ get_method(A1);
	        __ verify_oop(A1);	
		__ lhu(A1, A1, in_bytes(methodOopDesc::size_of_parameters_offset()));
		__ shl(A1, Interpreter::logStackElementSize());
		__ restore_locals();
		__ sub(A2, LVP, T0);
		__ addiu(A2, A2, wordSize);
    // Save these arguments
#ifndef OPT_THREAD
		__ get_thread(A0);
#else
		__ move(A0, TREG);
#endif
		__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::popframe_preserve_args), A0, A1, A2);

		
  		
		__ remove_activation(vtos, T9, false, false, false);

    // Inform deoptimization that it is responsible for restoring these arguments
#ifndef OPT_THREAD
		__ get_thread(thread);
#endif
		__ move(AT, JavaThread::popframe_force_deopt_reexecution_bit);
		__ sw(AT, thread, in_bytes(JavaThread::popframe_condition_offset()));
    // Continue in deoptimization handler
    ///__ jmp(edx);
		__ jr(T9);
		__ delayed()->nop();

    __ bind(caller_not_deoptimized);
  }
#endif /* !CORE */

  
  __ remove_activation(vtos, T3, 
                       /* throw_monitor_exception */ false, 
                       /* install_monitor_exception */ false,
                       /* notify_jvmdi */ false);

  // Clear the popframe condition flag
// Finish with popframe handling
  // A previous I2C followed by a deoptimization might have moved the
  // outgoing arguments further up the stack. PopFrame expects the
  // mutations to those outgoing arguments to be preserved and other
  // constraints basically require this frame to look exactly as
  // though it had previously invoked an interpreted activation with
  // no space between the top of the expression stack (current
  // last_sp) and the top of stack. Rather than force deopt to
  // maintain this kind of invariant all the time we call a small
  // fixup routine to move the mutated arguments onto the top of our
  // expression stack if necessary.
  //why x86 write this , i think it is no use ,@jerome 
  //__ movl(eax, esp);
  //__ movl(ebx, Address(ebp, frame::interpreter_frame_last_sp_offset * wordSize));
    __ move(T8, SP);
    __ lw(T6, FP, frame::interpreter_frame_last_sp_offset * wordSize);
#ifndef OPT_THREAD
  __ get_thread(thread);
#endif
// PC must point into interpreter here
  //__ set_last_Java_frame(ecx, noreg, ebp, __ pc());
  __ set_last_Java_frame(thread, noreg, FP, __ pc());
 // __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::popframe_move_outgoing_args), ecx, eax, ebx);
  __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::popframe_move_outgoing_args), thread, T8, T6);
  __ get_thread(thread);
  __ reset_last_Java_frame(thread, true, true);
  // Restore the last_sp and null it out
  __ lw(SP, FP, frame::interpreter_frame_last_sp_offset * wordSize);
//  __ movl(Address(ebp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
   __ sw(ZERO,FP, frame::interpreter_frame_last_sp_offset * wordSize);

  
  
  __ move(AT, JavaThread::popframe_inactive);	
  __ sw(AT, thread, in_bytes(JavaThread::popframe_condition_offset()));

  // Finish with popframe handling
  __ restore_bcp();
  __ restore_locals();
#ifndef CORE
  // The method data pointer was incremented already during
  // call profiling. We have to restore the mdp for the current bcp.
  if (ProfileInterpreter) {
    __ set_method_data_pointer_for_bcp();
  }
#endif // !CORE
    // Clear the popframe condition flag
   // __ get_thread(ecx);
   // __ movl(Address(ecx, JavaThread::popframe_condition_offset()), JavaThread::popframe_inactive);
  
    __ get_thread(thread);
    __ move(AT,JavaThread::popframe_inactive); 
   __ sw(AT,thread, in_bytes(JavaThread::popframe_condition_offset())); 
   __ dispatch_next(vtos);
  // end of PopFrame support

  Interpreter::_remove_activation_entry = __ pc();
  
  // preserve exception over this code sequence
	__ lw(T0, SP, 0);
	__ addi(SP, SP, wordSize);
#ifndef OPT_THREAD
  __ get_thread(thread);
#endif
  __ sw(T0, thread, in_bytes(JavaThread::vm_result_offset()));
  // remove the activation (without doing throws on illegalMonitorExceptions)
   __ remove_activation(vtos, T3, false, true, false);
  // restore exception
#ifndef OPT_THREAD
  __ get_thread(thread);
#endif
  __ lw(T0, thread, in_bytes(JavaThread::vm_result_offset()));
  __ sw(ZERO, thread, in_bytes(JavaThread::vm_result_offset()));
  __ verify_oop(T0);

  // Inbetween activations - previous activation type unknown yet
  // compute continuation point - the continuation point expects
  // the following registers set up:
  //
  // T0: exception																eax
  // T1: return address/pc that threw exception		edx
  // SP: expression stack of caller								esp
  // FP: ebp of caller														ebp
	__ addi(SP, SP, (-2) * wordSize);
	__ sw(T0, SP, wordSize);												// save exception
	__ sw(T3, SP, 0);                               // save return address
	__ move(A0, T3);	
	__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), A0);
  __ move(T7, V0);                             // save exception handler
	__ lw(V0, SP, wordSize);												// restore exception
	__ lw(V1, SP, 0);                               // restore return address
	__ addi(SP, SP, 2 * wordSize);
	
  // Note that an "issuing PC" is actually the next PC after the call
  __ jr(T7);                                   // jump to exception handler of caller
	__ delayed()->nop();
}


//
// JVMTI ForceEarlyReturn support
//
address TemplateInterpreterGenerator::generate_earlyret_entry_for(TosState state) {
  address entry = __ pc();
//aoqi:FIXME ?
 // __ restore_bcp();
 // __ restore_locals();
  __ empty_expression_stack();
  __ empty_FPU_stack();
  __ load_earlyret_value(state);

  //__ get_thread(ecx);
  __ get_thread(TREG);
//  __ movl(TREG, Address(TREG, JavaThread::jvmti_thread_state_offset()));
   __ lw(TREG, TREG, in_bytes(JavaThread::jvmti_thread_state_offset()));
  //const Address cond_addr(ecx, JvmtiThreadState::earlyret_state_offset());
  const Address cond_addr(TREG, in_bytes(JvmtiThreadState::earlyret_state_offset()));
  // Clear the earlyret state
 // __ movl(cond_addr, JvmtiThreadState::earlyret_inactive);
    __ move(AT,JvmtiThreadState::earlyret_inactive);
    __ sw(AT,cond_addr); 
    //__ remove_activation(state, esi,

 

    __ remove_activation(state, T0,
		       false, /* throw_monitor_exception */
                       false, /* install_monitor_exception */
                       true); /* notify_jvmdi */
 // __ jmp(esi);
  //__ jmp(T0);
    __ jr(T0); 
    __ delayed()->nop(); 
  return entry;
} // end of ForceEarlyReturn support


//-----------------------------------------------------------------------------
// Helper for vtos entry point generation

void TemplateInterpreterGenerator::set_vtos_entry_points(Template* t,
                                                         address& bep,
                                                         address& cep,
                                                         address& sep,
                                                         address& aep,
                                                         address& iep,
                                                         address& lep,
                                                         address& fep,
                                                         address& dep,
                                                         address& vep) {
  assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
  Label L;
  fep = __ pc(); __ push(ftos); __ b(L); __ delayed()->nop();
  dep = __ pc(); __ push(dtos); __ b(L); __ delayed()->nop();
  lep = __ pc(); __ push(ltos); __ b(L); __ delayed()->nop();
  aep  =__ pc(); __ push(atos); __ b(L); __ delayed()->nop();
  bep = cep = sep = iep = __ pc(); __ push(itos); 
  vep = __ pc(); __ bind(L);    // fall through
  generate_and_dispatch(t);
}


//-----------------------------------------------------------------------------
// Generation of individual instructions

// helpers for generate_and_dispatch


InterpreterGenerator::InterpreterGenerator(StubQueue* code)
  : TemplateInterpreterGenerator(code) {
   generate_all(); // down here so it can be "virtual"
}

//-----------------------------------------------------------------------------

// Non-product code
#ifndef PRODUCT
address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
  address entry = __ pc();

	// prepare expression stack
	__ push(state);       // save tosca

	// tos & tos2, added by yjl 7/15/2005
	// trace_bytecode need actually 4 args, the last two is tos&tos2
	// this work fine for x86. but mips o32 call convention will store A2-A3
	// to the stack position it think is the tos&tos2
	// when the expression stack have no more than 2 data, error occur.
	__ lw(A2, SP, 0);
	__ lw(A3, SP, 4);

	// pass arguments & call tracer
	__ call_VM(noreg, CAST_FROM_FN_PTR(address, SharedRuntime::trace_bytecode), RA, A2, A3);
	__ move(RA, V0);    // make sure return address is not destroyed by pop(state)

	// restore expression stack
	__ pop(state);        // restore tosca

	// return
	__ jr(RA);
	__ delayed()->nop();

	return entry;
}

void TemplateInterpreterGenerator::count_bytecode() {
	__ move(T8, (int)&BytecodeCounter::_counter_value);
	__ lw(AT, T8, 0);
	__ addi(AT, AT, 1);
	__ sw(AT, T8, 0);
}

void TemplateInterpreterGenerator::histogram_bytecode(Template* t) {
	__ move(T8, (int)&BytecodeHistogram::_counters[t->bytecode()]);
	__ lw(AT, T8, 0);
	__ addi(AT, AT, 1);
	__ sw(AT, T8, 0);
}

void TemplateInterpreterGenerator::histogram_bytecode_pair(Template* t) {
	__ move(T8, (int)&BytecodePairHistogram::_index);
	__ lw(T7, T8, 0);
	__ srl(T7, T7, BytecodePairHistogram::log2_number_of_codes);
	__ move(T8, ((int)t->bytecode()) << BytecodePairHistogram::log2_number_of_codes);
	__ orr(T7, T7, T8);
	__ move(T8, (int)&BytecodePairHistogram::_index);
	__ sw(T7, T8, 0);
	__ sll(T7, T7, 2);
	__ move(T8, (int)BytecodePairHistogram::_counters);
	__ add(T8, T8, T7);
	__ lw(AT, T8, 0);
	__ addi(AT, AT, 1);
	__ sw(AT, T8, 0);
}


void TemplateInterpreterGenerator::trace_bytecode(Template* t) {
  // Call a little run-time stub to avoid blow-up for each bytecode.
  // The run-time runtime saves the right registers, depending on
  // the tosca in-state for the given template.

	address entry = Interpreter::trace_code(t->tos_in());
	assert(entry != NULL, "entry must have been generated");
	__ call(entry, relocInfo::none);
	__ delayed()->nop();
}


void TemplateInterpreterGenerator::stop_interpreter_at() {
  Label L;
	__ move(T8, int(&BytecodeCounter::_counter_value));
	__ lw(T8, T8, 0);
	__ move(AT, StopInterpreterAt);
	__ bne(T8, AT, L);
	__ delayed()->nop();
	__ call(CAST_FROM_FN_PTR(address, os::breakpoint), relocInfo::runtime_call_type);
	__ delayed()->nop();
	__ bind(L);
}
#endif // !PRODUCT
#endif // ! CC_INTERP