view hotspot/src/cpu/mips/vm/mips.ad @ 1:c1e1428eff7c

The preliminary porting to MIPS architecture. With this commit, the interpreter can pass 140/141 regression tests, 8/8 SPECjvm98 tests and 31/37 SPECjvm2008 tests. The compiler can pass 136/141 regression tests, but it can not run the benchmark of SPECjvm98 and SPECjvm2008.
author LIU Qi <liuqi82@gmail.com>
date Thu, 30 Sep 2010 13:48:16 +0800
parents
children
line wrap: on
line source

//
// Copyright 2003-2008 Sun Microsystems, Inc.  All Rights Reserved.
// Copyright 2010 Lemote, Inc.  All Rights Reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
// CA 95054 USA or visit www.sun.com if you need additional information or
// have any questions.
//
//

// GodSon2 Architecture Description File

//----------REGISTER DEFINITION BLOCK------------------------------------------
// This information is used by the matcher and the register allocator to
// describe individual registers and classes of registers within the target
// archtecture.



//----------SOURCE BLOCK-------------------------------------------------------
// This is a block of C++ code which provides values, functions, and
// definitions necessary in the rest of the architecture description
// format:
// reg_def name (call convention, c-call convention, ideal type, encoding);
// 		call convention : 
//			NS  = No-Save
//			SOC = Save-On-Call
//			SOE = Save-On-Entry
//			AS  = Always-Save
//		ideal type :
//			see opto/opcodes.hpp for more info
// reg_class name (reg, ...);
// alloc_class name (reg, ...); 
register %{
	//Integer Registers
	reg_def ZERO	(NS, 	NS, 	Op_RegI, 0, R0->as_VMReg());
	reg_def AT		(NS, 	NS, 	Op_RegI, 1, AT->as_VMReg());
	reg_def V0		(SOC, SOC,	Op_RegI, 2, V0->as_VMReg());
	reg_def V1		(SOC, SOC,	Op_RegI, 3, V1->as_VMReg());
	reg_def A0		(SOC, SOC,	Op_RegI, 4, A0->as_VMReg(), ->as_VMReg());
	reg_def A1		(SOC, SOC,	Op_RegI, 5, A1->as_VMReg());
	reg_def A2		(SOC, SOC,	Op_RegI, 6, A2->as_VMReg());
	reg_def A3		(SOC, SOC,	Op_RegI, 7, A3->as_VMReg());
	reg_def T0		(SOC, SOC,	Op_RegI, 8, T0->as_VMReg());
	reg_def T1		(SOC, SOC,	Op_RegI, 9, T1->as_VMReg());
	reg_def T2		(SOC, SOC,	Op_RegI, 10, T2->as_VMReg());
	reg_def T3		(SOC, SOC,	Op_RegI, 11, T3->as_VMReg());
	reg_def T4		(SOC, SOC,	Op_RegI, 12, T4->as_VMReg());
	reg_def T5		(SOC, SOC,	Op_RegI, 13, T5->as_VMReg());
	reg_def T6		(SOC, SOC,	Op_RegI, 14, T6->as_VMReg());
	reg_def T7		(SOC, SOC,	Op_RegI, 15, T7->as_VMReg());
	reg_def S0		(SOE, SOE,	Op_RegI, 16, S0->as_VMReg());
	reg_def S1		(SOE, SOE,	Op_RegI, 17, S1->as_VMReg());
	reg_def S2		(SOE, SOE,	Op_RegI, 18, S2->as_VMReg());
	reg_def S3		(SOE, SOE,	Op_RegI, 19, S3->as_VMReg());
	reg_def S4		(SOE, SOE,	Op_RegI, 20, S4->as_VMReg());
	reg_def S5		(SOE, SOE,	Op_RegI, 21, S5->as_VMReg());
	reg_def S6		(SOE, SOE,	Op_RegI, 22, S6->as_VMReg());
	reg_def S7		(SOE, SOE,	Op_RegI, 23, S7->as_VMReg());
	reg_def T8		(SOC, SOC,	Op_RegI, 24, T8->as_VMReg());
	reg_def T9		(SOC, SOC,	Op_RegI, 25, T9->as_VMReg());
	reg_def K0		(NS,	NS,		Op_RegI, 26, K0->as_VMReg());
	reg_def K1		(NS,	NS,		Op_RegI, 27, K1->as_VMReg());
	reg_def GP		(NS,	NS,		Op_RegI, 28, GP->as_VMReg());
	reg_def SP		(NS,	NS,		Op_RegI, 29, SP->as_VMReg());
	reg_def FP		(NS,	NS,		Op_RegI, 30, FP->as_VMReg());
	reg_def RA		(NS,	SOE,	Op_RegI, 29, RA->as_VMReg());

	// Float registers. 
	reg_def F0		(SOC,	SOC,	Op_RegF, 0, F0->as_VMReg());
	reg_def F1		(SOC,	SOC,	Op_RegF, 1, F1->as_VMReg());
	reg_def F2		(SOC,	SOC,	Op_RegF, 2, F2->as_VMReg());
	reg_def F3		(SOC,	SOC,	Op_RegF, 3, F3->as_VMReg());
	reg_def F4		(SOC,	SOC,	Op_RegF, 4, F4->as_VMReg());
	reg_def F5		(SOC,	SOC,	Op_RegF, 5, F5->as_VMReg());
	reg_def F6		(SOC,	SOC,	Op_RegF, 6, F6->as_VMReg());
	reg_def F7		(SOC,	SOC,	Op_RegF, 7, F7->as_VMReg());
	reg_def F8		(SOC,	SOC,	Op_RegF, 8, F8->as_VMReg());
	reg_def F9		(SOC,	SOC,	Op_RegF, 9, F9->as_VMReg());
	reg_def F10		(SOC,	SOC,	Op_RegF, 10, F10->as_VMReg());
	reg_def F11		(SOC,	SOC,	Op_RegF, 11, F11->as_VMReg());
	reg_def F12		(SOC,	SOC,	Op_RegF, 12, F12->as_VMReg());
	reg_def F13		(SOC,	SOC,	Op_RegF, 13, F13->as_VMReg());
	reg_def F14		(SOC,	SOC,	Op_RegF, 14, F14->as_VMReg());
	reg_def F15		(SOC,	SOC,	Op_RegF, 15, F15->as_VMReg());
	reg_def F16		(SOC,	SOC,	Op_RegF, 16, F16->as_VMReg());
	reg_def F17		(SOC,	SOC,	Op_RegF, 17, F17->as_VMReg());
	reg_def F18		(SOC,	SOC,	Op_RegF, 18, F18->as_VMReg());
	reg_def F19		(SOC,	SOC,	Op_RegF, 19, F19->as_VMReg());
	reg_def F20		(SOC,	SOC,	Op_RegF, 20, F20->as_VMReg());
	reg_def F21		(SOC,	SOC,	Op_RegF, 21, F21->as_VMReg());
	reg_def F22		(SOC,	SOC,	Op_RegF, 22, F22->as_VMReg());
	reg_def F23		(SOC,	SOC,	Op_RegF, 23, F23->as_VMReg());
	reg_def F24		(SOC,	SOC,	Op_RegF, 24, F24->as_VMReg());
	reg_def F25		(SOC,	SOC,	Op_RegF, 25, F25->as_VMReg());
	reg_def F26		(SOC,	SOC,	Op_RegF, 26, F26->as_VMReg());
	reg_def F27		(SOC,	SOC,	Op_RegF, 27, F27->as_VMReg());
	reg_def F28		(SOC,	SOC,	Op_RegF, 28, F28->as_VMReg());
	reg_def F29		(SOC,	SOC,	Op_RegF, 29, F29->as_VMReg());
	reg_def F30		(SOC,	SOC,	Op_RegF, 30, F30->as_VMReg());
	reg_def F31		(SOC,	SOC,	Op_RegF, 31, F31->as_VMReg());

	alloc_class chunk0(	T0, T1, T2, T3, T4, T5, T6, T7,
			S0, S1, S2, S3, S4, S5, S6, S7);

	alloc_class chunk1(	AT, T8, T9, SP, FP, GP, ZERO, RA, K0, K1);

	// Class for all registers
	reg_class any_reg(T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
			S0, S1, S2, S3, S4, S5, S6, S7, V0, V1, A0, A1, A2, A3, AT
			SP, FP, RA, ZERO, GP, K0, K1);

	// Class for general registers
	reg_class e_reg(T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
			S0, S1, S2, S3, S4, S5, S6, S7, V0, V1, A0, A1, A2, A3, AT);

	// Class of registers that can appear in an address with no offset.
	// EBP and ESP require an extra instruction byte for zero offset.
	// Used in fast-unlock
	//reg_class p_reg(EDX, EDI, ESI, EBX);
	reg_class p_reg(T0, T1, T2, T3, T4, T5, T6, T7, S0, S1, S2, S3, S4, S5, S6, S7);

	reg_class long_reg(V0,V1, A0,A1, A2,A3);

	// Class of integer register pairs that aligns with calling convention
	reg_class ret_reg(V0,V1);
	reg_class p0_reg(A0,A1);
	reg_class p2_reg(A2,A3);

	// Floating point registers.
	reg_class flt_reg(	F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,
											F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31 );
	reg_class dbl_reg( F0,F2,F4,F6,F8,F10,F12,F14,F16,F18,F20,F22,F24,F26,F28,F30 );

	reg_class flt_arg0( F12 );
	reg_class dbl_arg0( F12 );
	reg_class dbl_arg1( F14 );
%}
//----------DEFINITION BLOCK---------------------------------------------------
// Define name --> value mappings to inform the ADLC of an integer valued name
// Current support includes integer values in the range [0, 0x7FFFFFFF]
// Format:
//        int_def  <name>         ( <int_value>, <expression>);
// Generated Code in ad_<arch>.hpp
//        #define  <name>   (<expression>)
//        // value == <int_value>
// Generated code in ad_<arch>.cpp adlc_verification()
//        assert( <name> == <int_value>, "Expect (<expression>) to equal <int_value>");
//
definitions %{
	// The default cost (of an ALU instruction).
	int_def DEFAULT_COST      (    100,     100);
	int_def HUGE_COST         (1000000, 1000000);

	// Memory refs are twice as expensive as run-of-the-mill.
	int_def MEMORY_REF_COST   (    200, DEFAULT_COST * 2);

	// Branches are even more expensive.
	int_def BRANCH_COST       (    300, DEFAULT_COST * 3);
	// we use jr instruction to construct call, so more expensive
	// by yjl 2/28/2006
	int_def CALL_COST         (    500, DEFAULT_COST * 5);
%}
						

source %{

#define __ _masm.

	// ****************************************************************************
	// temporary fix to generate new relocation info
	#define   RELOC_IMM32    0
	#define   RELOC_DISP32   1
	#define   RELOC_CALL32   2
	// ****************************************************************************

	// How to find the high register of a Long pair, given the low register
	#define   HIGH_FROM_LOW(x) ((x)+1)

	void emit_orri(CodeBuffer &cbuf, const MachNode* n, int opcode, int rs_enc, int rt_enc, int imm) {
		int insn = (opcode<<26) | (rs_enc<<21) | (rt_enc<<16) | bitfield(imm, 0, 16);
		*((int*)cbuf.code_end()) = insn;
		cbuf.set_code_end(cbuf.code_end() + sizeof(insn));
	}

	void emit_rrro(CodeBuffer &cbuf, const MachNode* n, int rs_enc, int rt_enc, int rd_enc, int opcode) {
		int insn = (rs_enc<<21) | (rt_enc<<16) | (rd_enc<<11) | opcode;
		*((int*)cbuf.code_end()) = insn;
		cbuf.set_code_end(cbuf.code_end() + sizeof(insn));
	}
	
	void emit_rrso(CodeBuffer &cbuf, const MachNode* n, int rt_enc, int rd_enc, int sa, int opcode) {
		int insn = (rt_enc<<16) | (rd_enc<<11) | (sa<<6) | opcode;
		*((int*)cbuf.code_end()) = insn;
		cbuf.set_code_end(cbuf.code_end() + sizeof(insn));
	}
	
	// i dont know the real differences of the followed series. just keep them same now.
	// by yjl 1/6/2006

	// !!!!! Special hack to get all type of calls to specify the byte offset
	//       from the start of the call to the point where the return address
	//       will point.
	int MachCallStaticJavaNode::ret_addr_offset() {
		return NativeCall::return_address_offset;
	}

	int MachCallDynamicJavaNode::ret_addr_offset() {
		return NativeMovConstReg::instruction_size + NativeCall::return_address_offset;
	}

	int MachCallRuntimeNode::ret_addr_offset() {
		return NativeCall::return_address_offset;
	}

	// change here, by yjl 2/28/2006
	int MachCallCompiledJavaNode::ret_addr_offset() {
		return NativeCall::return_address_offset;
	}

	// change here, by yjl 2/28/2006
	int MachCallInterpreterNode::ret_addr_offset() {
		// Offset from start of this code to where return address points
		return NativeCall::return_address_offset;
	}

	// change here, by yjl 2/28/2006
	int MachCallNativeNode::ret_addr_offset() {
		return MachCallRuntimeNode::ret_addr_offset(); 
	}

	// Indicate if the safepoint node needs the polling page as an input.
	// Since x86 does have absolute addressing, it doesn't.
	// i dont know what should it be returned on godson, just keep it unchanged as x86. 
	// by yjl 1/6/2006
	// i think it's right now by yjl 2/28/2006
	bool SafePointNode::needs_polling_address_input() {
		return false;
	}

	//
	// Compute padding required for nodes which need alignment
	//

	// The address of the call instruction needs to be 4-byte aligned to
	// ensure that it does not span a cache line so that it can be patched.
	
	// what the hell does in_24_bit_fp_mode mean?
	// i just ignore this now.
	// by yjl 1/6/2006
	// it's only needed in x86. by yjl 2/28/2006
	int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
		return round_to(current_offset, alignment_required()) - current_offset;
	}

	// The address of the call instruction needs to be 4-byte aligned to
	// ensure that it does not span a cache line so that it can be patched.
	int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
		return round_to(current_offset, alignment_required()) - current_offset;
	}

	// The address of the call instruction needs to be 4-byte aligned to
	// ensure that it does not span a cache line so that it can be patched.
	int CallInterpreterDirectNode::compute_padding(int current_offset) const {
		return round_to(current_offset, alignment_required()) - current_offset;
	}

	void add_oop_Relocation(CodeBuffer &cbuf, jobject h) {
		OopRecorder *oop_recorder = cbuf.oop_recorder();
		assert(oop_recorder != NULL, "CodeBuffer must have OopRecorder");

		// Create relocation information, record Oop
		int oop_index = oop_recorder->find_index(h);
		RelocationHolder rspec = oop_Relocation::spec(oop_index);

		assert(h == NULL || JNIHandles::resolve(h)->is_perm(), "cannot embed non-perm oops in code");

		// add Relocation information to the CodeBuffer
		cbuf.relocate(cbuf.mark(), rspec);
	}
	

#ifndef PRODUCT
	void MachBreakpointNode::format( PhaseRegAlloc * ) const {
		tty->print("break");
	}
#endif

	//=============================================================================
#ifndef PRODUCT
	void MachPrologNode::format( PhaseRegAlloc *ra_ ) const {
		Compile* C = ra_->C;
		
		for (int i = 0; i < OptoPrologueNops; i++) {
			tty->print_cr("nop"); tty->print("\t");
		}

		if( VerifyThread ) {
			tty->print_cr("Verify_Thread"); tty->print("\t");
		}
			

		int framesize = C->frame_slots() << LogBytesPerInt;
		assert(framesize % (2*wordSize) == wordSize, "aligned frame size");

		/*if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
			tty->print("move\tAT, 0xBADB100D\t# Majik cookie for stack depth check\n\t");
			tty->print("sw\t\tAT, SP, -4\n\t");
		}*/

		// Calls to C2R adapters often do not accept exceptional returns.
		// We require that their callers must bang for them.  But be careful, because
		// some VM calls (such as call site linkage) can use several kilobytes of
		// stack.  But the stack safety zone should account for that.
		// See bugs 4446381, 4468289, 4497237.
		if (C->need_stack_bang(framesize)) {
			tty->print_cr("# stack bang"); tty->print("\t"); 
		}

		if( C->start()->Opcode() == Op_StartI2C) {
			///tty->print_cr( "MOV    EBX,ESP\t\t# move old ESP to temp");  
			///tty->print_cr( "\tAND    ESP,-8\t\t# Round ESP to even");
			///tty->print_cr( "\tPUSH   EBX\t\t# Old ESP for EVEN alignment");
			///tty->print   ( "\t" );
			tty->print_cr("move		T8, SP");
			tty->print_cr("move		AT, -8");
			tty->print_cr("andr		SP, SP, AT");
			tty->print_cr("sw			T8, SP, -4");
			tty->print_cr("addiu	SP, SP, -4");
		} else if( C->start()->Opcode() == Op_StartOSR ) {
    	///tty->print_cr( "MOV    EBX,EDI\t\t# Move locals ptr to interpreter_arg_ptr_reg");
    	///tty->print   ( "\t" );
			// fixme, i dont know the meaning of code now. by yjl 2/21/2006
  	}

		tty->print("addiu\t,%d\t# Create frame", framesize);
	}
#endif


	void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		Compile* C = ra_->C;
		MacroAssembler masm(&cbuf);
#define __ masm.

		// WARNING: Initial instruction MUST be 16 bytes or longer so that
		// NativeJump::patch_verified_entry will be able to patch out the entry code safely. 
		/*if( C->in_24_bit_fp_mode() ) {
			MacroAssembler masm(&cbuf);
			Address cntrl_addr_24 = Address((int)StubRoutines::addr_fpu_cntrl_wrd_24(), relocInfo::none);
			masm.fldcw(cntrl_addr_24);
			}*/

		int framesize = C->frame_slots() << LogBytesPerInt;
		///framesize -= wordSize;      // Remove 1 for return adr already pushed
		assert(framesize % (2*wordSize) == wordSize, "aligned frame size");

		if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
			///emit_opcode(cbuf, 0x68); // push 0xbadb100d
			///emit_d32(cbuf, 0xbadb100d);
			///framesize -= wordSize;  
			__ move(AT, 0xbadb100d);
			__ sw(AT, SP, -4);
		}

		// Calls to C2R adapters often do not accept exceptional returns.
		// We require that their callers must bang for them.  But be careful, because
		// some VM calls (such as call site linkage) can use several kilobytes of
		// stack.  But the stack safety zone should account for that.
		// See bugs 4446381, 4468289, 4497237.
		if (C->need_stack_bang(framesize)) {
			///MacroAssembler masm(&cbuf);
			///masm.generate_stack_overflow_check(framesize);
			__ generate_stack_overflow_check(framesize);
		}

		if( C->start()->Opcode() == Op_StartI2C) {
			///emit_opcode(cbuf, 0x8B);             // MOV  reg,ESP
			///emit_rm(cbuf, 0x3, EBX_enc, ESP_enc);// interpreter_arg_ptr_reg
			///emit_opcode(cbuf,0x83);              // AND ESP,-8 ; Round ESP to even
			///emit_rm(cbuf,0x3,0x4,ESP_enc);
			///emit_d8(cbuf,-8);
			///emit_opcode(cbuf,0x50+EBX_enc);      // PUSH EBX (old ESP)
			__ move(T8, SP);
			__ move(AT, -8);
			__ andr(SP, SP, AT);
			__ sw(T8, SP, -4);
			//__ addiu(SP, SP -4);
		} else if( C->start()->Opcode() == Op_StartOSR ) {
			///emit_opcode(cbuf, 0x8B);             // MOV  
			///emit_rm(cbuf, 0x3, EBX_enc, EDI_enc);// MOV EBX,EDI locals ptr to EBX
			// fixme, i dont know the meaning of code now. by yjl 2/21/2006
		}


		if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
			if (framesize) {
				///emit_opcode(cbuf, 0x83);   // sub  SP,#framesize
				///emit_rm(cbuf, 0x3, 0x05, ESP_enc);
				///emit_d8(cbuf, framesize);
				__ subiu(SP, SP, framesize);
			}
		} else {
			///emit_opcode(cbuf, 0x81);   // sub  SP,#framesize
			///emit_rm(cbuf, 0x3, 0x05, ESP_enc); 
			///emit_d32(cbuf, framesize);
			__ subiu(SP, SP, framesize);
		}
#undef __
	}

	uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
		return MachNode::size(ra_); // too many variables; just compute it the hard way
	}

	int MachPrologNode::reloc() const {
		return 0; // a large enough number
	}

	//=============================================================================
#ifndef PRODUCT
	void MachEpilogNode::format( PhaseRegAlloc *ra_ ) const {
		Compile *C = ra_->C;
		int framesize = C->frame_slots() << LogBytesPerInt;
		///framesize -= wordSize;      // Remove 1 for return adr already pushed
		assert(framesize % (2*wordSize) == wordSize, "aligned frame size");

		///if( C->in_24_bit_fp_mode() ) {
		///  tty->print("FLDCW  standard control word");
		///  tty->cr(); tty->print("\t");
		///}
		if( framesize ) {
			///tty->print("ADD    ESP,%d\t# Destroy frame",framesize);
			///tty->cr(); tty->print("\t");
			tty->print_cr("addi		SP, SP, %d", framesize);
		}
		if( C->start()->Opcode() == Op_StartI2C) {
			///tty->print("POP    ESP\t\t# Recover prior ESP");
			///tty->cr(); tty->print("\t");
			tty->print_cr("lw			SP, SP, -4");
		}
		if( do_polling() && SafepointPolling && C->is_method_compilation() ) {
			///tty->print("TEST  PollPage,EAX\t! Poll Safepoint");
			///tty->cr(); tty->print("\t");
			tty->print_cr("lw			ZERO, PollPage, 0");
		}
	}
#endif

	void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		Compile *C = ra_->C;
		MacroAssembler masm(&cbuf);
#define __ masm.

		// If method set FPU control word, restore to standard control word
		///if( C->in_24_bit_fp_mode() ) {
		///  MacroAssembler masm(&cbuf);
		///  Address cntrl_addr_std  = Address((int)StubRoutines::addr_fpu_cntrl_wrd_std(), relocInfo::none); 
		///  masm.fldcw(cntrl_addr_std);
		///}

		int framesize = C->frame_slots() << LogBytesPerInt;
		///framesize -= wordSize;      // Remove 1 for return adr already pushed
		assert(framesize % (2*wordSize) == wordSize, "aligned frame size");

		///if( framesize >= 128 ) {
		///emit_opcode(cbuf, 0x81); // add  SP, #framesize
		///emit_rm(cbuf, 0x3, 0x00, ESP_enc);
		///emit_d32(cbuf, framesize);
		///}
		///else if( framesize ) {
		///  emit_opcode(cbuf, 0x83); // add  SP, #framesize
		///  emit_rm(cbuf, 0x3, 0x00, ESP_enc);
		///  emit_d8(cbuf, framesize);
		//}
		__ addiu(SP, SP, framesize);

		if( C->start()->Opcode() == Op_StartI2C) {
			///emit_opcode(cbuf,0x58+ESP_enc); // POP ESP
			__ lw(SP, SP, -4);
		}

		if( do_polling() && SafepointPolling && C->is_method_compilation() ) {
			///cbuf.relocate(cbuf.code_end(), relocInfo::poll_return_type, 0);
			///emit_opcode(cbuf,0x85);
			///emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
			///emit_d32(cbuf, (intptr_t)os::get_polling_page());
			__ relocate(relocInfo::poll_return_type);
			__ lui(AT, Assembler::split_high((int)os::get_polling_page()));
			__ lw(ZERO, AT, Assembler::split_low((int)os::get_polling_page()));
		}
#undef __
	}

	uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
		Compile *C = ra_->C;
		int size = 4;

		if (C->start()->Opcode() == Op_StartI2C) {
			size += 4;
		}

		if ( do_polling() && SafepointPolling && C->is_method_compilation() ) {
			size += 8;
		}

		return size;
	}

	int MachEpilogNode::reloc() const {
		return 0; // a large enough number
	}

	const Pipeline * MachEpilogNode::pipeline() const {
		return MachNode::pipeline_class();
	}

	int MachEpilogNode::safepoint_offset() const { return 0; }

	//=============================================================================

	enum RC { rc_bad, rc_int, rc_float, rc_stack };
	static enum RC rc_class( OptoReg::Name reg ) {
		if( reg == OptoReg::Bad ) return rc_bad;
		if( reg <= RA_num ) return rc_int;
		if( reg <= F31_num ) return rc_float;
  
		assert( reg >= SharedInfo::stack0, "blow up if spilling flags" );
		return rc_stack;
	}

	// i dont think we need this, by yjl 2/21/2006
	/*static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg, int opcode, const char *op_str, int size ) {
		if( cbuf ) {
			emit_opcode  (*cbuf, opcode );
			encode_RegMem(*cbuf, Matcher::_regEncode[reg], ESP_enc, 0x4, 0, offset, false);
#ifndef PRODUCT
		} else if( !do_size ) { 
			if( size != 0 ) tty->print("\n\t"); 
			if( is_load ) tty->print("%s   %s,[ESP + #%d]",op_str,SharedInfo::regName[reg],offset); 
			else          tty->print("%s   [ESP + #%d],%s",op_str,offset,SharedInfo::regName[reg]); 
#endif
		}
		int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
		return size+3+offset_size;
	}

	// Helper for XMM registers.  Extra opcode bits, limited syntax.
	static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load, 
			int offset, int reg_lo, int reg_hi, int size ) {
		if( cbuf ) {
			if( reg_lo+1 == reg_hi ) { // double move?
				emit_opcode  (*cbuf, 0xF2 );
			} else {
				emit_opcode  (*cbuf, 0xF3 );
			}
			emit_opcode  (*cbuf, 0x0F );
			emit_opcode  (*cbuf, is_load ? 0x10 : 0x11 );
			encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
#ifndef PRODUCT
		} else if( !do_size ) { 
			if( size != 0 ) tty->print("\n\t"); 
			if( reg_lo+1 == reg_hi ) { // double move?
				if( is_load ) tty->print("MOVSD  %s:%s,[ESP + #%d]",SharedInfo::regName[reg_lo],SharedInfo::regName[reg_hi],offset); 
				else          tty->print("MOVSD  [ESP + #%d],%s:%s",offset,SharedInfo::regName[reg_lo],SharedInfo::regName[reg_hi]); 
			} else {
				if( is_load ) tty->print("MOVSS  %s,[ESP + #%d]",SharedInfo::regName[reg_lo],offset); 
				else          tty->print("MOVSS  [ESP + #%d],%s",offset,SharedInfo::regName[reg_lo]); 
			}
#endif
		}
		int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
		return size+5+offset_size;
	}

	static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo, 
			int src_hi, int dst_hi, int size ) {
		if( cbuf ) {
			emit_opcode(*cbuf, (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 0xF2 : 0xF3 );
			emit_opcode(*cbuf, 0x0F );
			emit_opcode(*cbuf, 0x10 );
			emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
#ifndef PRODUCT
		} else if( !do_size ) { 
			if( size != 0 ) tty->print("\n\t"); 
			if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
				tty->print("MOVSD  %s:%s,%s:%s",SharedInfo::regName[dst_lo],SharedInfo::regName[dst_hi],SharedInfo::regName[src_lo],SharedInfo::regName[src_hi]); 
			} else {
				tty->print("MOVSS  %s,%s",SharedInfo::regName[dst_lo],SharedInfo::regName[src_lo]); 
			}
#endif
		}
		return size+4;
	}

	static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size ) {
		if( cbuf ) {
			emit_opcode(*cbuf, 0x8B );
			emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst], Matcher::_regEncode[src] );
#ifndef PRODUCT
		} else if( !do_size ) { 
			if( size != 0 ) tty->print("\n\t"); 
			tty->print("MOV    %s,%s",SharedInfo::regName[dst],SharedInfo::regName[src]); 
#endif
		}
		return size+2;
	}

	static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi, int offset, int size ) {
		if( src_lo != FPR1L_num ) {      // Move value to top of FP stack, if not already there
			if( cbuf ) {
				emit_opcode( *cbuf, 0xD9 );  // FLD (i.e., push it)
				emit_d8( *cbuf, 0xC0-1+Matcher::_regEncode[src_lo] );
#ifndef PRODUCT
			} else if( !do_size ) { 
				if( size != 0 ) tty->print("\n\t"); 
				tty->print("FLD    %s",SharedInfo::regName[src_lo]);
#endif
			}
			size += 2;
		}

		int st_op = (src_lo != FPR1L_num) ? EBX_num 
			//store & pop : 
			EDX_num 
			//store no pop;
		const char *op_str;
		int op;
		if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double store?
			op_str = (src_lo != FPR1L_num) ? "DSTP" : "DST ";
			op = 0xDD;
		} else {                   // 32-bit store
			op_str = (src_lo != FPR1L_num) ? "FSTP" : "FST ";
			op = 0xD9;
			assert( src_hi == OptoReg::Bad && dst_hi == OptoReg::Bad, "no non-adjacent float-stores" );
		}

		return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size);
	}*/

	uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size ) const {
		// Get registers to move 
		OptoReg::Name src_hi = ra_->get_reg_hi(in(1));
		OptoReg::Name src_lo = ra_->get_reg_lo(in(1));
		OptoReg::Name dst_hi = ra_->get_reg_hi(this );
		OptoReg::Name dst_lo = ra_->get_reg_lo(this );

		enum RC src_hi_rc = rc_class(src_hi);
		enum RC src_lo_rc = rc_class(src_lo);
		enum RC dst_hi_rc = rc_class(dst_hi);
		enum RC dst_lo_rc = rc_class(dst_lo);

		assert( src_lo != OptoReg::Bad && dst_lo != OptoReg::Bad, "must move at least 1 register" );

		MacroAssembler *masm = NULL;
#define __ masm->

		if (cbuf) {
			masm = new MacroAssembler(cbuf); 
		}

		// Generate spill code!
		int size = 0;

		if( src_lo == dst_lo && src_hi == dst_hi ) 
			return size;            // Self copy, no move

		// --------------------------------------
		// Check for mem-mem move.  push/pop to move.
		if( src_lo_rc == rc_stack && dst_lo_rc == rc_stack ) {
			if( src_hi == dst_lo ) { // overlapping stack copy ranges
				assert( src_hi_rc == rc_stack && dst_hi_rc == rc_stack, "we only expect a stk-stk copy here" );
				///size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_hi),ESI_num,0xFF,"PUSH",size);
				///size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_hi),EAX_num,0x8F,"POP ",size);
				if (cbuf) {
					__ lw(AT, SP, ra_->reg2offset(src_hi));
					__ sw(AT, SP, ra_->reg2offset(dst_hi));
#ifndef PRODUCT
				} else {
					if (!do_size) {
						tty->print_cr("lw			AT, SP, %d", ra_->reg2offset(src_hi));
						tty->print_cr("sw			AT, SP, %d", ra_->reg2offset(dst_hi));
					}
#endif
				}
				
				size += 8;
				src_hi_rc = dst_hi_rc = rc_bad;  // flag as already moved the hi bits
			}
			// move low bits
			///size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_lo),ESI_num,0xFF,"PUSH",size);
			///size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_lo),EAX_num,0x8F,"POP ",size);
			if (cbuf) {
				__ lw(AT, SP, ra_->reg2offset(src_lo));
				__ sw(AT, SP, ra_->reg2offset(dst_lo));
#ifndef PRODUCT
			} else {
				if (!do_size) {
					tty->print_cr("lw			AT, SP, %d", ra_->reg2offset(src_lo));
					tty->print_cr("sw			AT, SP, %d", ra_->reg2offset(dst_lo));
				}
#endif
			}

			size += 8;
			
			if( src_hi_rc == rc_stack && dst_hi_rc == rc_stack ) { // mov hi bits
				///size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_hi),ESI_num,0xFF,"PUSH",size);
				///size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_hi),EAX_num,0x8F,"POP ",size);
				if (cbuf) {
					__ lw(AT, SP, ra_->reg2offset(src_hi));
					__ sw(AT, SP, ra_->reg2offset(dst_hi));
#ifndef PRODUCT
				} else {
					if (!do_size) {
						tty->print_cr("lw			AT, SP, %d", ra_->reg2offset(src_hi));
						tty->print_cr("sw			AT, SP, %d", ra_->reg2offset(dst_hi));
					}
#endif
				}
				size += 8;
			}
		} else

		// --------------------------------------
		// Check for integer reg-reg copy
		if( src_lo_rc == rc_int && dst_lo_rc == rc_int ) {
			///size = impl_mov_helper(cbuf,do_size,src_lo,dst_lo,size);
			if (cbuf) {
				__ move(Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo]);
				if (dst_hi_rc!=rc_bad) {
					__ move(Matcher::_regEncode[dst_hi], Matcher::_regEncode[src_hi]);
					size += 4;
				}
#ifndef PRODUCT
			} else if (!do_size) {
				tty->print_cr("move		%s, %s", SharedInfo::regName[dst_lo], SharedInfo::regName[src_lo]);
				if (dst_hi_rc!=rc_bad) {
					tty->print_cr("move		%s, %s", SharedInfo::regName[dst_hi], SharedInfo::regName[src_hi]);
					size += 4;	
				}
#endif
			}
			size += 4;
		} else

		// Check for integer store
		if( src_lo_rc == rc_int && dst_lo_rc == rc_stack ) {
			///size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_lo),src_lo,0x89,"MOV ",size);
			if (cbuf) {
				__ sw(Matcher::_regEncode[src_lo], SP, ra_->reg2offset(dst_lo));
				if (dst_hi_rc!=rc_bad) {
					__ sw(Matcher::_regEncode[src_hi], SP, ra_->reg2offset(dst_hi));
					size +=4;
				}
#ifndef PRODUCT
			} else if (!do_size) {
				tty->print_cr("sw			%s, %d(SP)", SharedInfo::regName[src_lo], ra_->reg2offset(dst_lo));
				if (dst_hi_sc!=rc_bad) {
					tty->print_cr("lw			%s, %d(SP)", SharedInfo::regName[src_hi], ra_->reg2offset(dst_hi));
					size +=4;
				}
#endif
			}
			size += 4;
		} else

		// Check for integer load
		if( dst_lo_rc == rc_int && src_lo_rc == rc_stack ) {
			///size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_lo),dst_lo,0x8B,"MOV ",size);
			if (cbuf) {
				__ lw(Matcher::_regEncode[dst_lo], SP, ra_->reg2offset(src_lo));
				if (dst_hi_rc!=rc_bad) {
					__ lw(Matcher::_regEncode[dst_hi], SP, ra_->reg2offset(src_hi));
					size +=4;
				}
#ifndef PRODUCT
			} else if (!do_size) {
				tty->print_cr("lw			%s, %d(SP)", SharedInfo::regName[dst_lo], ra_->reg2offset(src_lo));
				if (dst_hi_sc!=rc_bad) {
					tty->print_cr("lw			%s, %d(SP)", SharedInfo::regName[dst_hi], ra_->reg2offset(src_hi));
					size +=4;
				}
#endif
			}
			size += 4;
		} else

		// --------------------------------------
		// Check for float reg-reg copy
		if( src_lo_rc == rc_float && dst_lo_rc == rc_float ) {
			assert( (src_hi_rc == rc_bad && dst_hi_rc == rc_bad) ||
					(src_lo+1 == src_hi && dst_lo+1 == dst_hi && src_lo&1==0 && dst_lo&1==0), "no non-adjacent float-moves" );
			if( cbuf ) {   
				///if( src_lo != FPR1L_num ) {
				///	emit_opcode  (*cbuf, 0xD9 );           // FLD    ST(i)
				///	emit_d8      (*cbuf, 0xC0+Matcher::_regEncode[src_lo]-1 );
				///	emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
				///	emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_lo] );
				///} else {
				///	emit_opcode  (*cbuf, 0xDD );           // FST    ST(i)
				///	emit_d8      (*cbuf, 0xD0+Matcher::_regEncode[dst_lo]-1 );
				///}
				
				if (src_hi_rc==rc_bad) {
					__ mfc0(AT, Matcher::_regEncode[src_lo]);
					__ mtc0(AT, Matcher::_regEncode[dst_lo]);
				} else {
					__ dmfc0(AT, Matcher::_regEncode[src_lo]);
					__ dmtc0(AT, Matcher::_regEncode[dst_lo]);
				}
#ifndef PRODUCT
			} else if( !do_size ) { 
				///if( size != 0 ) tty->print("\n\t"); 
				///if( src_lo != FPR1L_num ) tty->print("FLD    %s\n\tFSTP   %s",SharedInfo::regName[src_lo],SharedInfo::regName[dst_lo]);
				///else                      tty->print(             "FST    %s",                            SharedInfo::regName[dst_lo]);
				if (src_hi_rc==rc_bad) {
					tty->print_cr("mfc0		AT, %s", SharedInfo::regName[src_lo]);
					tty->print_cr("mtc0   AT, %s", SharedInfo::regName[dst_lo]);
				} else {
					tty->print_cr("dmfc0	AT, %s", SharedInfo::regName[src_lo]);
					tty->print_cr("dmtc0  AT, %s", SharedInfo::regName[dst_lo]);
				}
#endif
			}
			size += 8;
		} else

		// Check for float store
		if( src_lo_rc == rc_float && dst_lo_rc == rc_stack ) {
			///return impl_fp_store_helper(cbuf,do_size,src_lo,src_hi,dst_lo,dst_hi,ra_->reg2offset(dst_lo),size);
			assert( (src_hi_rc == rc_bad && dst_hi_rc == rc_bad) ||
					(src_lo+1 == src_hi && dst_lo+1 == dst_hi && src_lo&1==0 && dst_lo&1==0), "no non-adjacent float-moves" );
			if( cbuf ) {   
					__ swc1(Matcher::_regEncode[src_lo], SP, ra_->reg2offset(dst_lo));
					size += 4;
					if (src_hi_rc!=rc_bad) {
						__ swc1(Matcher::_regEncode[src_hi], SP, ra_->reg2offset(dst_hi));
						size += 4;
					}
#ifndef PRODUCT
			} else if(!do_size) {
				tty->print_cr("swc1		%s, %d(SP)", SharedInfo::regName[src_lo], ra_->reg2offset(dst_lo));
				size+=4;
				if (src_hi_rc!=rc_bad) {
					tty->print_cr("swc1		%s, %d(SP)", SharedInfo::regName[src_hi], ra_->reg2offset(dst_hi));
					size +=4;
				}
#endif
			} else {
				if (src_hi_rc!=rc_bad) size+=8;
				else size+=4;
			}
		} else

		// Check for float load
		if( dst_lo_rc == rc_float && src_lo_rc == rc_stack ) {
			assert( (src_hi_rc == rc_bad && dst_hi_rc == rc_bad) ||
					(src_lo+1 == src_hi && dst_lo+1 == dst_hi && src_lo&1==0 && dst_lo&1==0), "no non-adjacent float-moves" );
			if( cbuf ) {   
					__ lwc1(Matcher::_regEncode[src_lo], SP, ra_->reg2offset(dst_lo));
					size += 4;
					if (src_hi_rc!=rc_bad) {
						__ lwc1(Matcher::_regEncode[src_hi], SP, ra_->reg2offset(dst_hi));
						size += 4;
					}
#ifndef PRODUCT
			} else if(!do_size) {
				tty->print_cr("lwc1		%s, %d(SP)", SharedInfo::regName[src_lo], ra_->reg2offset(dst_lo));
				size+=4;
				if (src_hi_rc!=rc_bad) {
					tty->print_cr("lwc1		%s, %d(SP)", SharedInfo::regName[src_hi], ra_->reg2offset(dst_hi));
					size +=4;
				}
#endif
			} else {
				if (src_hi_rc!=rc_bad) size+=8;
				else size+=4;
			}
		}

#undef __
		if (cbuf) {
			delete masm;
		}

		assert( size > 0, "missed a case" );

		return size;
	}

#ifndef PRODUCT
	void MachSpillCopyNode::format( PhaseRegAlloc *ra_ ) const {
		implementation( NULL, ra_, false );
	}
#endif

	void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		implementation( &cbuf, ra_, false );
	}

	uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
		return implementation( NULL, ra_, true );
	}

	//=============================================================================
#ifndef PRODUCT
	void MachNopNode::format( PhaseRegAlloc * ) const {
		tty->print("NOP    # Pad for loops and calls");
	}
#endif

	void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
		///emit_opcode(cbuf, 0x90);      // nop 
		MacroAssembler masm(cbuf);
		masm.nop();
	}

	uint MachNopNode::size(PhaseRegAlloc *) const {
		return 4;
	}


	//=============================================================================
#ifndef PRODUCT
	void BoxLockNode::format( PhaseRegAlloc *ra_ ) const {
		int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
		int reg = ra_->get_reg_lo(this);
		///tty->print("LEA    %s,[ESP + #%d]",Matcher::regName[reg],offset);
		tty->print_cr("addiu	%s, SP, %d", Matcher::regName[reg], offset);
	}
#endif

	void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
		int reg = ra_->get_encode(this);
		///if( offset >= 128 ) {
		///  emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
		///  emit_rm(cbuf, 0x2, reg, 0x04);
		///  emit_rm(cbuf, 0x0, 0x04, ESP_enc);
		///  emit_d32(cbuf, offset);
		///}
		///else {
		///  emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
		///  emit_rm(cbuf, 0x1, reg, 0x04);
		///  emit_rm(cbuf, 0x0, 0x04, ESP_enc);
		///  emit_d8(cbuf, offset);
		///}
		MacroAssembler masm(&cbuf);
		masm.addiu(reg, SP, offset);
	}

	uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
		///int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
		///if( offset >= 128 ) {
		///  return 7;
		///}
		///else {
		///  return 4;
		///}
		return 4;
	}

	//=============================================================================

	// what the mean of set a static_stub_Relocation aheade? by yjl 2/22/2006
	// emit call stub, compiled java to interpreter
	void emit_java_to_interp(CodeBuffer &cbuf ) {
		// Stub is fixed up when the corresponding call is converted from calling
		// compiled code to calling interpreted code.
		// mov ebx,0
		// jmp -1

		cbuf.start_a_stub();
		MacroAssembler masm(&cbuf);
#define __ masm.
		// static stub relocation stores the instruction address of the call
		///cbuf.relocate(cbuf.code_end(), 
		///      static_stub_Relocation::spec(cbuf.mark()), RELOC_IMM32);
		// static stub relocation also tags the methodOop in the code-stream.
		///cbuf.relocate(cbuf.code_end(), 
		///      oop_Relocation::spec_for_immediate(), RELOC_IMM32);
		///emit_opcode(cbuf, 0xB8 | EAX_enc); // mov EAX, method
		///emit_d32(cbuf,0);               // method is zapped till fixup time
		///cbuf.set_mark();
		///emit_opcode(cbuf, 0xE9);        // jmp    entry
		///emit_d32_reloc(cbuf, -1 -(int)cbuf.code_end()-4, 
		///             runtime_call_Relocation::spec(), RELOC_IMM32 );

		// Update current stubs pointer and restore code_end.
		int oop_index = __ oop_recorder()->find_index(c->as_jobject());
		RelocationHolder rspec = oop_Relocation::spec(oop_index);
		__ relocate(static_stub_Relocation::spec(cbuf.mark()));
		__ relocate(rspec);
		__ lui(T7, 0);
		__ addiu(T7, T7, 0);
		cbuf.set_mark();
		__ relocate(runtime_call_Relocation::spec());
		__ lui(T9, Assembler::split_high(-1));
		__ addiu(T9, T9, Assembler::split_low(-1));
		__ jr(T9);
		__ delayed()->nop();


#undef __
		cbuf.end_a_stub();
	}
	// size of call stub, compiled java to interpretor
	uint size_java_to_interp() {
		return 20;
	}
	// relocation entries for call stub, compiled java to interpretor
	uint reloc_java_to_interp() {
		return 4;  // 3 in emit_java_to_interp + 1 in Java_Static_Call
	}

	//=============================================================================
#ifndef PRODUCT
	void MachUEPNode::format( PhaseRegAlloc *ra_ ) const {
		///tty->print_cr(  "CMP    EAX,[ECX+4]\t# Inline cache check");
		///tty->print_cr("\tJNE    OptoRuntime::handle_ic_miss_stub");
		///tty->print_cr("\tNOP");
		///tty->print_cr("\tNOP");
		tty->print_cr("\tlw		AT, %d(%s)", oopDesc::klass_offset_in_bytes(), RECEIVER->name());
		tty->print_cr("\tbeq	AT, %s, L", IC_Klass->name());
		tty->print_cr("\tnop");
		tty->print_cr("\tjmp	OptoRuntime::handle_ic_miss_stub");
		tty->print_cr("\tnop");
		tty->print_cr("L:");
		if( !OptoBreakpoint )
			tty->print_cr("nop");
	}
#endif

	void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		MacroAssembler masm(&cbuf);
#define __ masm.
#ifdef ASSERT
		uint code_size = cbuf.code_size();
#endif
		///masm.cmpl(eax, Address(ecx, oopDesc::klass_offset_in_bytes()));
		Label L;
		///masm.jcc(Assembler::notEqual, OptoRuntime::handle_ic_miss_stub(), relocInfo::runtime_call_type);
		__ lw(AT, RECEIVER, oopDesc::klass_offset_in_bytes());
		__ beq(AT, IC_Klass, L);
		__ delayed()->nop();
		__ jmp(OptoRuntime::handle_ic_miss_stub(), relocInfo::runtime_call_type);
		__ delayed()->nop();
		__ bind(L);
		/* WARNING these NOPs are critical so that verified entry point is properly
			 aligned for patching by NativeJump::patch_verified_entry() */
		// no need now for godson2. by yjl 2/22/2006
		///masm.nop();
		///masm.nop();
		if( !OptoBreakpoint ) // Leave space for int3
			///   masm.nop();
			__ nop();

		assert(cbuf.code_size() - code_size == size(ra_), "checking code size of inline cache node");
	}

	uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
		return OptoBreakpoint ? 20 : 24;
	}

	uint offset_start_of_table() {
		return 0;
	}

//=============================================================================
#ifndef PRODUCT
	void MachC2IEntriesNode::format( PhaseRegAlloc *ra_ ) const {
		int ic_reg  = Matcher::inline_cache_reg();
		int rec_reg = Matcher::compiler_method_oop_reg();
		const char *ic_name  = Matcher::regName[ic_reg];
		const char *rec_name = Matcher::regName[rec_reg];
		const char *fp_name  = "FP";

		tty->print_cr("------ MKH Unverified Entry Point");
		int disp = oopDesc::klass_offset_in_bytes();
		// Access receiver klass: this->klass
		///tty->print_cr( "\tMOV    %s,[%s+%d]\t# Receiver klass", tmp_name, rec_name, disp);
		tty->print_cr("\tlw\t\tAT, %d(%s)", disp, rec_name);
		disp = compiledICHolderOopDesc::holder_klass_offset();
		///tty->print_cr( "\tCMP    %s,[%s+compiledICHolderOopDesc::holder_klass_offset()  %d]", tmp_name, ic_name, disp);
		tty->print_cr("\tlw\t\tT8, %d(%s)", disp, ic_name);
		tty->print_cr("\tbne\t\tAT, T8, OptoRuntime::handle_ic_miss_stub");

		// Unpack compiledIC                 
		disp = compiledICHolderOopDesc::holder_method_offset();
		///tty->print_cr( "\tMOV    %s,[%s+compiledICHolderOopDesc::holder_method_offset() %d]", ic_name, ic_name, disp);
		tty->print_cr("\tmove\t%s, %d(%s)", ic_name, disp, ic_name);

		// Jump to inline cache miss fixup if check fails                 
		///tty->print_cr( "\tJNE    OptoRuntime::handle_ic_miss_stub");

		tty->print_cr( "------ Std Verified Entry Point");
	}
#endif

	void MachC2IEntriesNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
		int ic_reg  = Matcher::inline_cache_reg();
		int rec_reg = Matcher::compiler_method_oop_reg();
		int ic_encode  = Matcher::_regEncode[ic_reg];
		int rec_encode = Matcher::_regEncode[rec_reg];
		int tmp_encode = Matcher::_regEncode[tmp_reg];

		// !!!!!
		// Check that the registers are distinct, check ic_reg != rec_reg
		// checked in ADLC
		assert( ic_reg != rec_reg, "registers must be distinct");
		assert( ic_reg != tmp_reg, "registers must be distinct");
		// Check that these registers are caller-saved
		assert( register_save_policy[ic_reg] == 'C' || 
				register_save_policy[ic_reg] == 'A',
				"This register must be caller-saved or always-saved.\n");
		assert( register_save_policy[rec_reg] == 'C' || 
				register_save_policy[rec_reg] == 'A',
				"This register must be caller-saved or always-saved.\n");
		assert( register_save_policy[tmp_reg] == 'C' || 
				register_save_policy[tmp_reg] == 'A',
				"This register must be caller-saved or always-saved.\n");

		MacroAssembler masm(&cbuf);
#define __ masm.
		Label L;

		// ------ MKH Entry Point, Unverified
		// Receives the MethodKlassHolder in inline_cache_reg

		// size 13+6
		// Access "this" pointer from stack

		// Access receiver klass: this->klass
		int disp     = oopDesc::klass_offset_in_bytes();
		///assert( -128 <= disp && disp <= 127, "klass_offset_in_bytes is small");
		///emit_opcode(cbuf, 0x8B);   // MOV tmp_reg,[rec_reg+klass_offset_in_bytes]
		///emit_rm(cbuf, 0x01, tmp_encode, rec_encode ); // R/M byte
		///emit_d8(cbuf, disp);              // Displacement
		__ lw(AT, rec_encode, disp);
		

		// Compare this->klass, in rec_reg, with inline_cached_klass
		disp     = compiledICHolderOopDesc::holder_klass_offset();
		///assert( -128 <= disp && disp <= 127, "holder_klass_offset is small displacement");
		///emit_opcode(cbuf, 0x3B);  // CMP tmp_reg,[ic_reg+holder_klass_offset]
		///emit_rm(cbuf, 0x01, tmp_encode, ic_encode );  // R/M byte
		///emit_d8(cbuf, disp );              // Displacement
		__ lw(T8, ic_encode, disp);

		__ beq(AT, T8, L);
		__ delayed()->nop();

		// Access method_oop from compiledICHolder
		disp     = compiledICHolderOopDesc::holder_method_offset();
		///assert( -128 <= disp && disp <= 127, "holder_method_offset is small");
		///emit_opcode(cbuf, 0x8B);     // MOV    ic_reg,[ic_reg+holder_method_offset]
		///emit_rm(cbuf, 0x01, ic_encode, ic_encode ); // R/M byte
		///emit_d8(cbuf, disp);              // Displacement

		// i dont think we need the runtime call relocation.
		// FIXME by yjl 2/24/2005
		__ lui(T9, OptoRuntime::handle_ic_miss_stub());
		__ addiu(T9, T9, OptoRuntime::handle_ic_miss_stub());
		__ jr(T9);
		__ delayed()->lw(ic_encode, ic_encode, disp);

		__ bind(L);

		///cbuf.set_mark();
		///emit_opcode(cbuf, 0x0F);           // JNE    FIXUP
		///emit_opcode(cbuf, 0x85);
		// Grab address for fixup branch in unvalidated entry
		///address addr = OptoRuntime::handle_ic_miss_stub();
		///emit_d32_reloc(cbuf, addr - cbuf.code_end()-sizeof(int32), 
		///		runtime_call_Relocation::spec(), RELOC_IMM32 );

		// ------ Std Verified Entry Point
		// Receives a method oop in inline_cache_reg  
#undef __
	}

	uint MachC2IEntriesNode::size(PhaseRegAlloc *ra_) const {
		return 32;
	}

	//=============================================================================

#ifndef PRODUCT
	void MachC2IcheckICNode::format( PhaseRegAlloc *ra_ ) const {
		// get register. Inline cache register will contain methodOop at this point. compiler_method_oop_reg is
		// used tempoarily.
		int method_oop = Matcher::inline_cache_reg();

		const char *method_oop_name  = Matcher::regName[method_oop];

		tty->print_cr( "------ checkIC ------");
		int disp = in_bytes(methodOopDesc::compiled_code_offset());
		///tty->print_cr( "\tMOV    %s,[%s+in_bytes(methodOopDesc::compiled_code_offset()) %d]", temp_name, method_oop_name, disp);
		///tty->print_cr( "\tTEST   %s, %s\t# code exists?" , temp_name, temp_name);
		///tty->print_cr( "\tJNE    OptoRuntime::handle_wrong_method_stub()");
		tty->print_cr("\tlw\t\tAT, %d(%s)", disp, method_oop_name);
		tty->print_cr("\tbnez\tAT, OptoRuntime::handle_wrong_method_stub()");
	}
#endif

	void MachC2IcheckICNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {  
		int method_oop_reg     = Matcher::inline_cache_reg();
		int method_oop_encode  = Matcher::_regEncode[method_oop_reg];

		MacroAssembler masm(&cbuf);
#define __ masm.

		Label L;

		// Access code field from methodOop
		int disp = in_bytes(methodOopDesc::compiled_code_offset());
		///assert( -128 <= disp && disp <= 127, "code offset to big");
		// MOV    temp_reg,[method_oop_reg+methodOop::compiled_code_offset_in_bytes]
		///emit_opcode(cbuf, 0x8B);
		///emit_rm(cbuf, 0x01, temp_encode, method_oop_encode ); // R/M byte
		///emit_d8(cbuf, disp);              // Displacement
		__ lw(AT, method_oop_encode, disp);
		__ bne(AT, ZERO, L);
		__ delayed()->nop();


		// TEST temp_reg, temp_reg
		///emit_opcode(cbuf, 0x85);
		///emit_rm(cbuf, 0x03 ,temp_encode, temp_encode);

		// jne clear_ic_stub()
		///cbuf.set_mark();
		///emit_opcode(cbuf, 0x0F);           
		///emit_opcode(cbuf, 0x85);
		// Grab address for fixup branch in unvalidated entry
		///address addr = OptoRuntime::handle_wrong_method_stub();
		///emit_d32_reloc(cbuf, addr - cbuf.code_end()-sizeof(int32), 
		///              runtime_call_Relocation::spec(), RELOC_IMM32 );  
		__ lui(T9, Assembler::split_high((int)OptoRuntime::handle_wrong_method_stub()));
		__ addiu(T9, T9, Assembler::split_low((int)OptoRuntime::handle_wrong_method_stub()));
		__ jr(T9);
		__ delayed()->nop();

		__ bind(L);
#undef __
	}

	uint MachC2IcheckICNode::size(PhaseRegAlloc *ra_) const {
		return 28;
	}

	//=============================================================================
	// Emit exception handler code.  Stuff framesize into a register
	// and call a VM stub routine.
	void emit_exception_handler( CodeBuffer &cbuf ) {
		MacroAssembler masm(&cbuf);
#define __ masm.

		// Lazy deopt bug 4932387. If last instruction is a call then we
		// need an area to patch where we won't overwrite the exception
		// handler. This means we need 5 bytes.
		///for (int i = 0; i < NativeCall::instruction_size ; i++ ) {
		///	emit_opcode(cbuf, 0x90);
		///}
		__ nop();
		__ nop();
		__ nop();
		__ nop();

		// Now mark the functional start of the exception handler
		cbuf.set_exception_offset(cbuf.code_size());
		///cbuf.set_mark();
		///emit_opcode(cbuf, 0xE9);        // jmp    entry
		///emit_d32_reloc(cbuf, ((int)OptoRuntime::exception_blob()->instructions_begin()) - ((int)cbuf.code_end())-4, 
		///		runtime_call_Relocation::spec(), RELOC_IMM32 );
		__ lui(T9, Assembler::split_high((int)OptoRuntime::exception_blob()));
		__ addiu(T9, T9, Assembler::split_low((int)OptoRuntime::exception_blob()));
		__ jr(T9);
		__ delayed()->nop();
#undef __
	}

	uint size_exception_handler() {
		// NativeCall instruction size is the same as NativeJump.
		// exception handler starts out as jump and can be patched to
		// a call be deoptimization. The *2 is because of the padding
		// we need to make sure that deopt patches don't accidentally
		// overwrite patched exception handler (4932387)
		///return 2*NativeCall::instruction_size;
		return 32;
	}

	int Matcher::regnum_to_fpu_offset(int regnum) {
		return regnum - 32; // The FP registers are in the second chunk
	}

	bool is_positive_zero_float(jfloat f) {
		return jint_cast(f) == jint_cast(0.0F);
	}

	bool is_positive_one_float(jfloat f) {
		return jint_cast(f) == jint_cast(1.0F);
	}

	bool is_positive_zero_double(jdouble d) {
		return jlong_cast(d) == jlong_cast(0.0);
	}

	bool is_positive_one_double(jdouble d) {
		return jlong_cast(d) == jlong_cast(1.0);
	}

	// JumpTable support
	const bool Matcher::jumpTableSupported(void) {
		return false;
	}

	// This is UltraSparc specific, true just means we have fast l2f conversion
	const bool Matcher::convL2FSupported(void) {
		return true;
	}

	// Is this branch offset short enough that a short branch can be used?
	//
	// NOTE: If the platform does not provide any short branch variants, then
	//       this method should return false for offset 0.
	///bool Matcher::is_short_branch_offset(int offset) {
	///	return (-128 <= offset && offset <= 127);
	///}

	// Should the Matcher clone shifts on addressing modes, expecting them to
	// be subsumed into complex addressing expressions or compute them into
	// registers?  True for Intel but false for most RISCs
	const bool Matcher::clone_shift_expressions = false;

	// Is it better to copy float constants, or load them directly from memory?
	// Intel can load a float constant from a direct address, requiring no
	// extra registers.  Most RISCs will have to materialize an address into a
	// register first, so they would do better to copy the constant from stack.
	const bool Matcher::rematerialize_float_constants = false;

	// If CPU can load and store mis-aligned doubles directly then no fixup is 
	// needed.  Else we split the double into 2 integer pieces and move it 
	// piece-by-piece.  Only happens when passing doubles into C code as the 
	// Java calling convention forces doubles to be aligned.
	const bool Matcher::misaligned_doubles_ok = false;


	// what the hell does this mean? just leave itself now
	// by yjl 2/24/2006
	void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
		// Get the memory operand from the node
		uint numopnds = node->num_opnds();        // Virtual call for number of operands
		uint skipped  = node->oper_input_base();  // Sum of leaves skipped so far
		assert( idx >= skipped, "idx too low in pd_implicit_null_fixup" ); 
		uint opcnt     = 1;                 // First operand
		uint num_edges = node->_opnds[1]->num_edges(); // leaves for first operand 
		while( idx >= skipped+num_edges ) {
			skipped += num_edges;
			opcnt++;                          // Bump operand count
			assert( opcnt < numopnds, "Accessing non-existent operand" );
			num_edges = node->_opnds[opcnt]->num_edges(); // leaves for next operand
		}

		MachOper *memory = node->_opnds[opcnt];
		MachOper *new_memory = NULL;
		switch (memory->opcode()) {
			case DIRECT:
			case INDOFFSET32X:
				// No transformation necessary.
				return;
			case INDIRECT:
				new_memory = new indirect_win95_safeOper( );
				break;
			case INDOFFSET8:
				new_memory = new indOffset8_win95_safeOper(memory->disp(NULL, NULL, 0));
				break;
			case INDOFFSET32:
				new_memory = new indOffset32_win95_safeOper(memory->disp(NULL, NULL, 0));
				break;
			case INDINDEXOFFSET:
				new_memory = new indIndexOffset_win95_safeOper(memory->disp(NULL, NULL, 0));
				break;
			case INDINDEXSCALE:
				new_memory = new indIndexScale_win95_safeOper(memory->scale());
				break;
			case INDINDEXSCALEOFFSET:
				new_memory = new indIndexScaleOffset_win95_safeOper(memory->scale(), memory->disp(NULL, NULL, 0));
				break;
			case LOAD_LONG_INDIRECT:
			case LOAD_LONG_INDOFFSET32:
				// Does not use EBP as address register, use { EDX, EBX, EDI, ESI}
				return;
			default:
				assert(false, "unexpected memory operand in pd_implicit_null_fixup()");
				return;
		}
		node->_opnds[opcnt] = new_memory;
	}

	// Advertise here if the CPU requires explicit rounding operations
	// to implement the UseStrictFP mode.
	const bool Matcher::strict_fp_requires_explicit_rounding = false;

	// Do floats take an entire double register or just half?
	const bool Matcher::float_in_double = true;
	// Do ints take an entire long register or just half?
	const bool Matcher::int_in_long = false;


	// What is the range of offsets for allocator spill instructions?  
	// Offsets larger than this will encode to a 'large' instruction and 
	// offsets same size or smaller will encode to a 'small' instruction.
	// On Sparc the 'small' offset is from 0 to 4096; offsets larger than
	// this will not have any sane encoding (there's no spare register to
	// build up a large offset).  However, 4096 should be plenty large 
	// enough.  On Intel the 'small' offset is from 0 to 127; 'large' offsets
	// are +128 on up.  The allocator will match both large and small versions
	// of load/store [SP+offset] instructions, and will clone such instructions
	// in fixup_spills and patch in the correct offset.
	///const int Matcher::short_spill_offset_limit = 128;

	// Return whether or not this register is ever used as an argument.  This
	// function is used on startup to build the trampoline stubs in generateOptoStub.  
	// Registers not mentioned will be killed by the VM call in the trampoline, and 
	// arguments in those registers not be available to the callee.
	bool Matcher::can_be_arg( int reg ) {
		///f(  reg == ECX_num   || reg == EDX_num   ) return true;
		///if( (reg == XMM0a_num || reg == XMM1a_num) && UseSSE>=1 ) return true;
		///if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE==2 ) return true;
		if (reg>=A0_num && reg<=A3_num) return true;
		if (reg>=F12_num && reg<=F15_num) return true;
		return false;
	}

	bool Matcher::is_spillable_arg( int reg ) {
		return can_be_arg(reg);
	}
%}

//----------ENCODING BLOCK-----------------------------------------------------
// This block specifies the encoding classes used by the compiler to output
// byte streams.  Encoding classes generate functions which are called by
// Machine Instruction Nodes in order to generate the bit encoding of the
// instruction.  Operands specify their base encoding interface with the
// interface keyword.  There are currently supported four interfaces,
// REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
// operand to generate a function which returns its register number when
// queried.   CONST_INTER causes an operand to generate a function which
// returns the value of the constant when queried.  MEMORY_INTER causes an
// operand to generate four functions which return the Base Register, the
// Index Register, the Scale Value, and the Offset Value of the operand when
// queried.  COND_INTER causes an operand to generate six functions which
// return the encoding code (ie - encoding bits for the instruction)
// associated with each basic boolean condition for a conditional instruction.
// Instructions specify two basic values for encoding.  They use the
// ins_encode keyword to specify their encoding class (which must be one of
// the class names specified in the encoding block), and they use the
// opcode keyword to specify, in order, their primary, secondary, and
// tertiary opcode.  Only the opcode sections which a particular instruction
// needs for encoding need to be specified.
encode %{
	enc_class orri( memory mem, iRegI dst ) %{
		emit_orri(cbuf, this, $primary, $tertiary,
				mem$$base, $mem$$disp, $mem$$index, $dst$$reg);
		%}
					
%}


//---------mFRAME--------------------------------------------------------------
// Definition of frame structure and management information.
//
//  S T A C K   L A Y O U T    Allocators stack-slot number
//                             |   (to get allocators register number
//  G  Owned by    |        |  v    add SharedInfo::stack0)
//  r   CALLER     |        |
//  o     |        +--------+      pad to even-align allocators stack-slot 
//  w     V        |  pad0  |        numbers; owned by CALLER
//  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
//  h     ^        |   in   |  5   
//        |        |  args  |  4   Holes in incoming args owned by SELF
//  |     |    old |        |  3
//  |     |     SP-+--------+----> Matcher::_old_SP, even aligned
//  v     |        |  ret   |  3   return address
//     Owned by    +--------+
//      Self       |  pad2  |  2   pad to align old SP
//        |        +--------+  1
//        |        | locks  |  0
//        |        +--------+----> SharedInfo::stack0, even aligned  
//        |        |  pad1  | 11   pad to align new SP
//        |        +--------+
//        |        |        | 10
//        |        | spills |  9   spills
//        V        |        |  8   (pad0 slot for callee)
//      -----------+--------+----> Matcher::_out_arg_limit, unaligned
//        ^        |  out   |  7   
//        |        |  args  |  6   Holes in outgoing args owned by CALLEE
//   Owned by  new |				|
//		Callee    SP-+--------+----> Matcher::_new_SP, even aligned
//           			 |        |
//
// Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is 
//         known from SELF's arguments and the Java calling convention.
//         Region 6-7 is determined per call site.
// Note 2: If the calling convention leaves holes in the incoming argument 
//         area, those holes are owned by SELF.  Holes in the outgoing area
//         are owned by the CALLEE.  Holes should not be nessecary in the
//         incoming area, as the Java calling convention is completely under
//         the control of the AD file.  Doubles can be sorted and packed to
//         avoid holes.  Holes in the outgoing arguments may be nessecary for
//         varargs C calling conventions.
// Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is 
//         even aligned with pad0 as needed.
//         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
//         region 6-11 is even aligned; it may be padded out more so that
//         the region from SP to FP meets the minimum stack alignment.
// Note 4: For I2C adapters, the incoming FP may not meet the minimum stack
//         alignment.  Region 11, pad1, may be dynamically extended so that
//         SP meets the minimum alignment.

frame %{
  stack_direction(TOWARDS_LOW);

  // These three registers define part of the calling convention 
  // between compiled code and the interpreter.
	// SEE StartI2CNode::calling_convention & StartC2INode::calling_convention & StartOSRNode::calling_convention 
	// for more information. by yjl 3/16/2006
  inline_cache_reg(IC_Klass);          // Inline Cache Register or methodOop for I2C
  interpreter_arg_ptr_reg(A0);         // Argument pointer for I2C adapters
  compiler_method_oop_reg(RECEIVER);   // Temporary in compiled entry-points
  interpreter_method_oop_reg(T7);      // Method Oop Register when calling interpreter

  // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
  ///cisc_spilling_operand_name(indOffset32);  

  // Number of stack slots consumed by locking an object
	// generate Compile::sync_stack_slots
  sync_stack_slots(1);

  frame_pointer(SP);
  // Interpreter stores its frame pointer in a register which is 
  // stored to the stack by I2CAdaptors.
  // I2CAdaptors convert from interpreted java to compiled java.
  interpreter_frame_pointer(FP);

	// generate Matcher::stack_alignment
  stack_alignment(6);            // Log of alignment size in bits (64-bit -> 6)

  // Number of stack slots between incoming argument block and the start of 
  // a new frame.  The PROLOG must add this many slots to the stack.  The
  // EPILOG must remove this many slots.  Intel needs one slot for
  // return address.
	// generate Matcher::in_preserve_stack_slots
  in_preserve_stack_slots(VerifyStackAtCalls);

  // Number of stack slots reserved just above SELF's SP.
  // After a call, these remain between outgoing parameters and callee's frame.
  out_preserve_stack_slots(0);

  // Number of outgoing stack slots killed above the out_preserve_stack_slots
  // for calls to C.  Supports the var-args backing area for register parms.
  varargs_C_out_slots_killed(0);

  // The after-PROLOG location of the return address.  Location of
  // return address specifies a type (REG or STACK) and a number
  // representing the register number (i.e. - use a register name) or
  // stack slot.
  // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
  // Otherwise, it is above the locks and verification slot and alignment word
  return_addr(STACK -1+ round_to(1+VerifyStackAtCalls+Compile::current()->sync()*Compile::current()->sync_stack_slots(),WordsPerLong));

  // Body of function which returns an integer array locating
  // arguments either in registers or in stack slots.  Passed an array
  // of ideal registers called "sig" and a "length" count.  Stack-slot
  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  // arguments for a CALLEE.  Incoming stack arguments are
  // automatically biased by the preserve_stack_slots field above.
	
	// will generated to Matcher::calling_convention(OptoRegPair *sig, uint length, bool is_outgoing)
	// StartNode::calling_convention call this. by yjl 3/16/2006
  calling_convention %{           
    uint    stack = 0;          // Starting stack position for args on stack
    int ireg=A0_num, freg = F12_num;
		
		// Now pick where all else goes.
    for( i = 0; i < length; i++) {
      // From the type and the argument number (count) compute the location
      switch( sig[i].ideal_reg() ) {
      case Op_RegI:
      case Op_RegP:
        if( stack<4 )  {
          sig[i].set1(ireg++); stack++; freg++;
        } else {
          sig[i].set1(SharedInfo::stack2reg(stack++));
        }
        break;
      case Op_RegF:
        if( stack<4 ) {
          sig[i].set1(freg++); stack++; ireg++;
        } else {
          sig[i].set1(SharedInfo::stack2reg(stack++));
        }
        break;
      case Op_RegL:      
				//align first
				if ( stack%2 ) {
					stack++; ireg++; freg++;
				}
				if ( stack<4 ) {
					sig[i].set2(ireg); ireg+=2; freg+=2; stack+=2;
				} else {
					sig[i].set2(SharedInfo::stack2reg(stack)); stack+=2;	
				}
        break;
      case Op_RegD:
				//align first
				if ( stack%2 ) {
					stack++; ireg++; freg++;
				}
				if ( stack<4 ) {
					sig[i].set2(freg); ireg+=2; freg+=2; stack+=2;
				} else {
					sig[i].set2(SharedInfo::stack2reg(stack)); stack+=2;	
				}
        break;
      case 0: sig[i].set_bad(); break;
      default:
        ShouldNotReachHere();
        break;
      }
    }

  %}


  // Body of function which returns an integer array locating
  // arguments either in registers or in stack slots.  Passed an array
  // of ideal registers called "sig" and a "length" count.  Stack-slot
  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  // arguments for a CALLEE.  Incoming stack arguments are
  // automatically biased by the preserve_stack_slots field above.

	// SEE CallRuntimeNode::calling_convention for more information. by yjl 3/16/2006
  c_calling_convention %{           
		uint    stack = 0;          // Starting stack position for args on stack
    int ireg=A0_num, freg = F12_num;
		
		// Now pick where all else goes.
    for( i = 0; i < length; i++) {
      // From the type and the argument number (count) compute the location
      switch( sig[i].ideal_reg() ) {
      case Op_RegI:
      case Op_RegP:
        if( stack<4 )  {
          sig[i].set1(ireg++); stack++; freg++;
        } else {
          sig[i].set1(SharedInfo::stack2reg(stack++));
        }
        break;
      case Op_RegF:
        if( stack<4 ) {
          sig[i].set1(freg++); stack++; ireg++;
        } else {
          sig[i].set1(SharedInfo::stack2reg(stack++));
        }
        break;
      case Op_RegL:      
				//align first
				if ( stack%2 ) {
					stack++; ireg++; freg++;
				}
				if ( stack<4 ) {
					sig[i].set2(ireg); ireg+=2; freg+=2; stack+=2;
				} else {
					sig[i].set2(SharedInfo::stack2reg(stack)); stack+=2;	
				}
        break;
      case Op_RegD:
				//align first
				if ( stack%2 ) {
					stack++; ireg++; freg++;
				}
				if ( stack<4 ) {
					sig[i].set2(freg); ireg+=2; freg+=2; stack+=2;
				} else {
					sig[i].set2(SharedInfo::stack2reg(stack)); stack+=2;	
				}
        break;
      case 0: sig[i].set_bad(); break;
      default:
        ShouldNotReachHere();
        break;
      }
    }
  %}

  // Location of C & interpreter return values
	// register(s) contain(s) return value for Op_StartI2C and Op_StartOSR. 
	// SEE Matcher::match. by yjl 3/16/2006
  c_return_value %{
    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
    static int lo[Op_RegL+1] = { 0, 0, V0_num,      V0_num,      F0_num,    F0_num, V0_num };
    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, F1_num, V1_num };
    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  %}

  // Location of return values
	// register(s) contain(s) return value for Op_StartC2I and Op_Start. 
	// SEE Matcher::match. by yjl 3/16/2006
  return_value %{
    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
    static int lo[Op_RegL+1] = { 0, 0, V0_num,      V0_num,      F0_num,    F0_num, V0_num };
    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, F1_num, V1_num };
    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  %}

%}

//----------ATTRIBUTES---------------------------------------------------------
//----------Operand Attributes-------------------------------------------------
op_attrib op_cost(0);        // Required cost attribute

//----------Instruction Attributes---------------------------------------------
ins_attrib ins_cost(100);       // Required cost attribute
ins_attrib ins_size(32);         // Required size attribute (in bits)
ins_attrib ins_pc_relative(0);  // Required PC Relative flag
ins_attrib ins_short_branch(0); // Required flag: is this instruction a
                                // non-matching short branch variant of some
                                                            // long branch?
ins_attrib ins_alignment(4);    // Required alignment attribute (must be a power of 2)
                                // specifies the alignment that some part of the instruction (not
                                // necessarily the start) requires.  If > 1, a compute_padding()
                                // function must be provided for the instruction

//----------OPERANDS-----------------------------------------------------------
// Operand definitions must precede instruction definitions for correct parsing
// in the ADLC because operands constitute user defined types which are used in
// instruction definitions.

//----------Simple Operands----------------------------------------------------
// Immediate Operands
// Integer Immediate
operand immI() %{
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for test vs zero
operand immI0() %{
  predicate( n->get_int() == 0 );
  match(ConI);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for test vs not-zero
operand immInz() %{
  predicate( n->get_int() != 0 );
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for test vs not-minus-1
operand immInone() %{
  predicate( n->get_int() != -1 );
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for increment
operand immI1() %{
  predicate( n->get_int() == 1 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Constant for decrement
operand immI_M1() %{
  predicate( n->get_int() == -1 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Valid scale values for addressing modes
operand immI2() %{
  predicate(0 <= n->get_int() && (n->get_int() <= 3));
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI8() %{
  predicate((-128 <= n->get_int()) && (n->get_int() <= 127));
  match(ConI);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI16() %{
  predicate((-32768 <= n->get_int()) && (n->get_int() <= 32767));
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for long shifts
operand immI_32() %{
  predicate( n->get_int() == 32 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Integer Immediate: the value 10
operand immI10() %{
  predicate(n->get_int() == 10);
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI_1_31() %{
  predicate( n->get_int() >= 1 && n->get_int() <= 31 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI_32_63() %{
  predicate( n->get_int() >= 32 && n->get_int() <= 63 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Pointer Immediate
operand immP() %{
  match(ConP);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// NULL Pointer Immediate
operand immP0() %{
  predicate( n->get_int() == 0 );
  match(ConP);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate
operand immL() %{
  match(ConL);

  op_cost(20);
  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate zero
operand immL0() %{
  predicate( n->get_long() == 0L );
  match(ConL);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Long immediate from 0 to 127.
// Used for a shorter form of long mul by 10.
operand immL_127() %{
  predicate((0 <= n->get_long()) && (n->get_long() <= 127));
  match(ConL);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate: low 32-bit mask
operand immL_32bits() %{
  predicate(n->get_long() == 0xFFFFFFFFL);
  match(ConL);
  op_cost(20);

  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate: low 32-bit mask
operand immL32() %{
  predicate(n->get_long() == (int)(n->get_long()));
  match(ConL);
  op_cost(20);

  format %{ %}
  interface(CONST_INTER);
%}

//Double Immediate zero
operand immD0() %{
  // Do additional (and counter-intuitive) test against NaN to work around VC++
  // bug that generates code such that NaNs compare equal to 0.0
  predicate( n->getd() == 0.0 && !g_isnan(n->getd()) );
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate 
operand immD1() %{
  predicate( n->getd() == 1.0 );
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate
operand immD() %{
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

operand immXD() %{
  predicate(UseSSE == 2);
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate zero
operand immXD0() %{
  // Do additional (and counter-intuitive) test against NaN to work around VC++
  // bug that generates code such that NaNs compare equal to 0.0 AND do not
  // compare equal to -0.0.
  predicate( UseSSE==2 && jlong_cast(n->getd()) == 0 );
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate zero
operand immF0() %{
  predicate( n->getf() == 0.0 );
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate
operand immF() %{
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate
operand immXF() %{
  predicate(UseSSE >= 1);
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate zero.  Zero and not -0.0
operand immXF0() %{
  predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Immediates for special shifts (sign extend)

// Constants for increment
operand immI_16() %{
  predicate( n->get_int() == 16 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI_24() %{
  predicate( n->get_int() == 24 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Constant for byte-wide masking
operand immI_255() %{
  predicate( n->get_int() == 255 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Register Operands
// Integer Register
operand eRegI() %{
  constraint(ALLOC_IN_RC(e_reg));
  match(RegI);
  match(xRegI);
  match(eAXRegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eDXRegI);
  match(eDIRegI);
  match(eSIRegI);

  format %{ %}
  interface(REG_INTER);
%}

// Subset of Integer Register
operand xRegI(eRegI reg) %{
  constraint(ALLOC_IN_RC(x_reg));
  match(reg);
  match(eAXRegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eDXRegI);

  format %{ %}
  interface(REG_INTER);
%}

// Special Registers
operand eAXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(eax_reg));
  match(reg);
  match(eRegI);

  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Special Registers
operand eBXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(ebx_reg));
  match(reg);
  match(eRegI);

  format %{ "EBX" %}
  interface(REG_INTER);
%}

operand eCXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(ecx_reg));
  match(reg);
  match(eRegI);

  format %{ "ECX" %}
  interface(REG_INTER);
%}

operand eDXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(edx_reg));
  match(reg);
  match(eRegI);

  format %{ "EDX" %}
  interface(REG_INTER);
%}

operand eDIRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(edi_reg));
  match(reg);
  match(eRegI);

  format %{ "EDI" %}
  interface(REG_INTER);
%}

operand naxRegI() %{
  constraint(ALLOC_IN_RC(nax_reg));
  match(RegI);
  match(eCXRegI);
  match(eDXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

operand nadxRegI() %{
  constraint(ALLOC_IN_RC(nadx_reg));
  match(RegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

operand ncxRegI() %{
  constraint(ALLOC_IN_RC(ncx_reg));
  match(RegI);
  match(eAXRegI);
  match(eDXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

// // This operand was used by cmpFastUnlock, but conflicted with 'object' reg
// // 
operand eSIRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(esi_reg));
   match(reg);
   match(eRegI);
 
   format %{ "ESI" %}
   interface(REG_INTER);
%}

// Pointer Register
operand anyRegP() %{
  constraint(ALLOC_IN_RC(any_reg));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);
  match(eRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand eRegP() %{
  constraint(ALLOC_IN_RC(e_reg));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

// On windows95, EBP is not safe to use for implicit null tests.
operand eRegP_win95_safe() %{
  constraint(ALLOC_IN_RC(e_reg_win95_safe));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);

  op_cost(100);
  format %{ %}
  interface(REG_INTER);
%}

operand naxRegP() %{
  constraint(ALLOC_IN_RC(nax_reg));
  match(RegP);
  match(eBXRegP);
  match(eDXRegP);
  match(eCXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand nabxRegP() %{
  constraint(ALLOC_IN_RC(nabx_reg));
  match(RegP);
  match(eCXRegP);
  match(eDXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand pRegP() %{
  constraint(ALLOC_IN_RC(p_reg));
  match(RegP);
  match(eBXRegP);
  match(eDXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

// Special Registers
// Return a pointer value
operand eAXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(eax_reg));
  match(reg);
  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Used in AtomicAdd
operand eBXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(ebx_reg));
  match(reg);
  format %{ "EBX" %}
  interface(REG_INTER);
%}

// Tail-call (interprocedural jump) to interpreter
operand eCXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(ecx_reg));
  match(reg);
  format %{ "ECX" %}
  interface(REG_INTER);
%}

operand eSIRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(reg);
  format %{ "ESI" %}
  interface(REG_INTER);
%}

// Used in rep stosw
operand eDIRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(edi_reg));
  match(reg);
  format %{ "EDI" %}
  interface(REG_INTER);
%}

operand eBPRegP() %{
  constraint(ALLOC_IN_RC(ebp_reg));
  match(RegP);
  format %{ "EBP" %}
  interface(REG_INTER);
%}

operand eRegL() %{
  constraint(ALLOC_IN_RC(long_reg));
  match(RegL);
  match(eADXRegL);

  format %{ %}
  interface(REG_INTER);
%}

operand eADXRegL( eRegL reg ) %{
  constraint(ALLOC_IN_RC(eadx_reg));
  match(reg);

  format %{ "EDX:EAX" %}
  interface(REG_INTER);
%}

operand eBCXRegL( eRegL reg ) %{
  constraint(ALLOC_IN_RC(ebcx_reg));
  match(reg);

  format %{ "EBX:ECX" %}
  interface(REG_INTER);
%}

// Special case for integer high multiply
operand eADXRegL_low_only() %{
  constraint(ALLOC_IN_RC(eadx_reg));
  match(RegL);

  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Flags register, used as output of compare instructions
operand eFlagsReg() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);

  format %{ "EFLAGS" %}
  interface(REG_INTER);
%}

// Flags register, used as output of FLOATING POINT compare instructions
operand eFlagsRegU() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);

  format %{ "EFLAGS_U" %}
  interface(REG_INTER);
%}

// Condition Code Register used by long compare
operand flagsReg_long_LTGE() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_LTGE" %}
  interface(REG_INTER);
%}
operand flagsReg_long_EQNE() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_EQNE" %}
  interface(REG_INTER);
%}
operand flagsReg_long_LEGT() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_LEGT" %}
  interface(REG_INTER);
%}

// Float register operands
operand regD() %{
  constraint(ALLOC_IN_RC(dbl_reg));
  match(RegD);
  match(regDPR1);
  match(regDPR2);
  format %{ %}
  interface(REG_INTER);
%}

operand regDPR1(regD reg) %{
  constraint(ALLOC_IN_RC(dbl_reg0));
  match(reg);
  format %{ "FPR1" %}
  interface(REG_INTER);
%}

operand regDPR2(regD reg) %{
  constraint(ALLOC_IN_RC(dbl_reg1));
  match(reg);
  format %{ "FPR2" %}
  interface(REG_INTER);
%}

// XMM Double register operands
operand regXD() %{
  predicate( UseSSE==2 );
  constraint(ALLOC_IN_RC(xdb_reg));
  match(RegD);
  format %{ %}
  interface(REG_INTER);
%}

// Float register operands
operand regF() %{
  constraint(ALLOC_IN_RC(flt_reg));
  match(RegF);
  match(regFPR1);
  format %{ %}
  interface(REG_INTER);
%}

// Float register operands
operand regFPR1(regF reg) %{
  constraint(ALLOC_IN_RC(flt_reg0));
  match(reg);
  format %{ "FPR1" %}
  interface(REG_INTER);
%}

// XMM register operands
operand regX() %{
  predicate( UseSSE>=1 );
  constraint(ALLOC_IN_RC(xmm_reg));
  match(RegF);
  format %{ %}
  interface(REG_INTER);
%}


//----------Memory Operands----------------------------------------------------
// Direct Memory Operand
operand direct(immP addr) %{
  match(addr);

  format %{ "[$addr]" %}
  interface(MEMORY_INTER) %{
    base(0xFFFFFFFF);
    index(0x4);
    scale(0x0);
    disp($addr);
  %}
%}

// Indirect Memory Operand
operand indirect(eRegP reg) %{
  constraint(ALLOC_IN_RC(e_reg));
  match(reg);

  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Short Offset Operand
operand indOffset8(eRegP reg, immI8 off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32(eRegP reg, immI off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32X(eRegI reg, immP off) %{
  match(AddP off reg);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndexOffset(eRegP reg, eRegI ireg, immI off) %{
  match(AddP (AddP reg ireg) off);

  op_cost(10);
  format %{"[$reg + $off + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndex(eRegP reg, eRegI ireg) %{
  match(AddP reg ireg);

  op_cost(10);
  format %{"[$reg + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp(0x0);
  %}
%}

// // -------------------------------------------------------------------------
// // 486 architecture doesn't support "scale * index + offset" with out a base
// // -------------------------------------------------------------------------
// // Scaled Memory Operands
// // Indirect Memory Times Scale Plus Offset Operand
// operand indScaleOffset(immP off, eRegI ireg, immI2 scale) %{
//   match(AddP off (LShiftI ireg scale));
// 
//   op_cost(10);
//   format %{"[$off + $ireg << $scale]" %}
//   interface(MEMORY_INTER) %{
//     base(0x4);
//     index($ireg);
//     scale($scale);
//     disp($off);
//   %}
// %}

// Indirect Memory Times Scale Plus Index Register 
operand indIndexScale(eRegP reg, eRegI ireg, immI2 scale) %{
  match(AddP reg (LShiftI ireg scale));

  op_cost(10);
  format %{"[$reg + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp(0x0);
  %}
%}

// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
operand indIndexScaleOffset(eRegP reg, immI off, eRegI ireg, immI2 scale) %{
  match(AddP (AddP reg (LShiftI ireg scale)) off);

  op_cost(10);
  format %{"[$reg + $off + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp($off);
  %}
%}

//----------Load Long Memory Operands------------------------------------------
// The load-long idiom will use it's address expression again after loading 
// the first word of the long.  If the load-long destination overlaps with
// registers used in the addressing expression, the 2nd half will be loaded
// from a clobbered address.  Fix this by requiring that load-long use
// address registers that do not overlap with the load-long target.

// load-long support
operand load_long_RegP() %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(RegP);
  match(eSIRegP);
  op_cost(100);
  format %{  %}
  interface(REG_INTER);
%}

// Indirect Memory Operand Long
operand load_long_indirect(load_long_RegP reg) %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(reg);

  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand load_long_indOffset32(load_long_RegP reg, immI off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

opclass load_long_memory(load_long_indirect, load_long_indOffset32);


//----------Special Memory Operands--------------------------------------------
// Stack Slot Operand - This operand is used for loading and storing temporary
//                      values on the stack where a match requires a value to
//                      flow through memory.
operand stackSlotP(sRegP reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotI(sRegI reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotF(sRegF reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotD(sRegD reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotL(sRegL reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

//----------Memory Operands - Win95 Implicit Null Variants----------------
// Indirect Memory Operand
operand indirect_win95_safe(eRegP_win95_safe reg)
%{
  constraint(ALLOC_IN_RC(e_reg));
  match(reg);

  op_cost(100);
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Short Offset Operand
operand indOffset8_win95_safe(eRegP_win95_safe reg, immI8 off)
%{
  match(AddP reg off);

  op_cost(100);
  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32_win95_safe(eRegP_win95_safe reg, immI off)
%{
  match(AddP reg off);

  op_cost(100);
  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndexOffset_win95_safe(eRegP_win95_safe reg, eRegI ireg, immI off)
%{
  match(AddP (AddP reg ireg) off);

  op_cost(100);
  format %{"[$reg + $off + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Times Scale Plus Index Register 
operand indIndexScale_win95_safe(eRegP_win95_safe reg, eRegI ireg, immI2 scale)
%{
  match(AddP reg (LShiftI ireg scale));

  op_cost(100);
  format %{"[$reg + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp(0x0);
  %}
%}

// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
operand indIndexScaleOffset_win95_safe(eRegP_win95_safe reg, immI off, eRegI ireg, immI2 scale)
%{
  match(AddP (AddP reg (LShiftI ireg scale)) off);

  op_cost(100);
  format %{"[$reg + $off + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp($off);
  %}
%}

//----------Conditional Branch Operands----------------------------------------
// Comparison Op  - This is the operation of the comparison, and is limited to
//                  the following set of codes:
//                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
//
// Other attributes of the comparison, such as unsignedness, are specified
// by the comparison instruction that sets a condition code flags register.
// That result is represented by a flags operand whose subtype is appropriate
// to the unsignedness (etc.) of the comparison.
//
// Later, the instruction which matches both the Comparison Op (a Bool) and
// the flags (produced by the Cmp) specifies the coding of the comparison op
// by matching a specific subtype of Bool operand below, such as cmpOpU.

// Comparision Code
operand cmpOp() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4);
    not_equal(0x5);
    less(0xC);
    greater_equal(0xD);
    less_equal(0xE);
    greater(0xF);
  %}
%}

// Comparison Code, unsigned compare.  Used by FP also, with
// C2 (unordered) turned into GT or LT already.  The other bits
// C0 and C3 are turned into Carry & Zero flags.
operand cmpOpU() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4);
    not_equal(0x5);
    less(0x2);
    greater_equal(0x3);
    less_equal(0x6);
    greater(0x7);
  %}
%}

// Comparison Code for FP conditional move
operand cmpOp_fcmov() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal        (0x0C8);
    not_equal    (0x1C8);
    less         (0x0C0);
    greater_equal(0x1C0);
    less_equal   (0x0D0);
    greater      (0x1D0);
  %}
%}

// Comparision Code used in long compares
operand cmpOp_commute() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4);
    not_equal(0x5);
    less(0xF);
    greater_equal(0xE);
    less_equal(0xD);
    greater(0xC);
  %}
%}

//----------OPERAND CLASSES----------------------------------------------------
// Operand Classes are groups of operands that are used as to simplify
// instruction definitions by not requiring the AD writer to specify seperate
// instructions for every form of operand when the instruction accepts
// multiple operand types with the same basic encoding and format.  The classic
// case of this is memory operands.

opclass memory(direct, indirect, indOffset8, indOffset32, indOffset32X, indIndexOffset,
               indIndex, indIndexScale, indIndexScaleOffset);

// Long memory operations are encoded in 2 instructions and a +4 offset.  
// This means some kind of offset is always required and you cannot use
// an oop as the offset (done when working on static globals).
opclass long_memory(direct, indirect, indOffset8, indOffset32, indIndexOffset,
                    indIndex, indIndexScale, indIndexScaleOffset);


//----------PIPELINE-----------------------------------------------------------
// Rules which define the behavior of the target architectures pipeline.
pipeline %{

	//----------ATTRIBUTES---------------------------------------------------------
	attributes %{
		fixed_size_instructions;        		// Fixed size instructions
		branch_has_delay_slot;							// branch have delay slot in gs2
		max_instructions_per_bundle = 4;   	// Up to 5 instructions per bundle
		instruction_unit_size = 4;         	// An instruction is 4 bytes long
		instruction_fetch_unit_size = 32;  	// The processor fetches one line
		instruction_fetch_units = 1;       	// of 32 bytes

		// List of nop instructions
		nops( MachNop );
	%}

	//----------RESOURCES----------------------------------------------------------
	// Resources are the functional units available to the machine

	// godson2c pipeline
	// 4 decoders, a "bundle" is the limit 4 instructions decoded per cycle
	// 1 load/store ops per cycle, 1 branch, 2 FPU, 
	// 2 ALU op, only ALU0 handles mul/div instructions.
	resources( D0, D1, D2, D3, DECODE = D0 | D1 | D2 | D3, 
			MEM, BR, FPU0, FPU1, FPU = FPU0 | FPU1, 
			ALU0, ALU1, ALU = ALU0 | ALU1 );

	//----------PIPELINE DESCRIPTION-----------------------------------------------
	// Pipeline Description specifies the stages in the machine's pipeline

	// godson 2c pipeline
	// i dont know the detail of the godson 2c pipeline, leave it blank now. 
	// by yjl 2/21/2006
	pipe_desc(S0, S1, S2, S3, S4, S5, S6);

	//----------PIPELINE CLASSES---------------------------------------------------
	// Pipeline Classes describe the stages in which input and output are
	// referenced by the hardware pipeline.

	// Naming convention: ialu or fpu
	// Then: _reg
	// Then: _reg if there is a 2nd register
	// Then: _long if it's a pair of instructions implementing a long
	// Then: _fat if it requires the big decoder
	//   Or: _mem if it requires the big decoder and a memory unit.

	// Integer ALU reg operation
	pipe_class ialu_reg(eRegI dst) %{
		single_instruction;
		dst    : S4(write);
		dst    : S3(read);
		DECODE : S0;        // any decoder
		ALU    : S3;        // any alu
	%}

	// Long ALU reg operation
	pipe_class ialu_reg_long(eRegL dst) %{
		instruction_count(2);
		dst    : S4(write);
		dst    : S3(read);
		DECODE : S0(2);     // any 2 decoders
		ALU    : S3(2);     // both alus
	%}

	// Integer ALU reg-reg operation
	pipe_class ialu_reg_reg(eRegI dst, eRegI src) %{
		single_instruction;
		dst    : S4(write);
		src    : S3(read);
		DECODE : S0;        // any decoder
		ALU    : S3;        // any alu
	%}

	// Long ALU reg-reg operation
	pipe_class ialu_reg_reg_long(eRegL dst, eRegL src) %{
		instruction_count(2);
		dst    : S4(write);
		src    : S3(read);
		DECODE : S0(2);     // any 2 decoders
		ALU    : S3(2);     // both alus
	%}

	// Integer Store to Memory
	pipe_class ialu_mem_reg(memory mem, eRegI src) %{
		single_instruction;
		mem    : S3(read);
		src    : S5(read);
		D0     : S0;        // big decoder only
		ALU    : S4;        // any alu
		MEM    : S3;
	%}

	// Long Store to Memory
	pipe_class ialu_mem_long_reg(memory mem, eRegL src) %{
		instruction_count(2);
		mem    : S3(read);
		src    : S5(read);
		D0     : S0(2);     // big decoder only; twice
		ALU    : S4(2);     // any 2 alus
		MEM    : S3(2);     // Both mems
	%}

	// Integer ALU0 reg-reg operation
	pipe_class ialu_reg_reg_alu0(eRegI dst, eRegI src) %{
		single_instruction;
		dst    : S4(write);
		src    : S3(read);
		D0     : S0;        // Big decoder only
		ALU0   : S3;        // only alu0
	%}

	// Integer ALU reg-imm operation
	pipe_class ialu_cr_reg_imm(eFlagsReg cr, eRegI src1) %{
		single_instruction;
		cr     : S4(write);
		src1   : S3(read);
		DECODE : S0;        // any decoder
		ALU    : S3;        // any alu
	%}

	// Float reg-reg operation
	pipe_class fpu_reg(regD dst) %{
		instruction_count(2);
		dst    : S3(read);
		DECODE : S0(2);     // any 2 decoders
		FPU    : S3;
	%}

	// Float reg-reg operation
	pipe_class fpu_reg_reg(regD dst, regD src) %{
		instruction_count(2);
		dst    : S4(write);
		src    : S3(read);
		DECODE : S0(2);     // any 2 decoders
		FPU    : S3;
	%}

	// Float reg-reg operation
	pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
		instruction_count(3);
		dst    : S4(write);
		src1   : S3(read);
		src2   : S3(read);
		DECODE : S0(3);     // any 3 decoders
		FPU    : S3(2);
	%}

	// UnConditional branch
	pipe_class pipe_jmp( label labl ) %{
		single_instruction;
		BR   : S3;
	%}

	// Conditional branch
	pipe_class pipe_jcc( cmpOp cmp, eFlagsReg cr, label labl ) %{
		single_instruction;
		cr    : S1(read);
		BR    : S3;
	%}

	// The real do-nothing guy
	pipe_class empty( ) %{
		instruction_count(0);
	%}

	// Define the class for the Nop node
	define %{
		MachNop = empty;
	%}

%}

//----------INSTRUCTIONS-------------------------------------------------------
// 
// match      -- States which machine-independent subtree may be replaced 
//               by this instruction.
// ins_cost   -- The estimated cost of this instruction is used by instruction
//               selection to identify a minimum cost tree of machine 
//               instructions that matches a tree of machine-independent 
//               instructions.
// format     -- A string providing the disassembly for this instruction.
//               The value of an instruction's operand may be inserted 
//               by referring to it with a '$' prefix.
// opcode     -- Three instruction opcodes may be provided.  These are referred 
//               to within an encode class as $primary, $secondary, and $tertiary
//               respectively.  The primary opcode is commonly used to 
//               indicate the type of machine instruction, while secondary 
//               and tertiary are often used for prefix options or addressing 
//               modes.
// ins_encode -- A list of encode classes with parameters. The encode class
//               name must have been defined in an 'enc_class' specification
//               in the encode section of the architecture description.

instruct box_handle( eRegP dst, stackSlotP src) %{
  match( Set dst (Box src) );
  ins_cost(110);
  format %{ "LEA    $dst,$src\t! (box node)" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_reg_fat );
%}


//----------Load/Store/Move Instructions---------------------------------------
//----------Load Instructions--------------------------------------------------
// Load Byte (8bit signed)
instruct loadB(xRegI dst, memory mem) %{
  match(Set dst (LoadB mem));

  ins_cost(125);
  format %{ "MOVSX8 $dst,$mem" %}
  opcode(0xBE, 0x0F);
  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Byte (8bit UNsigned)
instruct loadUB(xRegI dst, memory mem, immI_255 bytemask) %{
  match(Set dst (AndI (LoadB mem) bytemask));

  ins_cost(125);
  format %{ "MOVZX8 $dst,$mem" %}
  opcode(0xB6, 0x0F);
  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Char (16bit unsigned)
instruct loadC(eRegI dst, memory mem) %{
  match(Set dst (LoadC mem));

  ins_cost(125);
  format %{ "MOVZX  $dst,$mem" %}
  opcode(0xB7, 0x0F);
  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Integer
instruct loadI(eRegI dst, memory mem) %{
  match(Set dst (LoadI mem));

  ins_cost(125);
  format %{ "lw    $dst,$mem" %}
  //opcode(0x8B);
  //ins_encode( OpcP, RegMem(dst,mem));
  //ins_pipe( ialu_reg_mem );
%}

// Load Long.  Cannot clobber address while loading, so restrict address 
// register to ESI
instruct loadL(eRegL dst, load_long_memory mem) %{
  predicate(!Compile::current()->alias_type(n->adr_type())->is_volatile());
  match(Set dst (LoadL mem));

  ins_cost(250);
  format %{ "MOV    $dst.lo,$mem\n\t"
            "MOV    $dst.hi,$mem+4" %}
  opcode(0x8B, 0x8B);
  ins_encode( OpcP, RegMem(dst,mem), OpcS, RegMem_Hi(dst,mem));
  ins_pipe( ialu_reg_long_mem );
%}

// Volatile Load Long.  Must be atomic, so do 64-bit FILD
// then store it down to the stack and reload on the int 
// side.
instruct loadL_volatile(stackSlotL dst, memory mem) %{
  predicate(Compile::current()->alias_type(n->adr_type())->is_volatile());
  match(Set dst (LoadL mem));

  ins_cost(200);
  format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
            "FISTp  $dst" %}
  ins_encode(enc_loadL_volatile(mem,dst));
  ins_pipe( fpu_reg_mem );
%}

// Load Range
instruct loadRange(eRegI dst, memory mem) %{
  match(Set dst (LoadRange mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}


// Load Pointer
instruct loadP(eRegP dst, memory mem) %{
  match(Set dst (LoadP mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Klass Pointer
instruct loadKlass(eRegP dst, memory mem) %{
  match(Set dst (LoadKlass mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Short (16bit signed)
instruct loadS(eRegI dst, memory mem) %{
  match(Set dst (LoadS mem));

  ins_cost(125);
  format %{ "MOVSX  $dst,$mem" %}
  opcode(0xBF, 0x0F);
  ins_encode( OpcS, OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Double
instruct loadD(regD dst, memory mem) %{
  predicate(UseSSE<=1);
  match(Set dst (LoadD mem));

  ins_cost(150);
  format %{ "FLD_D  ST,$mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDD);               /* DD /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_D(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Load Double to XMM
instruct loadXD(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (LoadD mem));
  ins_cost(145);
  format %{ "MOVSD  $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Load to XMM register (single-precision floating point)
// MOVSS instruction
instruct loadX(regX dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (LoadF mem));
  ins_cost(145);
  format %{ "MOVSS  $dst,$mem" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Load Float
instruct loadF(regF dst, memory mem) %{
  predicate(UseSSE==0);
  match(Set dst (LoadF mem));

  ins_cost(150);
  format %{ "FLD_S  ST,$mem\n\t"
            "FSTP   $dst" %}
  opcode(0xD9);               /* D9 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Load Effective Address
instruct leaP8(eRegP dst, indOffset8 mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaP32(eRegP dst, indOffset32 mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxOff(eRegP dst, indIndexOffset mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxScale(eRegP dst, indIndexScale mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxScaleOff(eRegP dst, indIndexScaleOffset mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

// Load Constant
instruct loadConI(eRegI dst, immI src) %{
  match(Set dst src);

  format %{ "MOV    $dst,$src" %}
  ins_encode( LdImmI(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

// Load Constant zero
instruct loadConI0(eRegI dst, immI0 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);
 
  ins_cost(50);
  format %{ "XOR    $dst,$dst" %}
  opcode(0x33);  /* + rd */
  ins_encode( OpcP, RegReg( dst, dst ) );
  ins_pipe( ialu_reg );
%}

instruct loadConP(eRegP dst, immP src) %{
  match(Set dst src);

  format %{ "MOV    $dst,$src" %}
  opcode(0xB8);  /* + rd */
  ins_encode( LdImmP(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

instruct loadConL(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);
  ins_cost(200);
  format %{ "MOV    $dst.lo,$src.lo\n\t"
            "MOV    $dst.hi,$src.hi" %}
  opcode(0xB8);
  ins_encode( LdImmL_Lo(dst, src), LdImmL_Hi(dst, src) );
  ins_pipe( ialu_reg_long_fat );
%}

instruct loadConL0(eRegL dst, immL0 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);
  ins_cost(150);
  format %{ "XOR    $dst.lo,$dst.lo\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  opcode(0x33,0x33);
  ins_encode( RegReg_Lo(dst,dst), RegReg_Hi(dst, dst) );
  ins_pipe( ialu_reg_long );
%}

// Load return address of the following native call into a register
instruct loadConPc(eRegP dst, method offset_to_call_return) %{
  match(Set dst (LoadPC));
  effect(USE offset_to_call_return);
  format %{ "MOV    $dst, PC" %}
  size(5);
  opcode(0xB8);  /* + rd */
  ins_encode( LdImmPc(dst, offset_to_call_return) );
  ins_pipe( ialu_reg_fat );
%}

instruct loadConF(regF dst, immF src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_S  ST,$src\n\t"
            "FSTP   $dst" %}
  opcode(0xD9, 0x00);       /* D9 /0 */
  ins_encode(LdImmF(src), Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_con );
%}

instruct loadConX(regX dst, immXF con) %{
  match(Set dst con);
  format %{ "MOVSS  $dst,[$con]" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), LdImmX(dst, con));
  ins_pipe( pipe_slow );
%}

instruct loadConX0(regX dst, immXF0 src) %{
  match(Set dst src);
  format %{ "XORPS  $dst,$dst\t# Zero XMM register" %}
  ins_encode( Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
  ins_pipe( pipe_slow );
%}

instruct loadConD(regD dst, immD src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_D  ST,$src\n\t"
            "FSTP   $dst" %}
  ins_encode(LdImmD(src), Pop_Reg_D(dst) );
  ins_pipe( fpu_reg_con );
%}

instruct loadConXD(regXD dst, immXD con) %{
  match(Set dst con);
  format %{ "MOVSD  $dst,[$con]" %}
  ins_encode(Opcode(0xF2), Opcode(0x0F), Opcode(0x10), LdImmXD(dst, con));
  ins_pipe( pipe_slow );
%}

instruct loadConXD0(regXD dst, immXD0 src) %{
  match(Set dst src);
  format %{ "XORPD  $dst,$dst\t# Zero XMM register" %}
  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
  ins_pipe( pipe_slow );
%}

// Load Stack Slot
instruct loadSSI(eRegI dst, stackSlotI src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "MOV    $dst,$src" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_mem );     
%}

instruct loadSSL(eRegL dst, stackSlotL src) %{
  match(Set dst src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi" %}
  opcode(0x8B, 0x8B);
  ins_encode( OpcP, RegMem( dst, src ), OpcS, RegMem_Hi( dst, src ) );
  ins_pipe( ialu_mem_long_reg );
%}

// Load Stack Slot
instruct loadSSP(eRegP dst, stackSlotP src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "MOV    $dst,$src" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_mem );     
%}

// Load Stack Slot
instruct loadSSF(regF dst, stackSlotF src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_S  $src\n\t"
            "FSTP   $dst" %}
  opcode(0xD9);               /* D9 /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_mem );      
%}

// Load Stack Slot
instruct loadSSD(regD dst, stackSlotD src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_D  $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDD);               /* DD /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_D(dst) );
  ins_pipe( fpu_reg_mem );      
%}

// Prefetch with SSE instruction
instruct prefetch1( memory mem ) %{
  predicate (UseSSE>=1);
  match( Prefetch mem );
  ins_cost(125);

  format %{ "PREFETCH_L2 $mem\t! Prefetch to level 2 cache" %}
  opcode( 0x0F, 0x18 );     /* Opcode 0F 18 /3 */
  ins_encode( OpcP, OpcS, RMopc_Mem(0x03,mem));
  ins_pipe( pipe_slow );
%}

// Prefetch - MOV into EAX. 
// NOT safe against out-of-range requests.
instruct prefetch0( memory mem, eFlagsReg cr ) %{
  predicate (UseSSE==0);
  match( Prefetch mem );
  effect( KILL cr );
  ins_cost(100);

  format %{ "CMP    EAX,$mem\t! Prefetch only, no flags" %}
  opcode( 0x3B );
  ins_encode( OpcP, RegMem( EAX, mem ) );
  ins_pipe( ialu_cr_reg_imm );
%}


//----------Store Instructions-------------------------------------------------
// Store Byte
instruct storeB(memory mem, xRegI src) %{
  match(Set mem (StoreB mem src));

  ins_cost(125);
  format %{ "MOV8   $mem,$src" %}
  opcode(0x88);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );     
%}

// Store Char/Short
instruct storeC(memory mem, eRegI src) %{
  match(Set mem (StoreC mem src));

  ins_cost(125);
  format %{ "MOV16  $mem,$src" %}
  opcode(0x89, 0x66);
  ins_encode( OpcS, OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );     
%}

// Store Integer
instruct storeI(memory mem, eRegI src) %{
  match(Set mem (StoreI mem src));

  ins_cost(125);
  format %{ "MOV    $mem,$src" %}
  opcode(0x89);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );     
%}

// Store Long
instruct storeL(long_memory mem, eRegL src) %{
  predicate(!Compile::current()->alias_type(n->adr_type())->is_volatile());
  match(Set mem (StoreL mem src));

  ins_cost(200);
  format %{ "MOV    $mem,$src.lo\n\t"
            "MOV    $mem+4,$src.hi" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, mem ), OpcS, RegMem_Hi( src, mem ) );
  ins_pipe( ialu_mem_long_reg );
%}

// Volatile Store Long.  Must be atomic, so move it into
// the FP TOS and then do a 64-bit FIST.  Has to probe the
// target address before the store (for null-ptr checks)
// so the memory operand is used twice in the encoding.
instruct storeL_volatile(memory mem, stackSlotL src, eFlagsReg cr ) %{
  predicate(Compile::current()->alias_type(n->adr_type())->is_volatile());
  match(Set mem (StoreL mem src));
  effect( KILL cr );
  ins_cost(400);
  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
            "FILD   $src\n\t"
            "FISTp  $mem\t # 64-bit atomic volatile long store" %}
  opcode(0x3B);
  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeL_volatile(mem,src));
  ins_pipe( fpu_reg_mem );
%}

// Store Pointer; for storing unknown oops and raw pointers
instruct storeP(memory mem, anyRegP src) %{
  match(Set mem (StoreP mem src));

  ins_cost(125);
  format %{ "MOV    $mem,$src" %}
  opcode(0x89);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );     
%}

// Store Integer Immediate
instruct storeImmI(memory mem, immI src) %{
  match(Set mem (StoreI mem src));

  ins_cost(150);
  format %{ "MOV    $mem,$src" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  ins_pipe( ialu_mem_imm );     
%}

// Store Short/Char Immediate
instruct storeImmI16(memory mem, immI16 src) %{
  match(Set mem (StoreC mem src));

  ins_cost(150);
  format %{ "MOV16  $mem,$src" %}
  opcode(0xC7);     /* C7 /0 Same as 32 store immediate with prefix */
  ins_encode( SizePrefix, OpcP, RMopc_Mem(0x00,mem),  Con16( src ));
  ins_pipe( ialu_mem_imm );     
%}

// Store Pointer Immediate; null pointers or constant oops that do not
// need card-mark barriers.
instruct storeImmP(memory mem, immP src) %{
  match(Set mem (StoreP mem src));

  ins_cost(150);
  format %{ "MOV    $mem,$src" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  ins_pipe( ialu_mem_imm );     
%}

// Store Byte Immediate
instruct storeImmB(memory mem, immI8 src) %{
  match(Set mem (StoreB mem src));

  ins_cost(150);
  format %{ "MOV8   $mem,$src" %}
  opcode(0xC6);               /* C6 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  ins_pipe( ialu_mem_imm );     
%}

// Store CMS card-mark Immediate
instruct storeImmCM(memory mem, immI8 src) %{
  match(Set mem (StoreCM mem src));

  ins_cost(150);
  format %{ "MOV8   $mem,$src\t! CMS card-mark imm0" %}
  opcode(0xC6);               /* C6 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  ins_pipe( ialu_mem_imm );     
%}

// Store Double
instruct storeD( memory mem, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set mem (StoreD mem src));

  ins_cost(100);
  format %{ "FST_D  $mem,$src" %}
  opcode(0xDD);       /* DD /2 */
  ins_encode( enc_FP_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store double does rounding on x86
instruct storeD_rounded( memory mem, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set mem (StoreD mem (RoundDouble src)));

  ins_cost(100);
  format %{ "FST_D  $mem,$src\t# round" %}
  opcode(0xDD);       /* DD /2 */
  ins_encode( enc_FP_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store XMM register to memory (double-precision floating points)
// MOVSD instruction
instruct storeXD(memory mem, regXD src) %{
  predicate(UseSSE==2);
  match(Set mem (StoreD mem src));
  ins_cost(95);
  format %{ "MOVSD  $mem,$src" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
  ins_pipe( pipe_slow );
%}

// Store XMM register to memory (single-precision floating point)
// MOVSS instruction
instruct storeX(memory mem, regX src) %{
  predicate(UseSSE>=1);
  match(Set mem (StoreF mem src));
  ins_cost(95);
  format %{ "MOVSS  $mem,$src" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
  ins_pipe( pipe_slow );
%}

// Store Float
instruct storeF( memory mem, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set mem (StoreF mem src));

  ins_cost(100);
  format %{ "FST_S  $mem,$src" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FP_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store Float does rounding on x86
instruct storeF_rounded( memory mem, regFPR1 src) %{
  match(Set mem (StoreF mem (RoundFloat src)));

  ins_cost(100);
  format %{ "FST_S  $mem,$src\t# round" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FP_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store Float does rounding on x86
instruct storeF_Drounded( memory mem, regDPR1 src) %{
  match(Set mem (StoreF mem (ConvD2F src)));

  ins_cost(100);
  format %{ "FST_S  $mem,$src\t# D-round" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FP_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store Float
instruct storeF_imm( memory mem, immF src) %{
  match(Set mem (StoreF mem src));

  ins_cost(125);
  format %{ "MOV    $mem,$src\t# store float" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32F_as_bits( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Integer to stack slot
instruct storeSSI(stackSlotI dst, eRegI src) %{
  match(Set dst src);

  ins_cost(100);
  format %{ "MOV    $dst,$src" %}
  opcode(0x89);
  ins_encode( OpcPRegSS( dst, src ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Integer to stack slot
instruct storeSSP(stackSlotP dst, eRegP src) %{
  match(Set dst src);

  ins_cost(100);
  format %{ "MOV    $dst,$src" %}
  opcode(0x89);
  ins_encode( OpcPRegSS( dst, src ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Long to stack slot
instruct storeSSL(stackSlotL dst, eRegL src) %{
  match(Set dst src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
  ins_pipe( ialu_mem_long_reg );
%}

//----------MemBar Instructions-----------------------------------------------
// Memory barrier flavors

instruct membar_acquire() %{
  match(MemBarAcquire);
  ins_cost(400);

  size(0);
  format %{ "MEMBAR-acquire" %}
  ins_encode( enc_membar_acquire );
  ins_pipe(pipe_slow);
%}

instruct membar_acquire_lock() %{
  match(MemBarAcquire);
  predicate(Matcher::prior_fast_lock(n));
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-acquire (prior CMPXCHG in FastLock so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_release() %{
  match(MemBarRelease);
  ins_cost(400);

  size(0);
  format %{ "MEMBAR-release" %}
  ins_encode( enc_membar_release );
  ins_pipe(pipe_slow);
%}

instruct membar_release_lock() %{
  match(MemBarRelease);
  predicate(Matcher::post_fast_unlock(n));
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-release (a FastUnlock follows so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_volatile() %{
  match(MemBarVolatile);
  ins_cost(400);

  format %{ "MEMBAR-volatile" %}
  ins_encode( enc_membar_volatile );
  ins_pipe(pipe_slow);
%}

instruct unnecessary_membar_volatile() %{
  match(MemBarVolatile);
  predicate(Matcher::post_store_load_barrier(n));
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-volatile (unnecessary so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_cpu_order() %{
  match(MemBarCPUOrder);
  ins_cost(1);

  format %{ "MEMBAR-CPUOrder" %}
  ins_encode( );
  ins_pipe(empty);
%}

//----------Move Instructions--------------------------------------------------
instruct castL2P(eAXRegP dst, eADXRegL src) %{
  match(Set dst (CastL2P src));
  format %{ "#castL2P of eAX" %}
  ins_encode( /*empty encoding*/ );
  ins_pipe(empty);
%}

instruct castP2L(eADXRegL dst, eAXRegP src, eFlagsReg cr) %{
  match(Set dst (CastP2L src));
  effect(KILL cr);
  ins_cost(50);
  format %{ "#castP2L of eAX\n\t"
            "XOR    EDX,EDX" %}
  opcode(0x33);  /* + rd */
  ins_encode( OpcP, RegReg( EDX, EDX ) );
  ins_pipe( ialu_reg );
%}

instruct castP2I(eRegI dst, eRegP src ) %{
  match(Set dst (CastP2I src));
  ins_cost(50);
  format %{ "MOV    $dst,$src\t# Cast ptr to int" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

//----------Conditional Move---------------------------------------------------
// Conditional move
instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovI_regU( eRegI dst, eRegI src, eFlagsRegU cr, cmpOpU cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Conditional move
instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Conditional move
instruct cmovI_memu(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Conditional move
instruct cmovP_reg(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Conditional move (non-P6 version)
// Note:  a CMoveP is generated for  stubs and native wrappers
//        regardless of whether we are on a P6, so we
//        emulate a cmov here
instruct cmovP_reg_nonP6(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(300);
  format %{ "Jn$cop   skip\n\t"
          "MOV    $dst,$src\t# pointer\n"
      "skip:" %}
  opcode(0x8b);
  ins_encode( enc_cmov_branch(cop, 0x2), OpcP, RegReg(dst, src));
  ins_pipe( pipe_cmov_reg );
%}

// Conditional move
instruct cmovP_regU(eRegP dst, eRegP src, eFlagsRegU cr, cmpOpU cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// DISABLED: Requires the ADLC to emit a bottom_type call that
// correctly meets the two pointer arguments; one is an incoming
// register but the other is a memory operand.  ALSO appears to
// be buggy with implicit null checks.
//
//// Conditional move
//instruct cmovP_mem(cmpOp cop, eFlagsReg cr, eRegP dst, memory src) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
//  ins_cost(250);
//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
//  opcode(0x0F,0x40);
//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
//  ins_pipe( pipe_cmov_mem );
//%}
//
//// Conditional move
//instruct cmovP_memU(cmpOpU cop, eFlagsRegU cr, eRegP dst, memory src) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
//  ins_cost(250);
//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
//  opcode(0x0F,0x40);
//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
//  ins_pipe( pipe_cmov_mem );
//%}

// Conditional move
instruct fcmovD_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "FCMOV$cop $dst,$src\t# double" %}
  opcode(0xDA);
  ins_encode( enc_cmov_d(cop,src) );
  ins_pipe( pipe_cmovD_reg );
%}

// Conditional move
instruct fcmovF_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regF src) %{
  predicate(UseSSE==0);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "FCMOV$cop $dst,$src\t# float" %}
  opcode(0xDA);
  ins_encode( enc_cmov_d(cop,src) );
  ins_pipe( pipe_cmovD_reg );
%}

// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOV    $dst,$src\t# double\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_D(src), OpcP, RegOpc(dst) );
  ins_pipe( pipe_cmovD_reg );
%}

// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
  predicate(UseSSE==0);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop    skip\n\t"
            "MOV    $dst,$src\t# float\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_F(src), OpcP, RegOpc(dst) );
  ins_pipe( pipe_cmovD_reg );
%}

// No CMOVE with SSE/SSE2
instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
  predicate (UseSSE>=1);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSS  $dst,$src\t# float\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x04 ), MovX_reg(dst,src));
  ins_pipe( pipe_slow );
%}

// No CMOVE with SSE/SSE2
instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
  predicate (UseSSE==2);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSD  $dst,$src\t# float\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x4 ), MovXD_reg(dst,src));
  ins_pipe( pipe_slow );
%}

// unsigned version
instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
  predicate (UseSSE>=1);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSS  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode( enc_cmov_branch( cop, 0x4 ), MovX_reg(dst,src) );
  ins_pipe( pipe_slow );
%}

// unsigned version
instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
  predicate (UseSSE==2);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSD  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode( enc_cmov_branch( cop, 0x4 ), MovXD_reg(dst,src) );
  ins_pipe( pipe_slow );
%}

instruct cmovL_reg(cmpOp cop, eFlagsReg cr, eRegL dst, eRegL src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
            "CMOV$cop $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovL_regU(cmpOpU cop, eFlagsRegU cr, eRegL dst, eRegL src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
            "CMOV$cop $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

//----------Arithmetic Instructions--------------------------------------------
//----------Addition Instructions----------------------------------------------
// Integer Addition Instructions
instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(2);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct addI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  format %{ "ADD    $dst,$src" %}
  opcode(0x81, 0x00); /* /0 id */
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(1);
  format %{ "INC    $dst" %}
  opcode(0x40); /*  */
  ins_encode( Opc_plus( primary, dst ) );
  ins_pipe( ialu_reg );
%}

instruct leaI_eReg_immI(eRegI dst, eRegI src0, immI src1) %{
  match(Set dst (AddI src0 src1));
  ins_cost(110);

  format %{ "LEA    $dst,[$src0 + $src1]" %}
  opcode(0x8D); /* 0x8D /r */
  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  ins_pipe( ialu_reg_reg );
%}

instruct leaP_eReg_immI(eRegP dst, eRegP src0, immI src1) %{
  match(Set dst (AddP src0 src1));
  ins_cost(110);

  format %{ "LEA    $dst,[$src0 + $src1]\t# ptr" %}
  opcode(0x8D); /* 0x8D /r */
  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  ins_pipe( ialu_reg_reg );
%}

instruct decI_eReg(eRegI dst, immI_M1 src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(1);
  format %{ "DEC    $dst" %}
  opcode(0x48); /*  */
  ins_encode( Opc_plus( primary, dst ) );
  ins_pipe( ialu_reg );
%}

instruct addP_eReg(eRegP dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (AddP dst src));
  effect(KILL cr);

  size(2);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct addP_eReg_imm(eRegP dst, immI src, eFlagsReg cr) %{
  match(Set dst (AddP dst src));
  effect(KILL cr);

  format %{ "ADD    $dst,$src" %}
  opcode(0x81,0x00); /* Opcode 81 /0 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct addI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (AddI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

instruct addI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "ADD    $dst,$src" %}
  opcode(0x01);  /* Opcode 01 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Add Memory with Immediate
instruct addI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "ADD    $dst,$src" %}
  opcode(0x81);               /* Opcode 81 /0 id */
  ins_encode( OpcSE( src ), RMopc_Mem(0x00,dst), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "INC    $dst" %}
  opcode(0xFF);               /* Opcode FF /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,dst));
  ins_pipe( ialu_mem_imm );
%}

instruct decI_mem(memory dst, immI_M1 src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "DEC    $dst" %}
  opcode(0xFF);               /* Opcode FF /1 */
  ins_encode( OpcP, RMopc_Mem(0x01,dst));
  ins_pipe( ialu_mem_imm );
%}


instruct checkCastPP( eRegP dst ) %{
  match(Set dst (CheckCastPP dst));

  size(0);
  format %{ "#checkcastPP of $dst" %}
  ins_encode( /*empty encoding*/ );
  ins_pipe( empty );
%}

instruct castPP( eRegP dst ) %{
  match(Set dst (CastPP dst));
  format %{ "#castPP of $dst" %}
  ins_encode( /*empty encoding*/ );
  ins_pipe( empty );
%}


// Load-locked - same as a regular pointer load when used with compare-swap
instruct loadPLocked(eRegP dst, memory mem) %{
  match(Set dst (LoadPLocked mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem\t# Load ptr. locked" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// LoadLong-locked - same as a volatile long load when used with compare-swap
instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
  match(Set dst (LoadLLocked mem));

  ins_cost(200);
  format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
            "FISTp  $dst" %}
  ins_encode(enc_loadL_volatile(mem,dst));
  ins_pipe( fpu_reg_mem );
%}

// Conditional-store of the updated heap-top.
// Used during allocation of the shared heap.
// Sets flags (EQ) on success.  Implemented with a CMPXCHG on Intel.
instruct storePConditional( memory heap_top_ptr, eAXRegP oldval, eRegP newval, eFlagsReg cr ) %{
  match(Set cr (StorePConditional heap_top_ptr (Binary oldval newval)));
  // EAX is killed if there is contention, but then it's also unused.
  // In the common case of no contention, EAX holds the new oop address.
  format %{ "CMPXCHG $heap_top_ptr,$newval\t# If EAX==$heap_top_ptr Then store $newval into $heap_top_ptr" %}
  ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval,heap_top_ptr) );
  ins_pipe( pipe_cmpxchg );
%}

// Conditional-store of a long value
// Returns a boolean value (0/1) on success.  Implemented with a CMPXCHG8 on Intel.
// mem_ptr can actually be in either ESI or EDI
instruct storeLConditional( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  match(Set res (StoreLConditional mem_ptr (Binary oldval newval)));
  // EDX:EAX is killed if there is contention, but then it's also unused.
  // In the common case of no contention, EDX:EAX holds the new oop address.
  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg8(mem_ptr),
              enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

// Conditional-store of a long value
// ZF flag is set on success, reset otherwise. Implemented with a CMPXCHG8 on Intel.
// mem_ptr can actually be in either ESI or EDI
instruct storeLConditional_flags( eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr, immI0 zero ) %{
  match(Set cr (CmpI (StoreLConditional mem_ptr (Binary oldval newval)) zero));
  // EDX:EAX is killed if there is contention, but then it's also unused.
  // In the common case of no contention, EDX:EAX holds the new oop address.
  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t" %}
  ins_encode( enc_cmpxchg8(mem_ptr) );
  ins_pipe( pipe_cmpxchg );
%}

// No flag versions for CompareAndSwap{P,I,L} because matcher can't match them

instruct compareAndSwapL( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg8(mem_ptr),
              enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

instruct compareAndSwapP( eRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

instruct compareAndSwapI( eRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
  match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

//----------Subtraction Instructions-------------------------------------------
// Integer Subtraction Instructions
instruct subI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (SubI dst src));
  effect(KILL cr);

  size(2);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct subI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (SubI dst src));
  effect(KILL cr);

  format %{ "SUB    $dst,$src" %}
  opcode(0x81,0x05);  /* Opcode 81 /5 */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct subI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (SubI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

instruct subI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (SubI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "SUB    $dst,$src" %}
  opcode(0x29);  /* Opcode 29 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Subtract from a pointer
instruct subP_eReg(eRegP dst, eRegI src, immI0 zero, eFlagsReg cr) %{
  match(Set dst (AddP dst (SubI zero src)));
  effect(KILL cr);

  size(2);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct negI_eReg(eRegI dst, immI0 zero, eFlagsReg cr) %{
  match(Set dst (SubI zero dst));
  effect(KILL cr);

  size(2);
  format %{ "NEG    $dst" %}
  opcode(0xF7,0x03);  // Opcode F7 /3
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}


//----------Multiplication/Division Instructions-------------------------------
// Integer Multiplication Instructions
// Multiply Register
instruct mulI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (MulI dst src));
  effect(KILL cr);

  size(3);
  ins_cost(300);
  format %{ "IMUL   $dst,$src" %}
  opcode(0xAF, 0x0F);
  ins_encode( OpcS, OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Multiply 32-bit Immediate
instruct mulI_eReg_imm(eRegI dst, eRegI src, immI imm, eFlagsReg cr) %{
  match(Set dst (MulI src imm));
  effect(KILL cr);

  ins_cost(300);
  format %{ "IMUL   $dst,$src,$imm" %}
  opcode(0x69);  /* 69 /r id */
  ins_encode( OpcSE(imm), RegReg( dst, src ), Con8or32( imm ) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

instruct loadConL_low_only(eADXRegL_low_only dst, immL32 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);

  // Note that this is artificially increased to make it more expensive than loadConL
  ins_cost(250);
  format %{ "MOV    EAX,$src\t// low word only" %}
  opcode(0xB8);
  ins_encode( LdImmL_Lo(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

// Multiply by 32-bit Immediate, taking the shifted high order results
//  (special case for shift by 32)
instruct mulI_imm_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32 cnt, eFlagsReg cr) %{
  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() >= min_jint &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() <= max_jint );
  effect(USE_KILL src1, KILL cr);

  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  ins_cost(0*100 + 1*400 - 150);
  format %{ "IMUL   EDX:EAX,$src1" %}
  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  ins_pipe( pipe_slow );
%}

// Multiply by 32-bit Immediate, taking the shifted high order results
instruct mulI_imm_RShift_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() >= min_jint &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() <= max_jint );
  effect(USE_KILL src1, KILL cr);

  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  ins_cost(1*100 + 1*400 - 150);
  format %{ "IMUL   EDX:EAX,$src1\n\t"
            "SAR    EDX,$cnt-32" %}
  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  ins_pipe( pipe_slow );
%}

// Multiply Memory 32-bit Immediate
instruct mulI_mem_imm(eRegI dst, memory src, immI imm, eFlagsReg cr) %{
  match(Set dst (MulI (LoadI src) imm));
  effect(KILL cr);

  ins_cost(300);
  format %{ "IMUL   $dst,$src,$imm" %}
  opcode(0x69);  /* 69 /r id */
  ins_encode( OpcSE(imm), RegMem( dst, src ), Con8or32( imm ) );
  ins_pipe( ialu_reg_mem_alu0 );
%}

// Multiply Memory
instruct mulI(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (MulI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(350);
  format %{ "IMUL   $dst,$src" %}
  opcode(0xAF, 0x0F);
  ins_encode( OpcS, OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem_alu0 );
%}

// Multiply Register Int to Long
instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
  // Basic Idea: long = (long)int * (long)int
  match(Set dst (MulL (ConvI2L src) (ConvI2L src1)));
  effect(DEF dst, USE src, USE src1, KILL flags);

  ins_cost(300);
  format %{ "IMUL   $dst,$src1" %}

  ins_encode( long_int_multiply( dst, src1 ) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

instruct mulIS_eReg(eADXRegL dst, eBCXRegL mask, eRegL mask1, eFlagsReg flags, eAXRegI src, nadxRegI src1) %{
  // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
  match(Set dst (MulL (AndL (ConvI2L src) mask) (AndL (ConvI2L src1) mask1)));
  predicate(_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
            _kids[0]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() == 0xFFFFFFFFl &&
            _kids[1]->_kids[1]->_leaf->Opcode() == Op_ConL &&
            _kids[1]->_kids[1]->_leaf->is_Type()->type()->is_long()->get_con() == 0xFFFFFFFFl );
  effect(DEF dst, USE src, USE src1, USE mask, USE mask1, KILL flags);

  ins_cost(300);
  format %{ "MUL    $dst,$src1" %}  

  ins_encode( long_uint_multiply(dst, src1) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Multiply Register Long
instruct mulL_eReg(eADXRegL dst, eRegL src, eFlagsReg cr, eSIRegI esi) %{
  match(Set dst (MulL dst src));
  effect(KILL cr, KILL esi);
  ins_cost(4*100+3*400);
// Basic idea: lo(result) = lo(x_lo * y_lo)
//             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
  format %{ "MOV    ESI,$src.lo\n\t"
            "IMUL   ESI,EDX\n\t"
            "MOV    EDX,$src.hi\n\t"
            "IMUL   EDX,EAX\n\t"
            "ADD    ESI,EDX\n\t"
            "MUL    EDX:EAX,$src.lo\n\t"
            "ADD    EDX,ESI" %}
  ins_encode( long_multiply( dst, src, esi ) );
  ins_pipe( pipe_slow );
%}

// Multiply Register Long by small constant
instruct mulL_eReg_con(eADXRegL dst, immL_127 src, eFlagsReg cr, eSIRegI esi) %{
  match(Set dst (MulL dst src));
  effect(KILL cr, KILL esi);
  ins_cost(2*100+2*400);
  size(12);
// Basic idea: lo(result) = lo(src * EAX)
//             hi(result) = hi(src * EAX) + lo(src * EDX)
  format %{ "IMUL   ESI,EDX,$src\n\t"
            "MOV    EDX,$src\n\t"
            "MUL    EDX\t# EDX*EAX -> EDX:EAX\n\t"
            "ADD    EDX,ESI" %}
  ins_encode( long_multiply_con( dst, src, esi ) );
  ins_pipe( pipe_slow );
%}

// Integer DIV with Register
instruct divI_eReg(eAXRegI eax, eDXRegI edx, eCXRegI div, eFlagsReg cr) %{
  match(Set eax (DivI eax div));
  effect(KILL edx, KILL cr);
  size(26);
  ins_cost(30*100+10*100);
  format %{ "CMP    EAX,0x80000000\n\t"
            "JNE,s  normal\n\t"
            "XOR    EDX,EDX\n\t"
            "CMP    ECX,-1\n\t"
            "JE,s   done\n"
    "normal: CDQ\n\t"
            "IDIV   $div\n\t"
    "done:"        %}
  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Divide Register Long
instruct divL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  match(Set dst (DivL src1 src2));
  effect( KILL cr, KILL cx, KILL bx );
  ins_cost(10000);
  format %{ "PUSH   $src1.hi\n\t"
            "PUSH   $src1.lo\n\t"
            "PUSH   $src2.hi\n\t"
            "PUSH   $src2.lo\n\t"
            "CALL   SharedRuntime::ldiv\n\t"
            "ADD    ESP,16" %}
  ins_encode( long_div(src1,src2) );
  ins_pipe( pipe_slow );
%}

// Integer MOD with Register
instruct modI_eReg(eDXRegI edx, eAXRegI eax, eCXRegI div, eFlagsReg cr) %{
  match(Set edx (ModI eax div));
  effect(KILL eax, KILL cr);

  size(26);
  ins_cost(300);
  format %{ "CDQ\n\t"
            "IDIV   $div" %}
  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Remainder Register Long
instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  match(Set dst (ModL src1 src2));
  effect( KILL cr, KILL cx, KILL bx );
  ins_cost(10000);
  format %{ "PUSH   $src1.hi\n\t"
            "PUSH   $src1.lo\n\t"
            "PUSH   $src2.hi\n\t"
            "PUSH   $src2.lo\n\t"
            "CALL   SharedRuntime::lrem\n\t"
            "ADD    ESP,16" %}
  ins_encode( long_mod(src1,src2) );
  ins_pipe( pipe_slow );
%}

// Integer Shift Instructions
// Shift Left by one
instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xD1, 0x4);  /* D1 /4 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Shift Left by 8-bit immediate
instruct salI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xC1, 0x4);  /* C1 /4 ib */
  ins_encode( RegOpcImm( dst, shift) );
  ins_pipe( ialu_reg );
%}

// Shift Left by variable
instruct salI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xD3, 0x4);  /* D3 /4 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}

// Arithmetic shift right by one
instruct sarI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD1, 0x7);  /* D1 /7 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Arithmetic shift right by one
instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD1, 0x7);  /* D1 /7 */
  ins_encode( OpcP, RMopc_Mem(secondary,dst) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by 8-bit immediate
instruct sarI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( RegOpcImm( dst, shift ) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by 8-bit immediate
instruct sarI_mem_imm(memory dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);

  format %{ "SAR    $dst,$shift" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( OpcP, RMopc_Mem(secondary, dst ), Con8or32( shift ) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by variable
instruct sarI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD3, 0x7);  /* D3 /7 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}

// Logical shift right by one
instruct shrI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xD1, 0x5);  /* D1 /5 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Logical Shift Right by 8-bit immediate
instruct shrI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xC1, 0x5);  /* C1 /5 ib */
  ins_encode( RegOpcImm( dst, shift) );
  ins_pipe( ialu_reg );
%}

// Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
// This idiom is used by the compiler for the i2b bytecode.
instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour, eFlagsReg cr) %{
  match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
  effect(KILL cr);

  size(3);
  format %{ "MOVSX  $dst,$src :8" %}
  opcode(0xBE, 0x0F);
  ins_encode( OpcS, OpcP, RegReg( dst, src));
  ins_pipe( ialu_reg_reg );
%}

// Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
// This idiom is used by the compiler the i2s bytecode.
instruct i2s(eRegI dst, xRegI src, immI_16 sixteen, eFlagsReg cr) %{
  match(Set dst (RShiftI (LShiftI src sixteen) sixteen));
  effect(KILL cr);

  size(3);
  format %{ "MOVSX  $dst,$src :16" %}
  opcode(0xBF, 0x0F);
  ins_encode( OpcS, OpcP, RegReg( dst, src));
  ins_pipe( ialu_reg_reg );
%}


// Logical Shift Right by variable
instruct shrI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xD3, 0x5);  /* D3 /5 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}


//----------Logical Instructions-----------------------------------------------
//----------Integer Logical Instructions---------------------------------------
// And Instructions
// And Register with Register
instruct andI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (AndI dst src));
  effect(KILL cr);

  size(2);
  format %{ "AND    $dst,$src" %}
  opcode(0x23);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

// And Register with Immediate
instruct andI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (AndI dst src));
  effect(KILL cr);

  format %{ "AND    $dst,$src" %}
  opcode(0x81,0x04);  /* Opcode 81 /4 */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// And Register with Memory
instruct andI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (AndI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "AND    $dst,$src" %}
  opcode(0x23);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// And Memory with Register
instruct andI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "AND    $dst,$src" %}
  opcode(0x21);  /* Opcode 21 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// And Memory with Immediate
instruct andI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "AND    $dst,$src" %}
  opcode(0x81, 0x4);  /* Opcode 81 /4 id */
  // ins_encode( MemImm( dst, src) );
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

// Or Instructions
// Or Register with Register
instruct orI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (OrI dst src));
  effect(KILL cr);

  size(2);
  format %{ "OR     $dst,$src" %}
  opcode(0x0B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

// Or Register with Immediate
instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (OrI dst src));
  effect(KILL cr);

  format %{ "OR     $dst,$src" %}
  opcode(0x81,0x01);  /* Opcode 81 /1 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// Or Register with Memory
instruct orI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (OrI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "OR     $dst,$src" %}
  opcode(0x0B);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// Or Memory with Register
instruct orI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "OR     $dst,$src" %}
  opcode(0x09);  /* Opcode 09 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Or Memory with Immediate
instruct orI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "OR     $dst,$src" %}
  opcode(0x81,0x1);  /* Opcode 81 /1 id */
  // ins_encode( MemImm( dst, src) );
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

// Xor Instructions
// Xor Register with Register
instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (XorI dst src));
  effect(KILL cr);

  size(2);
  format %{ "XOR    $dst,$src" %}
  opcode(0x33);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

// Xor Register with Immediate
instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (XorI dst src));
  effect(KILL cr);

  format %{ "XOR    $dst,$src" %}
  opcode(0x81,0x06);  /* Opcode 81 /6 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// Xor Register with Memory
instruct xorI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (XorI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "XOR    $dst,$src" %}
  opcode(0x33);
  ins_encode( OpcP, RegMem(dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// Xor Memory with Register
instruct xorI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "XOR    $dst,$src" %}
  opcode(0x31);  /* Opcode 31 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Xor Memory with Immediate
instruct xorI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "XOR    $dst,$src" %}
  opcode(0x81,0x6);  /* Opcode 81 /6 id */
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

//----------Convert Int to Boolean---------------------------------------------

instruct movI_nocopy(eRegI dst, eRegI src) %{
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct ci2b( eRegI dst, eRegI src, eFlagsReg cr ) %{
  effect( USE_DEF dst, USE src, KILL cr );
  
  size(4);
  format %{ "NEG    $dst\n\t"
            "ADC    $dst,$src" %}
  ins_encode( neg_reg(dst),
              OpcRegReg(0x13,dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convI2B( eRegI dst, eRegI src, eFlagsReg cr ) %{
  match(Set dst (Conv2B src));

  expand %{
    movI_nocopy(dst,src);
    ci2b(dst,src,cr);
  %}
%}

instruct movP_nocopy(eRegI dst, eRegP src) %{
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct cp2b( eRegI dst, eRegP src, eFlagsReg cr ) %{
  effect( USE_DEF dst, USE src, KILL cr );
  format %{ "NEG    $dst\n\t"
            "ADC    $dst,$src" %}
  ins_encode( neg_reg(dst),
              OpcRegReg(0x13,dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convP2B( eRegI dst, eRegP src, eFlagsReg cr ) %{
  match(Set dst (Conv2B src));

  expand %{
    movP_nocopy(dst,src);
    cp2b(dst,src,cr);
  %}
%}

instruct cmpLTMask( eCXRegI dst, ncxRegI p, ncxRegI q, eFlagsReg cr ) %{
  match(Set dst (CmpLTMask p q));
  effect( KILL cr );
  ins_cost(400);

  // SETlt can only use low byte of EAX,EBX, ECX, or EDX as destination
  format %{ "XOR    $dst,$dst\n\t"
            "CMP    $p,$q\n\t"
            "SETlt  $dst\n\t"
            "NEG    $dst" %}
  ins_encode( OpcRegReg(0x33,dst,dst),
              OpcRegReg(0x3B,p,q),
              setLT_reg(dst), neg_reg(dst) );
  ins_pipe( pipe_slow );
%}

instruct cmpLTMask0( eRegI dst, immI0 zero, eFlagsReg cr ) %{
  match(Set dst (CmpLTMask dst zero));
  effect( DEF dst, KILL cr );
  ins_cost(100);

  format %{ "SAR    $dst,31" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( RegOpcImm( dst, 0x1F ) );
  ins_pipe( ialu_reg );
%}


instruct cadd_cmpLTMask1( ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp, eFlagsReg cr ) %{
  match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
  effect( USE_KILL tmp, KILL cr );
  ins_cost(400);
  // annoyingly, $tmp has no edges so you cant ask for it in
  // any format or encoding
  format %{ "SUB    $p,$q\n\t"
            "SBB    ECX,ECX\n\t"
            "AND    ECX,$y\n\t"
            "ADD    $p,ECX" %}
  ins_encode( enc_cmpLTP(p,q,y,tmp) ); 
  ins_pipe( pipe_cmplt );
%}

instruct cadd_cmpLTMask2( ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp, eFlagsReg cr ) %{
  match(Set p (AddI (SubI p q) (AndI (CmpLTMask p q) y)));
  effect( USE_KILL tmp, KILL cr );
  ins_cost(400);

  format %{ "SUB    $p,$q\n\t"
            "SBB    ECX,ECX\n\t"
            "AND    ECX,$y\n\t"
            "ADD    $p,ECX" %}
  ins_encode( enc_cmpLTP(p,q,y,tmp) ); 
  ins_pipe( pipe_cmplt );
%}

/* If I enable these 2, I encourage spilling in the inner loop of compress.
instruct cadd_cmpLTMask1_mem( ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr ) %{
  match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
  effect( USE_KILL tmp, KILL cr );
  ins_cost(400);

  format %{ "SUB    $p,$q\n\t"
            "SBB    ECX,ECX\n\t"
            "AND    ECX,$y\n\t"
            "ADD    $p,ECX" %}
  ins_encode( enc_cmpLTP_mem(p,q,y,tmp) ); 
%}

instruct cadd_cmpLTMask2_mem( ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr ) %{
  match(Set p (AddI (SubI p q) (AndI (CmpLTMask p q) (LoadI y))));
  effect( USE_KILL tmp, KILL cr );
  ins_cost(400);

  format %{ "SUB    $p,$q\n\t"
            "SBB    ECX,ECX\n\t"
            "AND    ECX,$y\n\t"
            "ADD    $p,ECX" %}
  ins_encode( enc_cmpLTP_mem(p,q,y,tmp) ); 
%}
*/

//----------Long Instructions------------------------------------------------
// Add Long Register with Register
instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (AddL dst src));
  effect(KILL cr);
  ins_cost(200);
  format %{ "ADD    $dst.lo,$src.lo\n\t"
            "ADC    $dst.hi,$src.hi" %}
  opcode(0x03, 0x13);
  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Add Long Register with Immediate
instruct addL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (AddL dst src));
  effect(KILL cr);
  format %{ "ADD    $dst.lo,$src.lo\n\t"
            "ADC    $dst.hi,$src.hi" %}
  opcode(0x81,0x00,0x02);  /* Opcode 81 /0, 81 /2 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Add Long Register with Memory
instruct addL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (AddL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "ADD    $dst.lo,$mem\n\t"
            "ADC    $dst.hi,$mem+4" %}
  opcode(0x03, 0x13);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Subtract Long Register with Register.
instruct subL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (SubL dst src));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SUB    $dst.lo,$src.lo\n\t"
            "SBB    $dst.hi,$src.hi" %}
  opcode(0x2B, 0x1B);
  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Subtract Long Register with Immediate
instruct subL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (SubL dst src));
  effect(KILL cr);
  format %{ "SUB    $dst.lo,$src.lo\n\t"
            "SBB    $dst.hi,$src.hi" %}
  opcode(0x81,0x05,0x03);  /* Opcode 81 /5, 81 /3 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Subtract Long Register with Memory
instruct subL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (SubL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "SUB    $dst.lo,$mem\n\t"
            "SBB    $dst.hi,$mem+4" %}
  opcode(0x2B, 0x1B);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

instruct negL_eReg(eRegL dst, immL0 zero, eFlagsReg cr) %{
  match(Set dst (SubL zero dst));
  effect(KILL cr);
  ins_cost(300);
  format %{ "NEG    $dst.hi\n\tNEG    $dst.lo\n\tSBB    $dst.hi,0" %}
  ins_encode( neg_long(dst) );
  ins_pipe( ialu_reg_reg_long );
%}

// And Long Register with Register
instruct andL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (AndL dst src));
  effect(KILL cr);
  format %{ "AND    $dst.lo,$src.lo\n\t"
            "AND    $dst.hi,$src.hi" %}
  opcode(0x23,0x23);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// And Long Register with Immediate
instruct andL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (AndL dst src));
  effect(KILL cr);
  format %{ "AND    $dst.lo,$src.lo\n\t"
            "AND    $dst.hi,$src.hi" %}
  opcode(0x81,0x04,0x04);  /* Opcode 81 /4, 81 /4 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// And Long Register with Memory
instruct andL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (AndL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "AND    $dst.lo,$mem\n\t"
            "AND    $dst.hi,$mem+4" %}
  opcode(0x23, 0x23);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Or Long Register with Register
instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (OrL dst src));
  effect(KILL cr);
  format %{ "OR     $dst.lo,$src.lo\n\t"
            "OR     $dst.hi,$src.hi" %}
  opcode(0x0B,0x0B);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Or Long Register with Immediate
instruct orl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (OrL dst src));
  effect(KILL cr);
  format %{ "OR     $dst.lo,$src.lo\n\t"
            "OR     $dst.hi,$src.hi" %}
  opcode(0x81,0x01,0x01);  /* Opcode 81 /1, 81 /1 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Or Long Register with Memory
instruct orl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (OrL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "OR     $dst.lo,$mem\n\t"
            "OR     $dst.hi,$mem+4" %}
  opcode(0x0B,0x0B);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Xor Long Register with Register
instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (XorL dst src));
  effect(KILL cr);
  format %{ "XOR    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$src.hi" %}
  opcode(0x33,0x33);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Xor Long Register with Immediate
instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (XorL dst src));
  effect(KILL cr);
  format %{ "XOR    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$src.hi" %}
  opcode(0x81,0x06,0x06);  /* Opcode 81 /6, 81 /6 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Xor Long Register with Memory
instruct xorl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (XorL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "XOR    $dst.lo,$mem\n\t"
            "XOR    $dst.hi,$mem+4" %}
  opcode(0x33,0x33);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Shift Left Long by 1-31
instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHLD   $dst.hi,$dst.lo,$cnt\n\t"
            "SHL    $dst.lo,$cnt" %}
  opcode(0xC1, 0x4, 0xA4);  /* 0F/A4, then C1 /4 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by 32-63
instruct shlL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.hi,$dst.lo\n"
          "\tSHL    $dst.hi,$cnt-32\n"
          "\tXOR    $dst.lo,$dst.lo" %}
  opcode(0xC1, 0x4);  /* C1 /4 ib */
  ins_encode( move_long_big_shift_clr(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by variable
instruct salL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (LShiftL dst shift));
  effect(KILL cr);
  ins_cost(500+200);
  size(17);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.hi,$dst.lo\n\t"
            "XOR    $dst.lo,$dst.lo\n"
    "small:\tSHLD   $dst.hi,$dst.lo,$shift\n\t"
            "SHL    $dst.lo,$shift" %}
  ins_encode( shift_left_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}

// Shift Right Long by 1-31
instruct shrL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (URShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
            "SHR    $dst.hi,$cnt" %}
  opcode(0xC1, 0x5, 0xAC);  /* 0F/AC, then C1 /5 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by 32-63
instruct shrL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (URShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.lo,$dst.hi\n"
          "\tSHR    $dst.lo,$cnt-32\n"
          "\tXOR    $dst.hi,$dst.hi" %}
  opcode(0xC1, 0x5);  /* C1 /5 ib */
  ins_encode( move_long_big_shift_clr(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by variable
instruct shrL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (URShiftL dst shift));
  effect(KILL cr);
  ins_cost(600);
  size(17);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.lo,$dst.hi\n\t"
            "XOR    $dst.hi,$dst.hi\n"
    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
            "SHR    $dst.hi,$shift" %}
  ins_encode( shift_right_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}

// Shift Right Long by 1-31
instruct sarL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (RShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
            "SAR    $dst.hi,$cnt" %}
  opcode(0xC1, 0x7, 0xAC);  /* 0F/AC, then C1 /7 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by 32-63
instruct sarL_eReg_32_63( eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (RShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.lo,$dst.hi\n"
          "\tSAR    $dst.lo,$cnt-32\n"
          "\tSAR    $dst.hi,31" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( move_long_big_shift_sign(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right arithmetic Long by variable
instruct sarL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (RShiftL dst shift));
  effect(KILL cr);
  ins_cost(600);
  size(18);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.lo,$dst.hi\n\t"
            "SAR    $dst.hi,31\n"
    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
            "SAR    $dst.hi,$shift" %}
  ins_encode( shift_right_arith_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}


//----------Double Instructions------------------------------------------------
// Double Math

// Compare & branch

// P6 version of float compare, sets condition codes in EFLAGS
instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI eax) %{
  predicate(VM_Version::supports_cmov() && UseSSE <=1);
  match(Set cr (CmpD src1 src2));
  effect(KILL eax);
  ins_cost(150);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction\n\t"
            "JNP    exit\n\t"
            "MOV    ah,1       // saw a NaN, set CF\n\t"
            "SAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              cmpF_P6_fixup );
  ins_pipe( pipe_slow );
%}

// Compare & branch
instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI eax) %{
  predicate(UseSSE<=1);
  match(Set cr (CmpD src1 src2));
  effect(KILL eax);
  ins_cost(200);
  format %{ "FLD    $src1\n\t"
            "FCOMp  $src2\n\t"
            "FNSTSW AX\n\t"
            "TEST   AX,0x400\n\t"
            "JZ,s   flags\n\t"
            "MOV    AH,1\t# unordered treat as LT\n"
    "flags:\tSAHF" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              fpu_flags);
  ins_pipe( pipe_slow );
%}

// Compare vs zero into -1,0,1
instruct cmpD_0(eRegI dst, regD src1, immD0 zero, eAXRegI eax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (CmpD3 src1 zero));
  effect(KILL cr, KILL eax);
  ins_cost(280);
  format %{ "FTSTL  $dst,$src1" %}
  opcode(0xE4, 0xD9); 
  ins_encode( Push_Reg_D(src1),
              OpcS, OpcP, PopFPU,
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1
instruct cmpD_reg(eRegI dst, regD src1, regD src2, eAXRegI eax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (CmpD3 src1 src2));
  effect(KILL cr, KILL eax);
  ins_cost(300);
  format %{ "FCMPL  $dst,$src1,$src2" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpXD_cc(eFlagsRegU cr, regXD dst, regXD src, eAXRegI eax) %{
  predicate(UseSSE==2);
  match(Set cr (CmpD dst src));
  effect(KILL eax);
  ins_cost(145);
  format %{ "COMISD $dst,$src\n"
          "\tJNP    exit\n"
          "\tMOV    ah,1       // saw a NaN, set CF\n"
          "\tSAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0x66, 0x0F, 0x2F);
  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI eax) %{
  predicate(UseSSE==2);
  match(Set cr (CmpD dst (LoadD src)));
  effect(KILL eax);
  ins_cost(145);
  format %{ "COMISD $dst,$src\n"
          "\tJNP    exit\n"
          "\tMOV    ah,1       // saw a NaN, set CF\n"
          "\tSAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0x66, 0x0F, 0x2F);
  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM
instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
  predicate(UseSSE==2);
  match(Set dst (CmpD3 src1 src2));
  effect(KILL cr);
  ins_cost(275);
  format %{ "XOR    $dst,$dst\n"
          "\tCOMISD $src1,$src2\n"
          "\tJP,s   nan\n"
          "\tJEQ,s  exit\n"
          "\tJA,s   inc\n"
      "nan:\tDEC    $dst\n"
          "\tJMP,s  exit\n"
      "inc:\tINC    $dst\n"
      "exit:"
                %}
  opcode(0x66, 0x0F, 0x2F);
  ins_encode(Xor_Reg(dst), OpcP, OpcS, Opcode(tertiary), RegReg(src1, src2), 
             CmpX_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM and memory
instruct cmpXD_regmem(eRegI dst, regXD src1, memory mem, eFlagsReg cr) %{
  predicate(UseSSE==2);
  match(Set dst (CmpD3 src1 (LoadD mem)));
  effect(KILL cr);
  ins_cost(275);
  format %{ "COMISD $src1,$mem\n"
          "\tMOV    $dst,0\t\t# do not blow flags\n"
          "\tJP,s   nan\n"
          "\tJEQ,s  exit\n"
          "\tJA,s   inc\n"
      "nan:\tDEC    $dst\n"
          "\tJMP,s  exit\n"
      "inc:\tINC    $dst\n"
      "exit:"
                %}
  opcode(0x66, 0x0F, 0x2F);
  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(src1, mem),
             LdImmI(dst,0x0), CmpX_Result(dst));
  ins_pipe( pipe_slow );
%}


instruct subD_reg(regD dst, regD src) %{
  predicate (UseSSE <=1);
  match(Set dst (SubD dst src));

  format %{ "FLD    $src\n\t"
            "DSUBp  $dst,ST" %}
  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
  ins_cost(150);
  ins_encode( Push_Reg_D(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  predicate (UseSSE <=1);
  match(Set dst (RoundDouble (SubD src1 src2)));
  ins_cost(250);

  format %{ "FLD    $src2\n\t"
            "DSUB   ST,$src1\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x5); 
  ins_encode( Push_Reg_D(src2),
              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct subD_reg_mem(regD dst, memory src) %{
  predicate (UseSSE <=1);
  match(Set dst (SubD dst (LoadD src)));
  ins_cost(150);

  format %{ "FLD    $src\n\t"
            "DSUBp  $dst,ST" %}
  opcode(0xDE, 0x5, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

instruct absD_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (AbsD src));
  ins_cost(100);
  format %{ "FABS" %}
  opcode(0xE1, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct absXD_reg( regXD dst ) %{
  predicate(UseSSE==2);
  match(Set dst (AbsD dst));
  format %{ "ANDPD  $dst,[0x7FFFFFFFFFFFFFFF]\t# ABS D by sign masking" %}
  ins_encode( AbsXD_encoding(dst));
  ins_pipe( pipe_slow );
%}

instruct negD_reg(regDPR1 dst, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set dst (NegD src));
  ins_cost(100);
  format %{ "FCHS" %}
  opcode(0xE0, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct negXD_reg( regXD dst ) %{
  predicate(UseSSE==2);
  match(Set dst (NegD dst));
  format %{ "XORPD  $dst,[0x8000000000000000]\t# CHS D by sign flipping" %}
  ins_encode( NegXD_encoding(dst));
  ins_pipe( pipe_slow );
%}

instruct addD_reg(regD dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst src));
  format %{ "FLD    $src\n\t"
            "DADD   $dst,ST" %}
  size(4);
  ins_cost(150);
  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
  ins_encode( Push_Reg_D(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}


instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  predicate(UseSSE<=1);
  match(Set dst (RoundDouble (AddD src1 src2)));
  ins_cost(250);

  format %{ "FLD    $src2\n\t"
            "DADD   ST,$src1\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
  ins_encode( Push_Reg_D(src2),
              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct addD_reg_mem(regD dst, memory src) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst (LoadD src)));
  ins_cost(150);

  format %{ "FLD    $src\n\t"
            "DADDp  $dst,ST" %}
  opcode(0xDE, 0x0, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

// add-to-memory
instruct addD_mem_reg(memory dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
  ins_cost(150);

  format %{ "FLD_D  $dst\n\t"
            "DADD   ST,$src\n\t"
            "FST_D  $dst" %}
  opcode(0xDD, 0x0);
  ins_encode( Opcode(0xDD), RMopc_Mem(0x00,dst),
              Opcode(0xD8), RegOpc(src),
              set_instruction_start,
              Opcode(0xDD), RMopc_Mem(0x03,dst) );
  ins_pipe( fpu_reg_mem );
%}

instruct addD_reg_imm1(regD dst, immD1 src) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst src));
  ins_cost(125);
  format %{ "FLD1\n\t"
            "DADDp  $dst,ST" %}
  opcode(0xDE, 0x00);       
  ins_encode( LdImmD(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg );
%}

instruct addD_reg_imm(regD dst, immD src) %{
  predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (AddD dst src));
  ins_cost(200);
  format %{ "FLD_D  [$src]\n\t"
            "DADDp  $dst,ST" %}
  opcode(0xDE, 0x00);       /* DE /0 */
  ins_encode( LdImmD(src),
              OpcP, RegOpc(dst));
  ins_pipe( fpu_reg_mem );
%}

instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
  predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (RoundDouble (AddD src con)));
  ins_cost(200);
  format %{ "FLD_D  [$con]\n\t"
            "DADD   ST,$src\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x00);       /* D8 /0 */
  ins_encode( LdImmD(con),
              OpcP, RegOpc(src), Pop_Mem_D(dst));
  ins_pipe( fpu_mem_reg_con );
%}

// Add two double precision floating point values in xmm
instruct addXD_reg(regXD dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (AddD dst src));
  format %{ "ADDSD  $dst,$src" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct addXD_imm(regXD dst, immXD con) %{
  predicate(UseSSE==2);
  match(Set dst (AddD dst con));
  format %{ "ADDSD  $dst,[$con]" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), LdImmXD(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct addXD_mem(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (AddD dst (LoadD mem)));
  format %{ "ADDSD  $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Sub two double precision floating point values in xmm
instruct subXD_reg(regXD dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (SubD dst src));
  format %{ "SUBSD  $dst,$src" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct subXD_imm(regXD dst, immXD con) %{
  predicate(UseSSE==2);
  match(Set dst (SubD dst con));
  format %{ "SUBSD  $dst,[$con]" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), LdImmXD(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct subXD_mem(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (SubD dst (LoadD mem)));
  format %{ "SUBSD  $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Mul two double precision floating point values in xmm
instruct mulXD_reg(regXD dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (MulD dst src));
  format %{ "MULSD  $dst,$src" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct mulXD_imm(regXD dst, immXD con) %{
  predicate(UseSSE==2);
  match(Set dst (MulD dst con));
  format %{ "MULSD  $dst,[$con]" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), LdImmXD(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct mulXD_mem(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (MulD dst (LoadD mem)));
  format %{ "MULSD  $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Div two double precision floating point values in xmm
instruct divXD_reg(regXD dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (DivD dst src));
  format %{ "DIVSD  $dst,$src" %}
  opcode(0xF2, 0x0F, 0x5E); 
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct divXD_imm(regXD dst, immXD con) %{
  predicate(UseSSE==2);
  match(Set dst (DivD dst con));
  format %{ "DIVSD  $dst,[$con]" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), LdImmXD(dst, con));
  ins_pipe( pipe_slow );
%}

instruct divXD_mem(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (DivD dst (LoadD mem)));
  format %{ "DIVSD  $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}


instruct mulD_reg(regD dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (MulD dst src));
  format %{ "FLD    $src\n\t"
            "DMULp  $dst,ST" %}
  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  ins_cost(150);
  ins_encode( Push_Reg_D(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Strict FP instruction biases argument before multiply then 
// biases result to avoid double rounding of subnormals.
// 
// scale arg1 by multiplying arg1 by 2^(-15360)
// load arg2
// multiply scaled arg1 by arg2
// rescale product by 2^(15360)
// 
instruct strictfp_mulD_reg(regD dst, regD src) %{
  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  match(Set dst (MulD dst src));
  ins_cost(1);   // Select this instruction for all strict FP double multiplies

  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    $src\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
            "DMULp  $dst,ST\n\t" %}
  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  ins_encode( strictfp_bias1(dst),
              Push_Reg_D(src),
              OpcP, RegOpc(dst),
              strictfp_bias2(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct mulD_reg_imm(regD dst, immD src) %{
  predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (MulD dst src));
  ins_cost(200);
  format %{ "FLD_D  [$src]\n\t"
            "DMULp  $dst,ST" %}
  opcode(0xDE, 0x1); /* DE /1 */
  ins_encode( LdImmD(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}


instruct mulD_reg_mem(regD dst, memory src) %{
  predicate( UseSSE<=1 );
  match(Set dst (MulD dst (LoadD src)));
  ins_cost(200);
  format %{ "FLD_D  $src\n\t"
            "DMULp  $dst,ST" %}
  opcode(0xDE, 0x1, 0xDD); /* DE C8+i or DE /1*/  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

// 
// Cisc-alternate to reg-reg multiply
instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
  predicate( UseSSE<=1 );
  match(Set dst (MulD src (LoadD mem)));
  ins_cost(250);
  format %{ "FLD_D  $mem\n\t"
            "DMUL   ST,$src\n\t"
            "FSTP_D $dst" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
              OpcReg_F(src),
              Pop_Reg_D(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}


// MACRO3 -- addD a mulD
// This instruction is a '2-address' instruction in that the result goes 
// back to src2.  This eliminates a move from the macro; possibly the 
// register allocator will have to add it back (and maybe not).
instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
  predicate( UseSSE<=1 );
  match(Set src2 (AddD (MulD src0 src1) src2));
  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
            "DMUL   ST,$src1\n\t"
            "DADDp  $src2,ST" %}
  ins_cost(250);
  opcode(0xDD); /* LoadD DD /0 */
  ins_encode( Push_Reg_F(src0),
              FMul_ST_reg(src1),
              FAddP_reg_ST(src2) );
  ins_pipe( fpu_reg_reg_reg );
%}


// MACRO3 -- subD a mulD
instruct subD_mulD_reg(regD src2, regD src1, regD src0) %{
  predicate( UseSSE<=1 );
  match(Set src2 (SubD (MulD src0 src1) src2));
  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
            "DMUL   ST,$src1\n\t"
            "DSUBRp $src2,ST" %}
  ins_cost(250);
  ins_encode( Push_Reg_F(src0),
              FMul_ST_reg(src1),
              Opcode(0xDE), Opc_plus(0xE0,src2));
  ins_pipe( fpu_reg_reg_reg );
%}


instruct divD_reg(regD dst, regD src) %{
  predicate( UseSSE<=1 );
  match(Set dst (DivD dst src));

  format %{ "FLD    $src\n\t"
            "FDIVp  $dst,ST" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_cost(150);
  ins_encode( Push_Reg_D(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Strict FP instruction biases argument before division then 
// biases result, to avoid double rounding of subnormals.
// 
// scale dividend by multiplying dividend by 2^(-15360)
// load divisor
// divide scaled dividend by divisor
// rescale quotient by 2^(15360)
// 
instruct strictfp_divD_reg(regD dst, regD src) %{
  predicate (UseSSE<=1);
  match(Set dst (DivD dst src));
  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  ins_cost(01);

  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    $src\n\t"
            "FDIVp  $dst,ST\n\t"
            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
            "DMULp  $dst,ST\n\t" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( strictfp_bias1(dst),
              Push_Reg_D(src),
              OpcP, RegOpc(dst),
              strictfp_bias2(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
  predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
  match(Set dst (RoundDouble (DivD src1 src2)));

  format %{ "FLD    $src1\n\t"
            "FDIV   ST,$src2\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2), Pop_Mem_D(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct modD_reg(regD dst, regD src, eAXRegI eax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (ModD dst src));
  effect(KILL eax, KILL cr); // emitModD() uses EAX and EFLAGS

  format %{ "DMOD   $dst,$src" %}
  ins_cost(250);
  ins_encode(Push_Reg_Mod_D(dst, src),
              emitModD(),
              Push_Result_Mod_D(src),
              Pop_Reg_D(dst));
  ins_pipe( pipe_slow );
%}

instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI eax, regFPR1 tmp, eFlagsReg cr) %{
  predicate(UseSSE==2);
  match(Set dst (ModD src0 src1));
  effect(KILL eax, KILL tmp, KILL cr);

  format %{ "SUB    ESP,8\n"
          "\tMOVSD  [ESP+0],$src1\n"
          "\tFPOP\n"
          "\tFLD_D  [ESP+0]\n"
          "\tMOVSD  [ESP+0],$src0\n"
          "\tFLD_D  [ESP+0]\n"
     "loop:\tFPREM\n"
          "\tFWAIT\n"
          "\tFNSTSW AX\n"
          "\tSAHF\n"
          "\tJP     loop\n"
          "\tFSTP_D [ESP+0]\n"
          "\tMOVSD  $dst,[ESP+0]\n"
          "\tADD    ESP,8"
    %}
  ins_cost(250);
  ins_encode( Push_ModD_encoding(src0, src1), emitModD(), Push_ResultXD(dst));
  ins_pipe( pipe_slow );
%}

instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (SinD src));
  ins_cost(1800);
  format %{ "DSIN" %}
  opcode(0xD9, 0xFE);
  ins_encode( OpcP, OpcS );
  ins_pipe( pipe_slow );
%}

instruct sinXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  predicate (UseSSE==2);
  match(Set dst (SinD src));
  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  ins_cost(1800);
  format %{ "DSIN" %}
  opcode(0xD9, 0xFE);
  ins_encode( Push_SrcXD(src), OpcP, OpcS, Push_ResultXD(dst) );
  ins_pipe( pipe_slow );
%}

instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (CosD src));
  ins_cost(1800);
  format %{ "DCOS" %}
  opcode(0xD9, 0xFF);
  ins_encode( OpcP, OpcS );
  ins_pipe( pipe_slow );
%}

instruct cosXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  predicate (UseSSE==2);
  match(Set dst (CosD src));
  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  ins_cost(1800);
  format %{ "DCOS" %}
  opcode(0xD9, 0xFF);
  ins_encode( Push_SrcXD(src), OpcP, OpcS, Push_ResultXD(dst) );
  ins_pipe( pipe_slow );
%}

instruct tanD_reg(regD dst, regD src) %{
  predicate (UseSSE<=1);
  match(Set dst(TanD src));
  format %{ "DTAN   $dst,$src" %}
  opcode(0xD9, 0xF2);
  ins_encode( Push_Reg_D(src),
              OpcP, OpcS, Pop_Reg_D(dst) );
  ins_pipe( pipe_slow );
%}

instruct tanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  predicate (UseSSE==2);
  match(Set dst(TanD src));
  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  format %{ "DTAN   $dst,$src" %}
  opcode(0xD9, 0xF2);
  ins_encode( Push_SrcXD(src),
              OpcP, OpcS, Push_ResultXD(dst) );
  ins_pipe( pipe_slow );
%}

instruct atanD_reg(regD dst, regD src) %{
  predicate (UseSSE<=1);
  match(Set dst(AtanD dst src));
  format %{ "DATA   $dst,$src" %}
  opcode(0xD9, 0xF3);
  ins_encode( Push_Reg_D(src),
              OpcP, OpcS, RegOpc(dst) );
  ins_pipe( pipe_slow );
%}

instruct atanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
  predicate (UseSSE==2);
  match(Set dst(AtanD dst src));
  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
  format %{ "DATA   $dst,$src" %}
  opcode(0xD9, 0xF3);
  ins_encode( Push_SrcXD(src),
              OpcP, OpcS, Push_ResultXD(dst) );
  ins_pipe( pipe_slow );
%}

instruct sqrtD_reg(regD dst, regD src) %{
  predicate (UseSSE<=1);
  match(Set dst (SqrtD src));
  format %{ "DSQRT  $dst,$src" %}
  opcode(0xFA, 0xD9);
  ins_encode( Push_Reg_D(src),
              OpcS, OpcP, Pop_Reg_D(dst) );
  ins_pipe( pipe_slow );
%}


instruct powD_reg(regDPR1 X, regDPR2 Y) %{
  match(Set X (PowD X Y));
  effect(KILL Y);
  format %{ "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
            "FDUP   \t\t\t# Q Q\n\t"
            "FRNDINT\t\t\t# int(Q) Q\n\t"
            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
            "FXCH   ST(1)\t\t# frac(Q) int(Q)\n\t"
            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADDP [1.000] instead
            "FSCALE \t\t\t# 2^int(Q)*2^frac(Q)=2^Q int(Q)"
             %}
  ins_encode( Opcode(0xD9), Opcode(0xF1),   // fyl2x
              Opcode(0xD9), Opcode(0xC0),   // fdup = fld st(0)
              Opcode(0xD9), Opcode(0xFC),   // frndint
              Opcode(0xDC), Opcode(0xE9),   // fsub st(1) -= st(0)
              Opcode(0xD9), Opcode(0xC9),   // fxch st(1)
              Opcode(0xD9), Opcode(0xF0),   // f2xm1
              Opcode(0xD9), Opcode(0xE8),   // fld1
              Opcode(0xDE), Opcode(0xC1),   // faddp
              Opcode(0xD9), Opcode(0xFD) ); // fscale
  ins_pipe( pipe_slow );
%}

//-------------Float Instructions-------------------------------
// Float Math

// Code for float compare:
//     fcompp();
//     fwait(); fnstsw_ax();
//     sahf();
//     movl(dst, unordered_result);
//     jcc(Assembler::parity, exit);
//     movl(dst, less_result);
//     jcc(Assembler::below, exit);
//     movl(dst, equal_result);
//     jcc(Assembler::equal, exit);
//     movl(dst, greater_result);
//   exit:

// P6 version of float compare, sets condition codes in EFLAGS
instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI eax) %{
  predicate(VM_Version::supports_cmov() && UseSSE == 0);
  match(Set cr (CmpF src1 src2));
  effect(KILL eax);
  ins_cost(150);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction\n\t"
            "JNP    exit\n\t"
            "MOV    ah,1       // saw a NaN, set CF (treat as LT)\n\t"
            "SAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              cmpF_P6_fixup );
  ins_pipe( pipe_slow );
%}


// Compare & branch
instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI eax) %{
  predicate(UseSSE == 0);
  match(Set cr (CmpF src1 src2));
  effect(KILL eax);
  ins_cost(200);
  format %{ "FLD    $src1\n\t"
            "FCOMp  $src2\n\t"
            "FNSTSW AX\n\t"
            "TEST   AX,0x400\n\t"
            "JZ,s   flags\n\t"
            "MOV    AH,1\t# unordered treat as LT\n"
    "flags:\tSAHF" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              fpu_flags);
  ins_pipe( pipe_slow );
%}

// Compare vs zero into -1,0,1
instruct cmpF_0(eRegI dst, regF src1, immF0 zero, eAXRegI eax, eFlagsReg cr) %{
  predicate(UseSSE == 0);
  match(Set dst (CmpF3 src1 zero));
  effect(KILL cr, KILL eax);
  ins_cost(280);
  format %{ "FTSTL  $dst,$src1" %}
  opcode(0xE4, 0xD9); 
  ins_encode( Push_Reg_D(src1),
              OpcS, OpcP, PopFPU,
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1
instruct cmpF_reg(eRegI dst, regF src1, regF src2, eAXRegI eax, eFlagsReg cr) %{
  predicate(UseSSE == 0);
  match(Set dst (CmpF3 src1 src2));
  effect(KILL cr, KILL eax);
  ins_cost(300);
  format %{ "FCMPL  $dst,$src1,$src2" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_D(src1),
              OpcP, RegOpc(src2),
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpX_cc(eFlagsRegU cr, regX dst, regX src, eAXRegI eax) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF dst src));
  effect(KILL eax);
  ins_cost(145);
  format %{ "COMISS $dst,$src\n"
          "\tJNP    exit\n"
          "\tMOV    ah,1       // saw a NaN, set CF\n"
          "\tSAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0x0F, 0x2F);
  ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI eax) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF dst (LoadF src)));
  effect(KILL eax);
  ins_cost(145);
  format %{ "COMISS $dst,$src\n"
          "\tJNP    exit\n"
          "\tMOV    ah,1       // saw a NaN, set CF\n"
          "\tSAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0x0F, 0x2F);
  ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM
instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (CmpF3 src1 src2));
  effect(KILL cr);
  ins_cost(275);
  format %{ "XOR    $dst,$dst\n"
          "\tCOMISS $src1,$src2\n"
          "\tJP,s   nan\n"
          "\tJEQ,s  exit\n"
          "\tJA,s   inc\n"
      "nan:\tDEC    $dst\n"
          "\tJMP,s  exit\n"
      "inc:\tINC    $dst\n"
      "exit:"
                %}
  opcode(0x0F, 0x2F);
  ins_encode(Xor_Reg(dst), OpcP, OpcS, RegReg(src1, src2), CmpX_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM and memory
instruct cmpX_regmem(eRegI dst, regX src1, memory mem, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (CmpF3 src1 (LoadF mem)));
  effect(KILL cr);
  ins_cost(275);
  format %{ "COMISS $src1,$mem\n"
          "\tMOV    $dst,0\t\t# do not blow flags\n"
          "\tJP,s   nan\n"
          "\tJEQ,s  exit\n"
          "\tJA,s   inc\n"
      "nan:\tDEC    $dst\n"
          "\tJMP,s  exit\n"
      "inc:\tINC    $dst\n"
      "exit:"
                %}
  opcode(0x0F, 0x2F);
  ins_encode(OpcP, OpcS, RegMem(src1, mem), LdImmI(dst,0x0), CmpX_Result(dst));
  ins_pipe( pipe_slow );
%}

// Spill to obtain 24-bit precision
instruct subF24_reg(stackSlotF dst, regF src1, regF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (SubF src1 src2));

  format %{ "FSUB   $dst,$src1 - $src2" %}
  opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
  ins_encode( Push_Reg_F(src1),
              OpcReg_F(src2),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
// 
// This instruction does not round to 24-bits
instruct subF_reg(regF dst, regF src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (SubF dst src));

  format %{ "FSUB   $dst,$src" %}
  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
  ins_encode( Push_Reg_F(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Spill to obtain 24-bit precision
instruct addF24_reg(stackSlotF dst, regF src1, regF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0); /* D8 C0+i */
  ins_encode( Push_Reg_F(src2),
              OpcReg_F(src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
// 
// This instruction does not round to 24-bits
instruct addF_reg(regF dst, regF src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF dst src));

  format %{ "FLD    $src\n\t"
            "FADDp  $dst,ST" %}
  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
  ins_encode( Push_Reg_F(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Add two single precision floating point values in xmm
instruct addX_reg(regX dst, regX src) %{
  predicate(UseSSE>=1);
  match(Set dst (AddF dst src));
  format %{ "ADDSS  $dst,$src" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct addX_imm(regX dst, immXF con) %{
  predicate(UseSSE>=1);
  match(Set dst (AddF dst con));
  format %{ "ADDSS  $dst,[$con]" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), LdImmX(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct addX_mem(regX dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (AddF dst (LoadF mem)));
  format %{ "ADDSS  $dst,$mem" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegMem(dst, mem));
  ins_pipe( pipe_slow );
%}

// Subtract two single precision floating point values in xmm
instruct subX_reg(regX dst, regX src) %{
  predicate(UseSSE>=1);
  match(Set dst (SubF dst src));
  format %{ "SUBSS  $dst,$src" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct subX_imm(regX dst, immXF con) %{
  predicate(UseSSE>=1);
  match(Set dst (SubF dst con));
  format %{ "SUBSS  $dst,[$con]" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), LdImmX(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct subX_mem(regX dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (SubF dst (LoadF mem)));
  format %{ "SUBSS  $dst,$mem" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Multiply two single precision floating point values in xmm
instruct mulX_reg(regX dst, regX src) %{
  predicate(UseSSE>=1);
  match(Set dst (MulF dst src));
  format %{ "MULSS  $dst,$src" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct mulX_imm(regX dst, immXF con) %{
  predicate(UseSSE>=1);
  match(Set dst (MulF dst con));
  format %{ "MULSS  $dst,[$con]" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), LdImmX(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct mulX_mem(regX dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (MulF dst (LoadF mem)));
  format %{ "MULSS  $dst,$mem" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Divide two single precision floating point values in xmm
instruct divX_reg(regX dst, regX src) %{
  predicate(UseSSE>=1);
  match(Set dst (DivF dst src));
  format %{ "DIVSS  $dst,$src" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct divX_imm(regX dst, immXF con) %{
  predicate(UseSSE>=1);
  match(Set dst (DivF dst con));
  format %{ "DIVSS  $dst,[$con]" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), LdImmX(dst, con) );
  ins_pipe( pipe_slow );
%}

instruct divX_mem(regX dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (DivF dst (LoadF mem)));
  format %{ "DIVSS  $dst,$mem" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
  ins_pipe( pipe_slow );
%}

// Get the square root of a double precision floating point values in xmm
instruct sqrtXD_reg(regXD dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (SqrtD src));
  format %{ "SQRTSD $dst,$src" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct sqrtXD_mem(regXD dst, memory mem) %{
  predicate(UseSSE==2);
  match(Set dst (SqrtD (LoadD mem)));
  format %{ "SQRTSD $dst,$mem" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
  ins_pipe( pipe_slow );
%}

instruct absF_reg(regFPR1 dst, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set dst (AbsF src));
  ins_cost(100);
  format %{ "FABS" %}
  opcode(0xE1, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct absX_reg(regX dst ) %{
  predicate(UseSSE>=1);
  match(Set dst (AbsF dst));
  format %{ "ANDPS  $dst,[0x7FFFFFFF]\t# ABS F by sign masking" %}
  ins_encode( AbsXF_encoding(dst));
  ins_pipe( pipe_slow );
%}

instruct negF_reg(regFPR1 dst, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set dst (NegF src));
  ins_cost(100);
  format %{ "FCHS" %}
  opcode(0xE0, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct negX_reg( regX dst ) %{
  predicate(UseSSE>=1);
  match(Set dst (NegF dst));
  format %{ "XORPS  $dst,[0x80000000]\t# CHS F by sign flipping" %}
  ins_encode( NegXF_encoding(dst));
  ins_pipe( pipe_slow );
%}

// Cisc-alternate to addF_reg
// Spill to obtain 24-bit precision
instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 (LoadF src2)));

  format %{ "FLD    $src2\n\t"
            "FADD   ST,$src1\n\t"
            "FSTP_S $dst" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_F(src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}
// 
// Cisc-alternate to addF_reg
// This instruction does not round to 24-bits
instruct addF_reg_mem(regF dst, memory src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF dst (LoadF src)));

  format %{ "FADD   $dst,$src" %}
  opcode(0xDE, 0x0, 0xD9); /* DE C0+i or DE /0*/  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

// // Following two instructions for _222_mpegaudio
// Spill to obtain 24-bit precision
instruct addF24_mem_reg(stackSlotF dst, regF src2, memory src1 ) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
              OpcReg_F(src2),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}

// Cisc-spill variant
// Spill to obtain 24-bit precision
instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 (LoadF src2)));

  format %{ "FADD   $dst,$src1,$src2 cisc" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}

// Spill to obtain 24-bit precision
instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 /0 */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}


// Spill to obtain 24-bit precision
instruct addF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));
  format %{ "FLD    $src1\n\t"
            "FADD   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x00);       /* D8 /0 */
  ins_encode( Push_Reg_F(src1),
              Opc_MemImm_F(src2),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_reg_con );
%}
// 
// This instruction does not round to 24-bits
instruct addF_reg_imm(regF dst, regF src1, immF src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));
  format %{ "FLD    $src1\n\t"
            "FADD   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x00);       /* D8 /0 */
  ins_encode( Push_Reg_F(src1),
              Opc_MemImm_F(src2),
              Pop_Reg_F(dst));
  ins_pipe( fpu_reg_reg_con );
%}

// Spill to obtain 24-bit precision
instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FLD    $src1\n\t"
            "FMUL   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
  ins_encode( Push_Reg_F(src1),
              OpcReg_F(src2),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
// 
// This instruction does not round to 24-bits
instruct mulF_reg(regF dst, regF src1, regF src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FLD    $src1\n\t"
            "FMUL   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1); /* D8 C8+i */
  ins_encode( Push_Reg_F(src2),
              OpcReg_F(src1),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_reg_reg );
%}


// Spill to obtain 24-bit precision
// Cisc-alternate to reg-reg multiply
instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 (LoadF src2)));

  format %{ "FLDS   $src2\n\t"
            "FMUL   $src1\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_F(src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}
// 
// This instruction does not round to 24-bits
// Cisc-alternate to reg-reg multiply
instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 (LoadF src2)));

  format %{ "FMUL   $dst,$src1,$src2" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_F(src1),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}

// Spill to obtain 24-bit precision
instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FMUL   $dst,$src1,$src2" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 /1 */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}

// Spill to obtain 24-bit precision
instruct mulF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FMULc $dst,$src1,$src2" %}
  opcode(0xD8, 0x1);  /* D8 /1*/
  ins_encode( Push_Reg_F(src1),
              Opc_MemImm_F(src2),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_reg_con );
%}
// 
// This instruction does not round to 24-bits
instruct mulF_reg_imm(regF dst, regF src1, immF src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FMULc $dst. $src1, $src2" %}
  opcode(0xD8, 0x1);  /* D8 /1*/
  ins_encode( Push_Reg_F(src1),
              Opc_MemImm_F(src2),
              Pop_Reg_F(dst));
  ins_pipe( fpu_reg_reg_con );
%}


// 
// MACRO1 -- subsume unshared load into mulF
// This instruction does not round to 24-bits
instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF (LoadF mem1) src));

  format %{ "FLD    $mem1    ===MACRO1===\n\t"
            "FMUL   ST,$src\n\t"
            "FSTP   $dst" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
              OpcReg_F(src),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}
// 
// MACRO2 -- addF a mulF which subsumed an unshared load
// This instruction does not round to 24-bits
instruct addF_mulF_reg_load1(regF dst, memory mem1, regF src1, regF src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
  ins_cost(95);

  format %{ "FLD    $mem1     ===MACRO2===\n\t"
            "FMUL   ST,$src1  subsume mulF left load\n\t"
            "FADD   ST,$src2\n\t"
            "FSTP   $dst" %}
  opcode(0xD9); /* LoadF D9 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem1),
              FMul_ST_reg(src1),
              FAdd_ST_reg(src2),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_mem_reg_reg );
%}
 
// MACRO3 -- addF a mulF
// This instruction does not round to 24-bits.  It is a '2-address'
// instruction in that the result goes back to src2.  This eliminates
// a move from the macro; possibly the register allocator will have
// to add it back (and maybe not).
instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set src2 (AddF (MulF src0 src1) src2));

  format %{ "FLD    $src0     ===MACRO3===\n\t"
            "FMUL   ST,$src1\n\t"
            "FADDP  $src2,ST" %}
  opcode(0xD9); /* LoadF D9 /0 */
  ins_encode( Push_Reg_F(src0),
              FMul_ST_reg(src1),
              FAddP_reg_ST(src2) );
  ins_pipe( fpu_reg_reg_reg );
%}

// MACRO4 -- divF subF
// This instruction does not round to 24-bits
instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (DivF (SubF src2 src1) src3));

  format %{ "FLD    $src2   ===MACRO4===\n\t"
            "FSUB   ST,$src1\n\t"
            "FDIV   ST,$src3\n\t"
            "FSTP  $dst" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( Push_Reg_F(src2),
              subF_divF_encode(src1,src3),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_reg_reg_reg );
%}

// Spill to obtain 24-bit precision
instruct divF24_reg(stackSlotF dst, regF src1, regF src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (DivF src1 src2));

  format %{ "FDIV   $dst,$src1,$src2" %}
  opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
  ins_encode( Push_Reg_F(src1),
              OpcReg_F(src2),
              Pop_Mem_F(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
// 
// This instruction does not round to 24-bits
instruct divF_reg(regF dst, regF src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (DivF dst src));

  format %{ "FDIV   $dst,$src" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( Push_Reg_F(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}


// Spill to obtain 24-bit precision
instruct modF24_reg(stackSlotF dst, regF src1, regF src2, eAXRegI eax, eFlagsReg cr) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ModF src1 src2));
  effect(KILL eax, KILL cr); // emitModD() uses EAX and EFLAGS

  format %{ "FMOD   $dst,$src1,$src2" %}
  ins_encode( Push_Reg_Mod_D(src1, src2),
              emitModD(),
              Push_Result_Mod_D(src2),
              Pop_Mem_F(dst));
  ins_pipe( pipe_slow );
%}
// 
// This instruction does not round to 24-bits
instruct modF_reg(regF dst, regF src, eAXRegI eax, eFlagsReg cr) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ModF dst src));
  effect(KILL eax, KILL cr); // emitModD() uses EAX and EFLAGS

  format %{ "FMOD   $dst,$src" %}
  ins_encode(Push_Reg_Mod_D(dst, src),
              emitModD(),
              Push_Result_Mod_D(src),
              Pop_Reg_F(dst));
  ins_pipe( pipe_slow );
%}

instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI eax, regFPR1 tmp, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (ModF src0 src1));
  effect(KILL eax, KILL tmp, KILL cr);
  format %{ "SUB    ESP,4\n"
          "\tMOVSS  [ESP+0],$src1\n"
          "\tFPOP\n"
          "\tFLD_S  [ESP+0]\n"
          "\tMOVSS  [ESP+0],$src0\n"
          "\tFLD_S  [ESP+0]\n"
     "loop:\tFPREM\n"
          "\tFWAIT\n"
          "\tFNSTSW AX\n"
          "\tSAHF\n"
          "\tJP     loop\n"
          "\tFSTP_S [ESP+0]\n"
          "\tMOVSS  $dst,[ESP+0]\n"
          "\tADD    ESP,4"
    %}
  ins_cost(250);
  ins_encode( Push_ModX_encoding(src0, src1), emitModD(), Push_ResultX(dst,0x4));
  ins_pipe( pipe_slow );
%}


//----------Arithmetic Conversion Instructions---------------------------------
// The conversions operations are all Alpha sorted.  Please keep it that way!

instruct roundFloat_mem_reg(stackSlotF dst, regF src) %{
  predicate(UseSSE==0);
  match(Set dst (RoundFloat src));
  ins_cost(125);
  format %{ "FLD    $src\n\t"
            "FSTP_S $dst\t# F-round" %}
  ins_encode( Push_Reg_F(src),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_reg );
%}

instruct roundDouble_mem_reg(stackSlotD dst, regD src) %{
  predicate(UseSSE<=1);
  match(Set dst (RoundDouble src));
  ins_cost(125);
  format %{ "FLD    $src\n\t"
            "FSTP_D $dst\t# D-round" %}
  ins_encode( Push_Reg_D(src),
              Pop_Mem_D(dst));
  ins_pipe( fpu_mem_reg );
%}

// Force rounding to 24-bit precision and 6-bit exponent
instruct convD2F_reg(stackSlotF dst, regD src) %{
  predicate(UseSSE==0);
  match(Set dst (ConvD2F src));
  format %{ "D2F    $dst,$src" %}
  expand %{
    roundFloat_mem_reg(dst,src);
  %}
%}

// Force rounding to 24-bit precision and 6-bit exponent
instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
  predicate(UseSSE==1);
  match(Set dst (ConvD2F src));
  effect( KILL cr );
  format %{ "SUB    ESP,4\n\t"
            "FLD    $src\n\t"
            "FSTP_S [ESP]\t# F-round\n\t"
            "MOVSS  $dst,[ESP]\n\t"
            "ADD ESP,4" %}
  ins_encode( D2X_encoding(dst, src));
  ins_pipe( pipe_slow );
%}

// Force rounding double precision to single precision
instruct convXD2X_reg(regX dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (ConvD2F src));
  format %{ "CVTSD2SS $dst,$src" %}
  opcode(0xF2, 0x0F, 0x5A); 
  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
  predicate(UseSSE==1);
  match(Set dst (ConvF2D src));
  effect( KILL cr );
  format %{ "SUB    ESP,4\n\t"
            "MOVSS  [ESP] $src\n\t"
            "FLD    [ESP]\n\t"
            "ADD    ESP,4\n\t"
            "FSTP   $dst" %}
  ins_encode( X2D_encoding(dst, src), Pop_Reg_D(dst));
  ins_pipe( pipe_slow );
%}

instruct convX2XD_reg(regXD dst, regX src) %{
  predicate(UseSSE==2);
  match(Set dst (ConvF2D src));
  format %{ "CVTSS2SD $dst,$src" %}
  opcode(0xF3, 0x0F, 0x5A); 
  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
  predicate(UseSSE<=1);
  match(Set dst (ConvD2I src));
  effect( KILL tmp, KILL cr );
  format %{ "FLD    $src\t# Convert double to int \n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,4\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "CMP    EAX,0x80000000\n\t"
            "JNE,s  fast\n\t"   
            "FLD_D  $src\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  ins_encode( Push_Reg_D(src), D2I_encoding(src) );
  ins_pipe( pipe_slow );
%}

// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %{
  predicate(UseSSE==2);
  match(Set dst (ConvD2I src));
  effect( KILL tmp, KILL cr );
  format %{ "CVTTSD2SI $dst, $src\n\t" 
            "CMP    $dst,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP, 8\n\t"
            "MOVSD  [ESP], $src\n\t"
            "FLD_D  [ESP]\n\t"
            "ADD    ESP, 8\n\t"
            "CALL   d2i_wrapper\n\t"
      "fast:" %}
  opcode(0x1); // double-precision conversion
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
  ins_pipe( pipe_slow );
%}

instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
  match(Set dst (ConvD2L src));
  effect( KILL cr );
  format %{ "FLD    $src\t# Convert double to long\n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,8\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode( Push_Reg_D(src),  D2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

// XMM lacks a float/double->long conversion, so use the old FPU stack.
instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
  predicate (UseSSE==2);
  match(Set dst (ConvD2L src));
  effect( KILL cr );
  format %{ "SUB    ESP,8\t# Convert double to long\n\t"
            "MOVSD  [ESP],$src\n\t"
            "FLD_D  [ESP]\n\t"
            "FLDCW  trunc mode\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP,8\n\t"
            "MOVSD  [ESP],$src\n\t"
            "FLD_D  [ESP]\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode( XD2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

instruct convF2D_reg(regD dst, regF src) %{
  predicate(UseSSE==0);
  match(Set dst (ConvF2D src));
  format %{ "FLD    $src\n\t"
            "FSTP   $dst" %}
  ins_encode(Push_Reg_F(src), Pop_Reg_D(dst));
  ins_pipe( fpu_reg_reg );
%}

// Convert a double to an int.  Java semantics require we do complex
// manglations in the corner cases.  So we set the rounding mode to
// 'zero', store the darned double down as an int, and reset the
// rounding mode to 'nearest'.  The hardware stores a flag value down
// if we would overflow or converted a NAN; we check for this and
// and go the slow path if needed.
instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
  predicate(UseSSE==0);
  match(Set dst (ConvF2I src));
  effect( KILL tmp, KILL cr );
  format %{ "FLD    $src\t# Convert float to int \n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,4\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "CMP    EAX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  // D2I_encoding works for F2I
  ins_encode( Push_Reg_F(src), D2I_encoding(src) );
  ins_pipe( pipe_slow );
%}

instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
  match(Set dst (ConvF2L src));
  effect( KILL cr );
  format %{ "FLD    $src\t# Convert float to long\n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,8\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  // D2L_encoding works for F2L
  ins_encode( Push_Reg_F(src), D2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

// XMM lacks a float/double->long conversion, so use the old FPU stack.
instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
  predicate (UseSSE>=1);
  match(Set dst (ConvF2L src));
  effect( KILL cr );
  format %{ "SUB    ESP,8\t# Convert float to long\n\t"
            "MOVSS  [ESP],$src\n\t"
            "FLD_S  [ESP]\n\t"
            "FLDCW  trunc mode\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP,4\t# Convert float to long\n\t"
            "MOVSS  [ESP],$src\n\t"
            "FLD_S  [ESP]\n\t"
            "ADD    ESP,4\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode( X2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

instruct convI2D_reg(regD dst, stackSlotI src) %{
  predicate( UseSSE<=1 );
  match(Set dst (ConvI2D src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
  ins_pipe( fpu_reg_mem );
%}

instruct convI2XD_reg(regXD dst, eRegI src) %{
  predicate( UseSSE==2 );
  match(Set dst (ConvI2D src));
  format %{ "CVTSI2SD $dst,$src" %}
  opcode(0xF2, 0x0F, 0x2A);  
  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

instruct convI2XD_mem(regXD dst, memory mem) %{
  predicate( UseSSE==2 );
  match(Set dst (ConvI2D (LoadI mem)));
  format %{ "CVTSI2SD $dst,$mem" %}
  opcode(0xF2, 0x0F, 0x2A);  
  ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
  ins_pipe( pipe_slow );
%}

instruct convI2D_mem(regD dst, memory mem) %{
  predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2D (LoadI mem)));
  format %{ "FILD   $mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDB);      /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_D(dst));
  ins_pipe( fpu_reg_mem );
%}

// Convert a byte to a float; no rounding step needed.
instruct conv24I2F_reg(regF dst, stackSlotI src) %{
  predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
  match(Set dst (ConvI2F src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}

  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode(Push_Mem_I(src), Pop_Reg_F(dst));
  ins_pipe( fpu_reg_mem );
%}

// In 24-bit mode, force exponent rounding by storing back out
instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F src));
  ins_cost(200);
  format %{ "FILD   $src\n\t"
            "FSTP_S $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode( Push_Mem_I(src),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_mem );
%}
 
// In 24-bit mode, force exponent rounding by storing back out
instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F (LoadI mem)));
  ins_cost(200);
  format %{ "FILD   $mem\n\t"
            "FSTP_S $dst" %}
  opcode(0xDB);  /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_mem );
%}
 
// This instruction does not round to 24-bits
instruct convI2F_reg(regF dst, stackSlotI src) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode( Push_Mem_I(src),
              Pop_Reg_F(dst));
  ins_pipe( fpu_reg_mem );
%}

// This instruction does not round to 24-bits
instruct convI2F_mem(regF dst, memory mem) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F (LoadI mem)));
  format %{ "FILD   $mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDB);      /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_F(dst));
  ins_pipe( fpu_reg_mem );
%}

// Convert an int to a float in xmm; no rounding step needed.
instruct convI2X_reg(regX dst, eRegI src) %{
  predicate(UseSSE>=1);
  match(Set dst (ConvI2F src));
  format %{ "CVTSI2SS $dst, $src" %}

  opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
  ins_pipe( pipe_slow );
%}

// Convert a float in xmm to an int reg.
instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
  predicate(UseSSE>=1);
  match(Set dst (ConvF2I src));
  effect( KILL tmp, KILL cr );
  format %{ "CVTTSS2SI $dst, $src\n\t" 
            "CMP    $dst,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP, 4\n\t"
            "MOVSS  [ESP], $src\n\t"
            "FLD    [ESP]\n\t"
            "ADD    ESP, 4\n\t"
            "CALL   d2i_wrapper\n\t"
      "fast:" %}        
  opcode(0x0); // single-precision conversion
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
  ins_pipe( pipe_slow );
%}

instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
  match(Set dst (ConvI2L src));
  effect(KILL cr);
  format %{ "MOV    $dst.lo,$src\n\t"
            "MOV    $dst.hi,$src\n\t"
            "SAR    $dst.hi,31" %}
  ins_encode(convert_int_long(dst,src));
  ins_pipe( ialu_reg_reg_long );
%}

// Zero-extend convert int to long
instruct convI2L_reg_zex(eRegL dst, eRegI src, immL_32bits mask, eFlagsReg flags ) %{
  match(Set dst (AndL (ConvI2L src) mask) );
  effect( KILL flags );
  format %{ "MOV    $dst.lo,$src\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  opcode(0x33); // XOR
  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convI2L_reg_reg_zex(eRegL dst, eRegI src, eRegL mask, eFlagsReg flags) %{
  match(Set dst (AndL (ConvI2L src) mask) );
  predicate(_kids[1]->_leaf->Opcode() == Op_ConL && 
            _kids[1]->_leaf->is_Type()->type()->is_long()->get_con() == 0xFFFFFFFFl);
  effect( KILL flags );
  format %{ "MOV    $dst.lo,$src\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  opcode(0x33); // XOR
  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
  ins_pipe( ialu_reg_reg_long );
%}  

// Zero-extend long
instruct zerox_long(eRegL dst, eRegL src, immL_32bits mask, eFlagsReg flags ) %{
  match(Set dst (AndL src mask) );
  effect( KILL flags );
  format %{ "MOV    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$dst.hi\n\t" %}
  opcode(0x33); // XOR
  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE<=1);
  match(Set dst (ConvL2D src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD   ST,[ESP + #0]\n\t"
            "ADD    ESP,8\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double(src), Pop_Mem_D(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE==2);
  match(Set dst (ConvL2D src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD_D [ESP]\n\t"
            "FSTP_D [ESP]\n\t"
            "MOVSD  $dst,[ESP]\n\t"
            "ADD    ESP,8" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double2(src), Push_ResultXD(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE>=1);
  match(Set dst (ConvL2F src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD_D [ESP]\n\t"
            "FSTP_S [ESP]\n\t"
            "MOVSS  $dst,[ESP]\n\t"
            "ADD    ESP,8" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double2(src), Push_ResultX(dst,0x8));
  ins_pipe( pipe_slow );
%}

instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (ConvL2F src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD   ST,[ESP + #0]\n\t"
            "ADD    ESP,8\n\t"
            "FSTP_S $dst\t# F-round" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double(src), Pop_Mem_F(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2I_reg( eRegI dst, eRegL src ) %{
  match(Set dst (ConvL2I src));
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src.lo" %}
  ins_encode(enc_CopyL_Lo(dst,src));
  ins_pipe( ialu_reg_reg );
%}


instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );
  ins_cost(125);
  format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_mem );     
%}

instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
  predicate(UseSSE==0);
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );

  ins_cost(125);
  format %{ "FLD    $src\n\t"
            "FSTP_S $dst\t# MoveF2I_reg_stack" %}
  ins_encode( Push_Reg_F(src),
              Pop_Mem_F(dst));
  ins_pipe( fpu_mem_reg );
%}

instruct MoveF2I_reg_stack_sse(stackSlotI dst, regX src) %{
  predicate(UseSSE>=1);
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );

  ins_cost(95);
  format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, dst));
  ins_pipe( pipe_slow );
%}

instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
  match(Set dst (MoveI2F src));
  effect( DEF dst, USE src );

  ins_cost(100);
  format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
  opcode(0x89);
  ins_encode( OpcPRegSS( dst, src ) );
  ins_pipe( ialu_mem_reg );
%}


instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
  predicate(UseSSE==0);
  match(Set dst (MoveI2F src));
  effect(DEF dst, USE src);

  ins_cost(125);
  format %{ "FLD_S  $src\n\t"
            "FSTP   $dst\t# MoveI2F_stack_reg" %}
  opcode(0xD9);               /* D9 /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_F(dst) );
  ins_pipe( fpu_reg_mem );      
%}

instruct MoveI2F_stack_reg_sse(regX dst, stackSlotI src) %{
  predicate(UseSSE>=1);
  match(Set dst (MoveI2F src));
  effect( DEF dst, USE src );

  ins_cost(145);
  format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
  ins_pipe( pipe_slow );
%}

instruct MoveD2L_stack_reg(eRegL dst, stackSlotD src) %{
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);

  ins_cost(250);
  format %{ "MOV    $dst.lo,$src\n\t"
            "MOV    $dst.hi,$src+4\t# MoveD2L_stack_reg" %}
  opcode(0x8B, 0x8B);
  ins_encode( OpcP, RegMem(dst,src), OpcS, RegMem_Hi(dst,src));
  ins_pipe( ialu_mem_long_reg );     
%}

instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
  predicate(UseSSE<2);
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);

  ins_cost(125);
  format %{ "FLD    $src\n\t"
            "FSTP_D $dst\t# MoveD2L_reg_stack" %}
  ins_encode( Push_Reg_D(src),
              Pop_Mem_D(dst));
  ins_pipe( fpu_mem_reg );
%}

instruct MoveD2L_reg_stack_sse(stackSlotL dst, regXD src) %{
  predicate(UseSSE==2);
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);
  ins_cost(145);

  format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src,dst));
  ins_pipe( pipe_slow );
%}

instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
  predicate(UseSSE<2);
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);
  ins_cost(125);

  format %{ "FLD_D  $src\n\t"
            "FSTP   $dst\t# MoveL2D_stack_reg" %}
  opcode(0xDD);               /* DD /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_D(dst) );
  ins_pipe( fpu_reg_mem );      
%}


instruct MoveL2D_stack_reg_sse(regXD dst, stackSlotL src) %{
  predicate(UseSSE==2);
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);

  ins_cost(145);
  format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
  ins_pipe( pipe_slow );
%}


instruct MoveL2D_reg_stack(stackSlotD dst, eRegL src) %{
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi\t# MoveL2D_reg_stack" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
  ins_pipe( ialu_mem_long_reg );
%}



// =======================================================================
// fast clearing of an array

instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, eRegI dummy, eFlagsReg cr) %{
  match(Set dummy (ClearArray cnt base));
  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
  format %{ "SHL    ECX,1\t# Convert doublewords to words\n\t"
            "XOR    EAX,EAX\n\t"
            "REP STOS\t# store EAX into [EDI++] while ECX--" %}
  opcode(0,0x4);
  ins_encode( Opcode(0xD1), RegOpc(ECX),
              OpcRegReg(0x33,EAX,EAX),
              Opcode(0xF3), Opcode(0xAB) );
  ins_pipe( pipe_slow );
%}

instruct string_compare(eDIRegP str1, eSIRegP str2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result, eFlagsReg cr) %{
  match(Set result (StrComp str1 str2));
  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL cr);
  //ins_cost(300);

  format %{ "String Compare $str1,$str2 -> $result    // KILL EAX, EBX" %}
  ins_encode( enc_String_Compare() );
  ins_pipe( pipe_slow );
%}

//----------Control Flow Instructions------------------------------------------
// Signed compare Instructions
instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
  match(Set cr (CmpI op1 op2));
  effect( DEF cr, USE op1, USE op2 );
  format %{ "CMP    $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compI_eReg_imm(eFlagsReg cr, eRegI op1, immI op2) %{
  match(Set cr (CmpI op1 op2));
  effect( DEF cr, USE op1 );
  format %{ "CMP    $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  // ins_encode( RegImm( op1, op2) );  /* Was CmpImm */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Cisc-spilled version of cmpI_eReg
instruct compI_eReg_mem(eFlagsReg cr, eRegI op1, memory op2) %{
  match(Set cr (CmpI op1 (LoadI op2)));
 
  format %{ "CMP    $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

instruct testI_reg( eFlagsReg cr, eRegI src, immI0 zero ) %{
  match(Set cr (CmpI src zero));
  effect( DEF cr, USE src );

  format %{ "TEST   $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

instruct testI_reg_imm( eFlagsReg cr, eRegI src, immI con, immI0 zero ) %{
  match(Set cr (CmpI (AndI src con) zero));

  format %{ "TEST   $src,$con" %}
  opcode(0xF7,0x00);
  ins_encode( OpcP, RegOpc(src), Con32(con) );
  ins_pipe( ialu_cr_reg_imm );
%}

instruct testI_reg_mem( eFlagsReg cr, eRegI src, memory mem, immI0 zero ) %{
  match(Set cr (CmpI (AndI src mem) zero));

  format %{ "TEST   $src,$mem" %}
  opcode(0x85);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_cr_reg_mem );
%}

// Unsigned compare Instructions; really, same as signed except they
// produce an eFlagsRegU instead of eFlagsReg.
instruct compU_eReg(eFlagsRegU cr, eRegI op1, eRegI op2) %{
  match(Set cr (CmpU op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compU_eReg_imm(eFlagsRegU cr, eRegI op1, immI op2) %{
  match(Set cr (CmpU op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// // Cisc-spilled version of cmpU_eReg
instruct compU_eReg_mem(eFlagsRegU cr, eRegI op1, memory op2) %{
  match(Set cr (CmpU op1 (LoadI op2)));
 
  format %{ "CMPu   $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

// // Cisc-spilled version of cmpU_eReg
//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, eRegI op2) %{
//  match(Set cr (CmpU (LoadI op1) op2));
// 
//  format %{ "CMPu   $op1,$op2" %}
//  ins_cost(500);
//  opcode(0x39);  /* Opcode 39 /r */
//  ins_encode( OpcP, RegMem( op1, op2) );
//%}

instruct testU_reg( eFlagsRegU cr, eRegI src, immI0 zero ) %{
  match(Set cr (CmpU src zero));

  format %{ "TESTu  $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Unsigned pointer compare Instructions
instruct compP_eReg(eFlagsRegU cr, eRegP op1, eRegP op2) %{
  match(Set cr (CmpP op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compP_eReg_imm(eFlagsRegU cr, eRegP op1, immP op2) %{
  match(Set cr (CmpP op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// // Cisc-spilled version of cmpP_eReg
instruct compP_eReg_mem(eFlagsRegU cr, eRegP op1, memory op2) %{
  match(Set cr (CmpP op1 (LoadP op2)));
 
  format %{ "CMPu   $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

// // Cisc-spilled version of cmpP_eReg
//instruct compP_mem_eReg(eFlagsRegU cr, memory op1, eRegP op2) %{
//  match(Set cr (CmpP (LoadP op1) op2));
// 
//  format %{ "CMPu   $op1,$op2" %}
//  ins_cost(500);
//  opcode(0x39);  /* Opcode 39 /r */
//  ins_encode( OpcP, RegMem( op1, op2) );
//%}

// Compare raw pointer (used in out-of-heap check).
// Only works because non-oop pointers must be raw pointers
// and raw pointers have no anti-dependencies.
instruct compP_mem_eReg( eFlagsRegU cr, eRegP op1, memory op2 ) %{
  predicate( !n->in(2)->in(2)->bottom_type()->isa_oop_ptr() );
  match(Set cr (CmpP op1 (LoadP op2)));
 
  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

//
// This will generate a signed flags result. This should be ok
// since any compare to a zero should be eq/neq.
instruct testP_reg( eFlagsReg cr, eRegP src, immP0 zero ) %{
  match(Set cr (CmpP src zero));

  format %{ "TEST   $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Cisc-spilled version of testP_reg
// This will generate a signed flags result. This should be ok
// since any compare to a zero should be eq/neq.
instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{
  match(Set cr (CmpP (LoadP op) zero));
 
  format %{ "TEST   $op,0xFFFFFFFF" %}
  ins_cost(500);
  opcode(0xF7);               /* Opcode F7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,op), Con_d32(0xFFFFFFFF) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Yanked all unsigned pointer compare operations.
// Pointer compares are done with CmpP which is already unsigned.

//----------Max and Min--------------------------------------------------------
// Min Instructions
////
//   *** Min and Max using the conditional move are slower than the
//   *** branch version on a Pentium III.
// // Conditional move for min
//instruct cmovI_reg_lt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
//  effect( USE_DEF op2, USE op1, USE cr );
//  format %{ "CMOVlt $op2,$op1\t! min" %}
//  opcode(0x4C,0x0F);
//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
//  ins_pipe( pipe_cmov_reg );
//%}
//
//// Min Register with Register (P6 version)
//instruct minI_eReg_p6( eRegI op1, eRegI op2 ) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set op2 (MinI op1 op2));
//  ins_cost(200);
//  expand %{
//    eFlagsReg cr;
//    compI_eReg(cr,op1,op2);
//    cmovI_reg_lt(op2,op1,cr);
//  %}
//%}

// Min Register with Register (generic version)
instruct minI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
  match(Set dst (MinI dst src));
  effect(KILL flags);
  ins_cost(300);

  format %{ "MIN    $dst,$src" %}
  opcode(0xCC);
  ins_encode( min_enc(dst,src) );
  ins_pipe( pipe_slow );
%}

// Max Register with Register
//   *** Min and Max using the conditional move are slower than the
//   *** branch version on a Pentium III.
// // Conditional move for max
//instruct cmovI_reg_gt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
//  effect( USE_DEF op2, USE op1, USE cr );
//  format %{ "CMOVgt $op2,$op1\t! max" %}
//  opcode(0x4F,0x0F);
//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
//  ins_pipe( pipe_cmov_reg );
//%}
//
// // Max Register with Register (P6 version)
//instruct maxI_eReg_p6( eRegI op1, eRegI op2 ) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set op2 (MaxI op1 op2));
//  ins_cost(200);
//  expand %{
//    eFlagsReg cr;
//    compI_eReg(cr,op1,op2);
//    cmovI_reg_gt(op2,op1,cr);
//  %}
//%}

// Max Register with Register (generic version)
instruct maxI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
  match(Set dst (MaxI dst src));
  effect(KILL flags);
  ins_cost(300);

  format %{ "MAX    $dst,$src" %}
  opcode(0xCC);
  ins_encode( max_enc(dst,src) );
  ins_pipe( pipe_slow );
%}

// ============================================================================
// Branch Instructions
// Jump Direct - Label defines a relative address from JMP+1
instruct jmpDir(label labl) %{
  match(Goto);
  effect(USE labl);

  ins_cost(300);
  format %{ "JMP    $labl" %}
  size(5);
  opcode(0xE9);
  ins_encode( OpcP, Lbl( labl ) );
  ins_pipe( pipe_jmp );
  ins_pc_relative(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpCon(cmpOp cop, eFlagsReg cr, label labl) %{
  match(If cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop    $labl" %}
  size(6);
  opcode(0x0F, 0x80);
  ins_encode( Jcc( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEnd(cmpOp cop, eFlagsReg cr, label labl) %{
  match(CountedLoopEnd cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop    $labl\t# Loop end" %}
  size(6);
  opcode(0x0F, 0x80);
  ins_encode( Jcc( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEndU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,u  $labl\t# Loop end" %}
  size(6);
  opcode(0x0F, 0x80);
  ins_encode( Jcc( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
%}

// Jump Direct Conditional - using unsigned comparison
instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,u  $labl" %}
  size(6);
  opcode(0x0F, 0x80);
  ins_encode( Jcc( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
%}

// ============================================================================
// The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
// array for an instance of the superklass.  Set a hidden internal cache on a
// hit (cache is checked with exposed code in gen_subtype_check()).  Return
// NZ for a miss or zero for a hit.  The encoding ALSO sets flags.
instruct partialSubtypeCheck( eDIRegI result, eSIRegP sub, eAXRegP super, eCXRegI ecx, eFlagsReg cr ) %{
  match(Set result (PartialSubtypeCheck sub super));
  effect( KILL ecx, KILL cr );

  ins_cost(1000);
  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
            "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
            "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
            "JNE,s  miss\t\t# Missed: EDI not-zero\n\t"
            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache\n\t"
            "XOR    $result,$result\t\t Hit: EDI zero\n\t"
     "miss:\t" %}

  opcode(0x1); // Force a XOR of EDI
  ins_encode( enc_PartialSubtypeCheck() );
  ins_pipe( pipe_slow );
%}

instruct partialSubtypeCheck_vs_Zero( eFlagsReg cr, eSIRegP sub, eAXRegP super, eCXRegI ecx, eDIRegI result ) %{
  match(Set cr (PartialSubtypeCheck sub super));
  effect( KILL ecx, KILL result );

  ins_cost(1000);
  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
            "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
            "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
            "JNE,s  miss\t\t# Missed: flags NZ\n\t"
            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache, flags Z\n\t"
     "miss:\t" %}

  opcode(0x0);  // No need to XOR EDI
  ins_encode( enc_PartialSubtypeCheck() );
  ins_pipe( pipe_slow );
%}

// ============================================================================
// Branch Instructions -- short offset versions
// 
// These instructions are used to replace jumps of a long offset (the default
// match) with jumps of a shorter offset.  These instructions are all tagged
// with the ins_short_branch attribute, which causes the ADLC to suppress the
// match rules in general matching.  Instead, the ADLC generates a conversion
// method in the MachNode which can be used to do in-place replacement of the
// long variant with the shorter variant.  The compiler will determine if a
// branch can be taken by the is_short_branch_offset() predicate in the machine
// specific code section of the file.

// Jump Direct - Label defines a relative address from JMP+1
instruct jmpDir_short(label labl) %{
  match(Goto);
  effect(USE labl);

  ins_cost(300);
  format %{ "JMP,s  $labl" %}
  size(2);
  opcode(0xEB);
  ins_encode( OpcP, LblShort( labl ) );
  ins_pipe( pipe_jmp );
  ins_pc_relative(1);
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpCon_short(cmpOp cop, eFlagsReg cr, label labl) %{
  match(If cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,s  $labl" %}
  size(2);
  opcode(0x70);
  ins_encode( JccShort( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEnd_short(cmpOp cop, eFlagsReg cr, label labl) %{
  match(CountedLoopEnd cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,s  $labl" %}
  size(2);
  opcode(0x70);
  ins_encode( JccShort( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEndU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl" %}
  size(2);
  opcode(0x70);
  ins_encode( JccShort( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
  ins_short_branch(1);
%}

// Jump Direct Conditional - using unsigned comparison
instruct jmpConU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl" %}
  size(2);
  opcode(0x70);
  ins_encode( JccShort( cop, labl) );
  ins_pipe( pipe_jcc );
  ins_pc_relative(1);
  ins_short_branch(1);
%}

// ============================================================================
// Long Compare
//
// Currently we hold longs in 2 registers.  Comparing such values efficiently
// is tricky.  The flavor of compare used depends on whether we are testing
// for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
// The GE test is the negated LT test.  The LE test can be had by commuting
// the operands (yielding a GE test) and then negating; negate again for the
// GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
// NE test is negated from that.

// Due to a shortcoming in the ADLC, it mixes up expressions like:
// (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the 
// difference between 'Y' and '0L'.  The tree-matches for the CmpI sections 
// are collapsed internally in the ADLC's dfa-gen code.  The match for 
// (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the 
// foo match ends up with the wrong leaf.  One fix is to not match both 
// reg-reg and reg-zero forms of long-compare.  This is unfortunate because 
// both forms beat the trinary form of long-compare and both are very useful 
// on Intel which has so few registers.

// Manifest a CmpL result in an integer register.  Very painful.
// This is the test to avoid.
instruct cmpL3_reg_reg(eSIRegI dst, eRegL src1, eRegL src2, eFlagsReg flags ) %{
  match(Set dst (CmpL3 src1 src2));
  effect( KILL flags );
  ins_cost(1000);
  format %{ "XOR    $dst,$dst\n\t"
            "CMP    $src1.hi,$src2.hi\n\t"
            "JLT,s  m_one\n\t"
            "JGT,s  p_one\n\t"
            "CMP    $src1.lo,$src2.lo\n\t"
            "JB,s   m_one\n\t"
            "JEQ,s  done\n"
    "p_one:\tINC    $dst\n\t"
            "JMP,s  done\n"
    "m_one:\tDEC    $dst\n"
     "done:" %}
  opcode(0x3B, 0x1B);
  ins_encode( cmpl3_flag(src1,src2,dst) );
  ins_pipe( pipe_slow );
%}

//======
// Manifest a CmpL result in the normal flags.  Only good for LT or GE
// compares.  Can be used for LE or GT compares by reversing arguments.
// NOT GOOD FOR EQ/NE tests.  
instruct cmpL_zero_flags_LTGE( flagsReg_long_LTGE flags, eRegL src, immL0 zero ) %{
  match( Set flags (CmpL src zero ));
  ins_cost(100);
  format %{ "TEST   $src.hi,$src.hi" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg_Hi2( src, src ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Manifest a CmpL result in the normal flags.  Only good for LT or GE
// compares.  Can be used for LE or GT compares by reversing arguments.
// NOT GOOD FOR EQ/NE tests.
instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, eSIRegI tmp ) %{
  match( Set flags (CmpL src1 src2 ));
  effect( KILL tmp );
  ins_cost(300);
  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
            "MOV    ESI,$src1.hi\n\t"
            "SBB    ESI,$src2.hi\t! Compute flags for long compare" %}
  ins_encode( long_cmp_flags2( src1, src2 ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compares reg < zero/req OR reg >= zero/req.
// Just a wrapper for a normal branch, plus the predicate test.
instruct cmpL_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge );
  expand %{
    jmpCon(cmp,flags,labl);    // JLT or JGE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, eRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovPP_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovXDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regXD dst, regXD src) %{
  predicate( UseSSE==2 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovXD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovXX_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regX dst, regX src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovX_regS(cmp,flags,dst,src);
  %}
%}
 
//======
// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.  
instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, eSIRegI tmp ) %{
  match( Set flags (CmpL src zero ));
  effect(KILL tmp);
  ins_cost(200);
  format %{ "MOV    ESI,$src.lo\n\t"
            "OR     ESI,$src.hi\t! Long is EQ/NE 0?" %}
  ins_encode( long_cmp_flags0( src ) );
  ins_pipe( ialu_reg_reg_long );
%}

// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.  
instruct cmpL_reg_flags_EQNE( flagsReg_long_EQNE flags, eRegL src1, eRegL src2 ) %{
  match( Set flags (CmpL src1 src2 ));
  ins_cost(200+300);
  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
            "JNE,s  skip\n\t"
            "CMP    $src1.hi,$src2.hi\n\t"
     "skip:\t" %}
  ins_encode( long_cmp_flags1( src1, src2 ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compare reg == zero/reg OR reg != zero/reg
// Just a wrapper for a normal branch, plus the predicate test.
instruct cmpL_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne );
  expand %{
    jmpCon(cmp,flags,labl);    // JEQ or JNE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, eRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovPP_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovXDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regXD dst, regXD src) %{
  predicate( UseSSE==2 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovXD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovXX_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regX dst, regX src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovX_regS(cmp,flags,dst,src);
  %}
%}
 
//======
// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
// Same as cmpL_reg_flags_LEGT except must negate src
instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, eSIRegI tmp ) %{
  match( Set flags (CmpL src zero ));
  effect( KILL tmp );
  ins_cost(300);
  format %{ "XOR    ESI,ESI\t# Long compare for -$src < 0, use commuted test\n\t"
            "CMP    ESI,$src.lo\n\t"
            "SBB    ESI,$src.hi\n\t" %}
  ins_encode( long_cmp_flags3(src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
// Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
// requires a commuted test to get the same result.  
instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, eSIRegI tmp ) %{
  match( Set flags (CmpL src1 src2 ));
  effect( KILL tmp );
  ins_cost(300);
  format %{ "CMP    $src2.lo,$src1.lo\t! Long compare, swapped operands, use with commuted test\n\t"
            "MOV    ESI,$src2.hi\n\t"
            "SBB    ESI,$src1.hi\t! Compute flags for long compare" %}
  ins_encode( long_cmp_flags2( src2, src1 ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compares reg < zero/req OR reg >= zero/req.
// Just a wrapper for a normal branch, plus the predicate test
instruct cmpL_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le );
  ins_cost(300);
  expand %{
    jmpCon(cmp,flags,labl);    // JGT or JLE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi+4" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) ); 
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, eRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ptrs.
instruct cmovPP_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovXDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regXD dst, regXD src) %{
  predicate( UseSSE==2 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovXD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}


instruct cmovXX_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regX dst, regX src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->is_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovX_regS(cmp,flags,dst,src);
  %}
%}

// ============================================================================
// inlined locking and unlocking

instruct cmpFastLock( eFlagsReg cr, naxRegP object, naxRegP box, eAXRegI tmp) %{
  match( Set cr (FastLock object box) );
  effect( KILL tmp );
  ins_cost(300);
  format %{ "FASTLOCK $object, $box, kill EAX" %}
  ins_encode( Fast_Lock(object,box,tmp) );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}

instruct cmpFastUnlock( eFlagsReg cr, nabxRegP object, eAXRegP box, eBXRegP tmp ) %{
  match( Set cr (FastUnlock object box) );
  effect( KILL box, KILL tmp );
  ins_cost(300);
  format %{ "FASTUNLOCK $object, kills $box, EBX" %}
  ins_encode( Fast_Unlock(object,box,tmp) );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}

// ============================================================================
// Safepoint Instrucions
instruct safePoint( ) %{
  match(SafePoint);
  predicate(!SafepointPolling);
  format %{ "Safepoint_ " %}
  opcode(0x90); /* NOP = 0x90 */
  ins_encode( OpcP, OpcP, safepoint_reloc );
  ins_pipe( empty );
%}

instruct safePoint_poll(eFlagsReg cr) %{
  match(SafePoint);
  predicate(SafepointPolling);
  effect(KILL cr);

  format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
  size(6);
  ins_cost(125);
  ins_encode( Safepoint_Poll() );
  ins_pipe( ialu_reg_mem );
%}

// ============================================================================
// Procedure Call/Return Instructions
// Call Java Static Instruction
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallStaticJavaDirect(method meth) %{
  match(CallStaticJava);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL,static " %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_FPU,
              Java_Static_Call( meth ),
              call_epilog,
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
  ins_alignment(4);
%}

// Call Java Dynamic Instruction
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallDynamicJavaDirect(method meth) %{
  match(CallDynamicJava);
  effect(USE meth);

  ins_cost(300);
  format %{ "MOV    EAX,(oop)-1\n\t"
            "CALL,dynamic" %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_FPU,
              Java_Dynamic_Call( meth ),
              call_epilog,
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
  ins_alignment(4);
%}

// Call Compiled Java Instruction
// Required: Used in converter frame from interpreter to compiler
instruct CallCompiledJavaDirect( method meth, eBPRegP interp_fp ) %{
  match(CallCompiledJava);
  effect(USE meth, KILL interp_fp);

  ins_cost(300);
  format %{ "CALL    *[EAX+compiled_code_entry_point_offset] // compiled code" %}
  opcode(0xFF, 0x02); /* FF /2 */
  ins_encode( Java_Compiled_Call( meth ),
              FFree_Float_Stack_After_Return );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}

// Call Java Interpreter Instruction
// Required: Used in converter frame from compiled code to interpreter
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallInterpreterDirect( method meth ) %{
  match(CallInterpreter);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL,interpreter " %}
  opcode(0xE8); /* E8 cd */
  // Use FFREEs to clear entries in float stack
  ins_encode( FFree_Float_Stack_All,
              Xor_Reg(EBP),
              Java_To_Runtime( meth ) );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
  ins_alignment(4);
%}

// Call Runtime Instruction
instruct CallRuntimeDirect(method meth) %{
  match(CallRuntime );
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL,runtime " %}
  opcode(0xE8); /* E8 cd */
  // Use FFREEs to clear entries in float stack
  ins_encode( pre_call_FPU,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}

// Call runtime without safepoint
instruct CallLeafDirect(method meth) %{
  match(CallLeaf);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL_LEAF,runtime " %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_FPU,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              Verify_FPU_For_Leaf, post_call_FPU );
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}

instruct CallLeafNoFPDirect(method meth) %{
  match(CallLeafNoFP);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL_LEAF_NOFP,runtime " %}
  opcode(0xE8); /* E8 cd */
  ins_encode(Java_To_Runtime(meth));
  ins_pipe( pipe_slow );
  ins_pc_relative(1);
%}


// Return Instruction
// Remove the return address & jump to it.
// Notice: We always emit a nop after a ret to make sure there is room 
// for safepoint patching
instruct Ret() %{
  match(Return);
  format %{ "RET" %}
  opcode(0xC3);  
  ins_encode(RetWithNops());
  ins_pipe( pipe_jmp );
%}

// Tail Call; Jump from runtime stub to Java code.
// Also known as an 'interprocedural jump'.
// Target of jump will eventually return to caller.
// TailJump below removes the return address.
instruct TailCalljmpInd(eRegP jump_target, eAXRegP method_oop) %{
  match(TailCall jump_target method_oop );
  ins_cost(300);
  format %{ "JMP    $jump_target \t# EAX holds method oop" %}
  opcode(0xFF, 0x4);  /* Opcode FF /4 */
  ins_encode( OpcP, RegOpc(jump_target) );
  ins_pipe( pipe_jmp );
%}


// Tail Jump; remove the return address; jump to target.
// TailCall above leaves the return address around.
instruct tailjmpInd(eRegP jump_target, eAXRegP ex_oop) %{
  match( TailJump jump_target ex_oop );
  ins_cost(300);
  format %{ "POP    EDX\t# pop return address into dummy\n\t"
            "JMP    $jump_target " %}
  opcode(0xFF, 0x4);  /* Opcode FF /4 */
  ins_encode( enc_pop_edx,
              OpcP, RegOpc(jump_target) );
  ins_pipe( pipe_jmp );
%}

// Create exception oop: created by stack-crawling runtime code.
// Created exception is now available to this handler, and is setup
// just prior to jumping to this handler.  No code emitted.
instruct CreateException( eAXRegP ex_oop )
%{
  match(Set ex_oop (CreateEx));

  size(0);
  // use the following format syntax
  format %{ "# exception oop is in EAX; no code emitted" %}
  ins_encode();
  ins_pipe( empty );
%}


// Rethrow exception: 
// The exception oop will come in the first argument position.
// Then JUMP (not call) to the rethrow stub code.
instruct RethrowException()
%{
  match(Rethrow);

  // use the following format syntax
  format %{ "JMP    rethrow_stub" %}
  ins_encode(enc_rethrow);
  ins_pipe( pipe_jmp );
%}


//----------PEEPHOLE RULES-----------------------------------------------------
// These must follow all instruction definitions as they use the names
// defined in the instructions definitions.
// 
// peepmatch ( root_instr_name [preceeding_instruction]* );
//
// peepconstraint %{
// (instruction_number.operand_name relational_op instruction_number.operand_name
//  [, ...] );
// // instruction numbers are zero-based using left to right order in peepmatch
//
// peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
// // provide an instruction_number.operand_name for each operand that appears
// // in the replacement instruction's match rule
//
// ---------VM FLAGS---------------------------------------------------------
// 
// All peephole optimizations can be turned off using -XX:-OptoPeephole
// 
// Each peephole rule is given an identifying number starting with zero and
// increasing by one in the order seen by the parser.  An individual peephole
// can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
// on the command-line.
// 
// ---------CURRENT LIMITATIONS----------------------------------------------
// 
// Only match adjacent instructions in same basic block
// Only equality constraints
// Only constraints between operands, not (0.dest_reg == EAX_enc)
// Only one replacement instruction
//
// ---------EXAMPLE----------------------------------------------------------
//
// // pertinent parts of existing instructions in architecture description
// instruct movI(eRegI dst, eRegI src) %{
//   match(Set dst (CopyI src));
// %}
// 
// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
//   match(Set dst (AddI dst src));
//   effect(KILL cr);
// %}
// 
// // Change (inc mov) to lea
// peephole %{
//   // increment preceeded by register-register move
//   peepmatch ( incI_eReg movI );
//   // require that the destination register of the increment 
//   // match the destination register of the move
//   peepconstraint ( 0.dst == 1.dst );
//   // construct a replacement instruction that sets
//   // the destination to ( move's source register + one )
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
// 
// Implementation no longer uses movX instructions since 
// machine-independent system no longer uses CopyX nodes.
// 
// peephole %{
//   peepmatch ( incI_eReg movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
// 
// peephole %{
//   peepmatch ( decI_eReg movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
// 
// peephole %{
//   peepmatch ( addI_eReg_imm movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
// 
// peephole %{
//   peepmatch ( addP_eReg_imm movP );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaP_eReg_immI( 0.dst 1.src 0.src ) );
// %}

// // Change load of spilled value to only a spill
// instruct storeI(memory mem, eRegI src) %{
//   match(Set mem (StoreI mem src));
// %}
// 
// instruct loadI(eRegI dst, memory mem) %{
//   match(Set dst (LoadI mem));
// %}
// 
//peephole %{
//  peepmatch ( loadI storeI );
//  peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
//  peepreplace ( storeI( 1.mem 1.mem 1.src ) );
//%}

//----------SMARTSPILL RULES---------------------------------------------------
// These must follow all instruction definitions as they use the names
// defined in the instructions definitions.