Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for X86Base.Pause() and ArmBase.Yield() #61065

Merged
merged 4 commits into from
Nov 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3319,6 +3319,7 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type,
GenTree* op1,
Expand Down
32 changes: 17 additions & 15 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1240,21 +1240,22 @@ class emitter

#define PERFSCORE_THROUGHPUT_1C 1.0f // Single Issue

#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
#define PERFSCORE_THROUGHPUT_140C 140.0f // slower - 140 cycles

#define PERFSCORE_LATENCY_ILLEGAL -1024.0f

Expand All @@ -1281,6 +1282,7 @@ class emitter
#define PERFSCORE_LATENCY_26C 26.0f
#define PERFSCORE_LATENCY_62C 62.0f
#define PERFSCORE_LATENCY_69C 69.0f
#define PERFSCORE_LATENCY_140C 140.0f
#define PERFSCORE_LATENCY_400C 400.0f // Intel microcode issue with these instuctions

#define PERFSCORE_LATENCY_BRANCH_DIRECT 1.0f // cost of an unconditional branch
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14588,6 +14588,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
result.insLatency = PERFSCORE_LATENCY_ZERO;
}
else if (ins == INS_yield)
{
// @ToDo - find out the actual latency, match x86/x64 for now
result.insThroughput = PERFSCORE_THROUGHPUT_140C;
result.insLatency = PERFSCORE_LATENCY_140C;
}
else
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
Expand Down
14 changes: 11 additions & 3 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2898,7 +2898,8 @@ void emitter::emitIns(instruction ins)
ins == INS_r_movsp || ins == INS_r_stosb || ins == INS_r_stosd || ins == INS_r_stosp || ins == INS_ret ||
ins == INS_sahf || ins == INS_stosb || ins == INS_stosd || ins == INS_stosp
// These instructions take zero operands
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence);
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence ||
ins == INS_pause);

assert(assertCond);
}
Expand Down Expand Up @@ -12333,8 +12334,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
// Due to elided register moves, we can't have the following assert.
// For example, consider:
// t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
// /--* t85 byref
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
// /--* t85 byref
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
// Here, V01 is type `long` on entry, then is stored as a byref. But because
// the register allocator assigned the same register, no instruction was
// generated, and we only (currently) make gcref/byref changes in emitter GC info
Expand Down Expand Up @@ -16104,6 +16105,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
break;

case INS_pause:
{
result.insLatency = PERFSCORE_LATENCY_140C;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's 14x lower on pre-Skylakes. It reminds me this issue #8744

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but all of our latencies are currently dependent on and configured for Skylake.

We'd need some more advanced setup if we wanted to track per architecture latencies/throughput.

result.insThroughput = PERFSCORE_THROUGHPUT_140C;
break;
}

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21948,6 +21948,12 @@ GenTree* Compiler::gtNewSimdZeroNode(var_types type,
return gtNewSimdHWIntrinsicNode(type, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
}

GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID)
{
return new (this, GT_HWINTRINSIC)
GenTreeHWIntrinsic(type, hwIntrinsicID, CORINFO_TYPE_UNDEF, 0, /* isSimdAsHWIntrinsic */ false);
}

GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID)
{
SetOpLclRelatedToSIMDIntrinsic(op1);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ struct HWIntrinsic final

if (baseType == TYP_UNKNOWN)
{
assert(category == HW_Category_Scalar);
assert((category == HW_Category_Scalar) || (category == HW_Category_Special));

if (HWIntrinsicInfo::BaseTypeFromFirstArg(id))
{
Expand Down
24 changes: 20 additions & 4 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
var_types retType,
unsigned simdSize)
{
echesakov marked this conversation as resolved.
Show resolved Hide resolved
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
int numArgs = sig->numArgs;
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
int numArgs = sig->numArgs;

if (!featureSIMD || !IsBaselineSimdIsaSupported())
{
return nullptr;
}

assert(numArgs >= 0);
assert(varTypeIsArithmetic(simdBaseType));

var_types simdBaseType = TYP_UNKNOWN;

if (intrinsic != NI_ArmBase_Yield)
{
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
}

GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
Expand All @@ -327,6 +333,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

switch (intrinsic)
{
case NI_ArmBase_Yield:
{
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

case NI_Vector64_Abs:
case NI_Vector128_Abs:
{
Expand Down
19 changes: 19 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
emitSize = emitActualTypeSize(intrin.baseType);
opt = INS_OPTS_NONE;
}
else if (intrin.category == HW_Category_Special)
{
assert(intrin.id == NI_ArmBase_Yield);

emitSize = EA_UNKNOWN;
opt = INS_OPTS_NONE;
}
else
{
emitSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
Expand Down Expand Up @@ -443,6 +450,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_ArmBase_Yield:
{
ins = INS_yield;
break;
}

default:
ins = HWIntrinsicInfo::lookupIns(intrin.id, intrin.baseType);
break;
Expand Down Expand Up @@ -735,6 +748,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_ArmBase_Yield:
{
GetEmitter()->emitIns(ins);
break;
}

// mvni doesn't support the range of element types, so hard code the 'opts' value.
case NI_Vector64_get_Zero:
case NI_Vector64_get_AllBitsSet:
Expand Down
21 changes: 16 additions & 5 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,8 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
{
NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;

genConsumeOperands(node);

switch (intrinsicId)
{
case NI_X86Base_BitScanForward:
Expand All @@ -1459,16 +1461,25 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
var_types targetType = node->TypeGet();
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);

genConsumeOperands(node);
genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType), targetReg, op1);
genProduceReg(node);
break;
}

case NI_X86Base_Pause:
{
assert(node->GetSimdBaseType() == TYP_UNKNOWN);
assert(node->gtGetOp1() == nullptr);
assert(node->gtGetOp2() == nullptr);
GetEmitter()->emitIns(INS_pause);
break;
}

default:
unreached();
break;
}

genProduceReg(node);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -1532,7 +1543,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)

case NI_SSE_StoreFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_sfence);
Expand Down Expand Up @@ -1617,7 +1628,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)

case NI_SSE2_LoadFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_lfence);
Expand All @@ -1626,7 +1637,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)

case NI_SSE2_MemoryFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_mfence);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ HARDWARE_INTRINSIC(Aes, PolynomialMultiplyWideningUpper,
// Base Intrinsics
HARDWARE_INTRINSIC(ArmBase, LeadingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_clz, INS_clz, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(ArmBase, ReverseElementBits, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rbit, INS_rbit, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(ArmBase, Yield, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size Number of arguments Instructions Category Flags
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ HARDWARE_INTRINSIC(Vector256, Xor,
// X86Base Intrinsics
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Pause, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
Expand Down
24 changes: 20 additions & 4 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
case InstructionSet_Vector256:
case InstructionSet_Vector128:
case InstructionSet_X86Base:
return impBaseIntrinsic(intrinsic, clsHnd, method, sig, simdBaseJitType, retType, simdSize);
case InstructionSet_SSE:
return impSSEIntrinsic(intrinsic, method, sig);
Expand Down Expand Up @@ -548,8 +549,13 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
return nullptr;
}

var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
var_types simdBaseType = TYP_UNKNOWN;

if (intrinsic != NI_X86Base_Pause)
{
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
}

switch (intrinsic)
{
Expand Down Expand Up @@ -1532,6 +1538,16 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_Pause:
{
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

default:
{
return nullptr;
Expand Down Expand Up @@ -1604,7 +1620,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAND
case NI_SSE_StoreFence:
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, intrinsic, CORINFO_TYPE_VOID, 0);
retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;

default:
Expand Down Expand Up @@ -1667,7 +1683,7 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAN
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, intrinsic, CORINFO_TYPE_VOID, simdSize);
retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/instrsarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1563,6 +1563,9 @@ INST1(uxth, "uxth", 0, IF_DR_2H, 0x53003C00)
INST1(nop, "nop", 0, IF_SN_0A, 0xD503201F)
// nop SN_0A 1101010100000011 0010000000011111 D503 201F

INST1(yield, "yield", 0, IF_SN_0A, 0xD503203F)
// yield SN_0A 1101010100000011 0010000000111111 D503 203F

INST1(bkpt, "bkpt", 0, IF_SN_0A, 0xD43E0000)
// brpt SN_0A 1101010000111110 0000000000000000 D43E 0000 0xF000

Expand Down
Loading