Skip to content

Commit 37ce2ac

Browse files
authored
Merge pull request #21263 from BradleyWood/fma-0.51
(0.51) x86: Implement fma intrinsic
2 parents bc02dc5 + 69eb2d9 commit 37ce2ac

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed

runtime/compiler/x/codegen/J9CodeGenerator.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,18 @@ J9::X86::CodeGenerator::suppressInliningOfRecognizedMethod(TR::RecognizedMethod
438438
switch (method)
439439
{
440440
case TR::java_lang_Object_clone:
441+
return true;
442+
case TR::java_lang_Math_fma_F:
443+
case TR::java_lang_Math_fma_D:
444+
case TR::java_lang_StrictMath_fma_F:
445+
case TR::java_lang_StrictMath_fma_D:
446+
{
447+
static bool disableInlineFMA = feGetEnv("TR_DisableInlineFMA");
448+
449+
if (disableInlineFMA || !self()->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_FMA))
450+
return false;
451+
}
452+
441453
return true;
442454
default:
443455
return false;

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9086,6 +9086,150 @@ inlineNanoTime(
90869086
#endif
90879087
#endif // LINUX
90889088

9089+
TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGenerator* cg)
9090+
{
9091+
TR::Node *firstChild = node->getFirstChild();
9092+
TR::Node *secondChild = node->getSecondChild();
9093+
TR::Node *thirdChild = node->getThirdChild();
9094+
9095+
TR::Register *lhsReg = NULL;
9096+
TR::Register *midReg = NULL;
9097+
TR::Register *rhsReg = NULL;
9098+
TR::Register *result = cg->allocateRegister(TR_FPR);
9099+
9100+
bool memLoadLhs = !firstChild->getRegister() && firstChild->getReferenceCount() == 1
9101+
&& firstChild->getOpCode().isLoadVar();
9102+
9103+
bool memLoadMiddle = !secondChild->getRegister() && secondChild->getReferenceCount() == 1
9104+
&& secondChild->getOpCode().isLoadVar();
9105+
9106+
bool memLoadRhs = !thirdChild->getRegister() && thirdChild->getReferenceCount() == 1
9107+
&& thirdChild->getOpCode().isLoadVar();
9108+
9109+
bool is64Bit = node->getDataType().isDouble();
9110+
9111+
TR::InstOpCode::Mnemonic fpMovRegRegOpcode = is64Bit ? TR::InstOpCode::MOVSDRegReg : TR::InstOpCode::MOVSSRegReg;
9112+
result->setIsSinglePrecision(!is64Bit);
9113+
9114+
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_FMA), "Cannot generate inline fma implementation without FMA extensions");
9115+
9116+
// Choose fma instruction carefully, based on operand form, to reduce number of copies
9117+
if (memLoadLhs)
9118+
{
9119+
TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD231SDRegRegMem : TR::InstOpCode::VFMADD231SSRegRegMem;
9120+
TR::MemoryReference *lhsMR = generateX86MemoryReference(firstChild, cg);
9121+
9122+
if (memLoadRhs)
9123+
{
9124+
// a (2) * b (3) + c (1)
9125+
TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9126+
generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, rhsMR, cg);
9127+
9128+
midReg = cg->evaluate(secondChild);
9129+
memLoadMiddle = false; // No choice but to evaluate;
9130+
generateRegRegMemInstruction(opcode, node, result, midReg, lhsMR, cg);
9131+
}
9132+
else if (memLoadMiddle)
9133+
{
9134+
// fma = a (1) * b (3) + c (2)
9135+
opcode = is64Bit ? TR::InstOpCode::VFMADD132SDRegRegMem : TR::InstOpCode::VFMADD132SSRegRegMem;
9136+
9137+
TR::MemoryReference *midMR = generateX86MemoryReference(secondChild, cg);
9138+
rhsReg = cg->evaluate(thirdChild);
9139+
9140+
generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, lhsMR, cg);
9141+
generateRegRegMemInstruction(opcode, node, result, rhsReg, midMR, cg);
9142+
}
9143+
else
9144+
{
9145+
// fma = a (2) * b (3) + c (1)
9146+
midReg = cg->evaluate(secondChild);
9147+
rhsReg = cg->evaluate(thirdChild);
9148+
generateRegRegInstruction(fpMovRegRegOpcode, node, result, rhsReg, cg);
9149+
generateRegRegMemInstruction(opcode, node, result, midReg, lhsMR, cg);
9150+
}
9151+
}
9152+
else if (memLoadMiddle)
9153+
{
9154+
TR::MemoryReference *midMR = generateX86MemoryReference(secondChild, cg);
9155+
lhsReg = cg->evaluate(firstChild);
9156+
9157+
if (memLoadRhs)
9158+
{
9159+
// fma = a (2) * b (1) + c (3)
9160+
TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegMem : TR::InstOpCode::VFMADD213SSRegRegMem;
9161+
TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9162+
9163+
generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, midMR, cg);
9164+
generateRegRegMemInstruction(opcode, node, result, lhsReg, rhsMR, cg);
9165+
}
9166+
else
9167+
{
9168+
// fma = a (1) * b (3) + c (2)
9169+
TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD132SDRegRegMem : TR::InstOpCode::VFMADD132SSRegRegMem;
9170+
rhsReg = cg->evaluate(thirdChild);
9171+
9172+
generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9173+
generateRegRegMemInstruction(opcode, node, result, rhsReg, midMR, cg);
9174+
}
9175+
}
9176+
else if (memLoadRhs)
9177+
{
9178+
// fma = a (2) * b (1) + c (3)
9179+
TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegMem : TR::InstOpCode::VFMADD213SSRegRegMem;
9180+
9181+
TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9182+
lhsReg = cg->evaluate(firstChild);
9183+
midReg = cg->evaluate(secondChild);
9184+
9185+
generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9186+
generateRegRegMemInstruction(opcode, node, result, midReg, rhsMR, cg);
9187+
}
9188+
else
9189+
{
9190+
// fma = a (2) * b (1) + c (3)
9191+
TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegReg : TR::InstOpCode::VFMADD213SSRegRegReg;
9192+
9193+
lhsReg = cg->evaluate(firstChild);
9194+
midReg = cg->evaluate(secondChild);
9195+
rhsReg = cg->evaluate(thirdChild);
9196+
9197+
generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9198+
generateRegRegRegInstruction(opcode, node, result, midReg, rhsReg, cg);
9199+
}
9200+
9201+
if (memLoadLhs)
9202+
{
9203+
cg->recursivelyDecReferenceCount(firstChild);
9204+
}
9205+
else
9206+
{
9207+
cg->decReferenceCount(firstChild);
9208+
}
9209+
9210+
if (memLoadMiddle)
9211+
{
9212+
cg->recursivelyDecReferenceCount(secondChild);
9213+
}
9214+
else
9215+
{
9216+
cg->decReferenceCount(secondChild);
9217+
}
9218+
9219+
if (memLoadRhs)
9220+
{
9221+
cg->recursivelyDecReferenceCount(thirdChild);
9222+
}
9223+
else
9224+
{
9225+
cg->decReferenceCount(thirdChild);
9226+
}
9227+
9228+
node->setRegister(result);
9229+
9230+
return result;
9231+
}
9232+
90899233
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
90909234
//
90919235
// Conversion process example:
@@ -12155,6 +12299,18 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
1215512299
return TR::TreeEvaluator::inlineStringLatin1Inflate(node, cg);
1215612300
}
1215712301
break;
12302+
case TR::java_lang_Math_fma_F:
12303+
case TR::java_lang_Math_fma_D:
12304+
case TR::java_lang_StrictMath_fma_F:
12305+
case TR::java_lang_StrictMath_fma_D:
12306+
{
12307+
static bool disableInlineFMA = feGetEnv("TR_DisableInlineFMA") != NULL;
12308+
12309+
if (!disableInlineFMA && cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_FMA))
12310+
return inlineMathFma(node, cg);
12311+
12312+
break;
12313+
}
1215812314
case TR::jdk_internal_util_ArraysSupport_vectorizedHashCode:
1215912315
{
1216012316
if (cg->getSupportsInlineVectorizedHashCode())

runtime/compiler/x/codegen/J9TreeEvaluator.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ class OMR_EXTENSIBLE TreeEvaluator: public J9::TreeEvaluator
146146
static TR::Register *awrtbarEvaluator(TR::Node *node, TR::CodeGenerator *cg);
147147
static TR::Register *awrtbariEvaluator(TR::Node *node, TR::CodeGenerator *cg);
148148
static TR::Register *inlineStringLatin1Inflate(TR::Node *node, TR::CodeGenerator *cg);
149+
static TR::Register *inlineMathFma(TR::Node* node, TR::CodeGenerator* cg);
149150
static TR::Register *inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg);
150151
static TR::Register *vectorizedHashCodeReductionHelper(TR::Node* node,
151152
TR::Register **vectorRegisters,

test/functional/Java9andUp/playlist.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@
318318
<variations>
319319
<variation>-Xint</variation>
320320
<variation>-Xjit:count=1,disableAsyncCompilation</variation>
321+
<variation>-Xjit:count=1,disableGRA,disableLinkageRegisterAllocation,disableLocalCSE,disableAsyncCompilation</variation>
321322
</variations>
322323
<command>$(JAVA_COMMAND) $(JVM_OPTIONS) \
323324
-cp $(Q)$(RESOURCES_DIR)$(P)$(TESTNG)$(P)$(TEST_RESROOT)$(D)GeneralTest.jar$(P)$(LIB_DIR)$(D)asm-all.jar$(Q) \

0 commit comments

Comments
 (0)