@@ -9086,6 +9086,150 @@ inlineNanoTime(
9086
9086
#endif
9087
9087
#endif // LINUX
9088
9088
9089
+ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGenerator* cg)
9090
+ {
9091
+ TR::Node *firstChild = node->getFirstChild();
9092
+ TR::Node *secondChild = node->getSecondChild();
9093
+ TR::Node *thirdChild = node->getThirdChild();
9094
+
9095
+ TR::Register *lhsReg = NULL;
9096
+ TR::Register *midReg = NULL;
9097
+ TR::Register *rhsReg = NULL;
9098
+ TR::Register *result = cg->allocateRegister(TR_FPR);
9099
+
9100
+ bool memLoadLhs = !firstChild->getRegister() && firstChild->getReferenceCount() == 1
9101
+ && firstChild->getOpCode().isLoadVar();
9102
+
9103
+ bool memLoadMiddle = !secondChild->getRegister() && secondChild->getReferenceCount() == 1
9104
+ && secondChild->getOpCode().isLoadVar();
9105
+
9106
+ bool memLoadRhs = !thirdChild->getRegister() && thirdChild->getReferenceCount() == 1
9107
+ && thirdChild->getOpCode().isLoadVar();
9108
+
9109
+ bool is64Bit = node->getDataType().isDouble();
9110
+
9111
+ TR::InstOpCode::Mnemonic fpMovRegRegOpcode = is64Bit ? TR::InstOpCode::MOVSDRegReg : TR::InstOpCode::MOVSSRegReg;
9112
+ result->setIsSinglePrecision(!is64Bit);
9113
+
9114
+ TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_FMA), "Cannot generate inline fma implementation without FMA extensions");
9115
+
9116
+ // Choose fma instruction carefully, based on operand form, to reduce number of copies
9117
+ if (memLoadLhs)
9118
+ {
9119
+ TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD231SDRegRegMem : TR::InstOpCode::VFMADD231SSRegRegMem;
9120
+ TR::MemoryReference *lhsMR = generateX86MemoryReference(firstChild, cg);
9121
+
9122
+ if (memLoadRhs)
9123
+ {
9124
+ // a (2) * b (3) + c (1)
9125
+ TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9126
+ generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, rhsMR, cg);
9127
+
9128
+ midReg = cg->evaluate(secondChild);
9129
+ memLoadMiddle = false; // No choice but to evaluate;
9130
+ generateRegRegMemInstruction(opcode, node, result, midReg, lhsMR, cg);
9131
+ }
9132
+ else if (memLoadMiddle)
9133
+ {
9134
+ // fma = a (1) * b (3) + c (2)
9135
+ opcode = is64Bit ? TR::InstOpCode::VFMADD132SDRegRegMem : TR::InstOpCode::VFMADD132SSRegRegMem;
9136
+
9137
+ TR::MemoryReference *midMR = generateX86MemoryReference(secondChild, cg);
9138
+ rhsReg = cg->evaluate(thirdChild);
9139
+
9140
+ generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, lhsMR, cg);
9141
+ generateRegRegMemInstruction(opcode, node, result, rhsReg, midMR, cg);
9142
+ }
9143
+ else
9144
+ {
9145
+ // fma = a (2) * b (3) + c (1)
9146
+ midReg = cg->evaluate(secondChild);
9147
+ rhsReg = cg->evaluate(thirdChild);
9148
+ generateRegRegInstruction(fpMovRegRegOpcode, node, result, rhsReg, cg);
9149
+ generateRegRegMemInstruction(opcode, node, result, midReg, lhsMR, cg);
9150
+ }
9151
+ }
9152
+ else if (memLoadMiddle)
9153
+ {
9154
+ TR::MemoryReference *midMR = generateX86MemoryReference(secondChild, cg);
9155
+ lhsReg = cg->evaluate(firstChild);
9156
+
9157
+ if (memLoadRhs)
9158
+ {
9159
+ // fma = a (2) * b (1) + c (3)
9160
+ TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegMem : TR::InstOpCode::VFMADD213SSRegRegMem;
9161
+ TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9162
+
9163
+ generateRegMemInstruction(TR::InstOpCode::MOVSRegMem(is64Bit), node, result, midMR, cg);
9164
+ generateRegRegMemInstruction(opcode, node, result, lhsReg, rhsMR, cg);
9165
+ }
9166
+ else
9167
+ {
9168
+ // fma = a (1) * b (3) + c (2)
9169
+ TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD132SDRegRegMem : TR::InstOpCode::VFMADD132SSRegRegMem;
9170
+ rhsReg = cg->evaluate(thirdChild);
9171
+
9172
+ generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9173
+ generateRegRegMemInstruction(opcode, node, result, rhsReg, midMR, cg);
9174
+ }
9175
+ }
9176
+ else if (memLoadRhs)
9177
+ {
9178
+ // fma = a (2) * b (1) + c (3)
9179
+ TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegMem : TR::InstOpCode::VFMADD213SSRegRegMem;
9180
+
9181
+ TR::MemoryReference *rhsMR = generateX86MemoryReference(thirdChild, cg);
9182
+ lhsReg = cg->evaluate(firstChild);
9183
+ midReg = cg->evaluate(secondChild);
9184
+
9185
+ generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9186
+ generateRegRegMemInstruction(opcode, node, result, midReg, rhsMR, cg);
9187
+ }
9188
+ else
9189
+ {
9190
+ // fma = a (2) * b (1) + c (3)
9191
+ TR::InstOpCode::Mnemonic opcode = is64Bit ? TR::InstOpCode::VFMADD213SDRegRegReg : TR::InstOpCode::VFMADD213SSRegRegReg;
9192
+
9193
+ lhsReg = cg->evaluate(firstChild);
9194
+ midReg = cg->evaluate(secondChild);
9195
+ rhsReg = cg->evaluate(thirdChild);
9196
+
9197
+ generateRegRegInstruction(fpMovRegRegOpcode, node, result, lhsReg, cg);
9198
+ generateRegRegRegInstruction(opcode, node, result, midReg, rhsReg, cg);
9199
+ }
9200
+
9201
+ if (memLoadLhs)
9202
+ {
9203
+ cg->recursivelyDecReferenceCount(firstChild);
9204
+ }
9205
+ else
9206
+ {
9207
+ cg->decReferenceCount(firstChild);
9208
+ }
9209
+
9210
+ if (memLoadMiddle)
9211
+ {
9212
+ cg->recursivelyDecReferenceCount(secondChild);
9213
+ }
9214
+ else
9215
+ {
9216
+ cg->decReferenceCount(secondChild);
9217
+ }
9218
+
9219
+ if (memLoadRhs)
9220
+ {
9221
+ cg->recursivelyDecReferenceCount(thirdChild);
9222
+ }
9223
+ else
9224
+ {
9225
+ cg->decReferenceCount(thirdChild);
9226
+ }
9227
+
9228
+ node->setRegister(result);
9229
+
9230
+ return result;
9231
+ }
9232
+
9089
9233
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9090
9234
//
9091
9235
// Conversion process example:
@@ -12155,6 +12299,18 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
12155
12299
return TR::TreeEvaluator::inlineStringLatin1Inflate(node, cg);
12156
12300
}
12157
12301
break;
12302
+ case TR::java_lang_Math_fma_F:
12303
+ case TR::java_lang_Math_fma_D:
12304
+ case TR::java_lang_StrictMath_fma_F:
12305
+ case TR::java_lang_StrictMath_fma_D:
12306
+ {
12307
+ static bool disableInlineFMA = feGetEnv("TR_DisableInlineFMA") != NULL;
12308
+
12309
+ if (!disableInlineFMA && cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_FMA))
12310
+ return inlineMathFma(node, cg);
12311
+
12312
+ break;
12313
+ }
12158
12314
case TR::jdk_internal_util_ArraysSupport_vectorizedHashCode:
12159
12315
{
12160
12316
if (cg->getSupportsInlineVectorizedHashCode())
0 commit comments