Skip to content

Commit aa9f103

Browse files
authored
Merge pull request #22188 from knn-k/aarch64InlineStrIdxOfStr
AArch64: Inline StringLatin1.indexOf([BI[BII)I
2 parents 672797d + 9201b6e commit aa9f103

File tree

2 files changed

+275
-0
lines changed

2 files changed

+275
-0
lines changed

runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ J9::ARM64::CodeGenerator::initialize()
9191
{
9292
cg->setSupportsInlineStringIndexOf();
9393
}
94+
static bool disableInlineStrIdxOfStr = feGetEnv("TR_disableInlineStrIdxOfStr") != NULL;
95+
if ((!TR::Compiler->om.canGenerateArraylets()) && (!comp->getOption(TR_DisableFastStringIndexOf)) && !disableInlineStrIdxOfStr)
96+
{
97+
cg->setSupportsInlineStringIndexOfString();
98+
}
9499
static bool disableInlineStringLatin1Inflate = feGetEnv("TR_disableInlineStringLatin1Inflate") != NULL;
95100
if ((!TR::Compiler->om.canGenerateArraylets()) && (!disableInlineStringLatin1Inflate) && !TR::Compiler->om.isOffHeapAllocationEnabled())
96101
{

runtime/compiler/aarch64/codegen/J9TreeEvaluator.cpp

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6578,6 +6578,267 @@ static TR::Register* inlineIntrinsicIndexOf(TR::Node* node, TR::CodeGenerator* c
65786578
return resultReg;
65796579
}
65806580

6581+
/**
6582+
* \brief
6583+
* Generate inlined instructions equivalent to java/lang/StringLatin1.indexOf([BI[BII)I
6584+
*
6585+
* \param node
6586+
* The tree node
6587+
*
6588+
* \param cg
6589+
* The Code Generator
6590+
*
6591+
* Note that this version does not support discontiguous arrays
6592+
*/
6593+
static TR::Register *inlineIntrinsicStringIndexOfString(TR::Node *node, TR::CodeGenerator *cg)
6594+
{
6595+
static bool verboseInlineStrIdxOfStr = (feGetEnv("TR_verboseInlineStrIdxOfStr") != NULL);
6596+
if (verboseInlineStrIdxOfStr)
6597+
{
6598+
fprintf(stderr, "*Latin1.indexOfString(): %s @%s\n", cg->comp()->signature(), cg->comp()->getHotnessName());
6599+
}
6600+
6601+
TR_ASSERT_FATAL(!TR::Compiler->om.canGenerateArraylets(), "Discontiguous array is not supported");
6602+
6603+
// This evaluator function handles different indexOf() intrinsics, some of which are static calls without a
6604+
// receiver. Hence, the need for static call check.
6605+
const bool isStaticCall = node->getSymbolReference()->getSymbol()->castToMethodSymbol()->isStatic();
6606+
const uint8_t firstCallArgIdx = isStaticCall ? 0 : 1;
6607+
TR::Register *s1Reg = cg->evaluate(node->getChild(firstCallArgIdx));
6608+
TR::Node *s1lenNode = node->getChild(firstCallArgIdx+1);
6609+
TR::Register *s1lenReg = cg->evaluate(s1lenNode);
6610+
TR::Register *s2Reg = cg->evaluate(node->getChild(firstCallArgIdx+2));
6611+
TR::Register *s2lenReg = cg->evaluate(node->getChild(firstCallArgIdx+3));
6612+
TR::Node *offsetNode = node->getChild(firstCallArgIdx+4);
6613+
TR::Register *offsetReg = cg->evaluate(offsetNode);
6614+
6615+
TR::Register *maxReg;
6616+
if (s1lenNode->getReferenceCount() == 1)
6617+
{
6618+
maxReg = s1lenReg;
6619+
}
6620+
else
6621+
{
6622+
maxReg = cg->allocateRegister(TR_GPR);
6623+
generateMovInstruction(cg, node, maxReg, s1lenReg);
6624+
}
6625+
6626+
TR::Register *resultReg;
6627+
if (offsetNode->getReferenceCount() == 1)
6628+
{
6629+
resultReg = offsetReg;
6630+
}
6631+
else
6632+
{
6633+
resultReg = cg->allocateRegister(TR_GPR);
6634+
generateMovInstruction(cg, node, resultReg, offsetReg);
6635+
}
6636+
6637+
TR::Register *s1addrReg = cg->allocateRegister(TR_GPR);
6638+
TR::Register *s1idxReg = cg->allocateRegister(TR_GPR);
6639+
TR::Register *s2addrReg = cg->allocateRegister(TR_GPR);
6640+
TR::Register *s2idxReg = cg->allocateRegister(TR_GPR);
6641+
TR::Register *tmp1Reg = cg->allocateRegister(TR_GPR);
6642+
TR::Register *tmp2Reg = cg->allocateRegister(TR_GPR);
6643+
TR::Register *s2firstCharReg = cg->allocateRegister(TR_VRF);
6644+
TR::Register *vtmp1Reg = cg->allocateRegister(TR_VRF);
6645+
TR::Register *vtmp2Reg = cg->allocateRegister(TR_VRF);
6646+
6647+
TR::RegisterDependencyConditions *dependencies = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(14, 14, cg->trMemory());
6648+
dependencies->addPreCondition(s1Reg, TR::RealRegister::NoReg);
6649+
dependencies->addPreCondition(s2Reg, TR::RealRegister::NoReg);
6650+
dependencies->addPreCondition(s2lenReg, TR::RealRegister::NoReg);
6651+
dependencies->addPreCondition(maxReg, TR::RealRegister::NoReg);
6652+
dependencies->addPreCondition(resultReg, TR::RealRegister::NoReg);
6653+
dependencies->addPreCondition(s1addrReg, TR::RealRegister::NoReg);
6654+
dependencies->addPreCondition(s1idxReg, TR::RealRegister::NoReg);
6655+
dependencies->addPreCondition(s2addrReg, TR::RealRegister::NoReg);
6656+
dependencies->addPreCondition(s2idxReg, TR::RealRegister::NoReg);
6657+
dependencies->addPreCondition(tmp1Reg, TR::RealRegister::NoReg);
6658+
dependencies->addPreCondition(tmp2Reg, TR::RealRegister::NoReg);
6659+
dependencies->addPreCondition(s2firstCharReg, TR::RealRegister::NoReg);
6660+
dependencies->addPreCondition(vtmp1Reg, TR::RealRegister::NoReg);
6661+
dependencies->addPreCondition(vtmp2Reg, TR::RealRegister::NoReg);
6662+
6663+
dependencies->addPostCondition(s1Reg, TR::RealRegister::NoReg);
6664+
dependencies->addPostCondition(s2Reg, TR::RealRegister::NoReg);
6665+
dependencies->addPostCondition(s2lenReg, TR::RealRegister::NoReg);
6666+
dependencies->addPostCondition(maxReg, TR::RealRegister::NoReg);
6667+
dependencies->addPostCondition(resultReg, TR::RealRegister::NoReg);
6668+
dependencies->addPostCondition(s1addrReg, TR::RealRegister::NoReg);
6669+
dependencies->addPostCondition(s1idxReg, TR::RealRegister::NoReg);
6670+
dependencies->addPostCondition(s2addrReg, TR::RealRegister::NoReg);
6671+
dependencies->addPostCondition(s2idxReg, TR::RealRegister::NoReg);
6672+
dependencies->addPostCondition(tmp1Reg, TR::RealRegister::NoReg);
6673+
dependencies->addPostCondition(tmp2Reg, TR::RealRegister::NoReg);
6674+
dependencies->addPostCondition(s2firstCharReg, TR::RealRegister::NoReg);
6675+
dependencies->addPostCondition(vtmp1Reg, TR::RealRegister::NoReg);
6676+
dependencies->addPostCondition(vtmp2Reg, TR::RealRegister::NoReg);
6677+
6678+
TR::LabelSymbol *startLabel = generateLabelSymbol(cg);
6679+
TR::LabelSymbol *outerLoopLabel = generateLabelSymbol(cg);
6680+
TR::LabelSymbol *firstCharLoopLabel = generateLabelSymbol(cg);
6681+
TR::LabelSymbol *firstCharMatchedLabel = generateLabelSymbol(cg);
6682+
TR::LabelSymbol *arrayCmpVectorLoopLabel = generateLabelSymbol(cg);
6683+
TR::LabelSymbol *arrayCmpByteLoopLabel = generateLabelSymbol(cg);
6684+
TR::LabelSymbol *unmatchedLabel = generateLabelSymbol(cg);
6685+
TR::LabelSymbol *notFoundLabel = generateLabelSymbol(cg);
6686+
TR::LabelSymbol *doneLabel = generateLabelSymbol(cg);
6687+
6688+
startLabel->setStartInternalControlFlow();
6689+
doneLabel->setEndInternalControlFlow();
6690+
6691+
generateLabelInstruction(cg, TR::InstOpCode::label, node, startLabel);
6692+
6693+
const int32_t vecWidth = 16;
6694+
6695+
// Addresses of array elements
6696+
#ifdef J9VM_GC_SPARSE_HEAP_ALLOCATION
6697+
if (TR::Compiler->om.isOffHeapAllocationEnabled())
6698+
{
6699+
uint32_t dataAddrOffset = static_cast<int32_t>(cg->comp()->fej9()->getOffsetOfContiguousDataAddrField());
6700+
generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, node, s1addrReg, TR::MemoryReference::createWithDisplacement(cg, s1Reg, dataAddrOffset));
6701+
generateTrg1MemInstruction(cg, TR::InstOpCode::ldrimmx, node, s2addrReg, TR::MemoryReference::createWithDisplacement(cg, s2Reg, dataAddrOffset));
6702+
}
6703+
else
6704+
#endif /* J9VM_GC_SPARSE_HEAP_ALLOCATION */
6705+
{
6706+
uint32_t hdrSize = static_cast<uint32_t>(TR::Compiler->om.contiguousArrayHeaderSizeInBytes());
6707+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmx, node, s1addrReg, s1Reg, hdrSize);
6708+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmx, node, s2addrReg, s2Reg, hdrSize);
6709+
}
6710+
6711+
// First character of s2
6712+
generateTrg1MemInstruction(cg, TR::InstOpCode::ldrbimm, node, tmp1Reg, TR::MemoryReference::createWithDisplacement(cg, s2addrReg, 0));
6713+
generateTrg1Src1Instruction(cg, TR::InstOpCode::vdup16b, node, s2firstCharReg, tmp1Reg);
6714+
6715+
// Calculate max
6716+
generateTrg1Src2Instruction(cg, TR::InstOpCode::subw, node, maxReg, s1lenReg, s2lenReg);
6717+
6718+
// Outer loop
6719+
generateLabelInstruction(cg, TR::InstOpCode::label, node, outerLoopLabel);
6720+
generateCompareInstruction(cg, node, resultReg, maxReg, /* is64bit */ false);
6721+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, notFoundLabel, TR::CC_GT);
6722+
6723+
// Search for the first character
6724+
generateTrg1Src2Instruction(cg, TR::InstOpCode::addx, node, tmp1Reg, s1addrReg, resultReg);
6725+
generateLogicalImmInstruction(cg, TR::InstOpCode::andimmx, node, tmp2Reg, tmp1Reg, true, 3); // N = true, immr:imms = 3 for immediate value 0xf
6726+
generateCompareBranchInstruction(cg, TR::InstOpCode::cbzw, node, tmp2Reg, firstCharLoopLabel);
6727+
6728+
generateTrg1Src2Instruction(cg, TR::InstOpCode::subx, node, tmp1Reg, tmp1Reg, tmp2Reg); // tmp1Reg is 16-byte aligned
6729+
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrimmq, node, vtmp1Reg, TR::MemoryReference::createWithDisplacement(cg, tmp1Reg, 0));
6730+
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmeq16b, node, vtmp1Reg, vtmp1Reg, s2firstCharReg);
6731+
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, vtmp1Reg, vtmp1Reg, 4); // 8 bits x 16 -> 4 bits x 16
6732+
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, tmp1Reg, vtmp1Reg, 0);
6733+
generateLogicalShiftLeftImmInstruction(cg, node, s1idxReg, tmp2Reg, 2, /* is64bit */ false); // s1idxReg is used for other purpose here
6734+
generateTrg1Src2Instruction(cg, TR::InstOpCode::lsrvx, node, tmp1Reg, tmp1Reg, s1idxReg);
6735+
generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, tmp1Reg, firstCharMatchedLabel);
6736+
6737+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, resultReg, resultReg, vecWidth);
6738+
generateTrg1Src2Instruction(cg, TR::InstOpCode::subw, node, resultReg, resultReg, tmp2Reg);
6739+
6740+
generateCompareInstruction(cg, node, resultReg, maxReg, /* is64bit */ false);
6741+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, notFoundLabel, TR::CC_GT);
6742+
6743+
// (s1addrReg + resultReg) is 16-byte aligned here
6744+
generateLabelInstruction(cg, TR::InstOpCode::label, node, firstCharLoopLabel);
6745+
generateTrg1MemInstruction(cg, TR::InstOpCode::vldroffq, node, vtmp1Reg, TR::MemoryReference::createWithIndexReg(cg, s1addrReg, resultReg));
6746+
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmeq16b, node, vtmp1Reg, vtmp1Reg, s2firstCharReg);
6747+
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, vtmp1Reg, vtmp1Reg, 4); // 8 bits x 16 -> 4 bits x 16
6748+
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, tmp1Reg, vtmp1Reg, 0);
6749+
generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, tmp1Reg, firstCharMatchedLabel);
6750+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, resultReg, resultReg, vecWidth);
6751+
generateCompareInstruction(cg, node, resultReg, maxReg, /* is64bit */ false);
6752+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, firstCharLoopLabel, TR::CC_LE);
6753+
generateLabelInstruction(cg, TR::InstOpCode::b, node, notFoundLabel);
6754+
6755+
// First character matched in vector
6756+
generateLabelInstruction(cg, TR::InstOpCode::label, node, firstCharMatchedLabel);
6757+
generateTrg1Src1Instruction(cg, TR::InstOpCode::rbitx, node, tmp1Reg, tmp1Reg);
6758+
generateTrg1Src1Instruction(cg, TR::InstOpCode::clzx, node, tmp1Reg, tmp1Reg);
6759+
generateLogicalShiftRightImmInstruction(cg, node, tmp1Reg, tmp1Reg, 2, /* is64bit */ true); // div by 4
6760+
generateTrg1Src2Instruction(cg, TR::InstOpCode::addx, node, resultReg, resultReg, tmp1Reg);
6761+
6762+
generateCompareInstruction(cg, node, resultReg, maxReg, /* is64bit */ false);
6763+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, notFoundLabel, TR::CC_GT);
6764+
6765+
// Compare the rest of s2
6766+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, s1idxReg, resultReg, 1); // s1idx = offset + 1
6767+
loadConstant32(cg, node, 1, s2idxReg); // s2idx = 1
6768+
6769+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subimmw, node, tmp2Reg, s2lenReg, 1);
6770+
generateLogicalShiftRightImmInstruction(cg, node, tmp2Reg, tmp2Reg, 4, /* is64bit */ false); // div by 16
6771+
generateCompareBranchInstruction(cg, TR::InstOpCode::cbzw, node, tmp2Reg, arrayCmpByteLoopLabel);
6772+
6773+
// Vector comparison
6774+
generateLabelInstruction(cg, TR::InstOpCode::label, node, arrayCmpVectorLoopLabel);
6775+
generateTrg1MemInstruction(cg, TR::InstOpCode::vldroffq, node, vtmp1Reg, TR::MemoryReference::createWithIndexReg(cg, s1addrReg, s1idxReg));
6776+
generateTrg1MemInstruction(cg, TR::InstOpCode::vldroffq, node, vtmp2Reg, TR::MemoryReference::createWithIndexReg(cg, s2addrReg, s2idxReg));
6777+
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmeq16b, node, vtmp1Reg, vtmp1Reg, vtmp2Reg);
6778+
generateVectorShiftImmediateInstruction(cg, TR::InstOpCode::vshrn_8b, node, vtmp1Reg, vtmp1Reg, 4); // 8 bits x 16 -> 4 bits x 16
6779+
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, tmp1Reg, vtmp1Reg, 0);
6780+
generateCompareImmInstruction(cg, node, tmp1Reg, -1, /* is64bit */ true);
6781+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, unmatchedLabel, TR::CC_NE);
6782+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, s1idxReg, s1idxReg, vecWidth);
6783+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, s2idxReg, s2idxReg, vecWidth);
6784+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subsimmw, node, tmp2Reg, tmp2Reg, 1);
6785+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, arrayCmpVectorLoopLabel, TR::CC_NE);
6786+
6787+
// Byte comparison
6788+
generateLabelInstruction(cg, TR::InstOpCode::label, node, arrayCmpByteLoopLabel);
6789+
generateCompareInstruction(cg, node, s2lenReg, s2idxReg);
6790+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, doneLabel, TR::CC_LE); // resultReg has the result
6791+
6792+
generateTrg1MemInstruction(cg, TR::InstOpCode::ldrbimm, node, tmp1Reg, TR::MemoryReference::createWithIndexReg(cg, s1addrReg, s1idxReg));
6793+
generateTrg1MemInstruction(cg, TR::InstOpCode::ldrbimm, node, tmp2Reg, TR::MemoryReference::createWithIndexReg(cg, s2addrReg, s2idxReg));
6794+
generateCompareInstruction(cg, node, tmp1Reg, tmp2Reg, /* is64bit */ false);
6795+
generateConditionalBranchInstruction(cg, TR::InstOpCode::b_cond, node, unmatchedLabel, TR::CC_NE);
6796+
6797+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, s1idxReg, s1idxReg, 1);
6798+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, s2idxReg, s2idxReg, 1);
6799+
generateLabelInstruction(cg, TR::InstOpCode::b, node, arrayCmpByteLoopLabel);
6800+
6801+
// s2 unmatched
6802+
generateLabelInstruction(cg, TR::InstOpCode::label, node, unmatchedLabel);
6803+
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addimmw, node, resultReg, resultReg, 1);
6804+
generateLabelInstruction(cg, TR::InstOpCode::b, node, outerLoopLabel);
6805+
6806+
// Not found
6807+
generateLabelInstruction(cg, TR::InstOpCode::label, node, notFoundLabel);
6808+
loadConstant32(cg, node, -1, resultReg);
6809+
// fall through to doneLabel
6810+
6811+
generateLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, dependencies);
6812+
6813+
cg->stopUsingRegister(s1addrReg);
6814+
cg->stopUsingRegister(s1idxReg);
6815+
cg->stopUsingRegister(s2addrReg);
6816+
cg->stopUsingRegister(s2idxReg);
6817+
cg->stopUsingRegister(tmp1Reg);
6818+
cg->stopUsingRegister(tmp2Reg);
6819+
cg->stopUsingRegister(s2firstCharReg);
6820+
cg->stopUsingRegister(vtmp1Reg);
6821+
cg->stopUsingRegister(vtmp2Reg);
6822+
6823+
if (maxReg != s1lenReg)
6824+
{
6825+
cg->stopUsingRegister(maxReg);
6826+
}
6827+
6828+
node->setRegister(resultReg);
6829+
6830+
if (!isStaticCall)
6831+
{
6832+
cg->recursivelyDecReferenceCount(node->getChild(0));
6833+
}
6834+
for (int32_t i = firstCallArgIdx; i < node->getNumChildren(); i++)
6835+
{
6836+
cg->decReferenceCount(node->getChild(i));
6837+
}
6838+
6839+
return resultReg;
6840+
}
6841+
65816842
/**
65826843
* @brief Generates inlined instructions equivalent to java/lang/StringLatin1.inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len)
65836844
*
@@ -6877,6 +7138,15 @@ J9::ARM64::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&result
68777138
}
68787139
break;
68797140

7141+
case TR::java_lang_StringLatin1_indexOf:
7142+
case TR::com_ibm_jit_JITHelpers_intrinsicIndexOfStringLatin1:
7143+
if (cg->getSupportsInlineStringIndexOfString())
7144+
{
7145+
resultReg = inlineIntrinsicStringIndexOfString(node, cg);
7146+
return true;
7147+
}
7148+
break;
7149+
68807150
case TR::java_lang_String_hashCodeImplDecompressed:
68817151
if (cg->getSupportsInlineStringHashCode())
68827152
{

0 commit comments

Comments
 (0)