Skip to content

Commit e75db69

Browse files
authored
Merge pull request #21377 from BradleyWood/0.51-revert-shc
(0.51) Revert "x86: Implement String.hashCode with vectorizedHashCode()"
2 parents 1efd049 + af8d0b9 commit e75db69

File tree

1 file changed

+188
-5
lines changed

1 file changed

+188
-5
lines changed

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 188 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9230,12 +9230,195 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
92309230
return result;
92319231
}
92329232

9233+
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9234+
//
9235+
// Conversion process example:
9236+
//
9237+
// str[8] = example string representing 8 characters (compressed or decompressed)
9238+
//
9239+
// The serial method for creating the hash:
9240+
// hash = 0, offset = 0, count = 8
9241+
// for (int i = offset; i < offset+count; ++i) {
9242+
// hash = (hash << 5) - hash + str[i];
9243+
// }
9244+
//
9245+
// Note that ((hash << 5) - hash) is equivalent to hash * 31
9246+
//
9247+
// Expanding out the for loop:
9248+
// hash = ((((((((0*31+str[0])*31+str[1])*31+str[2])*31+str[3])*31+str[4])*31+str[5])*31+str[6])*31+str[7])
9249+
//
9250+
// Simplified:
9251+
// hash = (31^7)*str[0] + (31^6)*str[1] + (31^5)*str[2] + (31^4)*str[3]
9252+
// + (31^3)*str[4] + (31^2)*str[5] + (31^1)*str[6] + (31^0)*str[7]
9253+
//
9254+
// Rearranged:
9255+
// hash = (31^7)*str[0] + (31^3)*str[4]
9256+
// + (31^6)*str[1] + (31^2)*str[5]
9257+
// + (31^5)*str[2] + (31^1)*str[6]
9258+
// + (31^4)*str[3] + (31^0)*str[7]
9259+
//
9260+
// Factor out [31^3, 31^2, 31^1, 31^0]:
9261+
// hash = 31^3*((31^4)*str[0] + str[4]) Vector[0]
9262+
// + 31^2*((31^4)*str[1] + str[5]) Vector[1]
9263+
// + 31^1*((31^4)*str[2] + str[6]) Vector[2]
9264+
// + 31^0*((31^4)*str[3] + str[7]) Vector[3]
9265+
//
9266+
// Keep factoring out any 31^4 if possible (this example has no such case). If the string was 12 characters long then:
9267+
// 31^3*((31^8)*str[0] + (31^4)*str[4] + (31^0)*str[8]) would become 31^3*(31^4((31^4)*str[0] + str[4]) + (31^0)*str[8])
9268+
//
9269+
// Vectorization is done by simultaneously calculating the four sums that hash is made of (each -> is a successive step):
9270+
// Vector[0] = str[0] -> multiply 31^4 -> add str[4] -> multiply 31^3
9271+
// Vector[1] = str[1] -> multiply 31^4 -> add str[5] -> multiply 31^2
9272+
// Vector[2] = str[2] -> multiply 31^4 -> add str[6] -> multiply 31^1
9273+
// Vector[3] = str[3] -> multiply 31^4 -> add str[7] -> multiply 1
9274+
//
9275+
// Adding these four vectorized values together produces the required hash.
9276+
// If the number of characters in the string is not a multiple of 4, then the remainder of the hash is calculated serially.
9277+
//
9278+
// Implementation overview:
9279+
//
9280+
// start_label
9281+
// if size < threshold, goto serial_label, current threshold is 4
9282+
// xmm0 = load 16 bytes align constant [923521, 923521, 923521, 923521]
9283+
// xmm1 = 0
9284+
// SSEloop
9285+
// xmm2 = decompressed: load 8 byte value in lower 8 bytes.
9286+
// compressed: load 4 byte value in lower 4 bytes
9287+
// xmm1 = xmm1 * xmm0
9288+
// if(isCompressed)
9289+
// movzxbd xmm2, xmm2
9290+
// else
9291+
// movzxwd xmm2, xmm2
9292+
// xmm1 = xmm1 + xmm2
9293+
// i = i + 4;
9294+
// cmp i, end -3
9295+
// jge SSEloop
9296+
// xmm0 = load 16 bytes align [31^3, 31^2, 31, 1]
9297+
// xmm1 = xmm1 * xmm0 value contains [a0, a1, a2, a3]
9298+
// xmm0 = xmm1
9299+
// xmm0 = xmm0 >> 64 bits
9300+
// xmm1 = xmm1 + xmm0 reduce add [a0+a2, a1+a3, .., ...]
9301+
// xmm0 = xmm1
9302+
// xmm0 = xmm0 >> 32 bits
9303+
// xmm1 = xmm1 + xmm0 reduce add [a0+a2 + a1+a3, .., .., ..]
9304+
// movd xmm1, GPR1
9305+
//
9306+
// serial_label
9307+
//
9308+
// cmp i end
9309+
// jle end
9310+
// serial_loop
9311+
// GPR2 = GPR1
9312+
// GPR1 = GPR1 << 5
9313+
// GPR1 = GPR1 - GPR2
9314+
// GPR2 = load c[i]
9315+
// add GPR1, GPR2
9316+
// dec i
9317+
// cmp i, end
9318+
// jl serial_loop
9319+
//
9320+
// end_label
92339321
static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR::CodeGenerator* cg)
92349322
{
9235-
TR::Register *hashResult = TR::TreeEvaluator::vectorizedHashCodeHelper(node, isCompressed ? TR::Int8 : TR::Int16, NULL, false, cg);
9236-
node->setRegister(hashResult);
9323+
TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
9324+
9325+
const int size = 4;
9326+
auto shift = isCompressed ? 0 : 1;
9327+
9328+
auto address = cg->evaluate(node->getChild(0));
9329+
auto length = cg->evaluate(node->getChild(2));
9330+
auto index = cg->allocateRegister();
9331+
auto hash = cg->allocateRegister();
9332+
auto tmp = cg->allocateRegister();
9333+
auto hashXMM = cg->allocateRegister(TR_VRF);
9334+
auto tmpXMM = cg->allocateRegister(TR_VRF);
9335+
auto multiplierXMM = cg->allocateRegister(TR_VRF);
9336+
9337+
auto begLabel = generateLabelSymbol(cg);
9338+
auto endLabel = generateLabelSymbol(cg);
9339+
auto loopLabel = generateLabelSymbol(cg);
9340+
begLabel->setStartInternalControlFlow();
9341+
endLabel->setEndInternalControlFlow();
9342+
auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
9343+
deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
9344+
deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9345+
deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9346+
deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9347+
deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9348+
deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
9349+
deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
9350+
deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9351+
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9352+
deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9353+
deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9354+
deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
9355+
9356+
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
9357+
generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
9358+
generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
9359+
9360+
// Prepend zeros
9361+
{
9362+
TR::Compilation *comp = cg->comp();
9363+
9364+
static uint64_t MASKDECOMPRESSED[] = { 0x0000000000000000ULL, 0xffffffffffffffffULL };
9365+
static uint64_t MASKCOMPRESSED[] = { 0xffffffff00000000ULL, 0x0000000000000000ULL };
9366+
generateRegMemInstruction(isCompressed ? TR::InstOpCode::MOVDRegMem : TR::InstOpCode::MOVQRegMem, node, hashXMM, generateX86MemoryReference(address, index, shift, -(size << shift) + TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9367+
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, tmp, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, isCompressed ? MASKCOMPRESSED : MASKDECOMPRESSED), cg), cg);
9368+
9369+
auto mr = generateX86MemoryReference(tmp, index, shift, 0, cg);
9370+
if (comp->target().cpu.supportsAVX())
9371+
{
9372+
generateRegMemInstruction(TR::InstOpCode::PANDRegMem, node, hashXMM, mr, cg);
9373+
}
9374+
else
9375+
{
9376+
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXMM, mr, cg);
9377+
generateRegRegInstruction(TR::InstOpCode::PANDRegReg, node, hashXMM, tmpXMM, cg);
9378+
}
9379+
generateRegRegInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegReg : TR::InstOpCode::PMOVZXWDRegReg, node, hashXMM, hashXMM, cg);
9380+
}
9381+
9382+
// Reduction Loop
9383+
{
9384+
static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
9385+
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9386+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9387+
generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9388+
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9389+
generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
9390+
generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
9391+
generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9392+
generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
9393+
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9394+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9395+
generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9396+
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9397+
}
9398+
9399+
// Finalization
9400+
{
9401+
static uint32_t multiplier[] = { 31*31*31, 31*31, 31, 1 };
9402+
generateRegMemInstruction(TR::InstOpCode::PMULLDRegMem, node, hashXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9403+
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x0e, cg);
9404+
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9405+
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x01, cg);
9406+
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9407+
}
9408+
9409+
generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
92379410

9238-
return hashResult;
9411+
cg->stopUsingRegister(index);
9412+
cg->stopUsingRegister(tmp);
9413+
cg->stopUsingRegister(hashXMM);
9414+
cg->stopUsingRegister(tmpXMM);
9415+
cg->stopUsingRegister(multiplierXMM);
9416+
9417+
node->setRegister(hash);
9418+
cg->decReferenceCount(node->getChild(0));
9419+
cg->recursivelyDecReferenceCount(node->getChild(1));
9420+
cg->decReferenceCount(node->getChild(2));
9421+
return hash;
92399422
}
92409423

92419424
TR::Register* J9::X86::TreeEvaluator::inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg)
@@ -12036,14 +12219,14 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
1203612219
return TR::TreeEvaluator::encodeUTF16Evaluator(node, cg);
1203712220

1203812221
case TR::java_lang_String_hashCodeImplDecompressed:
12039-
if (cg->getSupportsInlineStringHashCode() && !cg->getCurrentBlock()->isCold())
12222+
if (cg->getSupportsInlineStringHashCode())
1204012223
returnRegister = inlineStringHashCode(node, false, cg);
1204112224

1204212225
callInlined = (returnRegister != NULL);
1204312226
break;
1204412227

1204512228
case TR::java_lang_String_hashCodeImplCompressed:
12046-
if (cg->getSupportsInlineStringHashCode() && !cg->getCurrentBlock()->isCold())
12229+
if (cg->getSupportsInlineStringHashCode())
1204712230
returnRegister = inlineStringHashCode(node, true, cg);
1204812231

1204912232
callInlined = (returnRegister != NULL);

0 commit comments

Comments
 (0)