@@ -9230,12 +9230,195 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
9230
9230
return result;
9231
9231
}
9232
9232
9233
+ // Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9234
+ //
9235
+ // Conversion process example:
9236
+ //
9237
+ // str[8] = example string representing 8 characters (compressed or decompressed)
9238
+ //
9239
+ // The serial method for creating the hash:
9240
+ // hash = 0, offset = 0, count = 8
9241
+ // for (int i = offset; i < offset+count; ++i) {
9242
+ // hash = (hash << 5) - hash + str[i];
9243
+ // }
9244
+ //
9245
+ // Note that ((hash << 5) - hash) is equivalent to hash * 31
9246
+ //
9247
+ // Expanding out the for loop:
9248
+ // hash = ((((((((0*31+str[0])*31+str[1])*31+str[2])*31+str[3])*31+str[4])*31+str[5])*31+str[6])*31+str[7])
9249
+ //
9250
+ // Simplified:
9251
+ // hash = (31^7)*str[0] + (31^6)*str[1] + (31^5)*str[2] + (31^4)*str[3]
9252
+ // + (31^3)*str[4] + (31^2)*str[5] + (31^1)*str[6] + (31^0)*str[7]
9253
+ //
9254
+ // Rearranged:
9255
+ // hash = (31^7)*str[0] + (31^3)*str[4]
9256
+ // + (31^6)*str[1] + (31^2)*str[5]
9257
+ // + (31^5)*str[2] + (31^1)*str[6]
9258
+ // + (31^4)*str[3] + (31^0)*str[7]
9259
+ //
9260
+ // Factor out [31^3, 31^2, 31^1, 31^0]:
9261
+ // hash = 31^3*((31^4)*str[0] + str[4]) Vector[0]
9262
+ // + 31^2*((31^4)*str[1] + str[5]) Vector[1]
9263
+ // + 31^1*((31^4)*str[2] + str[6]) Vector[2]
9264
+ // + 31^0*((31^4)*str[3] + str[7]) Vector[3]
9265
+ //
9266
+ // Keep factoring out any 31^4 if possible (this example has no such case). If the string was 12 characters long then:
9267
+ // 31^3*((31^8)*str[0] + (31^4)*str[4] + (31^0)*str[8]) would become 31^3*(31^4((31^4)*str[0] + str[4]) + (31^0)*str[8])
9268
+ //
9269
+ // Vectorization is done by simultaneously calculating the four sums that hash is made of (each -> is a successive step):
9270
+ // Vector[0] = str[0] -> multiply 31^4 -> add str[4] -> multiply 31^3
9271
+ // Vector[1] = str[1] -> multiply 31^4 -> add str[5] -> multiply 31^2
9272
+ // Vector[2] = str[2] -> multiply 31^4 -> add str[6] -> multiply 31^1
9273
+ // Vector[3] = str[3] -> multiply 31^4 -> add str[7] -> multiply 1
9274
+ //
9275
+ // Adding these four vectorized values together produces the required hash.
9276
+ // If the number of characters in the string is not a multiple of 4, then the remainder of the hash is calculated serially.
9277
+ //
9278
+ // Implementation overview:
9279
+ //
9280
+ // start_label
9281
+ // if size < threshold, goto serial_label, current threshold is 4
9282
+ // xmm0 = load 16 bytes align constant [923521, 923521, 923521, 923521]
9283
+ // xmm1 = 0
9284
+ // SSEloop
9285
+ // xmm2 = decompressed: load 8 byte value in lower 8 bytes.
9286
+ // compressed: load 4 byte value in lower 4 bytes
9287
+ // xmm1 = xmm1 * xmm0
9288
+ // if(isCompressed)
9289
+ // movzxbd xmm2, xmm2
9290
+ // else
9291
+ // movzxwd xmm2, xmm2
9292
+ // xmm1 = xmm1 + xmm2
9293
+ // i = i + 4;
9294
+ // cmp i, end -3
9295
+ // jge SSEloop
9296
+ // xmm0 = load 16 bytes align [31^3, 31^2, 31, 1]
9297
+ // xmm1 = xmm1 * xmm0 value contains [a0, a1, a2, a3]
9298
+ // xmm0 = xmm1
9299
+ // xmm0 = xmm0 >> 64 bits
9300
+ // xmm1 = xmm1 + xmm0 reduce add [a0+a2, a1+a3, .., ...]
9301
+ // xmm0 = xmm1
9302
+ // xmm0 = xmm0 >> 32 bits
9303
+ // xmm1 = xmm1 + xmm0 reduce add [a0+a2 + a1+a3, .., .., ..]
9304
+ // movd xmm1, GPR1
9305
+ //
9306
+ // serial_label
9307
+ //
9308
+ // cmp i end
9309
+ // jle end
9310
+ // serial_loop
9311
+ // GPR2 = GPR1
9312
+ // GPR1 = GPR1 << 5
9313
+ // GPR1 = GPR1 - GPR2
9314
+ // GPR2 = load c[i]
9315
+ // add GPR1, GPR2
9316
+ // dec i
9317
+ // cmp i, end
9318
+ // jl serial_loop
9319
+ //
9320
+ // end_label
9233
9321
static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR::CodeGenerator* cg)
9234
9322
{
9235
- TR::Register *hashResult = TR::TreeEvaluator::vectorizedHashCodeHelper(node, isCompressed ? TR::Int8 : TR::Int16, NULL, false, cg);
9236
- node->setRegister(hashResult);
9323
+ TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
9324
+
9325
+ const int size = 4;
9326
+ auto shift = isCompressed ? 0 : 1;
9327
+
9328
+ auto address = cg->evaluate(node->getChild(0));
9329
+ auto length = cg->evaluate(node->getChild(2));
9330
+ auto index = cg->allocateRegister();
9331
+ auto hash = cg->allocateRegister();
9332
+ auto tmp = cg->allocateRegister();
9333
+ auto hashXMM = cg->allocateRegister(TR_VRF);
9334
+ auto tmpXMM = cg->allocateRegister(TR_VRF);
9335
+ auto multiplierXMM = cg->allocateRegister(TR_VRF);
9336
+
9337
+ auto begLabel = generateLabelSymbol(cg);
9338
+ auto endLabel = generateLabelSymbol(cg);
9339
+ auto loopLabel = generateLabelSymbol(cg);
9340
+ begLabel->setStartInternalControlFlow();
9341
+ endLabel->setEndInternalControlFlow();
9342
+ auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
9343
+ deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
9344
+ deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9345
+ deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9346
+ deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9347
+ deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9348
+ deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
9349
+ deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
9350
+ deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9351
+ deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9352
+ deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9353
+ deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9354
+ deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
9355
+
9356
+ generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
9357
+ generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
9358
+ generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
9359
+
9360
+ // Prepend zeros
9361
+ {
9362
+ TR::Compilation *comp = cg->comp();
9363
+
9364
+ static uint64_t MASKDECOMPRESSED[] = { 0x0000000000000000ULL, 0xffffffffffffffffULL };
9365
+ static uint64_t MASKCOMPRESSED[] = { 0xffffffff00000000ULL, 0x0000000000000000ULL };
9366
+ generateRegMemInstruction(isCompressed ? TR::InstOpCode::MOVDRegMem : TR::InstOpCode::MOVQRegMem, node, hashXMM, generateX86MemoryReference(address, index, shift, -(size << shift) + TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9367
+ generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, tmp, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, isCompressed ? MASKCOMPRESSED : MASKDECOMPRESSED), cg), cg);
9368
+
9369
+ auto mr = generateX86MemoryReference(tmp, index, shift, 0, cg);
9370
+ if (comp->target().cpu.supportsAVX())
9371
+ {
9372
+ generateRegMemInstruction(TR::InstOpCode::PANDRegMem, node, hashXMM, mr, cg);
9373
+ }
9374
+ else
9375
+ {
9376
+ generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXMM, mr, cg);
9377
+ generateRegRegInstruction(TR::InstOpCode::PANDRegReg, node, hashXMM, tmpXMM, cg);
9378
+ }
9379
+ generateRegRegInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegReg : TR::InstOpCode::PMOVZXWDRegReg, node, hashXMM, hashXMM, cg);
9380
+ }
9381
+
9382
+ // Reduction Loop
9383
+ {
9384
+ static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
9385
+ generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9386
+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9387
+ generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9388
+ generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9389
+ generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
9390
+ generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
9391
+ generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9392
+ generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
9393
+ generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9394
+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9395
+ generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9396
+ generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9397
+ }
9398
+
9399
+ // Finalization
9400
+ {
9401
+ static uint32_t multiplier[] = { 31*31*31, 31*31, 31, 1 };
9402
+ generateRegMemInstruction(TR::InstOpCode::PMULLDRegMem, node, hashXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9403
+ generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x0e, cg);
9404
+ generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9405
+ generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x01, cg);
9406
+ generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9407
+ }
9408
+
9409
+ generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
9237
9410
9238
- return hashResult;
9411
+ cg->stopUsingRegister(index);
9412
+ cg->stopUsingRegister(tmp);
9413
+ cg->stopUsingRegister(hashXMM);
9414
+ cg->stopUsingRegister(tmpXMM);
9415
+ cg->stopUsingRegister(multiplierXMM);
9416
+
9417
+ node->setRegister(hash);
9418
+ cg->decReferenceCount(node->getChild(0));
9419
+ cg->recursivelyDecReferenceCount(node->getChild(1));
9420
+ cg->decReferenceCount(node->getChild(2));
9421
+ return hash;
9239
9422
}
9240
9423
9241
9424
TR::Register* J9::X86::TreeEvaluator::inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg)
@@ -12036,14 +12219,14 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
12036
12219
return TR::TreeEvaluator::encodeUTF16Evaluator(node, cg);
12037
12220
12038
12221
case TR::java_lang_String_hashCodeImplDecompressed:
12039
- if (cg->getSupportsInlineStringHashCode() && !cg->getCurrentBlock()->isCold() )
12222
+ if (cg->getSupportsInlineStringHashCode())
12040
12223
returnRegister = inlineStringHashCode(node, false, cg);
12041
12224
12042
12225
callInlined = (returnRegister != NULL);
12043
12226
break;
12044
12227
12045
12228
case TR::java_lang_String_hashCodeImplCompressed:
12046
- if (cg->getSupportsInlineStringHashCode() && !cg->getCurrentBlock()->isCold() )
12229
+ if (cg->getSupportsInlineStringHashCode())
12047
12230
returnRegister = inlineStringHashCode(node, true, cg);
12048
12231
12049
12232
callInlined = (returnRegister != NULL);
0 commit comments