Skip to content

Commit ca33d74

Browse files
[X86] Improve x86-partial-reduction to support abs intrinsic
Current implementation only recognizes absolute operation implemented by select instruction. This patch adds support for abs intrinsic. Differential Revision: https://reviews.llvm.org/D122777
1 parent 72ec2f7 commit ca33d74

File tree

2 files changed

+47
-181
lines changed

2 files changed

+47
-181
lines changed

llvm/lib/Target/X86/X86PartialReduction.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
#include "llvm/IR/Constants.h"
2020
#include "llvm/IR/IRBuilder.h"
2121
#include "llvm/IR/Instructions.h"
22+
#include "llvm/IR/IntrinsicInst.h"
2223
#include "llvm/IR/IntrinsicsX86.h"
2324
#include "llvm/IR/Operator.h"
25+
#include "llvm/IR/PatternMatch.h"
2426
#include "llvm/Pass.h"
2527
#include "llvm/Support/KnownBits.h"
2628

@@ -220,16 +222,21 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
220222
if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
221223
return false;
222224

223-
// Operand should be a select.
224-
auto *SI = dyn_cast<SelectInst>(Op);
225-
if (!SI)
226-
return false;
227-
228-
// Select needs to implement absolute value.
229-
Value *LHS, *RHS;
230-
auto SPR = matchSelectPattern(SI, LHS, RHS);
231-
if (SPR.Flavor != SPF_ABS)
232-
return false;
225+
Value *LHS;
226+
if (match(Op, PatternMatch::m_Intrinsic<Intrinsic::abs>())) {
227+
LHS = Op->getOperand(0);
228+
} else {
229+
// Operand should be a select.
230+
auto *SI = dyn_cast<SelectInst>(Op);
231+
if (!SI)
232+
return false;
233+
234+
Value *RHS;
235+
// Select needs to implement absolute value.
236+
auto SPR = matchSelectPattern(SI, LHS, RHS);
237+
if (SPR.Flavor != SPF_ABS)
238+
return false;
239+
}
233240

234241
// Need a subtract of two values.
235242
auto *Sub = dyn_cast<BinaryOperator>(LHS);
@@ -253,7 +260,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
253260
if (!Op0 || !Op1)
254261
return false;
255262

256-
IRBuilder<> Builder(SI);
263+
IRBuilder<> Builder(Op);
257264

258265
auto *OpTy = cast<FixedVectorType>(Op->getType());
259266
unsigned NumElts = OpTy->getNumElements();
@@ -271,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
271278
IntrinsicNumElts = 16;
272279
}
273280

274-
Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
281+
Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID);
275282

276283
if (NumElts < 16) {
277284
// Pad input with zeroes.
@@ -336,8 +343,8 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
336343
Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
337344
}
338345

339-
SI->replaceAllUsesWith(Ops[0]);
340-
SI->eraseFromParent();
346+
Op->replaceAllUsesWith(Ops[0]);
347+
Op->eraseFromParent();
341348

342349
return true;
343350
}

llvm/test/CodeGen/X86/sad.ll

Lines changed: 26 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,201 +1134,60 @@ bb:
11341134
define dso_local i32 @sad_double_reduction_abs(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) {
11351135
; SSE2-LABEL: sad_double_reduction_abs:
11361136
; SSE2: # %bb.0: # %bb
1137-
; SSE2-NEXT: movdqu (%rdi), %xmm11
1138-
; SSE2-NEXT: movdqu (%rsi), %xmm2
1139-
; SSE2-NEXT: pxor %xmm4, %xmm4
1140-
; SSE2-NEXT: movdqa %xmm11, %xmm10
1141-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
1142-
; SSE2-NEXT: movdqa %xmm10, %xmm8
1143-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1144-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
1145-
; SSE2-NEXT: movdqa %xmm11, %xmm9
1146-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
1147-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1148-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
1149-
; SSE2-NEXT: movdqa %xmm2, %xmm5
1150-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1151-
; SSE2-NEXT: movdqa %xmm5, %xmm6
1152-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1153-
; SSE2-NEXT: psubd %xmm6, %xmm8
1154-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1155-
; SSE2-NEXT: movdqa %xmm2, %xmm6
1156-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1157-
; SSE2-NEXT: psubd %xmm6, %xmm9
1158-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1159-
; SSE2-NEXT: psubd %xmm5, %xmm10
1160-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1161-
; SSE2-NEXT: psubd %xmm2, %xmm11
1162-
; SSE2-NEXT: movdqa %xmm8, %xmm2
1163-
; SSE2-NEXT: psrad $31, %xmm2
1164-
; SSE2-NEXT: pxor %xmm2, %xmm8
1165-
; SSE2-NEXT: psubd %xmm2, %xmm8
1166-
; SSE2-NEXT: movdqa %xmm9, %xmm2
1167-
; SSE2-NEXT: psrad $31, %xmm2
1168-
; SSE2-NEXT: pxor %xmm2, %xmm9
1169-
; SSE2-NEXT: psubd %xmm2, %xmm9
1170-
; SSE2-NEXT: movdqa %xmm10, %xmm2
1171-
; SSE2-NEXT: psrad $31, %xmm2
1172-
; SSE2-NEXT: pxor %xmm2, %xmm10
1173-
; SSE2-NEXT: psubd %xmm2, %xmm10
1174-
; SSE2-NEXT: movdqa %xmm11, %xmm2
1175-
; SSE2-NEXT: psrad $31, %xmm2
1176-
; SSE2-NEXT: pxor %xmm2, %xmm11
1177-
; SSE2-NEXT: psubd %xmm2, %xmm11
1178-
; SSE2-NEXT: movdqu (%rdx), %xmm5
1179-
; SSE2-NEXT: movdqu (%rcx), %xmm0
1180-
; SSE2-NEXT: movdqa %xmm5, %xmm2
1181-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1182-
; SSE2-NEXT: movdqa %xmm2, %xmm6
1183-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1184-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1185-
; SSE2-NEXT: movdqa %xmm5, %xmm7
1186-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1187-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1188-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1189-
; SSE2-NEXT: movdqa %xmm0, %xmm1
1190-
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1191-
; SSE2-NEXT: movdqa %xmm1, %xmm3
1192-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1193-
; SSE2-NEXT: psubd %xmm3, %xmm6
1194-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1195-
; SSE2-NEXT: movdqa %xmm0, %xmm3
1196-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1197-
; SSE2-NEXT: psubd %xmm3, %xmm7
1198-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1199-
; SSE2-NEXT: psubd %xmm1, %xmm2
1200-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1201-
; SSE2-NEXT: psubd %xmm0, %xmm5
1202-
; SSE2-NEXT: movdqa %xmm6, %xmm0
1203-
; SSE2-NEXT: psrad $31, %xmm0
1204-
; SSE2-NEXT: pxor %xmm0, %xmm6
1205-
; SSE2-NEXT: psubd %xmm0, %xmm6
1206-
; SSE2-NEXT: movdqa %xmm7, %xmm0
1207-
; SSE2-NEXT: psrad $31, %xmm0
1208-
; SSE2-NEXT: pxor %xmm0, %xmm7
1209-
; SSE2-NEXT: psubd %xmm0, %xmm7
1210-
; SSE2-NEXT: paddd %xmm9, %xmm7
1211-
; SSE2-NEXT: paddd %xmm8, %xmm7
1212-
; SSE2-NEXT: paddd %xmm6, %xmm7
1213-
; SSE2-NEXT: movdqa %xmm2, %xmm0
1214-
; SSE2-NEXT: psrad $31, %xmm0
1215-
; SSE2-NEXT: pxor %xmm0, %xmm2
1216-
; SSE2-NEXT: psubd %xmm0, %xmm2
1217-
; SSE2-NEXT: movdqa %xmm5, %xmm0
1218-
; SSE2-NEXT: psrad $31, %xmm0
1219-
; SSE2-NEXT: pxor %xmm0, %xmm5
1220-
; SSE2-NEXT: psubd %xmm0, %xmm5
1221-
; SSE2-NEXT: paddd %xmm11, %xmm5
1222-
; SSE2-NEXT: paddd %xmm10, %xmm5
1223-
; SSE2-NEXT: paddd %xmm7, %xmm5
1224-
; SSE2-NEXT: paddd %xmm2, %xmm5
1225-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
1226-
; SSE2-NEXT: paddd %xmm5, %xmm0
1137+
; SSE2-NEXT: movdqu (%rdi), %xmm0
1138+
; SSE2-NEXT: movdqu (%rsi), %xmm1
1139+
; SSE2-NEXT: psadbw %xmm0, %xmm1
1140+
; SSE2-NEXT: movdqu (%rdx), %xmm0
1141+
; SSE2-NEXT: movdqu (%rcx), %xmm2
1142+
; SSE2-NEXT: psadbw %xmm0, %xmm2
1143+
; SSE2-NEXT: paddd %xmm1, %xmm2
1144+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1145+
; SSE2-NEXT: paddd %xmm2, %xmm0
12271146
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1228-
; SSE2-NEXT: paddd %xmm0, %xmm1
1147+
; SSE2-NEXT: por %xmm0, %xmm1
12291148
; SSE2-NEXT: movd %xmm1, %eax
12301149
; SSE2-NEXT: retq
12311150
;
12321151
; AVX1-LABEL: sad_double_reduction_abs:
12331152
; AVX1: # %bb.0: # %bb
1234-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1235-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1236-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1237-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1238-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1239-
; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
1240-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1241-
; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
1242-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1243-
; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
1244-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1245-
; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
1246-
; AVX1-NEXT: vpabsd %xmm0, %xmm0
1247-
; AVX1-NEXT: vpabsd %xmm1, %xmm1
1248-
; AVX1-NEXT: vpabsd %xmm2, %xmm8
1249-
; AVX1-NEXT: vpabsd %xmm3, %xmm3
1250-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1251-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1252-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1253-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1254-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1255-
; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
1256-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1257-
; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
1258-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1259-
; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
1260-
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1261-
; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6
1262-
; AVX1-NEXT: vpabsd %xmm2, %xmm2
1263-
; AVX1-NEXT: vpabsd %xmm4, %xmm4
1264-
; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
1265-
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1266-
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
1267-
; AVX1-NEXT: vpabsd %xmm5, %xmm1
1268-
; AVX1-NEXT: vpabsd %xmm6, %xmm2
1269-
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1270-
; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2
1271-
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
1153+
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1154+
; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1155+
; AVX1-NEXT: vmovdqu (%rdx), %xmm1
1156+
; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
12721157
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
12731158
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
12741159
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
12751160
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1276-
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1161+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
12771162
; AVX1-NEXT: vmovd %xmm0, %eax
12781163
; AVX1-NEXT: retq
12791164
;
12801165
; AVX2-LABEL: sad_double_reduction_abs:
12811166
; AVX2: # %bb.0: # %bb
1282-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1283-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1284-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1285-
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1286-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1287-
; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
1288-
; AVX2-NEXT: vpabsd %ymm0, %ymm0
1289-
; AVX2-NEXT: vpabsd %ymm1, %ymm1
1290-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1291-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1292-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1293-
; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2
1294-
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1295-
; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3
1296-
; AVX2-NEXT: vpabsd %ymm2, %ymm2
1297-
; AVX2-NEXT: vpabsd %ymm3, %ymm3
1298-
; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
1299-
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1300-
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
1301-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1302-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1167+
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1168+
; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1169+
; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1170+
; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1171+
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
13031172
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
13041173
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13051174
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1306-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1175+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
13071176
; AVX2-NEXT: vmovd %xmm0, %eax
1308-
; AVX2-NEXT: vzeroupper
13091177
; AVX2-NEXT: retq
13101178
;
13111179
; AVX512-LABEL: sad_double_reduction_abs:
13121180
; AVX512: # %bb.0: # %bb
1313-
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1314-
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1315-
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1316-
; AVX512-NEXT: vpabsd %zmm0, %zmm0
1317-
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1318-
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1319-
; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1
1320-
; AVX512-NEXT: vpabsd %zmm1, %zmm1
1321-
; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1322-
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1323-
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1324-
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1325-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1181+
; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1182+
; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1183+
; AVX512-NEXT: vmovdqu (%rdx), %xmm1
1184+
; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1185+
; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
13261186
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
13271187
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13281188
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
13291189
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13301190
; AVX512-NEXT: vmovd %xmm0, %eax
1331-
; AVX512-NEXT: vzeroupper
13321191
; AVX512-NEXT: retq
13331192
bb:
13341193
%tmp = load <16 x i8>, <16 x i8>* %arg, align 1

0 commit comments

Comments
 (0)