@@ -1134,201 +1134,60 @@ bb:
1134
1134
define dso_local i32 @sad_double_reduction_abs (<16 x i8 >* %arg , <16 x i8 >* %arg1 , <16 x i8 >* %arg2 , <16 x i8 >* %arg3 ) {
1135
1135
; SSE2-LABEL: sad_double_reduction_abs:
1136
1136
; SSE2: # %bb.0: # %bb
1137
- ; SSE2-NEXT: movdqu (%rdi), %xmm11
1138
- ; SSE2-NEXT: movdqu (%rsi), %xmm2
1139
- ; SSE2-NEXT: pxor %xmm4, %xmm4
1140
- ; SSE2-NEXT: movdqa %xmm11, %xmm10
1141
- ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
1142
- ; SSE2-NEXT: movdqa %xmm10, %xmm8
1143
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1144
- ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
1145
- ; SSE2-NEXT: movdqa %xmm11, %xmm9
1146
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
1147
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1148
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
1149
- ; SSE2-NEXT: movdqa %xmm2, %xmm5
1150
- ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1151
- ; SSE2-NEXT: movdqa %xmm5, %xmm6
1152
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1153
- ; SSE2-NEXT: psubd %xmm6, %xmm8
1154
- ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1155
- ; SSE2-NEXT: movdqa %xmm2, %xmm6
1156
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1157
- ; SSE2-NEXT: psubd %xmm6, %xmm9
1158
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1159
- ; SSE2-NEXT: psubd %xmm5, %xmm10
1160
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1161
- ; SSE2-NEXT: psubd %xmm2, %xmm11
1162
- ; SSE2-NEXT: movdqa %xmm8, %xmm2
1163
- ; SSE2-NEXT: psrad $31, %xmm2
1164
- ; SSE2-NEXT: pxor %xmm2, %xmm8
1165
- ; SSE2-NEXT: psubd %xmm2, %xmm8
1166
- ; SSE2-NEXT: movdqa %xmm9, %xmm2
1167
- ; SSE2-NEXT: psrad $31, %xmm2
1168
- ; SSE2-NEXT: pxor %xmm2, %xmm9
1169
- ; SSE2-NEXT: psubd %xmm2, %xmm9
1170
- ; SSE2-NEXT: movdqa %xmm10, %xmm2
1171
- ; SSE2-NEXT: psrad $31, %xmm2
1172
- ; SSE2-NEXT: pxor %xmm2, %xmm10
1173
- ; SSE2-NEXT: psubd %xmm2, %xmm10
1174
- ; SSE2-NEXT: movdqa %xmm11, %xmm2
1175
- ; SSE2-NEXT: psrad $31, %xmm2
1176
- ; SSE2-NEXT: pxor %xmm2, %xmm11
1177
- ; SSE2-NEXT: psubd %xmm2, %xmm11
1178
- ; SSE2-NEXT: movdqu (%rdx), %xmm5
1179
- ; SSE2-NEXT: movdqu (%rcx), %xmm0
1180
- ; SSE2-NEXT: movdqa %xmm5, %xmm2
1181
- ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1182
- ; SSE2-NEXT: movdqa %xmm2, %xmm6
1183
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1184
- ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1185
- ; SSE2-NEXT: movdqa %xmm5, %xmm7
1186
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1187
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1188
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1189
- ; SSE2-NEXT: movdqa %xmm0, %xmm1
1190
- ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1191
- ; SSE2-NEXT: movdqa %xmm1, %xmm3
1192
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1193
- ; SSE2-NEXT: psubd %xmm3, %xmm6
1194
- ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1195
- ; SSE2-NEXT: movdqa %xmm0, %xmm3
1196
- ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1197
- ; SSE2-NEXT: psubd %xmm3, %xmm7
1198
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1199
- ; SSE2-NEXT: psubd %xmm1, %xmm2
1200
- ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1201
- ; SSE2-NEXT: psubd %xmm0, %xmm5
1202
- ; SSE2-NEXT: movdqa %xmm6, %xmm0
1203
- ; SSE2-NEXT: psrad $31, %xmm0
1204
- ; SSE2-NEXT: pxor %xmm0, %xmm6
1205
- ; SSE2-NEXT: psubd %xmm0, %xmm6
1206
- ; SSE2-NEXT: movdqa %xmm7, %xmm0
1207
- ; SSE2-NEXT: psrad $31, %xmm0
1208
- ; SSE2-NEXT: pxor %xmm0, %xmm7
1209
- ; SSE2-NEXT: psubd %xmm0, %xmm7
1210
- ; SSE2-NEXT: paddd %xmm9, %xmm7
1211
- ; SSE2-NEXT: paddd %xmm8, %xmm7
1212
- ; SSE2-NEXT: paddd %xmm6, %xmm7
1213
- ; SSE2-NEXT: movdqa %xmm2, %xmm0
1214
- ; SSE2-NEXT: psrad $31, %xmm0
1215
- ; SSE2-NEXT: pxor %xmm0, %xmm2
1216
- ; SSE2-NEXT: psubd %xmm0, %xmm2
1217
- ; SSE2-NEXT: movdqa %xmm5, %xmm0
1218
- ; SSE2-NEXT: psrad $31, %xmm0
1219
- ; SSE2-NEXT: pxor %xmm0, %xmm5
1220
- ; SSE2-NEXT: psubd %xmm0, %xmm5
1221
- ; SSE2-NEXT: paddd %xmm11, %xmm5
1222
- ; SSE2-NEXT: paddd %xmm10, %xmm5
1223
- ; SSE2-NEXT: paddd %xmm7, %xmm5
1224
- ; SSE2-NEXT: paddd %xmm2, %xmm5
1225
- ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
1226
- ; SSE2-NEXT: paddd %xmm5, %xmm0
1137
+ ; SSE2-NEXT: movdqu (%rdi), %xmm0
1138
+ ; SSE2-NEXT: movdqu (%rsi), %xmm1
1139
+ ; SSE2-NEXT: psadbw %xmm0, %xmm1
1140
+ ; SSE2-NEXT: movdqu (%rdx), %xmm0
1141
+ ; SSE2-NEXT: movdqu (%rcx), %xmm2
1142
+ ; SSE2-NEXT: psadbw %xmm0, %xmm2
1143
+ ; SSE2-NEXT: paddd %xmm1, %xmm2
1144
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
1145
+ ; SSE2-NEXT: paddd %xmm2, %xmm0
1227
1146
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1228
- ; SSE2-NEXT: paddd %xmm0, %xmm1
1147
+ ; SSE2-NEXT: por %xmm0, %xmm1
1229
1148
; SSE2-NEXT: movd %xmm1, %eax
1230
1149
; SSE2-NEXT: retq
1231
1150
;
1232
1151
; AVX1-LABEL: sad_double_reduction_abs:
1233
1152
; AVX1: # %bb.0: # %bb
1234
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1235
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1236
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1237
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1238
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1239
- ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
1240
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1241
- ; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
1242
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1243
- ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
1244
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1245
- ; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
1246
- ; AVX1-NEXT: vpabsd %xmm0, %xmm0
1247
- ; AVX1-NEXT: vpabsd %xmm1, %xmm1
1248
- ; AVX1-NEXT: vpabsd %xmm2, %xmm8
1249
- ; AVX1-NEXT: vpabsd %xmm3, %xmm3
1250
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1251
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1252
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1253
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1254
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1255
- ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
1256
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1257
- ; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
1258
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1259
- ; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
1260
- ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1261
- ; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6
1262
- ; AVX1-NEXT: vpabsd %xmm2, %xmm2
1263
- ; AVX1-NEXT: vpabsd %xmm4, %xmm4
1264
- ; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
1265
- ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1266
- ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
1267
- ; AVX1-NEXT: vpabsd %xmm5, %xmm1
1268
- ; AVX1-NEXT: vpabsd %xmm6, %xmm2
1269
- ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1270
- ; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2
1271
- ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
1153
+ ; AVX1-NEXT: vmovdqu (%rdi), %xmm0
1154
+ ; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1155
+ ; AVX1-NEXT: vmovdqu (%rdx), %xmm1
1156
+ ; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1272
1157
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1273
1158
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1274
1159
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1275
1160
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1276
- ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1161
+ ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1277
1162
; AVX1-NEXT: vmovd %xmm0, %eax
1278
1163
; AVX1-NEXT: retq
1279
1164
;
1280
1165
; AVX2-LABEL: sad_double_reduction_abs:
1281
1166
; AVX2: # %bb.0: # %bb
1282
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1283
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1284
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1285
- ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
1286
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1287
- ; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
1288
- ; AVX2-NEXT: vpabsd %ymm0, %ymm0
1289
- ; AVX2-NEXT: vpabsd %ymm1, %ymm1
1290
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1291
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1292
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1293
- ; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2
1294
- ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1295
- ; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3
1296
- ; AVX2-NEXT: vpabsd %ymm2, %ymm2
1297
- ; AVX2-NEXT: vpabsd %ymm3, %ymm3
1298
- ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
1299
- ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1300
- ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
1301
- ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1302
- ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1167
+ ; AVX2-NEXT: vmovdqu (%rdi), %xmm0
1168
+ ; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1169
+ ; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1170
+ ; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1171
+ ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1303
1172
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1304
1173
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1305
1174
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1306
- ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1175
+ ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1307
1176
; AVX2-NEXT: vmovd %xmm0, %eax
1308
- ; AVX2-NEXT: vzeroupper
1309
1177
; AVX2-NEXT: retq
1310
1178
;
1311
1179
; AVX512-LABEL: sad_double_reduction_abs:
1312
1180
; AVX512: # %bb.0: # %bb
1313
- ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1314
- ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1315
- ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1316
- ; AVX512-NEXT: vpabsd %zmm0, %zmm0
1317
- ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1318
- ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1319
- ; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1
1320
- ; AVX512-NEXT: vpabsd %zmm1, %zmm1
1321
- ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1322
- ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1323
- ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1324
- ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1325
- ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1181
+ ; AVX512-NEXT: vmovdqu (%rdi), %xmm0
1182
+ ; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
1183
+ ; AVX512-NEXT: vmovdqu (%rdx), %xmm1
1184
+ ; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
1185
+ ; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1326
1186
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1327
1187
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1328
1188
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1329
1189
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1330
1190
; AVX512-NEXT: vmovd %xmm0, %eax
1331
- ; AVX512-NEXT: vzeroupper
1332
1191
; AVX512-NEXT: retq
1333
1192
bb:
1334
1193
%tmp = load <16 x i8 >, <16 x i8 >* %arg , align 1
0 commit comments