|
21 | 21 | #include "libc/nexgen32e/x86feature.h"
|
22 | 22 | #include "libc/str/str.h"
|
23 | 23 |
|
| 24 | +#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x) |
| 25 | + |
24 | 26 | typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
25 | 27 |
|
| 28 | +#if defined(__x86_64__) && !defined(__chibicc__) |
| 29 | + |
| 30 | +static dontinline antiquity int memcmp_sse(const unsigned char *p, |
| 31 | + const unsigned char *q, size_t n) { |
| 32 | + unsigned u; |
| 33 | + if (n > 32) { |
| 34 | + while (n > 16 + 16) { |
| 35 | + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { |
| 36 | + n -= 16; |
| 37 | + p += 16; |
| 38 | + q += 16; |
| 39 | + } else { |
| 40 | + u = __builtin_ctzl(u); |
| 41 | + return p[u] - q[u]; |
| 42 | + } |
| 43 | + } |
| 44 | + } |
| 45 | + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { |
| 46 | + if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^ |
| 47 | + 0xffff)) { |
| 48 | + return 0; |
| 49 | + } else { |
| 50 | + u = __builtin_ctzl(u); |
| 51 | + return p[n - 16 + u] - q[n - 16 + u]; |
| 52 | + } |
| 53 | + } else { |
| 54 | + u = __builtin_ctzl(u); |
| 55 | + return p[u] - q[u]; |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p, |
| 60 | + const unsigned char *q, |
| 61 | + size_t n) { |
| 62 | + uint64_t w; |
| 63 | + unsigned u; |
| 64 | + if (n > 32) { |
| 65 | + while (n >= 16 + 64) { |
| 66 | + w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 | |
| 67 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 | |
| 68 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 | |
| 69 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060; |
| 70 | + if (w == -1) { |
| 71 | + n -= 64; |
| 72 | + p += 64; |
| 73 | + q += 64; |
| 74 | + } else { |
| 75 | + w = __builtin_ctzll(w ^ -1); |
| 76 | + return p[w] - q[w]; |
| 77 | + } |
| 78 | + } |
| 79 | + while (n > 16 + 16) { |
| 80 | + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { |
| 81 | + n -= 16; |
| 82 | + p += 16; |
| 83 | + q += 16; |
| 84 | + } else { |
| 85 | + u = __builtin_ctzl(u); |
| 86 | + return p[u] - q[u]; |
| 87 | + } |
| 88 | + } |
| 89 | + } |
| 90 | + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { |
| 91 | + if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^ |
| 92 | + 0xffff)) { |
| 93 | + return 0; |
| 94 | + } else { |
| 95 | + u = __builtin_ctzl(u); |
| 96 | + return p[n - 16 + u] - q[n - 16 + u]; |
| 97 | + } |
| 98 | + } else { |
| 99 | + u = __builtin_ctzl(u); |
| 100 | + return p[u] - q[u]; |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +#endif /* __x86_64__ */ |
| 105 | + |
26 | 106 | /**
|
27 | 107 | * Compares memory byte by byte.
|
28 | 108 | *
|
@@ -57,21 +137,64 @@ typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
57 | 137 | */
|
58 | 138 | int memcmp(const void *a, const void *b, size_t n) {
|
59 | 139 | int c;
|
| 140 | +#if defined(__x86_64__) && !defined(__chibicc__) |
| 141 | + unsigned u; |
| 142 | + uint32_t k, i, j; |
| 143 | + uint64_t w, x, y; |
| 144 | +#endif |
60 | 145 | const unsigned char *p, *q;
|
61 | 146 | if ((p = a) == (q = b) || !n) return 0;
|
62 | 147 | if ((c = *p - *q)) return c;
|
63 | 148 | #if defined(__x86_64__) && !defined(__chibicc__)
|
64 |
| - unsigned u; |
65 |
| - while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 && |
66 |
| - ((uintptr_t)q & 0xfff) <= 0x1000 - 16)) { |
67 |
| - if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^ |
68 |
| - 0xffff)) { |
69 |
| - n -= 16; |
70 |
| - p += 16; |
71 |
| - q += 16; |
| 149 | + if (!IsTiny()) { |
| 150 | + if (n <= 16) { |
| 151 | + if (n >= 8) { |
| 152 | + if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 | |
| 153 | + (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 | |
| 154 | + (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 | |
| 155 | + (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^ |
| 156 | + (y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 | |
| 157 | + (uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 | |
| 158 | + (uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 | |
| 159 | + (uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) { |
| 160 | + p += n - 8; |
| 161 | + q += n - 8; |
| 162 | + if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 | |
| 163 | + (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 | |
| 164 | + (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 | |
| 165 | + (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^ |
| 166 | + (y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 | |
| 167 | + (uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 | |
| 168 | + (uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 | |
| 169 | + (uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) { |
| 170 | + return 0; |
| 171 | + } |
| 172 | + } |
| 173 | + u = __builtin_ctzll(w); |
| 174 | + u = u & -8; |
| 175 | + return ((x >> u) & 255) - ((y >> u) & 255); |
| 176 | + } else if (n >= 4) { |
| 177 | + if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 | |
| 178 | + (uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^ |
| 179 | + (j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 | |
| 180 | + (uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) { |
| 181 | + p += n - 4; |
| 182 | + q += n - 4; |
| 183 | + if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 | |
| 184 | + (uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^ |
| 185 | + (j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 | |
| 186 | + (uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) { |
| 187 | + return 0; |
| 188 | + } |
| 189 | + } |
| 190 | + u = __builtin_ctzl(k); |
| 191 | + u = u & -8; |
| 192 | + return ((i >> u) & 255) - ((j >> u) & 255); |
| 193 | + } |
| 194 | + } else if (LIKELY(X86_HAVE(AVX))) { |
| 195 | + return memcmp_avx(p, q, n); |
72 | 196 | } else {
|
73 |
| - u = __builtin_ctzl(u); |
74 |
| - return p[u] - q[u]; |
| 197 | + return memcmp_sse(p, q, n); |
75 | 198 | }
|
76 | 199 | }
|
77 | 200 | #endif /* __x86_64__ */
|
|
0 commit comments