Skip to content

Commit ff955aa

Browse files
committed
Make memcmp() and memchr() go fast again
Readahead within the specified size is legal, even if it overlaps a page boundary; it's the fault of the caller if that causes a segfault.
1 parent 70155df commit ff955aa

File tree

4 files changed

+137
-49
lines changed

4 files changed

+137
-49
lines changed

libc/intrin/memchr.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717
│ PERFORMANCE OF THIS SOFTWARE. │
1818
╚─────────────────────────────────────────────────────────────────────────────*/
1919
#include "libc/dce.h"
20+
#include "libc/intrin/asan.internal.h"
2021
#include "libc/nexgen32e/x86feature.h"
2122
#include "libc/str/str.h"
2223
#ifndef __aarch64__
2324

24-
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
25+
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
2526

2627
static inline const unsigned char *memchr_pure(const unsigned char *s,
2728
unsigned char c, size_t n) {
@@ -69,15 +70,8 @@ static inline const unsigned char *memchr_sse(const unsigned char *s,
6970
void *memchr(const void *s, int c, size_t n) {
7071
#if defined(__x86_64__) && !defined(__chibicc__)
7172
const void *r;
72-
const unsigned char *p = (const unsigned char *)s;
73-
while (n && ((intptr_t)p & 15)) {
74-
if (*p == (unsigned char)c) {
75-
return (void *)p;
76-
}
77-
++p;
78-
--n;
79-
}
80-
r = memchr_sse(p, c, n);
73+
if (IsAsan()) __asan_verify(s, n);
74+
r = memchr_sse(s, c, n);
8175
return (void *)r;
8276
#else
8377
return (void *)memchr_pure(s, c, n);

libc/intrin/memcmp.c

Lines changed: 133 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,88 @@
2121
#include "libc/nexgen32e/x86feature.h"
2222
#include "libc/str/str.h"
2323

24+
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
25+
2426
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
2527

28+
#if defined(__x86_64__) && !defined(__chibicc__)
29+
30+
static dontinline antiquity int memcmp_sse(const unsigned char *p,
31+
const unsigned char *q, size_t n) {
32+
unsigned u;
33+
if (n > 32) {
34+
while (n > 16 + 16) {
35+
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
36+
n -= 16;
37+
p += 16;
38+
q += 16;
39+
} else {
40+
u = __builtin_ctzl(u);
41+
return p[u] - q[u];
42+
}
43+
}
44+
}
45+
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
46+
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
47+
0xffff)) {
48+
return 0;
49+
} else {
50+
u = __builtin_ctzl(u);
51+
return p[n - 16 + u] - q[n - 16 + u];
52+
}
53+
} else {
54+
u = __builtin_ctzl(u);
55+
return p[u] - q[u];
56+
}
57+
}
58+
59+
_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
60+
const unsigned char *q,
61+
size_t n) {
62+
uint64_t w;
63+
unsigned u;
64+
if (n > 32) {
65+
while (n >= 16 + 64) {
66+
w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 |
67+
(uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 |
68+
(uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 |
69+
(uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060;
70+
if (w == -1) {
71+
n -= 64;
72+
p += 64;
73+
q += 64;
74+
} else {
75+
w = __builtin_ctzll(w ^ -1);
76+
return p[w] - q[w];
77+
}
78+
}
79+
while (n > 16 + 16) {
80+
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
81+
n -= 16;
82+
p += 16;
83+
q += 16;
84+
} else {
85+
u = __builtin_ctzl(u);
86+
return p[u] - q[u];
87+
}
88+
}
89+
}
90+
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
91+
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
92+
0xffff)) {
93+
return 0;
94+
} else {
95+
u = __builtin_ctzl(u);
96+
return p[n - 16 + u] - q[n - 16 + u];
97+
}
98+
} else {
99+
u = __builtin_ctzl(u);
100+
return p[u] - q[u];
101+
}
102+
}
103+
104+
#endif /* __x86_64__ */
105+
26106
/**
27107
* Compares memory byte by byte.
28108
*
@@ -57,21 +137,64 @@ typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
57137
*/
58138
int memcmp(const void *a, const void *b, size_t n) {
59139
int c;
140+
#if defined(__x86_64__) && !defined(__chibicc__)
141+
unsigned u;
142+
uint32_t k, i, j;
143+
uint64_t w, x, y;
144+
#endif
60145
const unsigned char *p, *q;
61146
if ((p = a) == (q = b) || !n) return 0;
62147
if ((c = *p - *q)) return c;
63148
#if defined(__x86_64__) && !defined(__chibicc__)
64-
unsigned u;
65-
while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 &&
66-
((uintptr_t)q & 0xfff) <= 0x1000 - 16)) {
67-
if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^
68-
0xffff)) {
69-
n -= 16;
70-
p += 16;
71-
q += 16;
149+
if (!IsTiny()) {
150+
if (n <= 16) {
151+
if (n >= 8) {
152+
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
153+
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
154+
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
155+
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
156+
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
157+
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
158+
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
159+
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
160+
p += n - 8;
161+
q += n - 8;
162+
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
163+
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
164+
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
165+
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
166+
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
167+
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
168+
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
169+
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
170+
return 0;
171+
}
172+
}
173+
u = __builtin_ctzll(w);
174+
u = u & -8;
175+
return ((x >> u) & 255) - ((y >> u) & 255);
176+
} else if (n >= 4) {
177+
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
178+
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
179+
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
180+
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
181+
p += n - 4;
182+
q += n - 4;
183+
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
184+
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
185+
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
186+
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
187+
return 0;
188+
}
189+
}
190+
u = __builtin_ctzl(k);
191+
u = u & -8;
192+
return ((i >> u) & 255) - ((j >> u) & 255);
193+
}
194+
} else if (LIKELY(X86_HAVE(AVX))) {
195+
return memcmp_avx(p, q, n);
72196
} else {
73-
u = __builtin_ctzl(u);
74-
return p[u] - q[u];
197+
return memcmp_sse(p, q, n);
75198
}
76199
}
77200
#endif /* __x86_64__ */

test/libc/intrin/memchr_test.c

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,10 @@
1616
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
1717
│ PERFORMANCE OF THIS SOFTWARE. │
1818
╚─────────────────────────────────────────────────────────────────────────────*/
19-
#include "libc/runtime/runtime.h"
20-
#include "libc/runtime/sysconf.h"
2119
#include "libc/str/str.h"
22-
#include "libc/sysv/consts/map.h"
23-
#include "libc/sysv/consts/prot.h"
2420
#include "libc/testlib/testlib.h"
2521

2622
TEST(memchr, test) {
2723
const char *s = "hello";
2824
ASSERT_EQ(s + 1, memchr(s, 'e', 5));
2925
}
30-
31-
TEST(memchr, pageOverlapTorture) {
32-
long pagesz = sysconf(_SC_PAGESIZE);
33-
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
34-
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
35-
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
36-
strcpy(map + pagesz - 9, "12345678");
37-
EXPECT_EQ(map + pagesz - 1, memchr(map + pagesz - 9, 0, 79));
38-
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
39-
}

test/libc/intrin/memcmp_test.c

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -113,21 +113,6 @@ TEST(memcmp, fuzz) {
113113
}
114114
}
115115

116-
TEST(memcmp, pageOverlapTorture) {
117-
long pagesz = sysconf(_SC_PAGESIZE);
118-
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
119-
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
120-
char *map2 = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
121-
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
122-
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
123-
ASSERT_SYS(0, 0, mprotect(map2 + pagesz, pagesz, PROT_NONE));
124-
strcpy(map + pagesz - 9, "12345678");
125-
strcpy(map2 + pagesz - 9, "12345679");
126-
EXPECT_LT(memcmp(map + pagesz - 9, map2 + pagesz - 9, 79), 0);
127-
EXPECT_SYS(0, 0, munmap(map2, pagesz * 2));
128-
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
129-
}
130-
131116
int buncmp(const void *, const void *, size_t) asm("bcmp");
132117
int funcmp(const void *, const void *, size_t) asm("memcmp");
133118

0 commit comments

Comments
 (0)