Skip to content

Commit 0a79c69

Browse files
committed
Make malloc scalable on all platforms
It turns out sched_getcpu() didn't work on many platforms. So the system call now has tests and is well documented. We now employ new workarounds on platforms where it isn't supported in our malloc() implementation. It was previously the case that malloc() was only scalable on Linux/Windows for x86-64. Now the other platforms are scalable too.
1 parent 3fd275f commit 0a79c69

File tree

9 files changed

+459
-99
lines changed

9 files changed

+459
-99
lines changed

examples/nproc.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#if 0
2+
/*─────────────────────────────────────────────────────────────────╗
3+
│ To the extent possible under law, Justine Tunney has waived │
4+
│ all copyright and related or neighboring rights to this file, │
5+
│ as it is written in the following disclaimers: │
6+
│ • http://unlicense.org/ │
7+
│ • http://creativecommons.org/publicdomain/zero/1.0/ │
8+
╚─────────────────────────────────────────────────────────────────*/
9+
#endif
10+
#include <cosmo.h>
11+
#include <stdio.h>
12+
13+
int main(int argc, char *argv[]) {
14+
printf("%d\n", __get_cpu_count());
15+
}

libc/calls/getcpu.c

Lines changed: 51 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -30,39 +30,63 @@
3030

3131
int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
3232

33+
/**
34+
* Determines ID of CPU on which thread is currently scheduled.
35+
*
36+
* This is the same as sched_getcpu(), except it also supports returning
37+
* the ID of the current NUMA node. On some platforms this functionality
38+
* isn't available, in which case `out_opt_node` is always be set to 0.
39+
*/
3340
int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
34-
unsigned cpu;
35-
unsigned node;
36-
if (X86_HAVE(RDTSCP)) {
41+
42+
if (IsWindows()) {
43+
struct NtProcessorNumber pn;
44+
if (out_opt_cpu) {
45+
GetCurrentProcessorNumberEx(&pn);
46+
*out_opt_cpu = 64 * pn.Group + pn.Number;
47+
}
48+
if (out_opt_node) {
49+
unsigned short node16;
50+
if (GetNumaProcessorNodeEx(&pn, &node16)) {
51+
*out_opt_node = node16;
52+
} else {
53+
return __winerr();
54+
}
55+
}
56+
return 0;
57+
}
58+
59+
#ifdef __x86_64__
60+
if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
3761
unsigned tsc_aux;
3862
rdtscp(&tsc_aux);
39-
cpu = TSC_AUX_CORE(tsc_aux);
40-
node = TSC_AUX_NODE(tsc_aux);
41-
} else if (IsWindows()) {
42-
struct NtProcessorNumber pn;
43-
GetCurrentProcessorNumberEx(&pn);
44-
cpu = 64 * pn.Group + pn.Number;
45-
unsigned short node16;
46-
if (GetNumaProcessorNodeEx(&pn, &node16)) {
47-
node = node16;
48-
} else {
49-
return __winerr();
63+
if (out_opt_cpu)
64+
*out_opt_cpu = TSC_AUX_CORE(tsc_aux);
65+
if (out_opt_node)
66+
*out_opt_node = TSC_AUX_NODE(tsc_aux);
67+
return 0;
68+
}
69+
#endif
70+
71+
if (IsXnu() || IsOpenbsd() || IsNetbsd() || IsFreebsd()) {
72+
if (out_opt_cpu) {
73+
int rc = sched_getcpu();
74+
if (rc == -1)
75+
return -1;
76+
*out_opt_cpu = rc;
5077
}
51-
} else if (IsAarch64()) {
52-
long tpidr_el0;
53-
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
54-
cpu = tpidr_el0 & 255;
55-
node = 0;
56-
} else {
57-
int rc = sys_getcpu(&cpu, &node, 0);
58-
if (rc == -1)
59-
return -1;
78+
if (out_opt_node)
79+
*out_opt_node = 0;
80+
return 0;
6081
}
61-
if (out_opt_cpu) {
82+
83+
unsigned cpu, node;
84+
int rc = sys_getcpu(&cpu, &node, 0);
85+
if (rc == -1)
86+
return -1;
87+
if (out_opt_cpu)
6288
*out_opt_cpu = cpu;
63-
}
64-
if (out_opt_node) {
89+
if (out_opt_node)
6590
*out_opt_node = node;
66-
}
6791
return 0;
6892
}

libc/calls/sched_getcpu.c

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,32 +23,82 @@
2323
#include "libc/nexgen32e/x86feature.h"
2424
#include "libc/nt/struct/processornumber.h"
2525
#include "libc/nt/synchronization.h"
26+
#include "libc/runtime/syslib.internal.h"
2627
#include "libc/sysv/errfuns.h"
2728

2829
int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
2930

3031
/**
3132
* Returns ID of CPU on which thread is currently scheduled.
33+
*
34+
* This function is supported on the following platforms:
35+
*
36+
* - x86-64
37+
*
38+
* - Linux: rdtsc
39+
* - FreeBSD: rdtsc
40+
* - Windows: win32
41+
* - OpenBSD: unsupported
42+
* - NetBSD: unsupported
43+
* - MacOS: unsupported
44+
*
45+
* - aarch64
46+
*
47+
* - Linux: syscall
48+
* - FreeBSD: syscall
49+
* - MacOS: supported
50+
*
3251
* @return cpu number on success, or -1 w/ errno
3352
*/
3453
int sched_getcpu(void) {
35-
if (X86_HAVE(RDTSCP)) {
36-
unsigned tsc_aux;
37-
rdtscp(&tsc_aux);
38-
return TSC_AUX_CORE(tsc_aux);
39-
} else if (IsAarch64()) {
40-
long tpidr_el0;
41-
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
42-
return tpidr_el0 & 255;
43-
} else if (IsWindows()) {
54+
55+
if (IsWindows()) {
4456
struct NtProcessorNumber pn;
4557
GetCurrentProcessorNumberEx(&pn);
4658
return 64 * pn.Group + pn.Number;
47-
} else {
48-
unsigned cpu = 0;
49-
int rc = sys_getcpu(&cpu, 0, 0);
50-
if (rc == -1)
51-
return -1;
52-
return cpu;
5359
}
60+
61+
#ifdef __x86_64__
62+
if (X86_HAVE(RDTSCP) && (IsLinux() || IsFreebsd())) {
63+
// Only the Linux, FreeBSD, and Windows kernels can be counted upon
64+
// to populate the TSC_AUX register with the current thread number.
65+
unsigned tsc_aux;
66+
rdtscp(&tsc_aux);
67+
return TSC_AUX_CORE(tsc_aux);
68+
}
69+
#endif
70+
71+
#ifdef __aarch64__
72+
if (IsXnu()) {
73+
// pthread_cpu_number_np() is defined by MacOS 11.0+ (Big Sur) in
74+
// the SDK pthread.h header file, even though there's no man page
75+
if (__syslib && __syslib->__version >= 9) {
76+
errno_t err;
77+
size_t out = 0;
78+
if ((err = __syslib->__pthread_cpu_number_np(&out))) {
79+
errno = err;
80+
return -1;
81+
}
82+
return out;
83+
} else {
84+
errno = ENOSYS; // upgrade your ape loader
85+
return -1; // cc -o /usr/local/bin/ape ape/ape-m1.c
86+
}
87+
}
88+
#endif
89+
90+
#ifdef __aarch64__
91+
if (IsFreebsd()) {
92+
register int x0 asm("x0");
93+
register int x8 asm("x8") = 581; // sched_getcpu
94+
asm volatile("svc\t0" : "=r"(x0) : "r"(x8) : "memory");
95+
return x0;
96+
}
97+
#endif
98+
99+
unsigned cpu = 0;
100+
int rc = sys_getcpu(&cpu, 0, 0);
101+
if (rc == -1)
102+
return -1;
103+
return cpu;
54104
}

libc/intrin/atomic.h

Lines changed: 88 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -13,48 +13,26 @@
1313
*/
1414

1515
typedef enum {
16-
memory_order_relaxed,
17-
memory_order_consume,
18-
memory_order_acquire,
19-
memory_order_release,
20-
memory_order_acq_rel,
21-
memory_order_seq_cst,
16+
memory_order_relaxed = __ATOMIC_RELAXED,
17+
memory_order_consume = __ATOMIC_CONSUME,
18+
memory_order_acquire = __ATOMIC_ACQUIRE,
19+
memory_order_release = __ATOMIC_RELEASE,
20+
memory_order_acq_rel = __ATOMIC_ACQ_REL,
21+
memory_order_seq_cst = __ATOMIC_SEQ_CST
2222
} memory_order;
2323

24-
#define ATOMIC_VAR_INIT(...) __VA_ARGS__
24+
#if !(defined __STDC_VERSION__ && __STDC_VERSION__ > 201710L)
25+
#define ATOMIC_VAR_INIT(...) __VA_ARGS__
26+
#endif
27+
2528
#define atomic_is_lock_free(obj) ((void)(obj), sizeof(obj) <= sizeof(void *))
2629

2730
#define atomic_flag atomic_bool
28-
#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0)
31+
#define ATOMIC_FLAG_INIT false
2932
#define atomic_flag_test_and_set_explicit(x, order) \
3033
atomic_exchange_explicit(x, 1, order)
3134
#define atomic_flag_clear_explicit(x, order) atomic_store_explicit(x, 0, order)
3235

33-
#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
34-
atomic_compare_exchange_strong_explicit( \
35-
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
36-
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
37-
atomic_compare_exchange_weak_explicit( \
38-
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
39-
#define atomic_exchange(pObject, desired) \
40-
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
41-
#define atomic_fetch_add(pObject, operand) \
42-
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
43-
#define atomic_fetch_and(pObject, operand) \
44-
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
45-
#define atomic_fetch_or(pObject, operand) \
46-
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
47-
#define atomic_fetch_sub(pObject, operand) \
48-
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
49-
#define atomic_fetch_xor(pObject, operand) \
50-
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
51-
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
52-
#define atomic_store(pObject, desired) \
53-
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
54-
#define atomic_flag_test_and_set(x) \
55-
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
56-
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
57-
5836
#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
5937

6038
#define atomic_init(obj, value) __c11_atomic_init(obj, value)
@@ -84,9 +62,35 @@ typedef enum {
8462
#define atomic_store_explicit(object, desired, order) \
8563
__c11_atomic_store(object, desired, order)
8664

65+
#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
66+
atomic_compare_exchange_strong_explicit( \
67+
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
68+
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
69+
atomic_compare_exchange_weak_explicit( \
70+
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
71+
#define atomic_exchange(pObject, desired) \
72+
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
73+
#define atomic_fetch_add(pObject, operand) \
74+
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
75+
#define atomic_fetch_and(pObject, operand) \
76+
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
77+
#define atomic_fetch_or(pObject, operand) \
78+
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
79+
#define atomic_fetch_sub(pObject, operand) \
80+
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
81+
#define atomic_fetch_xor(pObject, operand) \
82+
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
83+
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
84+
#define atomic_store(pObject, desired) \
85+
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
86+
#define atomic_flag_test_and_set(x) \
87+
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
88+
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
89+
8790
#elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 407
8891

89-
#define atomic_init(obj, value) ((void)(*(obj) = (value)))
92+
#define atomic_init(obj, value) \
93+
atomic_store_explicit(obj, value, __ATOMIC_RELAXED)
9094
#define atomic_thread_fence(order) __atomic_thread_fence(order)
9195
#define atomic_signal_fence(order) __atomic_signal_fence(order)
9296
#define atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
@@ -111,6 +115,31 @@ typedef enum {
111115
#define atomic_store_explicit(pObject, desired, order) \
112116
__atomic_store_n(pObject, desired, order)
113117

118+
#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
119+
atomic_compare_exchange_strong_explicit(pObject, pExpected, desired, \
120+
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
121+
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
122+
atomic_compare_exchange_weak_explicit(pObject, pExpected, desired, \
123+
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
124+
#define atomic_exchange(pObject, desired) \
125+
atomic_exchange_explicit(pObject, desired, __ATOMIC_SEQ_CST)
126+
#define atomic_fetch_add(pObject, operand) \
127+
atomic_fetch_add_explicit(pObject, operand, __ATOMIC_SEQ_CST)
128+
#define atomic_fetch_and(pObject, operand) \
129+
atomic_fetch_and_explicit(pObject, operand, __ATOMIC_SEQ_CST)
130+
#define atomic_fetch_or(pObject, operand) \
131+
atomic_fetch_or_explicit(pObject, operand, __ATOMIC_SEQ_CST)
132+
#define atomic_fetch_sub(pObject, operand) \
133+
atomic_fetch_sub_explicit(pObject, operand, __ATOMIC_SEQ_CST)
134+
#define atomic_fetch_xor(pObject, operand) \
135+
atomic_fetch_xor_explicit(pObject, operand, __ATOMIC_SEQ_CST)
136+
#define atomic_load(pObject) atomic_load_explicit(pObject, __ATOMIC_SEQ_CST)
137+
#define atomic_store(pObject, desired) \
138+
atomic_store_explicit(pObject, desired, __ATOMIC_SEQ_CST)
139+
#define atomic_flag_test_and_set(x) \
140+
atomic_flag_test_and_set_explicit(x, __ATOMIC_SEQ_CST)
141+
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, __ATOMIC_SEQ_CST)
142+
114143
#elif (__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 401
115144

116145
#define atomic_init(obj, value) ((void)(*(obj) = (value)))
@@ -210,6 +239,31 @@ typedef enum {
210239
#define atomic_store_explicit(object, desired, order) \
211240
((void)atomic_exchange_explicit(object, desired, order))
212241

242+
#define atomic_compare_exchange_strong(pObject, pExpected, desired) \
243+
atomic_compare_exchange_strong_explicit( \
244+
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
245+
#define atomic_compare_exchange_weak(pObject, pExpected, desired) \
246+
atomic_compare_exchange_weak_explicit( \
247+
pObject, pExpected, desired, memory_order_seq_cst, memory_order_seq_cst)
248+
#define atomic_exchange(pObject, desired) \
249+
atomic_exchange_explicit(pObject, desired, memory_order_seq_cst)
250+
#define atomic_fetch_add(pObject, operand) \
251+
atomic_fetch_add_explicit(pObject, operand, memory_order_seq_cst)
252+
#define atomic_fetch_and(pObject, operand) \
253+
atomic_fetch_and_explicit(pObject, operand, memory_order_seq_cst)
254+
#define atomic_fetch_or(pObject, operand) \
255+
atomic_fetch_or_explicit(pObject, operand, memory_order_seq_cst)
256+
#define atomic_fetch_sub(pObject, operand) \
257+
atomic_fetch_sub_explicit(pObject, operand, memory_order_seq_cst)
258+
#define atomic_fetch_xor(pObject, operand) \
259+
atomic_fetch_xor_explicit(pObject, operand, memory_order_seq_cst)
260+
#define atomic_load(pObject) atomic_load_explicit(pObject, memory_order_seq_cst)
261+
#define atomic_store(pObject, desired) \
262+
atomic_store_explicit(pObject, desired, memory_order_seq_cst)
263+
#define atomic_flag_test_and_set(x) \
264+
atomic_flag_test_and_set_explicit(x, memory_order_seq_cst)
265+
#define atomic_flag_clear(x) atomic_flag_clear_explicit(x, memory_order_seq_cst)
266+
213267
#else /* non-gcc or old gcc w/o x86 */
214268
#error "atomic operations not supported with this compiler and/or architecture"
215269
#endif

libc/runtime/syslib.internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct Syslib {
8282
char *(*__dlerror)(void);
8383
/* v9 (2024-01-31) */
8484
int (*__pthread_cpu_number_np)(size_t *);
85+
/* v10 (2024-05-02) */
8586
long (*__sysctl)(int *, unsigned, void *, size_t *, void *, size_t);
8687
long (*__sysctlbyname)(const char *, void *, size_t *, void *, size_t);
8788
long (*__sysctlnametomib)(const char *, int *, size_t *);

0 commit comments

Comments
 (0)