Spaces:
Sleeping
Sleeping
Max Krasnyansky
commited on
Commit
·
aca04d5
1
Parent(s):
7349efc
threads: improve ggml_barrier scaling with large number of threads (llama/9598)
Browse filesMake sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing.
This optimization shows performance improvements even for n_threads <= 8 cases.
Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write
in the normal case and just use thread-fence as originally intended.
- ggml/src/ggml.c +45 -14
ggml/src/ggml.c
CHANGED
|
@@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
|
|
| 63 |
#pragma warning(disable: 4702)
|
| 64 |
#endif
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
#if defined(_WIN32)
|
| 67 |
|
| 68 |
#define WIN32_LEAN_AND_MEAN
|
|
@@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
|
|
| 72 |
#include <windows.h>
|
| 73 |
|
| 74 |
#if !defined(__clang__)
|
|
|
|
|
|
|
| 75 |
typedef volatile LONG atomic_int;
|
| 76 |
typedef atomic_int atomic_bool;
|
| 77 |
typedef atomic_int atomic_flag;
|
|
@@ -2006,8 +2027,8 @@ struct ggml_threadpool {
|
|
| 2006 |
|
| 2007 |
// synchronization primitives
|
| 2008 |
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
| 2009 |
-
atomic_int n_barrier;
|
| 2010 |
-
atomic_int n_barrier_passed;
|
| 2011 |
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
| 2012 |
|
| 2013 |
// these are atomic as an annotation for thread-sanitizer
|
|
@@ -3195,20 +3216,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
|
|
| 3195 |
// enter barrier (full seq-cst fence)
|
| 3196 |
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
|
| 3197 |
|
| 3198 |
-
int last = 0;
|
| 3199 |
if (n_barrier == (n_threads - 1)) {
|
| 3200 |
// last thread
|
| 3201 |
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
| 3202 |
-
|
| 3203 |
-
|
| 3204 |
-
|
| 3205 |
-
|
| 3206 |
-
|
| 3207 |
-
|
|
|
|
|
|
|
|
|
|
| 3208 |
}
|
| 3209 |
|
| 3210 |
// exit barrier (full seq-cst fence)
|
| 3211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3212 |
#endif
|
| 3213 |
}
|
| 3214 |
|
|
@@ -20239,10 +20267,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
|
|
| 20239 |
|
| 20240 |
// sync thread state after polling
|
| 20241 |
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
|
| 20242 |
-
|
| 20243 |
-
|
| 20244 |
-
|
| 20245 |
-
|
|
|
|
|
|
|
|
|
|
| 20246 |
}
|
| 20247 |
|
| 20248 |
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
|
|
|
| 63 |
#pragma warning(disable: 4702)
|
| 64 |
#endif
|
| 65 |
|
| 66 |
+
// Note: once we move threading into a separate C++ file
|
| 67 |
+
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
| 68 |
+
// and we'll use C++ attribute syntax.
|
| 69 |
+
#define GGML_CACHE_LINE 64
|
| 70 |
+
|
| 71 |
+
#if defined(__clang__) || defined(__GNUC__)
|
| 72 |
+
#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
|
| 73 |
+
#endif
|
| 74 |
+
|
| 75 |
+
#if defined(__has_feature)
|
| 76 |
+
#if __has_feature(thread_sanitizer)
|
| 77 |
+
#define GGML_TSAN_ENABLED 1
|
| 78 |
+
#endif
|
| 79 |
+
#else // __has_feature
|
| 80 |
+
#if defined(__SANITIZE_THREAD__)
|
| 81 |
+
#define GGML_TSAN_ENABLED 1
|
| 82 |
+
#endif
|
| 83 |
+
#endif // __has_feature
|
| 84 |
+
|
| 85 |
#if defined(_WIN32)
|
| 86 |
|
| 87 |
#define WIN32_LEAN_AND_MEAN
|
|
|
|
| 91 |
#include <windows.h>
|
| 92 |
|
| 93 |
#if !defined(__clang__)
|
| 94 |
+
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
| 95 |
+
|
| 96 |
typedef volatile LONG atomic_int;
|
| 97 |
typedef atomic_int atomic_bool;
|
| 98 |
typedef atomic_int atomic_flag;
|
|
|
|
| 2027 |
|
| 2028 |
// synchronization primitives
|
| 2029 |
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
| 2030 |
+
atomic_int GGML_CACHE_ALIGN n_barrier;
|
| 2031 |
+
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
| 2032 |
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
| 2033 |
|
| 2034 |
// these are atomic as an annotation for thread-sanitizer
|
|
|
|
| 3216 |
// enter barrier (full seq-cst fence)
|
| 3217 |
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
|
| 3218 |
|
|
|
|
| 3219 |
if (n_barrier == (n_threads - 1)) {
|
| 3220 |
// last thread
|
| 3221 |
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
|
| 3222 |
+
|
| 3223 |
+
// exit barrier (fill seq-cst fence)
|
| 3224 |
+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
|
| 3225 |
+
return;
|
| 3226 |
+
}
|
| 3227 |
+
|
| 3228 |
+
// wait for other threads
|
| 3229 |
+
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
|
| 3230 |
+
ggml_thread_cpu_relax();
|
| 3231 |
}
|
| 3232 |
|
| 3233 |
// exit barrier (full seq-cst fence)
|
| 3234 |
+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
|
| 3235 |
+
#ifdef GGML_TSAN_ENABLED
|
| 3236 |
+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
|
| 3237 |
+
#else
|
| 3238 |
+
atomic_thread_fence(memory_order_seq_cst);
|
| 3239 |
+
#endif
|
| 3240 |
#endif
|
| 3241 |
}
|
| 3242 |
|
|
|
|
| 20267 |
|
| 20268 |
// sync thread state after polling
|
| 20269 |
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
|
| 20270 |
+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
|
| 20271 |
+
#ifdef GGML_TSAN_ENABLED
|
| 20272 |
+
atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
|
| 20273 |
+
#else
|
| 20274 |
+
atomic_thread_fence(memory_order_seq_cst);
|
| 20275 |
+
#endif
|
| 20276 |
+
UNUSED(state);
|
| 20277 |
}
|
| 20278 |
|
| 20279 |
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|