Max Krasnyansky commited on
Commit
aca04d5
·
1 Parent(s): 7349efc

threads: improve ggml_barrier scaling with large number of threads (llama/9598)

Browse files

Make sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing.
This optimization shows performance improvements even for n_threads <= 8 cases.

Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write
in the normal case and just use thread-fence as originally intended.

Files changed (1) hide show
  1. ggml/src/ggml.c +45 -14
ggml/src/ggml.c CHANGED
@@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
63
  #pragma warning(disable: 4702)
64
  #endif
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  #if defined(_WIN32)
67
 
68
  #define WIN32_LEAN_AND_MEAN
@@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
72
  #include <windows.h>
73
 
74
  #if !defined(__clang__)
 
 
75
  typedef volatile LONG atomic_int;
76
  typedef atomic_int atomic_bool;
77
  typedef atomic_int atomic_flag;
@@ -2006,8 +2027,8 @@ struct ggml_threadpool {
2006
 
2007
  // synchronization primitives
2008
  atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
2009
- atomic_int n_barrier;
2010
- atomic_int n_barrier_passed;
2011
  atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
2012
 
2013
  // these are atomic as an annotation for thread-sanitizer
@@ -3195,20 +3216,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
3195
  // enter barrier (full seq-cst fence)
3196
  int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
3197
 
3198
- int last = 0;
3199
  if (n_barrier == (n_threads - 1)) {
3200
  // last thread
3201
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3202
- last = 1;
3203
- } else {
3204
- // wait for other threads
3205
- while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
3206
- ggml_thread_cpu_relax();
3207
- }
 
 
 
3208
  }
3209
 
3210
  // exit barrier (full seq-cst fence)
3211
- atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
 
 
 
 
 
3212
  #endif
3213
  }
3214
 
@@ -20239,10 +20267,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
20239
 
20240
  // sync thread state after polling
20241
  static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
20242
- struct ggml_threadpool * threadpool = state->threadpool;
20243
- // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
20244
- // so instead we just use a dummy read-modify-write
20245
- atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
 
 
 
20246
  }
20247
 
20248
  static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
 
63
  #pragma warning(disable: 4702)
64
  #endif
65
 
66
+ // Note: once we move threading into a separate C++ file
67
+ // will use std::hardware_destructive_interference_size instead of hardcoding it here
68
+ // and we'll use C++ attribute syntax.
69
+ #define GGML_CACHE_LINE 64
70
+
71
+ #if defined(__clang__) || defined(__GNUC__)
72
+ #define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
73
+ #endif
74
+
75
+ #if defined(__has_feature)
76
+ #if __has_feature(thread_sanitizer)
77
+ #define GGML_TSAN_ENABLED 1
78
+ #endif
79
+ #else // __has_feature
80
+ #if defined(__SANITIZE_THREAD__)
81
+ #define GGML_TSAN_ENABLED 1
82
+ #endif
83
+ #endif // __has_feature
84
+
85
  #if defined(_WIN32)
86
 
87
  #define WIN32_LEAN_AND_MEAN
 
91
  #include <windows.h>
92
 
93
  #if !defined(__clang__)
94
+ #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
95
+
96
  typedef volatile LONG atomic_int;
97
  typedef atomic_int atomic_bool;
98
  typedef atomic_int atomic_flag;
 
2027
 
2028
  // synchronization primitives
2029
  atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
2030
+ atomic_int GGML_CACHE_ALIGN n_barrier;
2031
+ atomic_int GGML_CACHE_ALIGN n_barrier_passed;
2032
  atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
2033
 
2034
  // these are atomic as an annotation for thread-sanitizer
 
3216
  // enter barrier (full seq-cst fence)
3217
  int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
3218
 
 
3219
  if (n_barrier == (n_threads - 1)) {
3220
  // last thread
3221
  atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3222
+
3223
+ // exit barrier (fill seq-cst fence)
3224
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
3225
+ return;
3226
+ }
3227
+
3228
+ // wait for other threads
3229
+ while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
3230
+ ggml_thread_cpu_relax();
3231
  }
3232
 
3233
  // exit barrier (full seq-cst fence)
3234
+ // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
3235
+ #ifdef GGML_TSAN_ENABLED
3236
+ atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
3237
+ #else
3238
+ atomic_thread_fence(memory_order_seq_cst);
3239
+ #endif
3240
  #endif
3241
  }
3242
 
 
20267
 
20268
  // sync thread state after polling
20269
  static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
20270
+ // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
20271
+ #ifdef GGML_TSAN_ENABLED
20272
+ atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
20273
+ #else
20274
+ atomic_thread_fence(memory_order_seq_cst);
20275
+ #endif
20276
+ UNUSED(state);
20277
  }
20278
 
20279
  static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {