ggerganov commited on
Commit
fdaf59a
·
unverified ·
1 Parent(s): ed14a8b

ggml : speed-up soft max via Accelerate + unroll

Browse files
Files changed (2) hide show
  1. ggml.c +113 -53
  2. whisper.cpp +1 -1
ggml.c CHANGED
@@ -81,6 +81,7 @@ typedef void* thread_ret_t;
81
 
82
  #define GGML_DEBUG 0
83
  #define GGML_GELU_FP16
 
84
 
85
  #if UINTPTR_MAX == 0xFFFFFFFF
86
  #define GGML_MEM_ALIGN 4
@@ -310,6 +311,7 @@ int64_t ggml_cycles_per_ms(void) {
310
  return CLOCKS_PER_SEC/1000;
311
  }
312
 
 
313
  #ifdef GGML_PERF
314
  #define ggml_perf_time_ms() ggml_time_ms()
315
  #define ggml_perf_time_us() ggml_time_us()
@@ -1316,25 +1318,25 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
1316
  return GGML_TYPE_SIZE[tensor->type];
1317
  }
1318
 
1319
- bool ggml_is_scalar(const struct ggml_tensor * tensor) {
1320
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1321
 
1322
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1323
  }
1324
 
1325
- bool ggml_is_vector(const struct ggml_tensor * tensor) {
1326
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1327
 
1328
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1329
  }
1330
 
1331
- bool ggml_is_matrix(const struct ggml_tensor * tensor) {
1332
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1333
 
1334
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
1335
  }
1336
 
1337
- bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1338
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1339
 
1340
  return
@@ -1343,7 +1345,7 @@ bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor *
1343
  (t0->ne[3] == t1->ne[3]);
1344
  }
1345
 
1346
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
1347
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1348
 
1349
  return
@@ -1353,7 +1355,7 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
1353
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
1354
  }
1355
 
1356
- bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1357
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1358
 
1359
  return
@@ -1362,7 +1364,7 @@ bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1362
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
1363
  }
1364
 
1365
- bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1366
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1367
 
1368
  return
@@ -1373,7 +1375,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
1373
  }
1374
 
1375
  // check if t1 can be represented as a repeatition of t0
1376
- bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1377
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1378
 
1379
  return
@@ -1383,14 +1385,20 @@ bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t
1383
  (t1->ne[3]%t0->ne[3] == 0);
1384
  }
1385
 
1386
- int ggml_up32(int n) {
1387
  return (n + 31) & ~31;
1388
  }
1389
 
1390
- int ggml_up64(int n) {
1391
  return (n + 63) & ~63;
1392
  }
1393
 
 
 
 
 
 
 
1394
  // assert that pointer is aligned to GGML_MEM_ALIGN
1395
  #define ggml_assert_aligned(ptr) \
1396
  assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
@@ -5094,21 +5102,19 @@ static void ggml_compute_forward_soft_max_f32(
5094
  #endif
5095
 
5096
  float max = -INFINITY;
5097
- for (int i = 0; i < nc; i++) {
5098
- max = MAX(max, p[i]);
5099
- }
5100
 
5101
  ggml_float sum = 0.0;
5102
 
5103
- uint16_t ss;
5104
  for (int i = 0; i < nc; i++) {
5105
  if (p[i] == -INFINITY) {
5106
- p[i] = 0.0;
5107
  } else {
5108
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
5109
  ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
5110
- memcpy(&ss, &s, sizeof(ss));
5111
- const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
5112
  sum += val;
5113
  p[i] = val;
5114
  }
@@ -5820,6 +5826,8 @@ static void ggml_compute_forward_flash_attn_f32(
5820
  const int P = nek1 - N;
5821
  const int M = P + N;
5822
 
 
 
5823
  GGML_ASSERT(ne0 == D);
5824
  GGML_ASSERT(ne1 == N);
5825
  GGML_ASSERT(P >= 0);
@@ -5872,7 +5880,11 @@ static void ggml_compute_forward_flash_attn_f32(
5872
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
5873
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
5874
 
5875
- float * S = (float *) params->wdata + ith*(M + CACHE_LINE_SIZE_F32);
 
 
 
 
5876
 
5877
  for (int ic = 0; ic < nek1; ++ic) {
5878
  // k indices
@@ -5903,30 +5915,50 @@ static void ggml_compute_forward_flash_attn_f32(
5903
  // softmax
5904
  {
5905
  float max = -INFINITY;
5906
- for (int i = 0; i < M; i++) {
5907
- max = MAX(max, S[i]);
5908
- }
 
 
 
 
5909
 
5910
- ggml_float sum = 0.0;
 
5911
 
5912
- uint16_t ss;
5913
- for (int i = 0; i < M; i++) {
5914
- if (S[i] == -INFINITY) {
5915
- S[i] = 0.0;
5916
- } else {
5917
- //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5918
- ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
5919
- memcpy(&ss, &s, sizeof(ss));
5920
- const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
5921
- sum += val;
5922
- S[i] = val;
5923
  }
 
 
 
 
 
 
 
 
5924
  }
5925
 
5926
  assert(sum > 0.0f);
5927
 
5928
  sum = 1.0/sum;
5929
  ggml_vec_scale_f32(M, S, sum);
 
 
 
 
 
 
 
5930
  }
5931
 
5932
  for (int ic = 0; ic < nev1; ++ic) {
@@ -6001,6 +6033,8 @@ static void ggml_compute_forward_flash_attn_f16(
6001
  const int P = nek1 - N;
6002
  const int M = P + N;
6003
 
 
 
6004
  GGML_ASSERT(ne0 == D);
6005
  GGML_ASSERT(ne1 == N);
6006
  GGML_ASSERT(P >= 0);
@@ -6053,7 +6087,11 @@ static void ggml_compute_forward_flash_attn_f16(
6053
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
6054
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
6055
 
6056
- float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
 
 
 
 
6057
 
6058
  for (int ic = 0; ic < nek1; ++ic) {
6059
  // k indices
@@ -6084,30 +6122,50 @@ static void ggml_compute_forward_flash_attn_f16(
6084
  // softmax
6085
  {
6086
  float max = -INFINITY;
6087
- for (int i = 0; i < M; i++) {
6088
- max = MAX(max, S[i]);
6089
- }
 
 
 
 
6090
 
6091
- ggml_float sum = 0.0;
 
6092
 
6093
- uint16_t ss;
6094
- for (int i = 0; i < M; i++) {
6095
- if (S[i] == -INFINITY) {
6096
- S[i] = 0.0;
6097
- } else {
6098
- //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
6099
- ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
6100
- memcpy(&ss, &s, sizeof(ss));
6101
- const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
6102
- sum += val;
6103
- S[i] = val;
 
 
 
 
6104
  }
 
 
 
 
6105
  }
6106
 
6107
  assert(sum > 0.0f);
6108
 
6109
  sum = 1.0/sum;
6110
  ggml_vec_scale_f32(M, S, sum);
 
 
 
 
 
 
 
6111
  }
6112
 
6113
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
@@ -7188,14 +7246,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
7188
 
7189
  size_t cur = 0;
7190
 
 
 
7191
  if (node->src1->type == GGML_TYPE_F32) {
7192
- cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
7193
- cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
7194
  }
7195
 
7196
  if (node->src1->type == GGML_TYPE_F16) {
7197
- cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
7198
- cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
7199
  }
7200
 
7201
  work_size = MAX(work_size, cur);
 
81
 
82
  #define GGML_DEBUG 0
83
  #define GGML_GELU_FP16
84
+ #define GGML_SOFT_MAX_UNROLL 4
85
 
86
  #if UINTPTR_MAX == 0xFFFFFFFF
87
  #define GGML_MEM_ALIGN 4
 
311
  return CLOCKS_PER_SEC/1000;
312
  }
313
 
314
+ //#define GGML_PERF
315
  #ifdef GGML_PERF
316
  #define ggml_perf_time_ms() ggml_time_ms()
317
  #define ggml_perf_time_us() ggml_time_us()
 
1318
  return GGML_TYPE_SIZE[tensor->type];
1319
  }
1320
 
1321
+ static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
1322
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1323
 
1324
  return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1325
  }
1326
 
1327
+ static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
1328
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1329
 
1330
  return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
1331
  }
1332
 
1333
+ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
1334
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1335
 
1336
  return tensor->ne[2] == 1 && tensor->ne[3] == 1;
1337
  }
1338
 
1339
+ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1340
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1341
 
1342
  return
 
1345
  (t0->ne[3] == t1->ne[3]);
1346
  }
1347
 
1348
+ static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
1349
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1350
 
1351
  return
 
1355
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
1356
  }
1357
 
1358
+ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
1359
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1360
 
1361
  return
 
1364
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
1365
  }
1366
 
1367
+ static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1368
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1369
 
1370
  return
 
1375
  }
1376
 
1377
  // check if t1 can be represented as a repeatition of t0
1378
+ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
1379
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
1380
 
1381
  return
 
1385
  (t1->ne[3]%t0->ne[3] == 0);
1386
  }
1387
 
1388
+ static inline int ggml_up32(int n) {
1389
  return (n + 31) & ~31;
1390
  }
1391
 
1392
+ static inline int ggml_up64(int n) {
1393
  return (n + 63) & ~63;
1394
  }
1395
 
1396
+ static inline int ggml_up(int n, int m) {
1397
+ // assert m is a power of 2
1398
+ GGML_ASSERT((m & (m - 1)) == 0);
1399
+ return (n + m - 1) & ~(m - 1);
1400
+ }
1401
+
1402
  // assert that pointer is aligned to GGML_MEM_ALIGN
1403
  #define ggml_assert_aligned(ptr) \
1404
  assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
 
5102
  #endif
5103
 
5104
  float max = -INFINITY;
5105
+ ggml_vec_max_f32(nc, &max, p);
 
 
5106
 
5107
  ggml_float sum = 0.0;
5108
 
5109
+ uint16_t scvt;
5110
  for (int i = 0; i < nc; i++) {
5111
  if (p[i] == -INFINITY) {
5112
+ p[i] = 0.0f;
5113
  } else {
5114
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
5115
  ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
5116
+ memcpy(&scvt, &s, sizeof(scvt));
5117
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
5118
  sum += val;
5119
  p[i] = val;
5120
  }
 
5826
  const int P = nek1 - N;
5827
  const int M = P + N;
5828
 
5829
+ const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
5830
+
5831
  GGML_ASSERT(ne0 == D);
5832
  GGML_ASSERT(ne1 == N);
5833
  GGML_ASSERT(P >= 0);
 
5880
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
5881
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
5882
 
5883
+ float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
5884
+
5885
+ for (int i = M; i < Mup; ++i) {
5886
+ S[i] = -INFINITY;
5887
+ }
5888
 
5889
  for (int ic = 0; ic < nek1; ++ic) {
5890
  // k indices
 
5915
  // softmax
5916
  {
5917
  float max = -INFINITY;
5918
+ ggml_vec_max_f32(M, &max, S);
5919
+
5920
+ float sum = 0.0f;
5921
+ {
5922
+ #ifndef GGML_USE_ACCELERATE
5923
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL];
5924
+ ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
5925
 
5926
+ for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
5927
+ float * SS = S + i;
5928
 
5929
+ for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
5930
+ if (SS[j] == -INFINITY) {
5931
+ SS[j] = 0.0f;
5932
+ } else {
5933
+ ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
5934
+ memcpy(&scvt[j], &s, sizeof(uint16_t));
5935
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
5936
+ sump[j] += val;
5937
+ SS[j] = val;
5938
+ }
5939
+ }
5940
  }
5941
+
5942
+ for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
5943
+ sum += sump[i];
5944
+ }
5945
+ #else
5946
+ vvexpf(S, S, &Mup);
5947
+ ggml_vec_sum_f32(Mup, &sum, S);
5948
+ #endif
5949
  }
5950
 
5951
  assert(sum > 0.0f);
5952
 
5953
  sum = 1.0/sum;
5954
  ggml_vec_scale_f32(M, S, sum);
5955
+
5956
+ #ifndef NDEBUG
5957
+ for (int i = 0; i < M; ++i) {
5958
+ assert(!isnan(S[i]));
5959
+ assert(!isinf(S[i]));
5960
+ }
5961
+ #endif
5962
  }
5963
 
5964
  for (int ic = 0; ic < nev1; ++ic) {
 
6033
  const int P = nek1 - N;
6034
  const int M = P + N;
6035
 
6036
+ const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
6037
+
6038
  GGML_ASSERT(ne0 == D);
6039
  GGML_ASSERT(ne1 == N);
6040
  GGML_ASSERT(P >= 0);
 
6087
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
6088
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
6089
 
6090
+ float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
6091
+
6092
+ for (int i = M; i < Mup; ++i) {
6093
+ S[i] = -INFINITY;
6094
+ }
6095
 
6096
  for (int ic = 0; ic < nek1; ++ic) {
6097
  // k indices
 
6122
  // softmax
6123
  {
6124
  float max = -INFINITY;
6125
+ ggml_vec_max_f32(M, &max, S);
6126
+
6127
+ float sum = 0.0f;
6128
+ {
6129
+ #ifndef GGML_USE_ACCELERATE
6130
+ uint16_t scvt[GGML_SOFT_MAX_UNROLL];
6131
+ ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
6132
 
6133
+ for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
6134
+ float * SS = S + i;
6135
 
6136
+ for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
6137
+ if (SS[j] == -INFINITY) {
6138
+ SS[j] = 0.0f;
6139
+ } else {
6140
+ ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
6141
+ memcpy(&scvt[j], &s, sizeof(uint16_t));
6142
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
6143
+ sump[j] += val;
6144
+ SS[j] = val;
6145
+ }
6146
+ }
6147
+ }
6148
+
6149
+ for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
6150
+ sum += sump[i];
6151
  }
6152
+ #else
6153
+ vvexpf(S, S, &Mup);
6154
+ ggml_vec_sum_f32(Mup, &sum, S);
6155
+ #endif
6156
  }
6157
 
6158
  assert(sum > 0.0f);
6159
 
6160
  sum = 1.0/sum;
6161
  ggml_vec_scale_f32(M, S, sum);
6162
+
6163
+ #ifndef NDEBUG
6164
+ for (int i = 0; i < M; ++i) {
6165
+ assert(!isnan(S[i]));
6166
+ assert(!isinf(S[i]));
6167
+ }
6168
+ #endif
6169
  }
6170
 
6171
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
 
7246
 
7247
  size_t cur = 0;
7248
 
7249
+ const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
7250
+
7251
  if (node->src1->type == GGML_TYPE_F32) {
7252
+ cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
7253
+ cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
7254
  }
7255
 
7256
  if (node->src1->type == GGML_TYPE_F16) {
7257
+ cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
7258
+ cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
7259
  }
7260
 
7261
  work_size = MAX(work_size, cur);
whisper.cpp CHANGED
@@ -131,7 +131,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
131
  { "su", { 98, "sundanese", } },
132
  };
133
 
134
- static const size_t MB = 1024*1024;
135
 
136
  static const std::map<e_model, size_t> MEM_REQ_MODEL = {
137
  { MODEL_TINY, 74ull*MB },
 
131
  { "su", { 98, "sundanese", } },
132
  };
133
 
134
+ static const size_t MB = 3*1024*1024;
135
 
136
  static const std::map<e_model, size_t> MEM_REQ_MODEL = {
137
  { MODEL_TINY, 74ull*MB },