sxx-404 commited on
Commit
c47823e
·
1 Parent(s): 8614863

ggml: move fp16/bf16 conversion optimizations to CPU backend + export conversion APIs (llama/13107)

Browse files

* ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion

* move fp converter to ggml-cpu

* Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32

ggml/include/ggml-cpu.h CHANGED
@@ -133,6 +133,11 @@ extern "C" {
133
 
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
 
 
 
 
 
 
136
  #ifdef __cplusplus
137
  }
138
  #endif
 
133
 
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
 
136
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
+ GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
+ GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
139
+ GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
140
+
141
  #ifdef __cplusplus
142
  }
143
  #endif
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
215
  .nrows = 1,
216
  },
217
  [GGML_TYPE_F16] = {
218
- .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
219
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
220
  .vec_dot_type = GGML_TYPE_F16,
221
  .nrows = 1,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
356
  .from_float = quantize_row_q8_K,
357
  },
358
  [GGML_TYPE_BF16] = {
359
- .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
360
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
361
  .vec_dot_type = GGML_TYPE_BF16,
362
  .nrows = 1,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
3166
  return ggml_graph_compute(cgraph, &cplan);
3167
  }
3168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3169
 
3170
  int ggml_cpu_has_avx(void) {
3171
  #if defined(__AVX__)
 
215
  .nrows = 1,
216
  },
217
  [GGML_TYPE_F16] = {
218
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
219
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
220
  .vec_dot_type = GGML_TYPE_F16,
221
  .nrows = 1,
 
356
  .from_float = quantize_row_q8_K,
357
  },
358
  [GGML_TYPE_BF16] = {
359
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
360
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
361
  .vec_dot_type = GGML_TYPE_BF16,
362
  .nrows = 1,
 
3166
  return ggml_graph_compute(cgraph, &cplan);
3167
  }
3168
 
3169
+ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3170
+ int64_t i = 0;
3171
+ #if defined(__F16C__)
3172
+ #if defined(__AVX512F__)
3173
+ for (; i + 15 < n; i += 16) {
3174
+ __m512 x_vec = _mm512_loadu_ps(x + i);
3175
+ __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3176
+ _mm256_storeu_si256((__m256i *)(y + i), y_vec);
3177
+ }
3178
+ #endif
3179
+ for (; i + 7 < n; i += 8) {
3180
+ __m256 x_vec = _mm256_loadu_ps(x + i);
3181
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3182
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
3183
+ }
3184
+ for (; i + 3 < n; i += 4) {
3185
+ __m128 x_vec = _mm_loadu_ps(x + i);
3186
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3187
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
3188
+ }
3189
+ #endif
3190
+ for (; i < n; ++i) {
3191
+ y[i] = GGML_FP32_TO_FP16(x[i]);
3192
+ }
3193
+ }
3194
+
3195
+ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3196
+ int64_t i = 0;
3197
+ #if defined(__F16C__)
3198
+ #if defined(__AVX512F__)
3199
+ for (; i + 15 < n; i += 16) {
3200
+ __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
3201
+ __m512 y_vec = _mm512_cvtph_ps(x_vec);
3202
+ _mm512_storeu_ps(y + i, y_vec);
3203
+ }
3204
+ #endif
3205
+ for (; i + 7 < n; i += 8) {
3206
+ __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
3207
+ __m256 y_vec = _mm256_cvtph_ps(x_vec);
3208
+ _mm256_storeu_ps(y + i, y_vec);
3209
+ }
3210
+ for (; i + 3 < n; i += 4) {
3211
+ __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
3212
+ __m128 y_vec = _mm_cvtph_ps(x_vec);
3213
+ _mm_storeu_ps(y + i, y_vec);
3214
+ }
3215
+ #endif
3216
+ for (; i < n; ++i) {
3217
+ y[i] = GGML_FP16_TO_FP32(x[i]);
3218
+ }
3219
+ }
3220
+
3221
+ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3222
+ int64_t i = 0;
3223
+ for (; i < n; ++i) {
3224
+ y[i] = GGML_FP32_TO_BF16(x[i]);
3225
+ }
3226
+ }
3227
+
3228
+ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3229
+ int64_t i = 0;
3230
+ #if defined(__AVX2__)
3231
+ #if defined(__AVX512F__)
3232
+ for (; i + 15 < n; i += 16) {
3233
+ _mm512_storeu_ps(y + i,
3234
+ _mm512_castsi512_ps(
3235
+ _mm512_slli_epi32(
3236
+ _mm512_cvtepu16_epi32(
3237
+ _mm256_loadu_si256(
3238
+ (const __m256i *)(x + i))),
3239
+ 16)));
3240
+ }
3241
+ #endif
3242
+ for (; i + 7 < n; i += 8) {
3243
+ _mm256_storeu_ps(y + i,
3244
+ _mm256_castsi256_ps(
3245
+ _mm256_slli_epi32(
3246
+ _mm256_cvtepu16_epi32(
3247
+ _mm_loadu_si128(
3248
+ (const __m128i *)(x + i))),
3249
+ 16)));
3250
+ }
3251
+ #endif
3252
+ for (; i < n; i++) {
3253
+ y[i] = GGML_BF16_TO_FP32(x[i]);
3254
+ }
3255
+ }
3256
 
3257
  int ggml_cpu_has_avx(void) {
3258
  #if defined(__AVX__)
ggml/src/ggml-cpu/ops.cpp CHANGED
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
4222
 
4223
  GGML_ASSERT(i01 >= 0 && i01 < ne01);
4224
 
4225
- ggml_fp16_to_fp32_row(
4226
  (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
4227
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
4228
  }
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
4263
 
4264
  GGML_ASSERT(i01 >= 0 && i01 < ne01);
4265
 
4266
- ggml_bf16_to_fp32_row(
4267
  (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
4268
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
4269
  }
 
4222
 
4223
  GGML_ASSERT(i01 >= 0 && i01 < ne01);
4224
 
4225
+ ggml_cpu_fp16_to_fp32(
4226
  (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
4227
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
4228
  }
 
4263
 
4264
  GGML_ASSERT(i01 >= 0 && i01 < ne01);
4265
 
4266
+ ggml_cpu_bf16_to_fp32(
4267
  (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
4268
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
4269
  }
ggml/src/ggml.c CHANGED
@@ -4,6 +4,7 @@
4
  #include "ggml-backend.h"
5
  #include "ggml-impl.h"
6
  #include "ggml-threading.h"
 
7
  #include "ggml.h"
8
 
9
  // FIXME: required here for quantization functions
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
382
  }
383
  }
384
 
385
- // FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
386
- // currently, the ggml_cpu_has_* functions are entirely compile-time
387
  void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
388
- int64_t i = 0;
389
- #if defined(__F16C__)
390
- //if (ggml_cpu_has_f16c()) {
391
- for (; i + 7 < n; i += 8) {
392
- __m256 x_vec = _mm256_loadu_ps(x + i);
393
- __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
394
- _mm_storeu_si128((__m128i *)(y + i), y_vec);
395
- }
396
- for(; i + 3 < n; i += 4) {
397
- __m128 x_vec = _mm_loadu_ps(x + i);
398
- __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
399
- _mm_storel_epi64((__m128i *)(y + i), y_vec);
400
- }
401
- //}
402
- #endif
403
- for (; i < n; i++) {
404
  y[i] = GGML_FP32_TO_FP16(x[i]);
405
  }
406
  }
407
 
408
  void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
409
- int64_t i = 0;
410
- #if defined(__AVX512F__)
411
- //if (ggml_cpu_has_avx512()) {
412
- for (; i + 16 <= n; i += 16) {
413
- _mm512_storeu_ps(y + i,
414
- _mm512_castsi512_ps(
415
- _mm512_slli_epi32(
416
- _mm512_cvtepu16_epi32(
417
- _mm256_loadu_si256(
418
- (const __m256i *)(x + i))),
419
- 16)));
420
- }
421
- //}
422
- #endif
423
- #if defined(__AVX2__)
424
- //if (ggml_cpu_has_avx2()) {
425
- for (; i + 8 <= n; i += 8) {
426
- _mm256_storeu_ps(y + i,
427
- _mm256_castsi256_ps(
428
- _mm256_slli_epi32(
429
- _mm256_cvtepu16_epi32(
430
- _mm_loadu_si128(
431
- (const __m128i *)(x + i))),
432
- 16)));
433
- }
434
- //}
435
- #endif
436
- for (; i < n; i++) {
437
  y[i] = GGML_BF16_TO_FP32(x[i]);
438
  }
439
  }
 
4
  #include "ggml-backend.h"
5
  #include "ggml-impl.h"
6
  #include "ggml-threading.h"
7
+ #include "ggml-cpu.h"
8
  #include "ggml.h"
9
 
10
  // FIXME: required here for quantization functions
 
383
  }
384
  }
385
 
 
 
386
  void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
387
+ int i = 0;
388
+ for (; i < n; ++i) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  y[i] = GGML_FP32_TO_FP16(x[i]);
390
  }
391
  }
392
 
393
  void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
394
+ int i = 0;
395
+ for (; i < n; ++i) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  y[i] = GGML_BF16_TO_FP32(x[i]);
397
  }
398
  }