Spaces:
Sleeping
Sleeping
ggml: move fp16/bf16 conversion optimizations to CPU backend + export conversion APIs (llama/13107)
Browse files* ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion
* move fp converter to ggml-cpu
* Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32
- ggml/include/ggml-cpu.h +5 -0
- ggml/src/ggml-cpu/ggml-cpu.c +89 -2
- ggml/src/ggml-cpu/ops.cpp +2 -2
- ggml/src/ggml.c +5 -46
ggml/include/ggml-cpu.h
CHANGED
|
@@ -133,6 +133,11 @@ extern "C" {
|
|
| 133 |
|
| 134 |
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
#ifdef __cplusplus
|
| 137 |
}
|
| 138 |
#endif
|
|
|
|
| 133 |
|
| 134 |
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
| 135 |
|
| 136 |
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
| 137 |
+
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
| 138 |
+
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
| 139 |
+
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
| 140 |
+
|
| 141 |
#ifdef __cplusplus
|
| 142 |
}
|
| 143 |
#endif
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 215 |
.nrows = 1,
|
| 216 |
},
|
| 217 |
[GGML_TYPE_F16] = {
|
| 218 |
-
.from_float = (ggml_from_float_t)
|
| 219 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
| 220 |
.vec_dot_type = GGML_TYPE_F16,
|
| 221 |
.nrows = 1,
|
|
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
| 356 |
.from_float = quantize_row_q8_K,
|
| 357 |
},
|
| 358 |
[GGML_TYPE_BF16] = {
|
| 359 |
-
.from_float = (ggml_from_float_t)
|
| 360 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
| 361 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 362 |
.nrows = 1,
|
|
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
|
| 3166 |
return ggml_graph_compute(cgraph, &cplan);
|
| 3167 |
}
|
| 3168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3169 |
|
| 3170 |
int ggml_cpu_has_avx(void) {
|
| 3171 |
#if defined(__AVX__)
|
|
|
|
| 215 |
.nrows = 1,
|
| 216 |
},
|
| 217 |
[GGML_TYPE_F16] = {
|
| 218 |
+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
|
| 219 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
| 220 |
.vec_dot_type = GGML_TYPE_F16,
|
| 221 |
.nrows = 1,
|
|
|
|
| 356 |
.from_float = quantize_row_q8_K,
|
| 357 |
},
|
| 358 |
[GGML_TYPE_BF16] = {
|
| 359 |
+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
|
| 360 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
| 361 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 362 |
.nrows = 1,
|
|
|
|
| 3166 |
return ggml_graph_compute(cgraph, &cplan);
|
| 3167 |
}
|
| 3168 |
|
| 3169 |
+
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
| 3170 |
+
int64_t i = 0;
|
| 3171 |
+
#if defined(__F16C__)
|
| 3172 |
+
#if defined(__AVX512F__)
|
| 3173 |
+
for (; i + 15 < n; i += 16) {
|
| 3174 |
+
__m512 x_vec = _mm512_loadu_ps(x + i);
|
| 3175 |
+
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
| 3176 |
+
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
|
| 3177 |
+
}
|
| 3178 |
+
#endif
|
| 3179 |
+
for (; i + 7 < n; i += 8) {
|
| 3180 |
+
__m256 x_vec = _mm256_loadu_ps(x + i);
|
| 3181 |
+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
| 3182 |
+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
| 3183 |
+
}
|
| 3184 |
+
for (; i + 3 < n; i += 4) {
|
| 3185 |
+
__m128 x_vec = _mm_loadu_ps(x + i);
|
| 3186 |
+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
| 3187 |
+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
| 3188 |
+
}
|
| 3189 |
+
#endif
|
| 3190 |
+
for (; i < n; ++i) {
|
| 3191 |
+
y[i] = GGML_FP32_TO_FP16(x[i]);
|
| 3192 |
+
}
|
| 3193 |
+
}
|
| 3194 |
+
|
| 3195 |
+
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
| 3196 |
+
int64_t i = 0;
|
| 3197 |
+
#if defined(__F16C__)
|
| 3198 |
+
#if defined(__AVX512F__)
|
| 3199 |
+
for (; i + 15 < n; i += 16) {
|
| 3200 |
+
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
|
| 3201 |
+
__m512 y_vec = _mm512_cvtph_ps(x_vec);
|
| 3202 |
+
_mm512_storeu_ps(y + i, y_vec);
|
| 3203 |
+
}
|
| 3204 |
+
#endif
|
| 3205 |
+
for (; i + 7 < n; i += 8) {
|
| 3206 |
+
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
|
| 3207 |
+
__m256 y_vec = _mm256_cvtph_ps(x_vec);
|
| 3208 |
+
_mm256_storeu_ps(y + i, y_vec);
|
| 3209 |
+
}
|
| 3210 |
+
for (; i + 3 < n; i += 4) {
|
| 3211 |
+
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
|
| 3212 |
+
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
| 3213 |
+
_mm_storeu_ps(y + i, y_vec);
|
| 3214 |
+
}
|
| 3215 |
+
#endif
|
| 3216 |
+
for (; i < n; ++i) {
|
| 3217 |
+
y[i] = GGML_FP16_TO_FP32(x[i]);
|
| 3218 |
+
}
|
| 3219 |
+
}
|
| 3220 |
+
|
| 3221 |
+
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
|
| 3222 |
+
int64_t i = 0;
|
| 3223 |
+
for (; i < n; ++i) {
|
| 3224 |
+
y[i] = GGML_FP32_TO_BF16(x[i]);
|
| 3225 |
+
}
|
| 3226 |
+
}
|
| 3227 |
+
|
| 3228 |
+
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
| 3229 |
+
int64_t i = 0;
|
| 3230 |
+
#if defined(__AVX2__)
|
| 3231 |
+
#if defined(__AVX512F__)
|
| 3232 |
+
for (; i + 15 < n; i += 16) {
|
| 3233 |
+
_mm512_storeu_ps(y + i,
|
| 3234 |
+
_mm512_castsi512_ps(
|
| 3235 |
+
_mm512_slli_epi32(
|
| 3236 |
+
_mm512_cvtepu16_epi32(
|
| 3237 |
+
_mm256_loadu_si256(
|
| 3238 |
+
(const __m256i *)(x + i))),
|
| 3239 |
+
16)));
|
| 3240 |
+
}
|
| 3241 |
+
#endif
|
| 3242 |
+
for (; i + 7 < n; i += 8) {
|
| 3243 |
+
_mm256_storeu_ps(y + i,
|
| 3244 |
+
_mm256_castsi256_ps(
|
| 3245 |
+
_mm256_slli_epi32(
|
| 3246 |
+
_mm256_cvtepu16_epi32(
|
| 3247 |
+
_mm_loadu_si128(
|
| 3248 |
+
(const __m128i *)(x + i))),
|
| 3249 |
+
16)));
|
| 3250 |
+
}
|
| 3251 |
+
#endif
|
| 3252 |
+
for (; i < n; i++) {
|
| 3253 |
+
y[i] = GGML_BF16_TO_FP32(x[i]);
|
| 3254 |
+
}
|
| 3255 |
+
}
|
| 3256 |
|
| 3257 |
int ggml_cpu_has_avx(void) {
|
| 3258 |
#if defined(__AVX__)
|
ggml/src/ggml-cpu/ops.cpp
CHANGED
|
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
|
|
| 4222 |
|
| 4223 |
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
| 4224 |
|
| 4225 |
-
|
| 4226 |
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 4227 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 4228 |
}
|
|
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
| 4263 |
|
| 4264 |
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
| 4265 |
|
| 4266 |
-
|
| 4267 |
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 4268 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 4269 |
}
|
|
|
|
| 4222 |
|
| 4223 |
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
| 4224 |
|
| 4225 |
+
ggml_cpu_fp16_to_fp32(
|
| 4226 |
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 4227 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 4228 |
}
|
|
|
|
| 4263 |
|
| 4264 |
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
| 4265 |
|
| 4266 |
+
ggml_cpu_bf16_to_fp32(
|
| 4267 |
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 4268 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 4269 |
}
|
ggml/src/ggml.c
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
#include "ggml-backend.h"
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
#include "ggml-threading.h"
|
|
|
|
| 7 |
#include "ggml.h"
|
| 8 |
|
| 9 |
// FIXME: required here for quantization functions
|
|
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
| 382 |
}
|
| 383 |
}
|
| 384 |
|
| 385 |
-
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
|
| 386 |
-
// currently, the ggml_cpu_has_* functions are entirely compile-time
|
| 387 |
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
//if (ggml_cpu_has_f16c()) {
|
| 391 |
-
for (; i + 7 < n; i += 8) {
|
| 392 |
-
__m256 x_vec = _mm256_loadu_ps(x + i);
|
| 393 |
-
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
| 394 |
-
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
| 395 |
-
}
|
| 396 |
-
for(; i + 3 < n; i += 4) {
|
| 397 |
-
__m128 x_vec = _mm_loadu_ps(x + i);
|
| 398 |
-
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
| 399 |
-
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
| 400 |
-
}
|
| 401 |
-
//}
|
| 402 |
-
#endif
|
| 403 |
-
for (; i < n; i++) {
|
| 404 |
y[i] = GGML_FP32_TO_FP16(x[i]);
|
| 405 |
}
|
| 406 |
}
|
| 407 |
|
| 408 |
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
//if (ggml_cpu_has_avx512()) {
|
| 412 |
-
for (; i + 16 <= n; i += 16) {
|
| 413 |
-
_mm512_storeu_ps(y + i,
|
| 414 |
-
_mm512_castsi512_ps(
|
| 415 |
-
_mm512_slli_epi32(
|
| 416 |
-
_mm512_cvtepu16_epi32(
|
| 417 |
-
_mm256_loadu_si256(
|
| 418 |
-
(const __m256i *)(x + i))),
|
| 419 |
-
16)));
|
| 420 |
-
}
|
| 421 |
-
//}
|
| 422 |
-
#endif
|
| 423 |
-
#if defined(__AVX2__)
|
| 424 |
-
//if (ggml_cpu_has_avx2()) {
|
| 425 |
-
for (; i + 8 <= n; i += 8) {
|
| 426 |
-
_mm256_storeu_ps(y + i,
|
| 427 |
-
_mm256_castsi256_ps(
|
| 428 |
-
_mm256_slli_epi32(
|
| 429 |
-
_mm256_cvtepu16_epi32(
|
| 430 |
-
_mm_loadu_si128(
|
| 431 |
-
(const __m128i *)(x + i))),
|
| 432 |
-
16)));
|
| 433 |
-
}
|
| 434 |
-
//}
|
| 435 |
-
#endif
|
| 436 |
-
for (; i < n; i++) {
|
| 437 |
y[i] = GGML_BF16_TO_FP32(x[i]);
|
| 438 |
}
|
| 439 |
}
|
|
|
|
| 4 |
#include "ggml-backend.h"
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
#include "ggml-threading.h"
|
| 7 |
+
#include "ggml-cpu.h"
|
| 8 |
#include "ggml.h"
|
| 9 |
|
| 10 |
// FIXME: required here for quantization functions
|
|
|
|
| 383 |
}
|
| 384 |
}
|
| 385 |
|
|
|
|
|
|
|
| 386 |
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
| 387 |
+
int i = 0;
|
| 388 |
+
for (; i < n; ++i) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
y[i] = GGML_FP32_TO_FP16(x[i]);
|
| 390 |
}
|
| 391 |
}
|
| 392 |
|
| 393 |
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
| 394 |
+
int i = 0;
|
| 395 |
+
for (; i < n; ++i) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
y[i] = GGML_BF16_TO_FP32(x[i]);
|
| 397 |
}
|
| 398 |
}
|