Spaces:
Sleeping
Sleeping
ggml : speed-up soft max via Accelerate + unroll
Browse files- ggml.c +113 -53
- whisper.cpp +1 -1
ggml.c
CHANGED
|
@@ -81,6 +81,7 @@ typedef void* thread_ret_t;
|
|
| 81 |
|
| 82 |
#define GGML_DEBUG 0
|
| 83 |
#define GGML_GELU_FP16
|
|
|
|
| 84 |
|
| 85 |
#if UINTPTR_MAX == 0xFFFFFFFF
|
| 86 |
#define GGML_MEM_ALIGN 4
|
|
@@ -310,6 +311,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|
| 310 |
return CLOCKS_PER_SEC/1000;
|
| 311 |
}
|
| 312 |
|
|
|
|
| 313 |
#ifdef GGML_PERF
|
| 314 |
#define ggml_perf_time_ms() ggml_time_ms()
|
| 315 |
#define ggml_perf_time_us() ggml_time_us()
|
|
@@ -1316,25 +1318,25 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
|
| 1316 |
return GGML_TYPE_SIZE[tensor->type];
|
| 1317 |
}
|
| 1318 |
|
| 1319 |
-
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
| 1320 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1321 |
|
| 1322 |
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1323 |
}
|
| 1324 |
|
| 1325 |
-
bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
| 1326 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1327 |
|
| 1328 |
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1329 |
}
|
| 1330 |
|
| 1331 |
-
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
| 1332 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1333 |
|
| 1334 |
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1335 |
}
|
| 1336 |
|
| 1337 |
-
bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1338 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1339 |
|
| 1340 |
return
|
|
@@ -1343,7 +1345,7 @@ bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor *
|
|
| 1343 |
(t0->ne[3] == t1->ne[3]);
|
| 1344 |
}
|
| 1345 |
|
| 1346 |
-
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
| 1347 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1348 |
|
| 1349 |
return
|
|
@@ -1353,7 +1355,7 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
| 1353 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 1354 |
}
|
| 1355 |
|
| 1356 |
-
bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
| 1357 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1358 |
|
| 1359 |
return
|
|
@@ -1362,7 +1364,7 @@ bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
| 1362 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 1363 |
}
|
| 1364 |
|
| 1365 |
-
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1366 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1367 |
|
| 1368 |
return
|
|
@@ -1373,7 +1375,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
| 1373 |
}
|
| 1374 |
|
| 1375 |
// check if t1 can be represented as a repeatition of t0
|
| 1376 |
-
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1377 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1378 |
|
| 1379 |
return
|
|
@@ -1383,14 +1385,20 @@ bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t
|
|
| 1383 |
(t1->ne[3]%t0->ne[3] == 0);
|
| 1384 |
}
|
| 1385 |
|
| 1386 |
-
int ggml_up32(int n) {
|
| 1387 |
return (n + 31) & ~31;
|
| 1388 |
}
|
| 1389 |
|
| 1390 |
-
int ggml_up64(int n) {
|
| 1391 |
return (n + 63) & ~63;
|
| 1392 |
}
|
| 1393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1394 |
// assert that pointer is aligned to GGML_MEM_ALIGN
|
| 1395 |
#define ggml_assert_aligned(ptr) \
|
| 1396 |
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
|
@@ -5094,21 +5102,19 @@ static void ggml_compute_forward_soft_max_f32(
|
|
| 5094 |
#endif
|
| 5095 |
|
| 5096 |
float max = -INFINITY;
|
| 5097 |
-
|
| 5098 |
-
max = MAX(max, p[i]);
|
| 5099 |
-
}
|
| 5100 |
|
| 5101 |
ggml_float sum = 0.0;
|
| 5102 |
|
| 5103 |
-
uint16_t
|
| 5104 |
for (int i = 0; i < nc; i++) {
|
| 5105 |
if (p[i] == -INFINITY) {
|
| 5106 |
-
p[i] = 0.
|
| 5107 |
} else {
|
| 5108 |
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
| 5109 |
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
| 5110 |
-
memcpy(&
|
| 5111 |
-
const float val = GGML_FP16_TO_FP32(table_exp_f16[
|
| 5112 |
sum += val;
|
| 5113 |
p[i] = val;
|
| 5114 |
}
|
|
@@ -5820,6 +5826,8 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
| 5820 |
const int P = nek1 - N;
|
| 5821 |
const int M = P + N;
|
| 5822 |
|
|
|
|
|
|
|
| 5823 |
GGML_ASSERT(ne0 == D);
|
| 5824 |
GGML_ASSERT(ne1 == N);
|
| 5825 |
GGML_ASSERT(P >= 0);
|
|
@@ -5872,7 +5880,11 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
| 5872 |
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
| 5873 |
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
| 5874 |
|
| 5875 |
-
float * S = (float *) params->wdata + ith*(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5876 |
|
| 5877 |
for (int ic = 0; ic < nek1; ++ic) {
|
| 5878 |
// k indices
|
|
@@ -5903,30 +5915,50 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
| 5903 |
// softmax
|
| 5904 |
{
|
| 5905 |
float max = -INFINITY;
|
| 5906 |
-
|
| 5907 |
-
|
| 5908 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5909 |
|
| 5910 |
-
|
|
|
|
| 5911 |
|
| 5912 |
-
|
| 5913 |
-
|
| 5914 |
-
|
| 5915 |
-
|
| 5916 |
-
|
| 5917 |
-
|
| 5918 |
-
|
| 5919 |
-
|
| 5920 |
-
|
| 5921 |
-
|
| 5922 |
-
|
| 5923 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5924 |
}
|
| 5925 |
|
| 5926 |
assert(sum > 0.0f);
|
| 5927 |
|
| 5928 |
sum = 1.0/sum;
|
| 5929 |
ggml_vec_scale_f32(M, S, sum);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5930 |
}
|
| 5931 |
|
| 5932 |
for (int ic = 0; ic < nev1; ++ic) {
|
|
@@ -6001,6 +6033,8 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
| 6001 |
const int P = nek1 - N;
|
| 6002 |
const int M = P + N;
|
| 6003 |
|
|
|
|
|
|
|
| 6004 |
GGML_ASSERT(ne0 == D);
|
| 6005 |
GGML_ASSERT(ne1 == N);
|
| 6006 |
GGML_ASSERT(P >= 0);
|
|
@@ -6053,7 +6087,11 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
| 6053 |
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
| 6054 |
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
| 6055 |
|
| 6056 |
-
float * S = (float *) params->wdata + ith*(2*
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6057 |
|
| 6058 |
for (int ic = 0; ic < nek1; ++ic) {
|
| 6059 |
// k indices
|
|
@@ -6084,30 +6122,50 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
| 6084 |
// softmax
|
| 6085 |
{
|
| 6086 |
float max = -INFINITY;
|
| 6087 |
-
|
| 6088 |
-
|
| 6089 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6090 |
|
| 6091 |
-
|
|
|
|
| 6092 |
|
| 6093 |
-
|
| 6094 |
-
|
| 6095 |
-
|
| 6096 |
-
|
| 6097 |
-
|
| 6098 |
-
|
| 6099 |
-
|
| 6100 |
-
|
| 6101 |
-
|
| 6102 |
-
|
| 6103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6104 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6105 |
}
|
| 6106 |
|
| 6107 |
assert(sum > 0.0f);
|
| 6108 |
|
| 6109 |
sum = 1.0/sum;
|
| 6110 |
ggml_vec_scale_f32(M, S, sum);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6111 |
}
|
| 6112 |
|
| 6113 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
|
@@ -7188,14 +7246,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 7188 |
|
| 7189 |
size_t cur = 0;
|
| 7190 |
|
|
|
|
|
|
|
| 7191 |
if (node->src1->type == GGML_TYPE_F32) {
|
| 7192 |
-
cur = sizeof(float)*
|
| 7193 |
-
cur += sizeof(float)*
|
| 7194 |
}
|
| 7195 |
|
| 7196 |
if (node->src1->type == GGML_TYPE_F16) {
|
| 7197 |
-
cur = sizeof(float)*
|
| 7198 |
-
cur += sizeof(float)*
|
| 7199 |
}
|
| 7200 |
|
| 7201 |
work_size = MAX(work_size, cur);
|
|
|
|
| 81 |
|
| 82 |
#define GGML_DEBUG 0
|
| 83 |
#define GGML_GELU_FP16
|
| 84 |
+
#define GGML_SOFT_MAX_UNROLL 4
|
| 85 |
|
| 86 |
#if UINTPTR_MAX == 0xFFFFFFFF
|
| 87 |
#define GGML_MEM_ALIGN 4
|
|
|
|
| 311 |
return CLOCKS_PER_SEC/1000;
|
| 312 |
}
|
| 313 |
|
| 314 |
+
//#define GGML_PERF
|
| 315 |
#ifdef GGML_PERF
|
| 316 |
#define ggml_perf_time_ms() ggml_time_ms()
|
| 317 |
#define ggml_perf_time_us() ggml_time_us()
|
|
|
|
| 1318 |
return GGML_TYPE_SIZE[tensor->type];
|
| 1319 |
}
|
| 1320 |
|
| 1321 |
+
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
| 1322 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1323 |
|
| 1324 |
return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1325 |
}
|
| 1326 |
|
| 1327 |
+
static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
|
| 1328 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1329 |
|
| 1330 |
return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1331 |
}
|
| 1332 |
|
| 1333 |
+
static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
| 1334 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1335 |
|
| 1336 |
return tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
| 1337 |
}
|
| 1338 |
|
| 1339 |
+
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1340 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1341 |
|
| 1342 |
return
|
|
|
|
| 1345 |
(t0->ne[3] == t1->ne[3]);
|
| 1346 |
}
|
| 1347 |
|
| 1348 |
+
static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
| 1349 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1350 |
|
| 1351 |
return
|
|
|
|
| 1355 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 1356 |
}
|
| 1357 |
|
| 1358 |
+
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
| 1359 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1360 |
|
| 1361 |
return
|
|
|
|
| 1364 |
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
| 1365 |
}
|
| 1366 |
|
| 1367 |
+
static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1368 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1369 |
|
| 1370 |
return
|
|
|
|
| 1375 |
}
|
| 1376 |
|
| 1377 |
// check if t1 can be represented as a repeatition of t0
|
| 1378 |
+
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
| 1379 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1380 |
|
| 1381 |
return
|
|
|
|
| 1385 |
(t1->ne[3]%t0->ne[3] == 0);
|
| 1386 |
}
|
| 1387 |
|
| 1388 |
+
static inline int ggml_up32(int n) {
|
| 1389 |
return (n + 31) & ~31;
|
| 1390 |
}
|
| 1391 |
|
| 1392 |
+
static inline int ggml_up64(int n) {
|
| 1393 |
return (n + 63) & ~63;
|
| 1394 |
}
|
| 1395 |
|
| 1396 |
+
static inline int ggml_up(int n, int m) {
|
| 1397 |
+
// assert m is a power of 2
|
| 1398 |
+
GGML_ASSERT((m & (m - 1)) == 0);
|
| 1399 |
+
return (n + m - 1) & ~(m - 1);
|
| 1400 |
+
}
|
| 1401 |
+
|
| 1402 |
// assert that pointer is aligned to GGML_MEM_ALIGN
|
| 1403 |
#define ggml_assert_aligned(ptr) \
|
| 1404 |
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
|
|
|
| 5102 |
#endif
|
| 5103 |
|
| 5104 |
float max = -INFINITY;
|
| 5105 |
+
ggml_vec_max_f32(nc, &max, p);
|
|
|
|
|
|
|
| 5106 |
|
| 5107 |
ggml_float sum = 0.0;
|
| 5108 |
|
| 5109 |
+
uint16_t scvt;
|
| 5110 |
for (int i = 0; i < nc; i++) {
|
| 5111 |
if (p[i] == -INFINITY) {
|
| 5112 |
+
p[i] = 0.0f;
|
| 5113 |
} else {
|
| 5114 |
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
| 5115 |
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
| 5116 |
+
memcpy(&scvt, &s, sizeof(scvt));
|
| 5117 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
| 5118 |
sum += val;
|
| 5119 |
p[i] = val;
|
| 5120 |
}
|
|
|
|
| 5826 |
const int P = nek1 - N;
|
| 5827 |
const int M = P + N;
|
| 5828 |
|
| 5829 |
+
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
| 5830 |
+
|
| 5831 |
GGML_ASSERT(ne0 == D);
|
| 5832 |
GGML_ASSERT(ne1 == N);
|
| 5833 |
GGML_ASSERT(P >= 0);
|
|
|
|
| 5880 |
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
| 5881 |
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
| 5882 |
|
| 5883 |
+
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
| 5884 |
+
|
| 5885 |
+
for (int i = M; i < Mup; ++i) {
|
| 5886 |
+
S[i] = -INFINITY;
|
| 5887 |
+
}
|
| 5888 |
|
| 5889 |
for (int ic = 0; ic < nek1; ++ic) {
|
| 5890 |
// k indices
|
|
|
|
| 5915 |
// softmax
|
| 5916 |
{
|
| 5917 |
float max = -INFINITY;
|
| 5918 |
+
ggml_vec_max_f32(M, &max, S);
|
| 5919 |
+
|
| 5920 |
+
float sum = 0.0f;
|
| 5921 |
+
{
|
| 5922 |
+
#ifndef GGML_USE_ACCELERATE
|
| 5923 |
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
| 5924 |
+
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 5925 |
|
| 5926 |
+
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
| 5927 |
+
float * SS = S + i;
|
| 5928 |
|
| 5929 |
+
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
| 5930 |
+
if (SS[j] == -INFINITY) {
|
| 5931 |
+
SS[j] = 0.0f;
|
| 5932 |
+
} else {
|
| 5933 |
+
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
| 5934 |
+
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 5935 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
| 5936 |
+
sump[j] += val;
|
| 5937 |
+
SS[j] = val;
|
| 5938 |
+
}
|
| 5939 |
+
}
|
| 5940 |
}
|
| 5941 |
+
|
| 5942 |
+
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
| 5943 |
+
sum += sump[i];
|
| 5944 |
+
}
|
| 5945 |
+
#else
|
| 5946 |
+
vvexpf(S, S, &Mup);
|
| 5947 |
+
ggml_vec_sum_f32(Mup, &sum, S);
|
| 5948 |
+
#endif
|
| 5949 |
}
|
| 5950 |
|
| 5951 |
assert(sum > 0.0f);
|
| 5952 |
|
| 5953 |
sum = 1.0/sum;
|
| 5954 |
ggml_vec_scale_f32(M, S, sum);
|
| 5955 |
+
|
| 5956 |
+
#ifndef NDEBUG
|
| 5957 |
+
for (int i = 0; i < M; ++i) {
|
| 5958 |
+
assert(!isnan(S[i]));
|
| 5959 |
+
assert(!isinf(S[i]));
|
| 5960 |
+
}
|
| 5961 |
+
#endif
|
| 5962 |
}
|
| 5963 |
|
| 5964 |
for (int ic = 0; ic < nev1; ++ic) {
|
|
|
|
| 6033 |
const int P = nek1 - N;
|
| 6034 |
const int M = P + N;
|
| 6035 |
|
| 6036 |
+
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
| 6037 |
+
|
| 6038 |
GGML_ASSERT(ne0 == D);
|
| 6039 |
GGML_ASSERT(ne1 == N);
|
| 6040 |
GGML_ASSERT(P >= 0);
|
|
|
|
| 6087 |
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
| 6088 |
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
| 6089 |
|
| 6090 |
+
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
| 6091 |
+
|
| 6092 |
+
for (int i = M; i < Mup; ++i) {
|
| 6093 |
+
S[i] = -INFINITY;
|
| 6094 |
+
}
|
| 6095 |
|
| 6096 |
for (int ic = 0; ic < nek1; ++ic) {
|
| 6097 |
// k indices
|
|
|
|
| 6122 |
// softmax
|
| 6123 |
{
|
| 6124 |
float max = -INFINITY;
|
| 6125 |
+
ggml_vec_max_f32(M, &max, S);
|
| 6126 |
+
|
| 6127 |
+
float sum = 0.0f;
|
| 6128 |
+
{
|
| 6129 |
+
#ifndef GGML_USE_ACCELERATE
|
| 6130 |
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
| 6131 |
+
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
| 6132 |
|
| 6133 |
+
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
| 6134 |
+
float * SS = S + i;
|
| 6135 |
|
| 6136 |
+
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
| 6137 |
+
if (SS[j] == -INFINITY) {
|
| 6138 |
+
SS[j] = 0.0f;
|
| 6139 |
+
} else {
|
| 6140 |
+
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
| 6141 |
+
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
| 6142 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
| 6143 |
+
sump[j] += val;
|
| 6144 |
+
SS[j] = val;
|
| 6145 |
+
}
|
| 6146 |
+
}
|
| 6147 |
+
}
|
| 6148 |
+
|
| 6149 |
+
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
| 6150 |
+
sum += sump[i];
|
| 6151 |
}
|
| 6152 |
+
#else
|
| 6153 |
+
vvexpf(S, S, &Mup);
|
| 6154 |
+
ggml_vec_sum_f32(Mup, &sum, S);
|
| 6155 |
+
#endif
|
| 6156 |
}
|
| 6157 |
|
| 6158 |
assert(sum > 0.0f);
|
| 6159 |
|
| 6160 |
sum = 1.0/sum;
|
| 6161 |
ggml_vec_scale_f32(M, S, sum);
|
| 6162 |
+
|
| 6163 |
+
#ifndef NDEBUG
|
| 6164 |
+
for (int i = 0; i < M; ++i) {
|
| 6165 |
+
assert(!isnan(S[i]));
|
| 6166 |
+
assert(!isinf(S[i]));
|
| 6167 |
+
}
|
| 6168 |
+
#endif
|
| 6169 |
}
|
| 6170 |
|
| 6171 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
|
|
|
| 7246 |
|
| 7247 |
size_t cur = 0;
|
| 7248 |
|
| 7249 |
+
const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
| 7250 |
+
|
| 7251 |
if (node->src1->type == GGML_TYPE_F32) {
|
| 7252 |
+
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
| 7253 |
+
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
| 7254 |
}
|
| 7255 |
|
| 7256 |
if (node->src1->type == GGML_TYPE_F16) {
|
| 7257 |
+
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
| 7258 |
+
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
| 7259 |
}
|
| 7260 |
|
| 7261 |
work_size = MAX(work_size, cur);
|
whisper.cpp
CHANGED
|
@@ -131,7 +131,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
|
|
| 131 |
{ "su", { 98, "sundanese", } },
|
| 132 |
};
|
| 133 |
|
| 134 |
-
static const size_t MB = 1024*1024;
|
| 135 |
|
| 136 |
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
|
| 137 |
{ MODEL_TINY, 74ull*MB },
|
|
|
|
| 131 |
{ "su", { 98, "sundanese", } },
|
| 132 |
};
|
| 133 |
|
| 134 |
+
static const size_t MB = 3*1024*1024;
|
| 135 |
|
| 136 |
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
|
| 137 |
{ MODEL_TINY, 74ull*MB },
|