ggerganov commited on
Commit
1b62c96
·
1 Parent(s): 524a01b

ggml : refactoring (llama/#0)

Browse files
ggml/include/ggml.h CHANGED
@@ -358,6 +358,7 @@ extern "C" {
358
 
359
  struct ggml_object;
360
  struct ggml_context;
 
361
 
362
  // NOTE: always add types at the end of the enum to keep backward compatibility
363
  enum ggml_type {
@@ -579,23 +580,9 @@ extern "C" {
579
  GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
580
  };
581
 
582
- // ggml object
583
- struct ggml_object {
584
- size_t offs;
585
- size_t size;
586
-
587
- struct ggml_object * next;
588
-
589
- enum ggml_object_type type;
590
-
591
- char padding[4];
592
- };
593
-
594
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
595
-
596
  // n-dimensional tensor
597
  struct ggml_tensor {
598
- enum ggml_type type;
599
 
600
  GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
601
 
@@ -659,7 +646,7 @@ extern "C" {
659
 
660
  struct ggml_threadpool; // forward declaration, see ggml.c
661
 
662
- typedef struct ggml_threadpool * ggml_threadpool_t;
663
 
664
  // the compute plan that needs to be prepared for ggml_graph_compute()
665
  // since https://github.com/ggerganov/ggml/issues/287
@@ -675,35 +662,6 @@ extern "C" {
675
  void * abort_callback_data;
676
  };
677
 
678
- enum ggml_cgraph_eval_order {
679
- GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
680
- GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
681
- GGML_CGRAPH_EVAL_ORDER_COUNT
682
- };
683
-
684
- typedef uint32_t ggml_bitset_t;
685
-
686
- struct ggml_hash_set {
687
- size_t size;
688
- ggml_bitset_t * used; // whether or not the keys are in use i.e. set
689
- struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
690
- };
691
-
692
- // computation graph
693
- struct ggml_cgraph {
694
- int size;
695
- int n_nodes;
696
- int n_leafs;
697
-
698
- struct ggml_tensor ** nodes;
699
- struct ggml_tensor ** grads;
700
- struct ggml_tensor ** leafs;
701
-
702
- struct ggml_hash_set visited_hash_set;
703
-
704
- enum ggml_cgraph_eval_order order;
705
- };
706
-
707
  // scratch buffer
708
  struct ggml_scratch {
709
  size_t offs;
@@ -2021,8 +1979,6 @@ extern "C" {
2021
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
2022
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
2023
 
2024
- #define GGML_N_TASKS_MAX -1
2025
-
2026
  GGML_API struct ggml_tensor * ggml_map_custom1(
2027
  struct ggml_context * ctx,
2028
  struct ggml_tensor * a,
@@ -2103,7 +2059,6 @@ extern "C" {
2103
  GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
2104
  GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
2105
 
2106
-
2107
  GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2108
  GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
2109
 
@@ -2118,25 +2073,31 @@ extern "C" {
2118
  float wd); // weight decay
2119
 
2120
  // graph allocation in a context
2121
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2122
- GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
2123
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2124
- GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
2125
- GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2126
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2127
- GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
 
 
 
 
 
 
2128
 
2129
  GGML_API size_t ggml_graph_overhead(void);
2130
  GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2131
 
2132
- GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2133
- GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
2134
- GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
2135
- GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
2136
- GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
2137
- GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
2138
- GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
2139
- GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
2140
 
2141
  // ggml_graph_plan() has to be called before ggml_graph_compute()
2142
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
 
358
 
359
  struct ggml_object;
360
  struct ggml_context;
361
+ struct ggml_cgraph;
362
 
363
  // NOTE: always add types at the end of the enum to keep backward compatibility
364
  enum ggml_type {
 
580
  GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
581
  };
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  // n-dimensional tensor
584
  struct ggml_tensor {
585
+ enum ggml_type type;
586
 
587
  GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
588
 
 
646
 
647
  struct ggml_threadpool; // forward declaration, see ggml.c
648
 
649
+ typedef struct ggml_threadpool * ggml_threadpool_t;
650
 
651
  // the compute plan that needs to be prepared for ggml_graph_compute()
652
  // since https://github.com/ggerganov/ggml/issues/287
 
662
  void * abort_callback_data;
663
  };
664
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  // scratch buffer
666
  struct ggml_scratch {
667
  size_t offs;
 
1979
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1980
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1981
 
 
 
1982
  GGML_API struct ggml_tensor * ggml_map_custom1(
1983
  struct ggml_context * ctx,
1984
  struct ggml_tensor * a,
 
2059
  GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
2060
  GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
2061
 
 
2062
  GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2063
  GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
2064
 
 
2073
  float wd); // weight decay
2074
 
2075
  // graph allocation in a context
2076
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2077
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2078
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2079
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2080
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2081
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2082
+
2083
+ GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2084
+ GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2085
+ GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2086
+ GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
2087
+
2088
+ GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2089
 
2090
  GGML_API size_t ggml_graph_overhead(void);
2091
  GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2092
 
2093
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2094
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2095
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2096
+ GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
2097
+ GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
2098
+ GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
2099
+ GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
2100
+ GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
2101
 
2102
  // ggml_graph_plan() has to be called before ggml_graph_compute()
2103
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
ggml/src/ggml-aarch64.c CHANGED
@@ -4,6 +4,7 @@
4
 
5
  #include "ggml-quants.h"
6
  #include "ggml-impl.h"
 
7
 
8
  #include <math.h>
9
  #include <string.h>
 
4
 
5
  #include "ggml-quants.h"
6
  #include "ggml-impl.h"
7
+ #include "ggml-cpu-impl.h"
8
 
9
  #include <math.h>
10
  #include <string.h>
ggml/src/ggml-blas.cpp CHANGED
@@ -1,3 +1,4 @@
 
1
  #include "ggml-blas.h"
2
  #include "ggml-backend-impl.h"
3
 
 
1
+ #include "ggml-impl.h"
2
  #include "ggml-blas.h"
3
  #include "ggml-backend-impl.h"
4
 
ggml/src/ggml-cann.cpp CHANGED
@@ -30,6 +30,7 @@
30
  #include <cstring>
31
  #include <mutex>
32
 
 
33
  #include "ggml-backend-impl.h"
34
  #include "ggml-cann/aclnn_ops.h"
35
  #include "ggml-cann/common.h"
 
30
  #include <cstring>
31
  #include <mutex>
32
 
33
+ #include "ggml-impl.h"
34
  #include "ggml-backend-impl.h"
35
  #include "ggml-cann/aclnn_ops.h"
36
  #include "ggml-cann/common.h"
ggml/src/ggml-cuda.cu CHANGED
@@ -1,5 +1,5 @@
1
  #include "ggml-cuda.h"
2
- #include "ggml.h"
3
  #include "ggml-backend-impl.h"
4
 
5
  #include "ggml-cuda/common.cuh"
 
1
  #include "ggml-cuda.h"
2
+ #include "ggml-impl.h"
3
  #include "ggml-backend-impl.h"
4
 
5
  #include "ggml-cuda/common.cuh"
ggml/src/ggml-impl.h CHANGED
@@ -1,15 +1,17 @@
1
  #pragma once
2
 
3
- #include "ggml.h"
4
-
5
  // GGML internal header
6
 
 
 
7
  #include <assert.h>
8
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
9
- #include <stddef.h>
10
  #include <stdbool.h>
11
- #include <string.h> // memcpy
12
- #include <math.h> // fabsf
 
 
 
13
 
14
  #undef MIN
15
  #undef MAX
@@ -17,96 +19,6 @@
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
 
20
- #if defined(_MSC_VER)
21
-
22
- #define m512bh(p) p
23
- #define m512i(p) p
24
-
25
- #else
26
-
27
- #define m512bh(p) (__m512bh)(p)
28
- #define m512i(p) (__m512i)(p)
29
-
30
- #endif
31
-
32
- /**
33
- * Converts brain16 to float32.
34
- *
35
- * The bfloat16 floating point format has the following structure:
36
- *
37
- * ┌sign
38
- * │
39
- * │ ┌exponent
40
- * │ │
41
- * │ │ ┌mantissa
42
- * │ │ │
43
- * │┌──┴───┐┌─┴───┐
44
- * 0b0000000000000000 brain16
45
- *
46
- * Since bf16 has the same number of exponent bits as a 32bit float,
47
- * encoding and decoding numbers becomes relatively straightforward.
48
- *
49
- * ┌sign
50
- * │
51
- * │ ┌exponent
52
- * │ │
53
- * │ │ ┌mantissa
54
- * │ │ │
55
- * │┌──┴───┐┌─┴───────────────────┐
56
- * 0b00000000000000000000000000000000 IEEE binary32
57
- *
58
- * For comparison, the standard fp16 format has fewer exponent bits.
59
- *
60
- * ┌sign
61
- * │
62
- * │ ┌exponent
63
- * │ │
64
- * │ │ ┌mantissa
65
- * │ │ │
66
- * │┌─┴─┐┌─┴──────┐
67
- * 0b0000000000000000 IEEE binary16
68
- *
69
- * @see IEEE 754-2008
70
- */
71
- static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
72
- union {
73
- float f;
74
- uint32_t i;
75
- } u;
76
- u.i = (uint32_t)h.bits << 16;
77
- return u.f;
78
- }
79
-
80
- /**
81
- * Converts float32 to brain16.
82
- *
83
- * This is binary identical with Google Brain float conversion.
84
- * Floats shall round to nearest even, and NANs shall be quiet.
85
- * Subnormals aren't flushed to zero, except perhaps when used.
86
- * This code should vectorize nicely if using modern compilers.
87
- */
88
- static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
89
- ggml_bf16_t h;
90
- union {
91
- float f;
92
- uint32_t i;
93
- } u;
94
- u.f = s;
95
- if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
96
- h.bits = (u.i >> 16) | 64; /* force to quiet */
97
- return h;
98
- }
99
- h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
100
- return h;
101
- }
102
-
103
- #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
104
- #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
105
-
106
- #ifdef __cplusplus
107
- extern "C" {
108
- #endif
109
-
110
  // static_assert should be a #define, but if it's not,
111
  // fall back to the _Static_assert C11 keyword.
112
  // if C99 - static_assert is noop
@@ -121,516 +33,10 @@ extern "C" {
121
  #endif
122
  #endif
123
 
124
- // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
125
- #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
126
- #ifndef __FMA__
127
- #define __FMA__
128
- #endif
129
- #ifndef __F16C__
130
- #define __F16C__
131
- #endif
132
- #endif
133
-
134
- // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
135
- #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
136
- #ifndef __SSE3__
137
- #define __SSE3__
138
- #endif
139
- #ifndef __SSSE3__
140
- #define __SSSE3__
141
- #endif
142
- #endif
143
-
144
- #if defined(__ARM_FEATURE_SVE)
145
- #include <arm_sve.h>
146
- #include <sys/prctl.h>
147
- #endif
148
-
149
- // 16-bit float
150
- // on Arm, we use __fp16
151
- // on x86, we use uint16_t
152
- #if defined(__ARM_NEON)
153
-
154
- // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
155
- //
156
- // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
157
- //
158
- #include <arm_neon.h>
159
-
160
- #ifdef _MSC_VER
161
-
162
- typedef uint16_t ggml_fp16_internal_t;
163
-
164
- #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
165
-
166
- #else
167
-
168
- typedef __fp16 ggml_fp16_internal_t;
169
-
170
- #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
171
-
172
- #endif // _MSC_VER
173
-
174
- #if !defined(__aarch64__)
175
-
176
- // 32-bit ARM compatibility
177
-
178
- // vaddlvq_s16
179
- // vpaddq_s16
180
- // vpaddq_s32
181
- // vaddvq_s32
182
- // vaddvq_f32
183
- // vmaxvq_f32
184
- // vcvtnq_s32_f32
185
- // vzip1_u8
186
- // vzip2_u8
187
-
188
- inline static int32_t vaddlvq_s16(int16x8_t v) {
189
- int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
190
- return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
191
- }
192
-
193
- inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
194
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
195
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
196
- return vcombine_s16(a0, b0);
197
- }
198
-
199
- inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
200
- int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
201
- int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
202
- return vcombine_s32(a0, b0);
203
- }
204
-
205
- inline static int32_t vaddvq_s32(int32x4_t v) {
206
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
207
- }
208
-
209
- inline static float vaddvq_f32(float32x4_t v) {
210
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
211
- }
212
-
213
- inline static float vmaxvq_f32(float32x4_t v) {
214
- return
215
- MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
216
- MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
217
- }
218
-
219
- inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
220
- int32x4_t res;
221
-
222
- res[0] = roundf(vgetq_lane_f32(v, 0));
223
- res[1] = roundf(vgetq_lane_f32(v, 1));
224
- res[2] = roundf(vgetq_lane_f32(v, 2));
225
- res[3] = roundf(vgetq_lane_f32(v, 3));
226
-
227
- return res;
228
- }
229
-
230
- inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
231
- uint8x8_t res;
232
-
233
- res[0] = a[0]; res[1] = b[0];
234
- res[2] = a[1]; res[3] = b[1];
235
- res[4] = a[2]; res[5] = b[2];
236
- res[6] = a[3]; res[7] = b[3];
237
-
238
- return res;
239
- }
240
-
241
- inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
242
- uint8x8_t res;
243
-
244
- res[0] = a[4]; res[1] = b[4];
245
- res[2] = a[5]; res[3] = b[5];
246
- res[4] = a[6]; res[5] = b[6];
247
- res[6] = a[7]; res[7] = b[7];
248
-
249
- return res;
250
- }
251
-
252
- // vld1q_s16_x2
253
- // vld1q_u8_x2
254
- // vld1q_u8_x4
255
- // vld1q_s8_x2
256
- // vld1q_s8_x4
257
- // TODO: double-check these work correctly
258
-
259
- typedef struct ggml_int16x8x2_t {
260
- int16x8_t val[2];
261
- } ggml_int16x8x2_t;
262
-
263
- inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
264
- ggml_int16x8x2_t res;
265
-
266
- res.val[0] = vld1q_s16(ptr + 0);
267
- res.val[1] = vld1q_s16(ptr + 8);
268
-
269
- return res;
270
- }
271
-
272
- typedef struct ggml_uint8x16x2_t {
273
- uint8x16_t val[2];
274
- } ggml_uint8x16x2_t;
275
-
276
- inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
277
- ggml_uint8x16x2_t res;
278
-
279
- res.val[0] = vld1q_u8(ptr + 0);
280
- res.val[1] = vld1q_u8(ptr + 16);
281
-
282
- return res;
283
- }
284
-
285
- typedef struct ggml_uint8x16x4_t {
286
- uint8x16_t val[4];
287
- } ggml_uint8x16x4_t;
288
-
289
- inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
290
- ggml_uint8x16x4_t res;
291
-
292
- res.val[0] = vld1q_u8(ptr + 0);
293
- res.val[1] = vld1q_u8(ptr + 16);
294
- res.val[2] = vld1q_u8(ptr + 32);
295
- res.val[3] = vld1q_u8(ptr + 48);
296
-
297
- return res;
298
- }
299
-
300
- typedef struct ggml_int8x16x2_t {
301
- int8x16_t val[2];
302
- } ggml_int8x16x2_t;
303
-
304
- inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
305
- ggml_int8x16x2_t res;
306
-
307
- res.val[0] = vld1q_s8(ptr + 0);
308
- res.val[1] = vld1q_s8(ptr + 16);
309
-
310
- return res;
311
- }
312
-
313
- typedef struct ggml_int8x16x4_t {
314
- int8x16_t val[4];
315
- } ggml_int8x16x4_t;
316
-
317
- inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
318
- ggml_int8x16x4_t res;
319
-
320
- res.val[0] = vld1q_s8(ptr + 0);
321
- res.val[1] = vld1q_s8(ptr + 16);
322
- res.val[2] = vld1q_s8(ptr + 32);
323
- res.val[3] = vld1q_s8(ptr + 48);
324
-
325
- return res;
326
- }
327
-
328
- // NOTE: not tested
329
- inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
330
- int8x16_t res;
331
-
332
- res[ 0] = a[b[ 0]];
333
- res[ 1] = a[b[ 1]];
334
- res[ 2] = a[b[ 2]];
335
- res[ 3] = a[b[ 3]];
336
- res[ 4] = a[b[ 4]];
337
- res[ 5] = a[b[ 5]];
338
- res[ 6] = a[b[ 6]];
339
- res[ 7] = a[b[ 7]];
340
- res[ 8] = a[b[ 8]];
341
- res[ 9] = a[b[ 9]];
342
- res[10] = a[b[10]];
343
- res[11] = a[b[11]];
344
- res[12] = a[b[12]];
345
- res[13] = a[b[13]];
346
- res[14] = a[b[14]];
347
- res[15] = a[b[15]];
348
-
349
- return res;
350
- }
351
-
352
- // NOTE: not tested
353
- inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
354
- uint8x16_t res;
355
-
356
- res[ 0] = a[b[ 0]];
357
- res[ 1] = a[b[ 1]];
358
- res[ 2] = a[b[ 2]];
359
- res[ 3] = a[b[ 3]];
360
- res[ 4] = a[b[ 4]];
361
- res[ 5] = a[b[ 5]];
362
- res[ 6] = a[b[ 6]];
363
- res[ 7] = a[b[ 7]];
364
- res[ 8] = a[b[ 8]];
365
- res[ 9] = a[b[ 9]];
366
- res[10] = a[b[10]];
367
- res[11] = a[b[11]];
368
- res[12] = a[b[12]];
369
- res[13] = a[b[13]];
370
- res[14] = a[b[14]];
371
- res[15] = a[b[15]];
372
-
373
- return res;
374
- }
375
-
376
- #else
377
-
378
- #define ggml_int16x8x2_t int16x8x2_t
379
- #define ggml_uint8x16x2_t uint8x16x2_t
380
- #define ggml_uint8x16x4_t uint8x16x4_t
381
- #define ggml_int8x16x2_t int8x16x2_t
382
- #define ggml_int8x16x4_t int8x16x4_t
383
-
384
- #define ggml_vld1q_s16_x2 vld1q_s16_x2
385
- #define ggml_vld1q_u8_x2 vld1q_u8_x2
386
- #define ggml_vld1q_u8_x4 vld1q_u8_x4
387
- #define ggml_vld1q_s8_x2 vld1q_s8_x2
388
- #define ggml_vld1q_s8_x4 vld1q_s8_x4
389
- #define ggml_vqtbl1q_s8 vqtbl1q_s8
390
- #define ggml_vqtbl1q_u8 vqtbl1q_u8
391
-
392
- #endif // !defined(__aarch64__)
393
-
394
- #if !defined(__ARM_FEATURE_DOTPROD)
395
-
396
- inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
397
- const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
398
- const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
399
-
400
- return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
401
- }
402
-
403
- #else
404
-
405
- #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
406
-
407
- #endif // !defined(__ARM_FEATURE_DOTPROD)
408
-
409
- #endif // defined(__ARM_NEON)
410
-
411
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
412
-
413
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
414
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
415
-
416
- #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
417
-
418
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
419
- ggml_fp16_internal_t tmp;
420
- memcpy(&tmp, &h, sizeof(ggml_fp16_t));
421
- return (float)tmp;
422
- }
423
-
424
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
425
- ggml_fp16_t res;
426
- ggml_fp16_internal_t tmp = f;
427
- memcpy(&res, &tmp, sizeof(ggml_fp16_t));
428
- return res;
429
- }
430
-
431
- #else
432
-
433
- #ifdef __wasm_simd128__
434
- #include <wasm_simd128.h>
435
- #else
436
- #ifdef __POWER9_VECTOR__
437
- #include <altivec.h>
438
- #undef bool
439
- #define bool _Bool
440
- #else
441
- #if defined(_MSC_VER) || defined(__MINGW32__)
442
- #include <intrin.h>
443
- #else
444
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
445
- #if !defined(__riscv)
446
- #include <immintrin.h>
447
- #endif
448
- #endif
449
- #endif
450
- #endif
451
- #endif
452
-
453
- #ifdef __riscv_v_intrinsic
454
- #include <riscv_vector.h>
455
- #endif
456
-
457
- #if defined(__loongarch64)
458
- #if defined(__loongarch_asx)
459
- #include <lasxintrin.h>
460
- #endif
461
- #if defined(__loongarch_sx)
462
- #include <lsxintrin.h>
463
- #endif
464
- #endif
465
-
466
- #if defined(__loongarch_asx)
467
-
468
- typedef union {
469
- int32_t i;
470
- float f;
471
- } ft_union;
472
-
473
- /* float type data load instructions */
474
- static __m128 __lsx_vreplfr2vr_s(float val) {
475
- ft_union fi_tmpval = {.f = val};
476
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
477
- }
478
-
479
- static __m256 __lasx_xvreplfr2vr_s(float val) {
480
- ft_union fi_tmpval = {.f = val};
481
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
482
- }
483
- #endif
484
-
485
- #ifdef __F16C__
486
-
487
- #ifdef _MSC_VER
488
- #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
489
- #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
490
- #else
491
- #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
492
- #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
493
- #endif
494
-
495
- #elif defined(__POWER9_VECTOR__)
496
-
497
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
498
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
499
- /* the inline asm below is about 12% faster than the lookup method */
500
- #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
501
- #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
502
-
503
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
504
- register float f;
505
- register double d;
506
- __asm__(
507
- "mtfprd %0,%2\n"
508
- "xscvhpdp %0,%0\n"
509
- "frsp %1,%0\n" :
510
- /* temp */ "=d"(d),
511
- /* out */ "=f"(f):
512
- /* in */ "r"(h));
513
- return f;
514
- }
515
-
516
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
517
- register double d;
518
- register ggml_fp16_t r;
519
- __asm__( /* xscvdphp can work on double or single precision */
520
- "xscvdphp %0,%2\n"
521
- "mffprd %1,%0\n" :
522
- /* temp */ "=d"(d),
523
- /* out */ "=r"(r):
524
- /* in */ "f"(f));
525
- return r;
526
- }
527
-
528
- #else
529
-
530
- // FP16 <-> FP32
531
- // ref: https://github.com/Maratyszcza/FP16
532
-
533
- static inline float fp32_from_bits(uint32_t w) {
534
- union {
535
- uint32_t as_bits;
536
- float as_value;
537
- } fp32;
538
- fp32.as_bits = w;
539
- return fp32.as_value;
540
- }
541
-
542
- static inline uint32_t fp32_to_bits(float f) {
543
- union {
544
- float as_value;
545
- uint32_t as_bits;
546
- } fp32;
547
- fp32.as_value = f;
548
- return fp32.as_bits;
549
- }
550
-
551
- static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
552
- const uint32_t w = (uint32_t) h << 16;
553
- const uint32_t sign = w & UINT32_C(0x80000000);
554
- const uint32_t two_w = w + w;
555
-
556
- const uint32_t exp_offset = UINT32_C(0xE0) << 23;
557
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
558
- const float exp_scale = 0x1.0p-112f;
559
- #else
560
- const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
561
- #endif
562
- const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
563
-
564
- const uint32_t magic_mask = UINT32_C(126) << 23;
565
- const float magic_bias = 0.5f;
566
- const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
567
-
568
- const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
569
- const uint32_t result = sign |
570
- (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
571
- return fp32_from_bits(result);
572
- }
573
-
574
- static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
575
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
576
- const float scale_to_inf = 0x1.0p+112f;
577
- const float scale_to_zero = 0x1.0p-110f;
578
- #else
579
- const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
580
- const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
581
- #endif
582
- float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
583
-
584
- const uint32_t w = fp32_to_bits(f);
585
- const uint32_t shl1_w = w + w;
586
- const uint32_t sign = w & UINT32_C(0x80000000);
587
- uint32_t bias = shl1_w & UINT32_C(0xFF000000);
588
- if (bias < UINT32_C(0x71000000)) {
589
- bias = UINT32_C(0x71000000);
590
- }
591
-
592
- base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
593
- const uint32_t bits = fp32_to_bits(base);
594
- const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
595
- const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
596
- const uint32_t nonsign = exp_bits + mantissa_bits;
597
- return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
598
- }
599
-
600
- #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
601
- #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
602
-
603
- #endif // __F16C__
604
-
605
- #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
606
-
607
- #ifdef __ARM_FEATURE_SVE
608
- #include <arm_sve.h>
609
- #endif // __ARM_FEATURE_SVE
610
-
611
- // precomputed f32 table for f16 (256 KB)
612
- // defined in ggml.c, initialized in ggml_init()
613
- extern float ggml_table_f32_f16[1 << 16];
614
-
615
- // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
616
- // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
617
- // This is also true for POWER9.
618
- #if !defined(GGML_FP16_TO_FP32)
619
- inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
620
- uint16_t s;
621
- memcpy(&s, &f, sizeof(uint16_t));
622
- return ggml_table_f32_f16[s];
623
- }
624
-
625
- #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
626
- #endif
627
-
628
- #if !defined(GGML_FP32_TO_FP16)
629
- #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
630
- #endif
631
-
632
  // bitset
633
 
 
 
634
  static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
635
  #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
636
  #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@@ -656,6 +62,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
656
  #define GGML_HASHSET_FULL ((size_t)-1)
657
  #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
658
 
 
 
 
 
 
 
659
  struct ggml_hash_set ggml_hash_set_new(size_t size);
660
  void ggml_hash_set_free(struct ggml_hash_set * hash_set);
661
 
@@ -745,6 +157,30 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
745
  GGML_ABORT("fatal error");
746
  }
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  #ifdef __cplusplus
749
  }
750
  #endif
 
1
  #pragma once
2
 
 
 
3
  // GGML internal header
4
 
5
+ #include "ggml.h"
6
+
7
  #include <assert.h>
8
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 
9
  #include <stdbool.h>
10
+ #include <stdint.h>
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
 
16
  #undef MIN
17
  #undef MAX
 
19
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
20
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  // static_assert should be a #define, but if it's not,
23
  // fall back to the _Static_assert C11 keyword.
24
  // if C99 - static_assert is noop
 
33
  #endif
34
  #endif
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  // bitset
37
 
38
+ typedef uint32_t ggml_bitset_t;
39
+
40
  static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
41
  #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
42
  #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
 
62
  #define GGML_HASHSET_FULL ((size_t)-1)
63
  #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
64
 
65
+ struct ggml_hash_set {
66
+ size_t size;
67
+ ggml_bitset_t * used; // whether or not the keys are in use i.e. set
68
+ struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
69
+ };
70
+
71
  struct ggml_hash_set ggml_hash_set_new(size_t size);
72
  void ggml_hash_set_free(struct ggml_hash_set * hash_set);
73
 
 
157
  GGML_ABORT("fatal error");
158
  }
159
 
160
+ // computation graph
161
+
162
+ enum ggml_cgraph_eval_order {
163
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
164
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
165
+ GGML_CGRAPH_EVAL_ORDER_COUNT
166
+ };
167
+
168
+ struct ggml_cgraph {
169
+ int size;
170
+ int n_nodes;
171
+ int n_leafs;
172
+
173
+ struct ggml_tensor ** nodes;
174
+ struct ggml_tensor ** grads;
175
+ struct ggml_tensor ** leafs;
176
+
177
+ struct ggml_hash_set visited_hash_set;
178
+
179
+ enum ggml_cgraph_eval_order order;
180
+ };
181
+
182
+ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
183
+
184
  #ifdef __cplusplus
185
  }
186
  #endif
ggml/src/ggml-kompute.cpp CHANGED
@@ -1,4 +1,4 @@
1
- #include "ggml.h"
2
  #include "ggml-backend.h"
3
  #include "ggml-backend-impl.h"
4
  #include "ggml-kompute.h"
 
1
+ #include "ggml-impl.h"
2
  #include "ggml-backend.h"
3
  #include "ggml-backend-impl.h"
4
  #include "ggml-kompute.h"
ggml/src/ggml-metal.m CHANGED
@@ -1,7 +1,7 @@
1
  #import "ggml-metal.h"
2
 
 
3
  #import "ggml-backend-impl.h"
4
- #import "ggml.h"
5
 
6
  #import <Foundation/Foundation.h>
7
 
@@ -885,7 +885,7 @@ static enum ggml_status ggml_metal_graph_compute(
885
  // create multiple command buffers and enqueue them
886
  // then, we encode the graph into the command buffers in parallel
887
 
888
- const int n_nodes = gf->n_nodes;
889
  const int n_cb = ctx->n_cb;
890
  const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
891
 
 
1
  #import "ggml-metal.h"
2
 
3
+ #import "ggml-impl.h"
4
  #import "ggml-backend-impl.h"
 
5
 
6
  #import <Foundation/Foundation.h>
7
 
 
885
  // create multiple command buffers and enqueue them
886
  // then, we encode the graph into the command buffers in parallel
887
 
888
+ const int n_nodes = gf->n_nodes;
889
  const int n_cb = ctx->n_cb;
890
  const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
891
 
ggml/src/ggml-quants.c CHANGED
@@ -3,6 +3,7 @@
3
 
4
  #include "ggml-quants.h"
5
  #include "ggml-impl.h"
 
6
 
7
 
8
  #include <math.h>
 
3
 
4
  #include "ggml-quants.h"
5
  #include "ggml-impl.h"
6
+ #include "ggml-cpu-impl.h"
7
 
8
 
9
  #include <math.h>
ggml/src/ggml-rpc.cpp CHANGED
@@ -1,5 +1,5 @@
1
  #include "ggml-rpc.h"
2
- #include "ggml.h"
3
  #include "ggml-backend-impl.h"
4
 
5
  #include <cinttypes>
 
1
  #include "ggml-rpc.h"
2
+ #include "ggml-impl.h"
3
  #include "ggml-backend-impl.h"
4
 
5
  #include <cinttypes>
ggml/src/ggml-sycl.cpp CHANGED
@@ -33,7 +33,7 @@
33
  #include <sycl/half_type.hpp>
34
 
35
  #include "ggml-sycl.h"
36
- #include "ggml.h"
37
  #include "ggml-backend-impl.h"
38
 
39
  #include "ggml-sycl/backend.hpp"
 
33
  #include <sycl/half_type.hpp>
34
 
35
  #include "ggml-sycl.h"
36
+ #include "ggml-impl.h"
37
  #include "ggml-backend-impl.h"
38
 
39
  #include "ggml-sycl/backend.hpp"
ggml/src/ggml-vulkan.cpp CHANGED
@@ -21,7 +21,7 @@
21
  #include <memory>
22
  #include <mutex>
23
 
24
- #include "ggml.h"
25
  #include "ggml-backend-impl.h"
26
 
27
  #include "ggml-vulkan-shaders.hpp"
 
21
  #include <memory>
22
  #include <mutex>
23
 
24
+ #include "ggml-impl.h"
25
  #include "ggml-backend-impl.h"
26
 
27
  #include "ggml-vulkan-shaders.hpp"
ggml/src/ggml.c CHANGED
@@ -3,6 +3,7 @@
3
 
4
  #include "ggml-backend.h"
5
  #include "ggml-impl.h"
 
6
  #include "ggml-quants.h"
7
  #include "ggml.h"
8
  #include "ggml-aarch64.h"
@@ -288,6 +289,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
288
  #define GGML_DEBUG 0
289
  #define GGML_GELU_FP16
290
  #define GGML_GELU_QUICK_FP16
 
291
 
292
  #define GGML_SOFT_MAX_UNROLL 4
293
  #define GGML_VEC_DOT_UNROLL 2
@@ -1121,21 +1123,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1121
  #define GGML_F32x4_ADD vaddq_f32
1122
  #define GGML_F32x4_MUL vmulq_f32
1123
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1124
- #define GGML_F32x4_REDUCE(res, x) \
1125
- { \
1126
- int offset = GGML_F32_ARR >> 1; \
1127
- for (int i = 0; i < offset; ++i) { \
1128
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1129
- } \
1130
- offset >>= 1; \
1131
- for (int i = 0; i < offset; ++i) { \
1132
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1133
- } \
1134
- offset >>= 1; \
1135
- for (int i = 0; i < offset; ++i) { \
1136
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1137
- } \
1138
- res = GGML_F32x4_REDUCE_ONE(x[0]); \
1139
  }
1140
 
1141
  #define GGML_F32_VEC GGML_F32x4
@@ -1162,30 +1164,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1162
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
1163
  #define GGML_F16x8_ADD vaddq_f16
1164
  #define GGML_F16x8_MUL vmulq_f16
1165
- #define GGML_F16x8_REDUCE(res, x) \
1166
- do { \
1167
- int offset = GGML_F16_ARR >> 1; \
1168
- for (int i = 0; i < offset; ++i) { \
1169
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1170
- } \
1171
- offset >>= 1; \
1172
- for (int i = 0; i < offset; ++i) { \
1173
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1174
- } \
1175
- offset >>= 1; \
1176
- for (int i = 0; i < offset; ++i) { \
1177
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1178
- } \
1179
- const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1180
- const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
1181
- res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
1182
  } while (0)
1183
 
1184
  #define GGML_F16_VEC GGML_F16x8
1185
  #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
1186
  #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
1187
  #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
1188
- #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
1189
  #define GGML_F16_VEC_FMA GGML_F16x8_FMA
1190
  #define GGML_F16_VEC_ADD GGML_F16x8_ADD
1191
  #define GGML_F16_VEC_MUL GGML_F16x8_MUL
@@ -1894,6 +1896,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1894
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1895
  #endif
1896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1897
  //
1898
  // ggml context
1899
  //
@@ -19408,6 +19427,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
19408
  ggml_hash_set_reset(&cgraph->visited_hash_set);
19409
  }
19410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19411
  // Android's libc implementation "bionic" does not support setting affinity
19412
  #if defined(__gnu_linux__)
19413
  static void set_numa_thread_affinity(int thread_n) {
 
3
 
4
  #include "ggml-backend.h"
5
  #include "ggml-impl.h"
6
+ #include "ggml-cpu-impl.h"
7
  #include "ggml-quants.h"
8
  #include "ggml.h"
9
  #include "ggml-aarch64.h"
 
289
  #define GGML_DEBUG 0
290
  #define GGML_GELU_FP16
291
  #define GGML_GELU_QUICK_FP16
292
+ #define GGML_N_TASKS_MAX (-1)
293
 
294
  #define GGML_SOFT_MAX_UNROLL 4
295
  #define GGML_VEC_DOT_UNROLL 2
 
1123
  #define GGML_F32x4_ADD vaddq_f32
1124
  #define GGML_F32x4_MUL vmulq_f32
1125
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1126
+ #define GGML_F32x4_REDUCE(res, x) \
1127
+ { \
1128
+ int offset = GGML_F32_ARR >> 1; \
1129
+ for (int i = 0; i < offset; ++i) { \
1130
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1131
+ } \
1132
+ offset >>= 1; \
1133
+ for (int i = 0; i < offset; ++i) { \
1134
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1135
+ } \
1136
+ offset >>= 1; \
1137
+ for (int i = 0; i < offset; ++i) { \
1138
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1139
+ } \
1140
+ (res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
1141
  }
1142
 
1143
  #define GGML_F32_VEC GGML_F32x4
 
1164
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
1165
  #define GGML_F16x8_ADD vaddq_f16
1166
  #define GGML_F16x8_MUL vmulq_f16
1167
+ #define GGML_F16x8_REDUCE(res, x) \
1168
+ do { \
1169
+ int offset = GGML_F16_ARR >> 1; \
1170
+ for (int i = 0; i < offset; ++i) { \
1171
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1172
+ } \
1173
+ offset >>= 1; \
1174
+ for (int i = 0; i < offset; ++i) { \
1175
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1176
+ } \
1177
+ offset >>= 1; \
1178
+ for (int i = 0; i < offset; ++i) { \
1179
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1180
+ } \
1181
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
1182
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
1183
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
1184
  } while (0)
1185
 
1186
  #define GGML_F16_VEC GGML_F16x8
1187
  #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
1188
  #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
1189
  #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
1190
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
1191
  #define GGML_F16_VEC_FMA GGML_F16x8_FMA
1192
  #define GGML_F16_VEC_ADD GGML_F16x8_ADD
1193
  #define GGML_F16_VEC_MUL GGML_F16x8_MUL
 
1896
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1897
  #endif
1898
 
1899
+ //
1900
+ // ggml object
1901
+ //
1902
+
1903
+ struct ggml_object {
1904
+ size_t offs;
1905
+ size_t size;
1906
+
1907
+ struct ggml_object * next;
1908
+
1909
+ enum ggml_object_type type;
1910
+
1911
+ char padding[4];
1912
+ };
1913
+
1914
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
1915
+
1916
  //
1917
  // ggml context
1918
  //
 
19427
  ggml_hash_set_reset(&cgraph->visited_hash_set);
19428
  }
19429
 
19430
+ int ggml_graph_size(struct ggml_cgraph * cgraph) {
19431
+ return cgraph->size;
19432
+ }
19433
+
19434
+ struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
19435
+ if (i < 0) {
19436
+ GGML_ASSERT(cgraph->n_nodes + i >= 0);
19437
+ return cgraph->nodes[cgraph->n_nodes + i];
19438
+ }
19439
+
19440
+ GGML_ASSERT(i < cgraph->n_nodes);
19441
+ return cgraph->nodes[i];
19442
+ }
19443
+
19444
+ struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
19445
+ return cgraph->nodes;
19446
+ }
19447
+
19448
+ int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
19449
+ return cgraph->n_nodes;
19450
+ }
19451
+
19452
+ void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
19453
+ GGML_ASSERT(cgraph->size > cgraph->n_nodes);
19454
+ cgraph->nodes[cgraph->n_nodes] = tensor;
19455
+ cgraph->n_nodes++;
19456
+ }
19457
+
19458
  // Android's libc implementation "bionic" does not support setting affinity
19459
  #if defined(__gnu_linux__)
19460
  static void set_numa_thread_affinity(int thread_n) {
src/ggml-cpu-impl.h ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // GGML CPU internal header
4
+
5
+ #include "ggml.h"
6
+ #include "ggml-impl.h"
7
+ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
8
+ //#include <stddef.h>
9
+ #include <stdbool.h>
10
+ #include <string.h> // memcpy
11
+ #include <math.h> // fabsf
12
+
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ #if defined(_MSC_VER)
19
+
20
+ #define m512bh(p) p
21
+ #define m512i(p) p
22
+
23
+ #else
24
+
25
+ #define m512bh(p) (__m512bh)(p)
26
+ #define m512i(p) (__m512i)(p)
27
+
28
+ #endif
29
+
30
+ /**
31
+ * Converts brain16 to float32.
32
+ *
33
+ * The bfloat16 floating point format has the following structure:
34
+ *
35
+ * ┌sign
36
+ * │
37
+ * │ ┌exponent
38
+ * │ │
39
+ * │ │ ┌mantissa
40
+ * │ │ │
41
+ * │┌──┴───┐┌─┴───┐
42
+ * 0b0000000000000000 brain16
43
+ *
44
+ * Since bf16 has the same number of exponent bits as a 32bit float,
45
+ * encoding and decoding numbers becomes relatively straightforward.
46
+ *
47
+ * ┌sign
48
+ * │
49
+ * │ ┌exponent
50
+ * │ │
51
+ * │ │ ┌mantissa
52
+ * │ │ │
53
+ * │┌──┴───┐┌─┴───────────────────┐
54
+ * 0b00000000000000000000000000000000 IEEE binary32
55
+ *
56
+ * For comparison, the standard fp16 format has fewer exponent bits.
57
+ *
58
+ * ┌sign
59
+ * │
60
+ * │ ┌exponent
61
+ * │ │
62
+ * │ │ ┌mantissa
63
+ * │ │ │
64
+ * │┌─┴─┐┌─┴──────┐
65
+ * 0b0000000000000000 IEEE binary16
66
+ *
67
+ * @see IEEE 754-2008
68
+ */
69
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
70
+ union {
71
+ float f;
72
+ uint32_t i;
73
+ } u;
74
+ u.i = (uint32_t)h.bits << 16;
75
+ return u.f;
76
+ }
77
+
78
+ /**
79
+ * Converts float32 to brain16.
80
+ *
81
+ * This is binary identical with Google Brain float conversion.
82
+ * Floats shall round to nearest even, and NANs shall be quiet.
83
+ * Subnormals aren't flushed to zero, except perhaps when used.
84
+ * This code should vectorize nicely if using modern compilers.
85
+ */
86
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
87
+ ggml_bf16_t h;
88
+ union {
89
+ float f;
90
+ uint32_t i;
91
+ } u;
92
+ u.f = s;
93
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
94
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
95
+ return h;
96
+ }
97
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
98
+ return h;
99
+ }
100
+
101
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
102
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
103
+
104
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
105
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
106
+ #ifndef __FMA__
107
+ #define __FMA__
108
+ #endif
109
+ #ifndef __F16C__
110
+ #define __F16C__
111
+ #endif
112
+ #endif
113
+
114
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
115
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
116
+ #ifndef __SSE3__
117
+ #define __SSE3__
118
+ #endif
119
+ #ifndef __SSSE3__
120
+ #define __SSSE3__
121
+ #endif
122
+ #endif
123
+
124
+ #if defined(__ARM_FEATURE_SVE)
125
+ #include <arm_sve.h>
126
+ #include <sys/prctl.h>
127
+ #endif
128
+
129
+ // 16-bit float
130
+ // on Arm, we use __fp16
131
+ // on x86, we use uint16_t
132
+ #if defined(__ARM_NEON)
133
+
134
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
135
+ //
136
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
137
+ //
138
+ #include <arm_neon.h>
139
+
140
+ #ifdef _MSC_VER
141
+
142
+ typedef uint16_t ggml_fp16_internal_t;
143
+
144
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
145
+
146
+ #else
147
+
148
+ typedef __fp16 ggml_fp16_internal_t;
149
+
150
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
151
+
152
+ #endif // _MSC_VER
153
+
154
+ #if !defined(__aarch64__)
155
+
156
+ // 32-bit ARM compatibility
157
+
158
+ // vaddlvq_s16
159
+ // vpaddq_s16
160
+ // vpaddq_s32
161
+ // vaddvq_s32
162
+ // vaddvq_f32
163
+ // vmaxvq_f32
164
+ // vcvtnq_s32_f32
165
+ // vzip1_u8
166
+ // vzip2_u8
167
+
168
+ inline static int32_t vaddlvq_s16(int16x8_t v) {
169
+ int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
170
+ return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
171
+ }
172
+
173
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
174
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
175
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
176
+ return vcombine_s16(a0, b0);
177
+ }
178
+
179
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
180
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
181
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
182
+ return vcombine_s32(a0, b0);
183
+ }
184
+
185
+ inline static int32_t vaddvq_s32(int32x4_t v) {
186
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
187
+ }
188
+
189
+ inline static float vaddvq_f32(float32x4_t v) {
190
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
191
+ }
192
+
193
+ inline static float vmaxvq_f32(float32x4_t v) {
194
+ return
195
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
196
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
197
+ }
198
+
199
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
200
+ int32x4_t res;
201
+
202
+ res[0] = roundf(vgetq_lane_f32(v, 0));
203
+ res[1] = roundf(vgetq_lane_f32(v, 1));
204
+ res[2] = roundf(vgetq_lane_f32(v, 2));
205
+ res[3] = roundf(vgetq_lane_f32(v, 3));
206
+
207
+ return res;
208
+ }
209
+
210
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
211
+ uint8x8_t res;
212
+
213
+ res[0] = a[0]; res[1] = b[0];
214
+ res[2] = a[1]; res[3] = b[1];
215
+ res[4] = a[2]; res[5] = b[2];
216
+ res[6] = a[3]; res[7] = b[3];
217
+
218
+ return res;
219
+ }
220
+
221
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
222
+ uint8x8_t res;
223
+
224
+ res[0] = a[4]; res[1] = b[4];
225
+ res[2] = a[5]; res[3] = b[5];
226
+ res[4] = a[6]; res[5] = b[6];
227
+ res[6] = a[7]; res[7] = b[7];
228
+
229
+ return res;
230
+ }
231
+
232
+ // vld1q_s16_x2
233
+ // vld1q_u8_x2
234
+ // vld1q_u8_x4
235
+ // vld1q_s8_x2
236
+ // vld1q_s8_x4
237
+ // TODO: double-check these work correctly
238
+
239
+ typedef struct ggml_int16x8x2_t {
240
+ int16x8_t val[2];
241
+ } ggml_int16x8x2_t;
242
+
243
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
244
+ ggml_int16x8x2_t res;
245
+
246
+ res.val[0] = vld1q_s16(ptr + 0);
247
+ res.val[1] = vld1q_s16(ptr + 8);
248
+
249
+ return res;
250
+ }
251
+
252
+ typedef struct ggml_uint8x16x2_t {
253
+ uint8x16_t val[2];
254
+ } ggml_uint8x16x2_t;
255
+
256
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
257
+ ggml_uint8x16x2_t res;
258
+
259
+ res.val[0] = vld1q_u8(ptr + 0);
260
+ res.val[1] = vld1q_u8(ptr + 16);
261
+
262
+ return res;
263
+ }
264
+
265
+ typedef struct ggml_uint8x16x4_t {
266
+ uint8x16_t val[4];
267
+ } ggml_uint8x16x4_t;
268
+
269
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
270
+ ggml_uint8x16x4_t res;
271
+
272
+ res.val[0] = vld1q_u8(ptr + 0);
273
+ res.val[1] = vld1q_u8(ptr + 16);
274
+ res.val[2] = vld1q_u8(ptr + 32);
275
+ res.val[3] = vld1q_u8(ptr + 48);
276
+
277
+ return res;
278
+ }
279
+
280
+ typedef struct ggml_int8x16x2_t {
281
+ int8x16_t val[2];
282
+ } ggml_int8x16x2_t;
283
+
284
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
285
+ ggml_int8x16x2_t res;
286
+
287
+ res.val[0] = vld1q_s8(ptr + 0);
288
+ res.val[1] = vld1q_s8(ptr + 16);
289
+
290
+ return res;
291
+ }
292
+
293
+ typedef struct ggml_int8x16x4_t {
294
+ int8x16_t val[4];
295
+ } ggml_int8x16x4_t;
296
+
297
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
298
+ ggml_int8x16x4_t res;
299
+
300
+ res.val[0] = vld1q_s8(ptr + 0);
301
+ res.val[1] = vld1q_s8(ptr + 16);
302
+ res.val[2] = vld1q_s8(ptr + 32);
303
+ res.val[3] = vld1q_s8(ptr + 48);
304
+
305
+ return res;
306
+ }
307
+
308
+ // NOTE: not tested
309
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
310
+ int8x16_t res;
311
+
312
+ res[ 0] = a[b[ 0]];
313
+ res[ 1] = a[b[ 1]];
314
+ res[ 2] = a[b[ 2]];
315
+ res[ 3] = a[b[ 3]];
316
+ res[ 4] = a[b[ 4]];
317
+ res[ 5] = a[b[ 5]];
318
+ res[ 6] = a[b[ 6]];
319
+ res[ 7] = a[b[ 7]];
320
+ res[ 8] = a[b[ 8]];
321
+ res[ 9] = a[b[ 9]];
322
+ res[10] = a[b[10]];
323
+ res[11] = a[b[11]];
324
+ res[12] = a[b[12]];
325
+ res[13] = a[b[13]];
326
+ res[14] = a[b[14]];
327
+ res[15] = a[b[15]];
328
+
329
+ return res;
330
+ }
331
+
332
+ // NOTE: not tested
333
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
334
+ uint8x16_t res;
335
+
336
+ res[ 0] = a[b[ 0]];
337
+ res[ 1] = a[b[ 1]];
338
+ res[ 2] = a[b[ 2]];
339
+ res[ 3] = a[b[ 3]];
340
+ res[ 4] = a[b[ 4]];
341
+ res[ 5] = a[b[ 5]];
342
+ res[ 6] = a[b[ 6]];
343
+ res[ 7] = a[b[ 7]];
344
+ res[ 8] = a[b[ 8]];
345
+ res[ 9] = a[b[ 9]];
346
+ res[10] = a[b[10]];
347
+ res[11] = a[b[11]];
348
+ res[12] = a[b[12]];
349
+ res[13] = a[b[13]];
350
+ res[14] = a[b[14]];
351
+ res[15] = a[b[15]];
352
+
353
+ return res;
354
+ }
355
+
356
+ #else
357
+
358
+ #define ggml_int16x8x2_t int16x8x2_t
359
+ #define ggml_uint8x16x2_t uint8x16x2_t
360
+ #define ggml_uint8x16x4_t uint8x16x4_t
361
+ #define ggml_int8x16x2_t int8x16x2_t
362
+ #define ggml_int8x16x4_t int8x16x4_t
363
+
364
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
365
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
366
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
367
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
368
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
369
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
370
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
371
+
372
+ #endif // !defined(__aarch64__)
373
+
374
+ #if !defined(__ARM_FEATURE_DOTPROD)
375
+
376
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
377
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
378
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
379
+
380
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
381
+ }
382
+
383
+ #else
384
+
385
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
386
+
387
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
388
+
389
+ #endif // defined(__ARM_NEON)
390
+
391
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
392
+
393
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
394
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
395
+
396
+ #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
397
+
398
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
399
+ ggml_fp16_internal_t tmp;
400
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
401
+ return (float)tmp;
402
+ }
403
+
404
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
405
+ ggml_fp16_t res;
406
+ ggml_fp16_internal_t tmp = f;
407
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
408
+ return res;
409
+ }
410
+
411
+ #else
412
+
413
+ #ifdef __wasm_simd128__
414
+ #include <wasm_simd128.h>
415
+ #else
416
+ #ifdef __POWER9_VECTOR__
417
+ #include <altivec.h>
418
+ #undef bool
419
+ #define bool _Bool
420
+ #else
421
+ #if defined(_MSC_VER) || defined(__MINGW32__)
422
+ #include <intrin.h>
423
+ #else
424
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
425
+ #if !defined(__riscv)
426
+ #include <immintrin.h>
427
+ #endif
428
+ #endif
429
+ #endif
430
+ #endif
431
+ #endif
432
+
433
+ #ifdef __riscv_v_intrinsic
434
+ #include <riscv_vector.h>
435
+ #endif
436
+
437
+ #if defined(__loongarch64)
438
+ #if defined(__loongarch_asx)
439
+ #include <lasxintrin.h>
440
+ #endif
441
+ #if defined(__loongarch_sx)
442
+ #include <lsxintrin.h>
443
+ #endif
444
+ #endif
445
+
446
+ #if defined(__loongarch_asx)
447
+
448
+ typedef union {
449
+ int32_t i;
450
+ float f;
451
+ } ft_union;
452
+
453
+ /* float type data load instructions */
454
+ static __m128 __lsx_vreplfr2vr_s(float val) {
455
+ ft_union fi_tmpval = {.f = val};
456
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
457
+ }
458
+
459
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
460
+ ft_union fi_tmpval = {.f = val};
461
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
462
+ }
463
+ #endif
464
+
465
+ #ifdef __F16C__
466
+
467
+ #ifdef _MSC_VER
468
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
469
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
470
+ #else
471
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
472
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
473
+ #endif
474
+
475
+ #elif defined(__POWER9_VECTOR__)
476
+
477
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
478
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
479
+ /* the inline asm below is about 12% faster than the lookup method */
480
+ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
481
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
482
+
483
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
484
+ register float f;
485
+ register double d;
486
+ __asm__(
487
+ "mtfprd %0,%2\n"
488
+ "xscvhpdp %0,%0\n"
489
+ "frsp %1,%0\n" :
490
+ /* temp */ "=d"(d),
491
+ /* out */ "=f"(f):
492
+ /* in */ "r"(h));
493
+ return f;
494
+ }
495
+
496
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
497
+ register double d;
498
+ register ggml_fp16_t r;
499
+ __asm__( /* xscvdphp can work on double or single precision */
500
+ "xscvdphp %0,%2\n"
501
+ "mffprd %1,%0\n" :
502
+ /* temp */ "=d"(d),
503
+ /* out */ "=r"(r):
504
+ /* in */ "f"(f));
505
+ return r;
506
+ }
507
+
508
+ #else
509
+
510
+ // FP16 <-> FP32
511
+ // ref: https://github.com/Maratyszcza/FP16
512
+
513
+ static inline float fp32_from_bits(uint32_t w) {
514
+ union {
515
+ uint32_t as_bits;
516
+ float as_value;
517
+ } fp32;
518
+ fp32.as_bits = w;
519
+ return fp32.as_value;
520
+ }
521
+
522
+ static inline uint32_t fp32_to_bits(float f) {
523
+ union {
524
+ float as_value;
525
+ uint32_t as_bits;
526
+ } fp32;
527
+ fp32.as_value = f;
528
+ return fp32.as_bits;
529
+ }
530
+
531
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
532
+ const uint32_t w = (uint32_t) h << 16;
533
+ const uint32_t sign = w & UINT32_C(0x80000000);
534
+ const uint32_t two_w = w + w;
535
+
536
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
537
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
538
+ const float exp_scale = 0x1.0p-112f;
539
+ #else
540
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
541
+ #endif
542
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
543
+
544
+ const uint32_t magic_mask = UINT32_C(126) << 23;
545
+ const float magic_bias = 0.5f;
546
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
547
+
548
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
549
+ const uint32_t result = sign |
550
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
551
+ return fp32_from_bits(result);
552
+ }
553
+
554
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
555
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
556
+ const float scale_to_inf = 0x1.0p+112f;
557
+ const float scale_to_zero = 0x1.0p-110f;
558
+ #else
559
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
560
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
561
+ #endif
562
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
563
+
564
+ const uint32_t w = fp32_to_bits(f);
565
+ const uint32_t shl1_w = w + w;
566
+ const uint32_t sign = w & UINT32_C(0x80000000);
567
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
568
+ if (bias < UINT32_C(0x71000000)) {
569
+ bias = UINT32_C(0x71000000);
570
+ }
571
+
572
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
573
+ const uint32_t bits = fp32_to_bits(base);
574
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
575
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
576
+ const uint32_t nonsign = exp_bits + mantissa_bits;
577
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
578
+ }
579
+
580
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
581
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
582
+
583
+ #endif // __F16C__
584
+
585
+ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
586
+
587
+ #ifdef __ARM_FEATURE_SVE
588
+ #include <arm_sve.h>
589
+ #endif // __ARM_FEATURE_SVE
590
+
591
+ // precomputed f32 table for f16 (256 KB)
592
+ // defined in ggml.c, initialized in ggml_init()
593
+ extern float ggml_table_f32_f16[1 << 16];
594
+
595
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
596
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
597
+ // This is also true for POWER9.
598
+ #if !defined(GGML_FP16_TO_FP32)
599
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
600
+ uint16_t s;
601
+ memcpy(&s, &f, sizeof(uint16_t));
602
+ return ggml_table_f32_f16[s];
603
+ }
604
+
605
+ #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
606
+ #endif
607
+
608
+ #if !defined(GGML_FP32_TO_FP16)
609
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
610
+ #endif
611
+
612
+ #ifdef __cplusplus
613
+ }
614
+ #endif