ggerganov commited on
Commit
db4407f
·
1 Parent(s): c768824

ggml : repack block_iq4_nlx8 (llama/14904)

Browse files
ggml/src/ggml-cpu/arch-fallback.h CHANGED
@@ -40,18 +40,22 @@
40
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
41
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
42
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
43
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
44
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
45
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
46
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
47
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
48
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
49
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
50
  // repack.cpp
51
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
52
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
 
53
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
54
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
 
55
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
56
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
57
  // repack.cpp
@@ -80,12 +84,14 @@
80
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
81
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
82
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
83
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
84
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
85
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
86
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
87
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
88
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
89
  #elif defined(__loongarch64)
90
  // quants.c
91
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -103,12 +109,14 @@
103
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
104
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
105
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
106
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
107
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
108
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
109
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
110
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
111
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
112
  #elif defined(__riscv)
113
  // quants.c
114
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -133,11 +141,13 @@
133
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
134
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
135
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
136
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
137
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
138
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
139
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
140
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
141
  #elif defined(__s390x__)
142
  // quants.c
143
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -164,12 +174,14 @@
164
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
165
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
166
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
167
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
168
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
169
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
170
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
171
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
172
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
173
  #elif defined(__wasm__)
174
  // quants.c
175
  #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -195,10 +207,12 @@
195
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
196
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
197
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 
198
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
199
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
200
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
201
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
202
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
203
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 
204
  #endif
 
40
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
41
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
42
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
43
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
44
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
45
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
46
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
47
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
48
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
49
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
50
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
51
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
52
  // repack.cpp
53
  #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
54
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
55
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
56
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
57
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
58
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
59
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
60
  #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
61
  // repack.cpp
 
84
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
85
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
86
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
87
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
88
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
89
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
90
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
91
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
92
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
93
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
94
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
95
  #elif defined(__loongarch64)
96
  // quants.c
97
  #define quantize_row_q8_K_generic quantize_row_q8_K
 
109
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
110
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
111
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
112
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
113
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
114
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
115
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
116
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
117
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
118
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
119
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
120
  #elif defined(__riscv)
121
  // quants.c
122
  #define quantize_row_q8_K_generic quantize_row_q8_K
 
141
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
142
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
143
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
144
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
145
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
146
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
147
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
148
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
149
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
150
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
151
  #elif defined(__s390x__)
152
  // quants.c
153
  #define quantize_row_q8_K_generic quantize_row_q8_K
 
174
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
175
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
176
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
177
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
178
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
179
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
180
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
181
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
182
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
183
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
184
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
185
  #elif defined(__wasm__)
186
  // quants.c
187
  #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
 
207
  #define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
208
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
209
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
210
+ #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
211
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
212
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
213
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
214
  #define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
215
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
216
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
217
+ #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
218
  #endif
ggml/src/ggml-cpu/arch/x86/repack.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cpu/repack.cpp CHANGED
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
206
  const int ncols_interleaved = 4;
207
  const int blocklen = 4;
208
 
209
- assert (n % qk == 0);
210
- assert (nc % ncols_interleaved == 0);
 
211
 
212
  UNUSED(s);
213
  UNUSED(bs);
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
307
  UNUSED(ncols_interleaved);
308
  UNUSED(blocklen);
309
 
310
- {
311
- float sumf[8];
312
- int sumi;
313
 
314
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
- for (int x = 0; x < nc / ncols_interleaved; x++) {
316
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
 
318
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
- for (int l = 0; l < nb; l++) {
320
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
- for (int j = 0; j < ncols_interleaved; j++) {
322
- sumi = 0;
323
- for (int i = 0; i < blocklen; ++i) {
324
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
- }
328
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
329
  }
 
330
  }
331
  }
332
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
  }
 
334
  }
335
  }
336
 
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
494
  const int ncols_interleaved = 4;
495
  const int blocklen = 4;
496
 
497
- assert (n % qk == 0);
498
- assert (nc % ncols_interleaved == 0);
 
499
 
500
- UNUSED(s);
501
  UNUSED(bs);
502
- UNUSED(vx);
503
- UNUSED(vy);
504
  UNUSED(nr);
505
- UNUSED(nc);
506
- UNUSED(nb);
507
- UNUSED(ncols_interleaved);
508
- UNUSED(blocklen);
509
 
510
- {
511
- float sumf[4];
512
- int sumi;
513
 
514
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
515
- for (int x = 0; x < nc / ncols_interleaved; x++) {
516
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
517
 
518
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
519
- for (int l = 0; l < nb; l++) {
520
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
521
- for (int j = 0; j < ncols_interleaved; j++) {
522
- sumi = 0;
523
- for (int i = 0; i < blocklen; ++i) {
524
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
525
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
526
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
527
- }
528
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
529
  }
 
530
  }
531
  }
532
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
533
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
  }
535
  }
536
 
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
934
  }
935
  }
936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
  } // extern "C"
938
 
939
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
1285
 
1286
  static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1287
  GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1288
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1289
  GGML_ASSERT(interleave_block == 4);
1290
 
1291
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
1292
- const block_iq4_nl * src = (const block_iq4_nl *)data;
 
1293
  block_iq4_nl dst_tmp[4];
 
1294
  int nrow = ggml_nrows(t);
1295
  int nrows_interleaved = 4;
1296
- int nblocks = t->ne[0] / QK4_0;
1297
 
1298
  GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1299
 
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
1315
  GGML_UNUSED(data_size);
1316
  }
1317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  namespace ggml::cpu::repack {
1319
  // repack
1320
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
1350
  // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1351
  //}
1352
 
 
 
 
 
1353
  // gemv
1354
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1355
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1378
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1379
  }
1380
 
 
 
 
 
1381
  // gemm
1382
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1383
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1406
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1407
  }
1408
 
 
 
 
 
1409
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1410
  public:
1411
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1680
 
1681
  // instance for IQ4
1682
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
 
1683
 
1684
  if (cur->type == GGML_TYPE_Q4_0) {
1685
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1710
  }
1711
  }
1712
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
 
 
 
 
 
1713
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1714
  if (cur->ne[1] % 4 == 0) {
1715
  return &iq4_nl_4x4_q8_0;
 
206
  const int ncols_interleaved = 4;
207
  const int blocklen = 4;
208
 
209
+ assert(nr == 1);
210
+ assert(n % qk == 0);
211
+ assert(nc % ncols_interleaved == 0);
212
 
213
  UNUSED(s);
214
  UNUSED(bs);
 
308
  UNUSED(ncols_interleaved);
309
  UNUSED(blocklen);
310
 
311
+ float sumf[8];
312
+ int sumi;
 
313
 
314
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
316
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
 
318
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
+ for (int l = 0; l < nb; l++) {
320
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
+ for (int j = 0; j < ncols_interleaved; j++) {
322
+ sumi = 0;
323
+ for (int i = 0; i < blocklen; ++i) {
324
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
 
 
327
  }
328
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
329
  }
330
  }
 
331
  }
332
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
  }
334
  }
335
 
 
493
  const int ncols_interleaved = 4;
494
  const int blocklen = 4;
495
 
496
+ assert(nr == 1);
497
+ assert(n % qk == 0);
498
+ assert(nc % ncols_interleaved == 0);
499
 
 
500
  UNUSED(bs);
 
 
501
  UNUSED(nr);
 
 
 
 
502
 
503
+ float sumf[4];
504
+ int sumi;
 
505
 
506
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
507
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
508
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
509
 
510
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
511
+ for (int l = 0; l < nb; l++) {
512
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
513
+ for (int j = 0; j < ncols_interleaved; j++) {
514
+ sumi = 0;
515
+ for (int i = 0; i < blocklen; ++i) {
516
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
517
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
518
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
 
 
519
  }
520
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
521
  }
522
  }
 
523
  }
524
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
525
+ }
526
+ }
527
+
528
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
529
+ const int qk = QK8_0;
530
+ const int nb = n / qk;
531
+ const int ncols_interleaved = 8;
532
+ const int blocklen = 8;
533
+
534
+ assert(nr == 1);
535
+ assert(n % qk == 0);
536
+ assert(nc % ncols_interleaved == 0);
537
+
538
+ UNUSED(bs);
539
+ UNUSED(nr);
540
+
541
+ float sumf[8];
542
+ int sumi;
543
+
544
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
545
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
546
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
547
+
548
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
549
+ for (int l = 0; l < nb; l++) {
550
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
555
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
556
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
557
+ }
558
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
559
+ }
560
+ }
561
+ }
562
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
563
  }
564
  }
565
 
 
963
  }
964
  }
965
 
966
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
967
+ const int qk = QK8_0;
968
+ const int nb = n / qk;
969
+ const int ncols_interleaved = 8;
970
+ const int blocklen = 8;
971
+
972
+ assert(n % qk == 0);
973
+ assert(nr % 4 == 0);
974
+ assert(nc % ncols_interleaved == 0);
975
+
976
+ float sumf[4][8];
977
+ int sumi;
978
+
979
+ for (int y = 0; y < nr / 4; y++) {
980
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
983
+ for (int m = 0; m < 4; m++) {
984
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
985
+ }
986
+ for (int l = 0; l < nb; l++) {
987
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
988
+ for (int m = 0; m < 4; m++) {
989
+ for (int j = 0; j < ncols_interleaved; j++) {
990
+ sumi = 0;
991
+ for (int i = 0; i < blocklen; ++i) {
992
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
993
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
994
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
995
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
996
+ }
997
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
998
+ }
999
+ }
1000
+ }
1001
+ }
1002
+ for (int m = 0; m < 4; m++) {
1003
+ for (int j = 0; j < ncols_interleaved; j++)
1004
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
1010
  } // extern "C"
1011
 
1012
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
 
1358
 
1359
  static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1360
  GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
 
1361
  GGML_ASSERT(interleave_block == 4);
1362
 
1363
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1364
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
1365
+
1366
  block_iq4_nl dst_tmp[4];
1367
+
1368
  int nrow = ggml_nrows(t);
1369
  int nrows_interleaved = 4;
1370
+ int nblocks = t->ne[0] / QK4_NL;
1371
 
1372
  GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1373
 
 
1389
  GGML_UNUSED(data_size);
1390
  }
1391
 
1392
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1393
+ block_iq4_nlx8 out;
1394
+
1395
+ for (int i = 0; i < 8; i++) {
1396
+ out.d[i] = in[i].d;
1397
+ }
1398
+
1399
+ const int end = QK4_NL * 4 / blck_size_interleave;
1400
+
1401
+ if (blck_size_interleave == 8) {
1402
+ for (int i = 0; i < end; ++i) {
1403
+ int src_id = i % 8;
1404
+ int src_offset = (i / 8) * blck_size_interleave;
1405
+ int dst_offset = i * blck_size_interleave;
1406
+
1407
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1408
+ }
1409
+ } else {
1410
+ GGML_ASSERT(false);
1411
+ }
1412
+
1413
+ return out;
1414
+ }
1415
+
1416
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1417
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1418
+ GGML_ASSERT(interleave_block == 8);
1419
+
1420
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1421
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
1422
+
1423
+ block_iq4_nl dst_tmp[8];
1424
+
1425
+ int nrow = ggml_nrows(t);
1426
+ int nrows_interleaved = 8;
1427
+ int nblocks = t->ne[0] / QK4_NL;
1428
+
1429
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1430
+
1431
+ if (t->ne[1] % nrows_interleaved != 0) {
1432
+ return -1;
1433
+ }
1434
+
1435
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1436
+ for (int64_t x = 0; x < nblocks; x++) {
1437
+ for (int i = 0; i < nrows_interleaved; i++) {
1438
+ dst_tmp[i] = src[x + i * nblocks];
1439
+ }
1440
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1441
+ }
1442
+ src += nrows_interleaved * nblocks;
1443
+ }
1444
+ return 0;
1445
+
1446
+ GGML_UNUSED(data_size);
1447
+ }
1448
+
1449
  namespace ggml::cpu::repack {
1450
  // repack
1451
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
 
1481
  // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1482
  //}
1483
 
1484
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1485
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1486
+ }
1487
+
1488
  // gemv
1489
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1490
  void gemv(int, float *, size_t, const void *, const void *, int, int);
 
1513
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1514
  }
1515
 
1516
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1517
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1518
+ }
1519
+
1520
  // gemm
1521
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1522
  void gemm(int, float *, size_t, const void *, const void *, int, int);
 
1545
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1546
  }
1547
 
1548
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1549
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1550
+ }
1551
+
1552
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1553
  public:
1554
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
 
1823
 
1824
  // instance for IQ4
1825
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1826
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
1827
 
1828
  if (cur->type == GGML_TYPE_Q4_0) {
1829
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
 
1854
  }
1855
  }
1856
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1857
+ if (ggml_cpu_has_avx2()) {
1858
+ if (cur->ne[1] % 8 == 0) {
1859
+ return &iq4_nl_8x8_q8_0;
1860
+ }
1861
+ }
1862
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1863
  if (cur->ne[1] % 4 == 0) {
1864
  return &iq4_nl_4x4_q8_0;
ggml/src/ggml-cpu/repack.h CHANGED
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
67
 
68
  static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
69
 
 
 
 
 
 
 
 
70
  #if defined(__cplusplus)
71
  extern "C" {
72
  #endif
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
80
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
82
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
83
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
84
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
89
 
90
  // Native implementations
91
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
97
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
100
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
106
 
107
  #if defined(__cplusplus)
108
  } // extern "C"
 
67
 
68
  static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
69
 
70
+ struct block_iq4_nlx8 {
71
+ ggml_half d[8]; // deltas for 8 iq4_nl blocks
72
+ uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
73
+ };
74
+
75
+ static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
76
+
77
  #if defined(__cplusplus)
78
  extern "C" {
79
  #endif
 
87
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
+ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
91
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
96
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
97
+ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
 
99
  // Native implementations
100
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
 
106
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
107
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
108
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
110
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
111
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
112
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
113
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
114
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
115
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
117
 
118
  #if defined(__cplusplus)
119
  } // extern "C"