Akarshan Biswas commited on
Commit
1377b05
·
1 Parent(s): ff22836

SYCL: Refactor and enable FP16 in binary broadcast OPs (llama/12975)

Browse files

* SYCL: refactor move to a separate file

* Fix binbcast

* Remove duplicates

* fix include formatting

* fix typo

ggml/src/ggml-sycl/backend.hpp CHANGED
@@ -13,6 +13,7 @@
13
  #ifndef GGML_SYCL_BACKEND_HPP
14
  #define GGML_SYCL_BACKEND_HPP
15
 
 
16
  #include "concat.hpp"
17
  #include "common.hpp"
18
  #include "conv.hpp"
 
13
  #ifndef GGML_SYCL_BACKEND_HPP
14
  #define GGML_SYCL_BACKEND_HPP
15
 
16
+ #include "binbcast.hpp"
17
  #include "concat.hpp"
18
  #include "common.hpp"
19
  #include "conv.hpp"
ggml/src/ggml-sycl/binbcast.cpp ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "binbcast.hpp"
2
+
3
+ #include <cstddef>
4
+ #include <cstdint>
5
+ #include <sycl/sycl.hpp>
6
+
7
+ #include "ggml.h"
8
+
9
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
10
+ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
11
+ int ne0, int ne1, int ne2, int ne3,
12
+ int ne10, int ne11, int ne12, int ne13,
13
+ /*int s0, */ int s1, int s2, int s3,
14
+ /*int s00,*/ int s01, int s02, int s03,
15
+ /*int s10,*/ int s11, int s12, int s13,
16
+ const sycl::nd_item<3> &item_ct1) {
17
+ const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
18
+ item_ct1.get_local_id(2);
19
+ const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
20
+ item_ct1.get_local_id(1));
21
+ const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
22
+ item_ct1.get_local_id(0)) /
23
+ ne3;
24
+ const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
25
+ item_ct1.get_local_id(0)) %
26
+ ne3;
27
+
28
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
29
+ return;
30
+ }
31
+
32
+ const int i11 = i1 % ne11;
33
+ const int i12 = i2 % ne12;
34
+ const int i13 = i3 % ne13;
35
+
36
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
37
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
38
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
39
+
40
+ const src0_t * src0_row = src0 + i_src0;
41
+ const src1_t * src1_row = src1 + i_src1;
42
+ dst_t * dst_row = dst + i_dst;
43
+
44
+ for (int i0 = i0s; i0 < ne0;
45
+ i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
46
+ const int i10 = i0 % ne10;
47
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
48
+ }
49
+ }
50
+
51
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
52
+ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
53
+ int ne0, int ne1, int ne2, int ne3,
54
+ int ne10, int ne11, int ne12, int ne13,
55
+ /*int s0, */ int s1, int s2, int s3,
56
+ /*int s00,*/ int s01, int s02, int s03,
57
+ /*int s10,*/ int s11, int s12, int s13,
58
+ const sycl::nd_item<3> &item_ct1) {
59
+
60
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
61
+ item_ct1.get_local_id(2);
62
+
63
+ const int i3 = i/(ne2*ne1*ne0);
64
+ const int i2 = (i/(ne1*ne0)) % ne2;
65
+ const int i1 = (i/ne0) % ne1;
66
+ const int i0 = i % ne0;
67
+
68
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
69
+ return;
70
+ }
71
+
72
+ const int i11 = i1 % ne11;
73
+ const int i12 = i2 % ne12;
74
+ const int i13 = i3 % ne13;
75
+
76
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
77
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
78
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
79
+
80
+ const src0_t * src0_row = src0 + i_src0;
81
+ const src1_t * src1_row = src1 + i_src1;
82
+ dst_t * dst_row = dst + i_dst;
83
+
84
+ const int i10 = i0 % ne10;
85
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
86
+ }
87
+
88
+
89
+ template<float (*bin_op)(const float, const float)>
90
+ struct bin_bcast_sycl {
91
+ template <typename src0_t, typename src1_t, typename dst_t>
92
+ void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
93
+ const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
94
+ const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2,
95
+ const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
96
+ const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
97
+ const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
98
+ const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
99
+ int nr0 = ne10 / ne0;
100
+ int nr1 = ne11/ne1;
101
+ int nr2 = ne12/ne2;
102
+ int nr3 = ne13/ne3;
103
+
104
+ int nr[4] = { nr0, nr1, nr2, nr3 };
105
+
106
+ // collapse dimensions until first broadcast dimension
107
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
108
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
109
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
110
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
111
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
112
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
113
+ auto collapse = [](int64_t cne[]) {
114
+ cne[0] *= cne[1];
115
+ cne[1] = cne[2];
116
+ cne[2] = cne[3];
117
+ cne[3] = 1;
118
+ };
119
+
120
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
121
+ cnb[1] *= cne[1];
122
+ cnb[2] *= cne[2];
123
+ cnb[3] *= cne[3];
124
+ };
125
+
126
+ if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
127
+ for (int i = 0; i < 4; i++) {
128
+ if (nr[i] != 1) {
129
+ break;
130
+ }
131
+ if (i > 0) {
132
+ collapse_nb(cnb, cne);
133
+ collapse_nb(cnb0, cne0);
134
+ collapse_nb(cnb1, cne1);
135
+ collapse(cne);
136
+ collapse(cne0);
137
+ collapse(cne1);
138
+ }
139
+ }
140
+ }
141
+ {
142
+ int64_t ne0 = cne[0];
143
+ int64_t ne1 = cne[1];
144
+ int64_t ne2 = cne[2];
145
+ int64_t ne3 = cne[3];
146
+
147
+ int64_t ne10 = cne1[0];
148
+ int64_t ne11 = cne1[1];
149
+ int64_t ne12 = cne1[2];
150
+ int64_t ne13 = cne1[3];
151
+
152
+ size_t nb0 = cnb[0];
153
+ size_t nb1 = cnb[1];
154
+ size_t nb2 = cnb[2];
155
+ size_t nb3 = cnb[3];
156
+
157
+ size_t nb00 = cnb0[0];
158
+ size_t nb01 = cnb0[1];
159
+ size_t nb02 = cnb0[2];
160
+ size_t nb03 = cnb0[3];
161
+
162
+ size_t nb10 = cnb1[0];
163
+ size_t nb11 = cnb1[1];
164
+ size_t nb12 = cnb1[2];
165
+ size_t nb13 = cnb1[3];
166
+
167
+ size_t s0 = nb0 / sizeof(dst_t);
168
+ size_t s1 = nb1 / sizeof(dst_t);
169
+ size_t s2 = nb2 / sizeof(dst_t);
170
+ size_t s3 = nb3 / sizeof(dst_t);
171
+
172
+ size_t s10 = nb10 / sizeof(src1_t);
173
+ size_t s11 = nb11 / sizeof(src1_t);
174
+ size_t s12 = nb12 / sizeof(src1_t);
175
+ size_t s13 = nb13 / sizeof(src1_t);
176
+
177
+ size_t s00 = nb00 / sizeof(src0_t);
178
+ size_t s01 = nb01 / sizeof(src0_t);
179
+ size_t s02 = nb02 / sizeof(src0_t);
180
+ size_t s03 = nb03 / sizeof(src0_t);
181
+
182
+ GGML_UNUSED(s00);
183
+
184
+ GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
185
+ GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
186
+ GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
187
+ GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
188
+
189
+ GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
190
+ GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
191
+ GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
192
+ GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
193
+
194
+ GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
195
+ GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
196
+ GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
197
+ GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
198
+
199
+ GGML_ASSERT(s0 == 1);
200
+ GGML_ASSERT(s10 == 1);
201
+
202
+ const int block_size = 128;
203
+
204
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
205
+
206
+ sycl::range<3> block_dims(1, 1, 1);
207
+ block_dims[2] = std::min<unsigned int>(hne0, block_size);
208
+ block_dims[1] = std::min<unsigned int>(
209
+ ne1, block_size / (unsigned int)block_dims[2]);
210
+ block_dims[0] = std::min(
211
+ std::min<unsigned int>(
212
+ ne2 * ne3, block_size / (unsigned int)block_dims[2] /
213
+ (unsigned int)block_dims[1]),
214
+ 64U);
215
+
216
+ sycl::range<3> block_nums(
217
+ (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
218
+ (ne1 + block_dims[1] - 1) / block_dims[1],
219
+ (hne0 + block_dims[2] - 1) / block_dims[2]);
220
+
221
+ if (block_nums[0] > 65535) {
222
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
223
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
224
+ {
225
+ dpct::has_capability_or_fail(stream->get_device(),
226
+ {sycl::aspect::fp16});
227
+
228
+ stream->parallel_for(
229
+ sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
230
+ sycl::range<3>(1, 1, block_size),
231
+ sycl::range<3>(1, 1, block_size)),
232
+ [=](sycl::nd_item<3> item_ct1) {
233
+ k_bin_bcast_unravel<bin_op>(
234
+ src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
235
+ ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
236
+ s03, s11, s12, s13, item_ct1);
237
+ });
238
+ }
239
+ } else {
240
+ /*
241
+ DPCT1049:16: The work-group size passed to the SYCL kernel may
242
+ exceed the limit. To get the device limit, query
243
+ info::device::max_work_group_size. Adjust the work-group size if
244
+ needed.
245
+ */
246
+ dpct::has_capability_or_fail(stream->get_device(),
247
+ {sycl::aspect::fp16});
248
+
249
+ stream->parallel_for(
250
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
251
+ [=](sycl::nd_item<3> item_ct1) {
252
+ k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
253
+ ne2, ne3, ne10, ne11, ne12, ne13,
254
+ s1, s2, s3, s01, s02, s03, s11, s12, s13,
255
+ item_ct1);
256
+ });
257
+ }
258
+ }
259
+ }
260
+ };
261
+
262
+ template <class op>
263
+ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
264
+ ggml_tensor * dst) {
265
+ dpct::queue_ptr main_stream = ctx.stream();
266
+ GGML_TENSOR_BINARY_OP_LOCALS
267
+
268
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
269
+ op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
270
+ ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
271
+ ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
272
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
273
+ op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
274
+ ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
275
+ nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
276
+ main_stream);
277
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
278
+ op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
279
+ ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
280
+ nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
281
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
282
+ op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
283
+ ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
284
+ nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
285
+ } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
286
+ op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
287
+ ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
288
+ nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
289
+ } else {
290
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
291
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
292
+ GGML_ABORT("fatal error");
293
+ }
294
+ }
295
+
296
+ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
297
+
298
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
299
+ }
300
+
301
+ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
302
+
303
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
304
+ }
305
+
306
+ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
307
+
308
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
309
+ }
310
+
311
+ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
312
+
313
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
314
+ }
315
+
316
+ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
317
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
318
+ }
319
+
320
+
321
+ void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322
+ GGML_SYCL_DEBUG("call %s\n", __func__);
323
+ ggml_sycl_op_add(ctx, dst);
324
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
325
+ }
326
+
327
+ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
328
+ GGML_SYCL_DEBUG("call %s\n", __func__);
329
+ ggml_sycl_op_sub(ctx, dst);
330
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
331
+ }
332
+
333
+ void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
334
+ GGML_SYCL_DEBUG("call %s\n", __func__);
335
+ ggml_sycl_op_mul(ctx, dst);
336
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
337
+ }
338
+
339
+ void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
340
+ GGML_SYCL_DEBUG("call %s\n", __func__);
341
+ ggml_sycl_op_div(ctx, dst);
342
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
343
+ }
344
+
345
+ void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
346
+ GGML_SYCL_DEBUG("call %s\n", __func__);
347
+ ggml_sycl_op_repeat(ctx, dst);
348
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
349
+ }
350
+
ggml/src/ggml-sycl/binbcast.hpp ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef GGML_SYCL_BINBCAST_HPP
2
+ #define GGML_SYCL_BINBCAST_HPP
3
+ #include "common.hpp"
4
+
5
+
6
+ static __dpct_inline__ float op_repeat(const float a, const float b) {
7
+ return b;
8
+ GGML_UNUSED(a);
9
+ }
10
+
11
+ static __dpct_inline__ float op_add(const float a, const float b) {
12
+ return a + b;
13
+ }
14
+
15
+ static __dpct_inline__ float op_sub(const float a, const float b) {
16
+ return a - b;
17
+ }
18
+
19
+ static __dpct_inline__ float op_mul(const float a, const float b) {
20
+ return a * b;
21
+ }
22
+
23
+ static __dpct_inline__ float op_div(const float a, const float b) {
24
+ return a / b;
25
+ }
26
+
27
+ void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
28
+
29
+ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
30
+
31
+ void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
32
+
33
+ void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
34
+
35
+ void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
36
+
37
+
38
+ #endif //GGML_SYCL_BINBCAST_HPP
39
+
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -494,286 +494,5 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
494
 
495
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
496
 
497
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
498
- static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
499
- int ne0, int ne1, int ne2, int ne3,
500
- int ne10, int ne11, int ne12, int ne13,
501
- /*int s0, */ int s1, int s2, int s3,
502
- /*int s00,*/ int s01, int s02, int s03,
503
- /*int s10,*/ int s11, int s12, int s13,
504
- const sycl::nd_item<3> &item_ct1) {
505
- const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
506
- item_ct1.get_local_id(2);
507
- const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
508
- item_ct1.get_local_id(1));
509
- const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
510
- item_ct1.get_local_id(0)) /
511
- ne3;
512
- const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
513
- item_ct1.get_local_id(0)) %
514
- ne3;
515
-
516
- if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
517
- return;
518
- }
519
-
520
- const int i11 = i1 % ne11;
521
- const int i12 = i2 % ne12;
522
- const int i13 = i3 % ne13;
523
-
524
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
525
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
526
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
527
-
528
- const src0_t * src0_row = src0 + i_src0;
529
- const src1_t * src1_row = src1 + i_src1;
530
- dst_t * dst_row = dst + i_dst;
531
-
532
- for (int i0 = i0s; i0 < ne0;
533
- i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
534
- const int i10 = i0 % ne10;
535
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
536
- }
537
- }
538
-
539
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
540
- static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
541
- int ne0, int ne1, int ne2, int ne3,
542
- int ne10, int ne11, int ne12, int ne13,
543
- /*int s0, */ int s1, int s2, int s3,
544
- /*int s00,*/ int s01, int s02, int s03,
545
- /*int s10,*/ int s11, int s12, int s13,
546
- const sycl::nd_item<3> &item_ct1) {
547
-
548
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
549
- item_ct1.get_local_id(2);
550
-
551
- const int i3 = i/(ne2*ne1*ne0);
552
- const int i2 = (i/(ne1*ne0)) % ne2;
553
- const int i1 = (i/ne0) % ne1;
554
- const int i0 = i % ne0;
555
-
556
- if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
557
- return;
558
- }
559
-
560
- const int i11 = i1 % ne11;
561
- const int i12 = i2 % ne12;
562
- const int i13 = i3 % ne13;
563
-
564
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
565
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
566
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
567
-
568
- const src0_t * src0_row = src0 + i_src0;
569
- const src1_t * src1_row = src1 + i_src1;
570
- dst_t * dst_row = dst + i_dst;
571
-
572
- const int i10 = i0 % ne10;
573
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
574
- }
575
-
576
-
577
- template<float (*bin_op)(const float, const float)>
578
- struct bin_bcast_sycl {
579
- template <typename src0_t, typename src1_t, typename dst_t>
580
- void operator()(ggml_backend_sycl_context & ctx,
581
- const struct ggml_tensor *src0,
582
- const struct ggml_tensor *src1, struct ggml_tensor *dst,
583
- const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
584
- queue_ptr stream) {
585
-
586
- GGML_TENSOR_BINARY_OP_LOCALS
587
-
588
- int nr0 = ne10/ne0;
589
- int nr1 = ne11/ne1;
590
- int nr2 = ne12/ne2;
591
- int nr3 = ne13/ne3;
592
-
593
- int nr[4] = { nr0, nr1, nr2, nr3 };
594
-
595
- // collapse dimensions until first broadcast dimension
596
- int64_t cne[] = {ne0, ne1, ne2, ne3};
597
- int64_t cne0[] = {ne00, ne01, ne02, ne03};
598
- int64_t cne1[] = {ne10, ne11, ne12, ne13};
599
- size_t cnb[] = {nb0, nb1, nb2, nb3};
600
- size_t cnb0[] = {nb00, nb01, nb02, nb03};
601
- size_t cnb1[] = {nb10, nb11, nb12, nb13};
602
- auto collapse = [](int64_t cne[]) {
603
- cne[0] *= cne[1];
604
- cne[1] = cne[2];
605
- cne[2] = cne[3];
606
- cne[3] = 1;
607
- };
608
-
609
- auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
610
- cnb[1] *= cne[1];
611
- cnb[2] *= cne[2];
612
- cnb[3] *= cne[3];
613
- };
614
-
615
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
616
- for (int i = 0; i < 4; i++) {
617
- if (nr[i] != 1) {
618
- break;
619
- }
620
- if (i > 0) {
621
- collapse_nb(cnb, cne);
622
- collapse_nb(cnb0, cne0);
623
- collapse_nb(cnb1, cne1);
624
- collapse(cne);
625
- collapse(cne0);
626
- collapse(cne1);
627
- }
628
- }
629
- }
630
- {
631
- int64_t ne0 = cne[0];
632
- int64_t ne1 = cne[1];
633
- int64_t ne2 = cne[2];
634
- int64_t ne3 = cne[3];
635
-
636
- int64_t ne10 = cne1[0];
637
- int64_t ne11 = cne1[1];
638
- int64_t ne12 = cne1[2];
639
- int64_t ne13 = cne1[3];
640
-
641
- size_t nb0 = cnb[0];
642
- size_t nb1 = cnb[1];
643
- size_t nb2 = cnb[2];
644
- size_t nb3 = cnb[3];
645
-
646
- size_t nb00 = cnb0[0];
647
- size_t nb01 = cnb0[1];
648
- size_t nb02 = cnb0[2];
649
- size_t nb03 = cnb0[3];
650
-
651
- size_t nb10 = cnb1[0];
652
- size_t nb11 = cnb1[1];
653
- size_t nb12 = cnb1[2];
654
- size_t nb13 = cnb1[3];
655
-
656
- size_t s0 = nb0 / sizeof(dst_t);
657
- size_t s1 = nb1 / sizeof(dst_t);
658
- size_t s2 = nb2 / sizeof(dst_t);
659
- size_t s3 = nb3 / sizeof(dst_t);
660
-
661
- size_t s10 = nb10 / sizeof(src1_t);
662
- size_t s11 = nb11 / sizeof(src1_t);
663
- size_t s12 = nb12 / sizeof(src1_t);
664
- size_t s13 = nb13 / sizeof(src1_t);
665
-
666
- size_t s00 = nb00 / sizeof(src0_t);
667
- size_t s01 = nb01 / sizeof(src0_t);
668
- size_t s02 = nb02 / sizeof(src0_t);
669
- size_t s03 = nb03 / sizeof(src0_t);
670
-
671
- GGML_UNUSED(s00);
672
-
673
- GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
674
- GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
675
- GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
676
- GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
677
-
678
- GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
679
- GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
680
- GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
681
- GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
682
-
683
- GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
684
- GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
685
- GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
686
- GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
687
-
688
- GGML_ASSERT(s0 == 1);
689
- GGML_ASSERT(s10 == 1);
690
-
691
- const int block_size = 128;
692
-
693
- int64_t hne0 = std::max(ne0/2LL, 1LL);
694
-
695
- sycl::range<3> block_dims(1, 1, 1);
696
- block_dims[2] = std::min<unsigned int>(hne0, block_size);
697
- block_dims[1] = std::min<unsigned int>(
698
- ne1, block_size / (unsigned int)block_dims[2]);
699
- block_dims[0] = std::min(
700
- std::min<unsigned int>(
701
- ne2 * ne3, block_size / (unsigned int)block_dims[2] /
702
- (unsigned int)block_dims[1]),
703
- 64U);
704
-
705
- sycl::range<3> block_nums(
706
- (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
707
- (ne1 + block_dims[1] - 1) / block_dims[1],
708
- (hne0 + block_dims[2] - 1) / block_dims[2]);
709
-
710
- if (block_nums[0] > 65535) {
711
- // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
712
- int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
713
- {
714
- dpct::has_capability_or_fail(stream->get_device(),
715
- {sycl::aspect::fp16});
716
-
717
- stream->parallel_for(
718
- sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
719
- sycl::range<3>(1, 1, block_size),
720
- sycl::range<3>(1, 1, block_size)),
721
- [=](sycl::nd_item<3> item_ct1) {
722
- k_bin_bcast_unravel<bin_op>(
723
- src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
724
- ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
725
- s03, s11, s12, s13, item_ct1);
726
- });
727
- }
728
- } else {
729
- /*
730
- DPCT1049:16: The work-group size passed to the SYCL kernel may
731
- exceed the limit. To get the device limit, query
732
- info::device::max_work_group_size. Adjust the work-group size if
733
- needed.
734
- */
735
- dpct::has_capability_or_fail(stream->get_device(),
736
- {sycl::aspect::fp16});
737
-
738
- stream->parallel_for(
739
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
740
- [=](sycl::nd_item<3> item_ct1) {
741
- k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
742
- ne2, ne3, ne10, ne11, ne12, ne13,
743
- s1, s2, s3, s01, s02, s03, s11, s12, s13,
744
- item_ct1);
745
- });
746
- }
747
- }
748
- GGML_UNUSED(ctx);
749
- }
750
- };
751
-
752
- template <class op>
753
- inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
754
- const ggml_tensor *src1, ggml_tensor *dst) {
755
- dpct::queue_ptr main_stream = ctx.stream();
756
-
757
- if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
758
- op()(ctx, src0, src1, dst, (const float *)src0->data, (const float *)src1->data, (float *)dst->data, main_stream);
759
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
760
- op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data,
761
- (sycl::half *)dst->data, main_stream);
762
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
763
- op()(ctx, src0, src1, dst, (const sycl::half *)src0->data, (const float *)src1->data, (float *)dst->data,
764
- main_stream);
765
- } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
766
- op()(ctx, src0, src1, dst, (const int32_t *)src0->data, (const int32_t *)src1->data, (int32_t *)dst->data,
767
- main_stream);
768
- } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
769
- op()(ctx, src0, src1, dst, (const int16_t *)src0->data, (const int16_t *)src1->data, (int16_t *)dst->data,
770
- main_stream);
771
- } else {
772
- fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
773
- ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
774
- GGML_ABORT("fatal error");
775
- }
776
- }
777
-
778
  bool gpu_has_xmx(sycl::device &dev);
779
  #endif // GGML_SYCL_COMMON_HPP
 
494
 
495
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  bool gpu_has_xmx(sycl::device &dev);
498
  #endif // GGML_SYCL_COMMON_HPP
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -1261,27 +1261,6 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
1261
  }
1262
 
1263
 
1264
- inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
1265
-
1266
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
1267
- }
1268
-
1269
- inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
1270
-
1271
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
1272
- }
1273
-
1274
- inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
1275
-
1276
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
1277
- }
1278
-
1279
- inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
1280
-
1281
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
1282
- }
1283
-
1284
-
1285
  void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1286
  GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1287
  ggml_sycl_op_sqrt(ctx, dst);
@@ -1409,29 +1388,3 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1409
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1410
  }
1411
 
1412
-
1413
-
1414
-
1415
- void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1416
- GGML_SYCL_DEBUG("call %s\n", __func__);
1417
- ggml_sycl_op_add(ctx, dst);
1418
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1419
- }
1420
-
1421
- void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1422
- GGML_SYCL_DEBUG("call %s\n", __func__);
1423
- ggml_sycl_op_sub(ctx, dst);
1424
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1425
- }
1426
-
1427
- void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1428
- GGML_SYCL_DEBUG("call %s\n", __func__);
1429
- ggml_sycl_op_mul(ctx, dst);
1430
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1431
- }
1432
-
1433
- void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1434
- GGML_SYCL_DEBUG("call %s\n", __func__);
1435
- ggml_sycl_op_div(ctx, dst);
1436
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1437
- }
 
1261
  }
1262
 
1263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1264
  void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1265
  GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1266
  ggml_sycl_op_sqrt(ctx, dst);
 
1388
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1389
  }
1390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-sycl/element_wise.hpp CHANGED
@@ -10,27 +10,6 @@ T neg_infinity() {
10
  return -std::numeric_limits<T>::infinity();
11
  }
12
 
13
- static __dpct_inline__ float op_repeat(const float a, const float b) {
14
- return b;
15
- GGML_UNUSED(a);
16
- }
17
-
18
- static __dpct_inline__ float op_add(const float a, const float b) {
19
- return a + b;
20
- }
21
-
22
- static __dpct_inline__ float op_sub(const float a, const float b) {
23
- return a - b;
24
- }
25
-
26
- static __dpct_inline__ float op_mul(const float a, const float b) {
27
- return a * b;
28
- }
29
-
30
- static __dpct_inline__ float op_div(const float a, const float b) {
31
- return a / b;
32
- }
33
-
34
  template<typename T>
35
  struct typed_data {
36
  const T * src;
@@ -87,14 +66,5 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
87
 
88
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
89
 
90
- // ---------
91
-
92
- void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
93
-
94
- void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
95
-
96
- void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
97
-
98
- void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
99
-
100
  #endif // GGML_SYCL_ELEMENTWISE_HPP
 
 
10
  return -std::numeric_limits<T>::infinity();
11
  }
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  template<typename T>
14
  struct typed_data {
15
  const T * src;
 
66
 
67
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
68
 
 
 
 
 
 
 
 
 
 
 
69
  #endif // GGML_SYCL_ELEMENTWISE_HPP
70
+
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -1967,11 +1967,6 @@ catch (sycl::exception const &exc) {
1967
  std::exit(1);
1968
  }
1969
 
1970
- static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
1971
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
1972
- }
1973
-
1974
-
1975
  inline void ggml_sycl_op_mul_mat_sycl(
1976
  ggml_backend_sycl_context & ctx,
1977
  const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
@@ -2600,12 +2595,6 @@ catch (sycl::exception const &exc) {
2600
  }
2601
 
2602
 
2603
- static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2604
- GGML_SYCL_DEBUG("call %s\n", __func__);
2605
- ggml_sycl_op_repeat(ctx, dst);
2606
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2607
- }
2608
-
2609
  static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2610
  GGML_SYCL_DEBUG("call %s\n", __func__);
2611
  ggml_sycl_op_get_rows(ctx, dst);
@@ -3972,7 +3961,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3972
  case GGML_OP_ARGMAX:
3973
  case GGML_OP_NONE:
3974
  case GGML_OP_RESHAPE:
3975
- case GGML_OP_REPEAT:
3976
  case GGML_OP_VIEW:
3977
  case GGML_OP_PERMUTE:
3978
  case GGML_OP_TRANSPOSE:
@@ -3982,7 +3970,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3982
  case GGML_OP_SUB:
3983
  case GGML_OP_MUL:
3984
  case GGML_OP_DIV:
3985
- return (op->src[0]->type == GGML_TYPE_F32);
 
3986
  case GGML_OP_SQR:
3987
  case GGML_OP_SQRT:
3988
  case GGML_OP_SIN:
 
1967
  std::exit(1);
1968
  }
1969
 
 
 
 
 
 
1970
  inline void ggml_sycl_op_mul_mat_sycl(
1971
  ggml_backend_sycl_context & ctx,
1972
  const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
 
2595
  }
2596
 
2597
 
 
 
 
 
 
 
2598
  static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2599
  GGML_SYCL_DEBUG("call %s\n", __func__);
2600
  ggml_sycl_op_get_rows(ctx, dst);
 
3961
  case GGML_OP_ARGMAX:
3962
  case GGML_OP_NONE:
3963
  case GGML_OP_RESHAPE:
 
3964
  case GGML_OP_VIEW:
3965
  case GGML_OP_PERMUTE:
3966
  case GGML_OP_TRANSPOSE:
 
3970
  case GGML_OP_SUB:
3971
  case GGML_OP_MUL:
3972
  case GGML_OP_DIV:
3973
+ case GGML_OP_REPEAT:
3974
+ return true;
3975
  case GGML_OP_SQR:
3976
  case GGML_OP_SQRT:
3977
  case GGML_OP_SIN: