Romain Biessy commited on
Commit
4da3fb6
·
1 Parent(s): 09c03ad

sycl: Add more debug prints (llama/13640)

Browse files
ggml/src/ggml-sycl/binbcast.cpp CHANGED
@@ -319,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
319
 
320
 
321
  void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322
- GGML_SYCL_DEBUG("call %s\n", __func__);
323
  ggml_sycl_op_add(ctx, dst);
324
- GGML_SYCL_DEBUG("call %s done\n", __func__);
325
  }
326
 
327
  void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
328
- GGML_SYCL_DEBUG("call %s\n", __func__);
329
  ggml_sycl_op_sub(ctx, dst);
330
- GGML_SYCL_DEBUG("call %s done\n", __func__);
331
  }
332
 
333
  void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
334
- GGML_SYCL_DEBUG("call %s\n", __func__);
335
  ggml_sycl_op_mul(ctx, dst);
336
- GGML_SYCL_DEBUG("call %s done\n", __func__);
337
  }
338
 
339
  void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
340
- GGML_SYCL_DEBUG("call %s\n", __func__);
341
  ggml_sycl_op_div(ctx, dst);
342
- GGML_SYCL_DEBUG("call %s done\n", __func__);
343
  }
344
 
345
  void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
346
- GGML_SYCL_DEBUG("call %s\n", __func__);
347
  ggml_sycl_op_repeat(ctx, dst);
348
- GGML_SYCL_DEBUG("call %s done\n", __func__);
349
  }
350
 
 
319
 
320
 
321
  void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
323
  ggml_sycl_op_add(ctx, dst);
 
324
  }
325
 
326
  void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
327
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
328
  ggml_sycl_op_sub(ctx, dst);
 
329
  }
330
 
331
  void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
332
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
333
  ggml_sycl_op_mul(ctx, dst);
 
334
  }
335
 
336
  void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
337
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
338
  ggml_sycl_op_div(ctx, dst);
 
339
  }
340
 
341
  void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
342
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
343
  ggml_sycl_op_repeat(ctx, dst);
 
344
  }
345
 
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -15,6 +15,7 @@
15
 
16
  #include <fstream>
17
  #include <iostream>
 
18
 
19
  #include "dpct/helper.hpp"
20
  #include "ggml-sycl.h"
@@ -44,11 +45,20 @@ extern int g_ggml_sycl_debug;
44
  extern int g_ggml_sycl_disable_optimize;
45
  extern int g_ggml_sycl_prioritize_dmmv;
46
 
47
- #define GGML_SYCL_DEBUG(...) \
48
- do { \
49
- if (g_ggml_sycl_debug) \
50
- fprintf(stderr, __VA_ARGS__); \
51
- } while (0)
 
 
 
 
 
 
 
 
 
52
 
53
  #define CHECK_TRY_ERROR(expr) \
54
  [&]() { \
@@ -490,4 +500,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
490
  }
491
 
492
  bool gpu_has_xmx(sycl::device &dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  #endif // GGML_SYCL_COMMON_HPP
 
15
 
16
  #include <fstream>
17
  #include <iostream>
18
+ #include <string>
19
 
20
  #include "dpct/helper.hpp"
21
  #include "ggml-sycl.h"
 
45
  extern int g_ggml_sycl_disable_optimize;
46
  extern int g_ggml_sycl_prioritize_dmmv;
47
 
48
+ #if defined(__clang__) && __has_builtin(__builtin_expect)
49
+ // Hint the optimizer to pipeline the more likely following instruction in branches
50
+ # define LIKELY(expr) __builtin_expect(expr, true)
51
+ # define UNLIKELY(expr) __builtin_expect(expr, false)
52
+ #else
53
+ # define LIKELY(expr) (expr)
54
+ # define UNLIKELY(expr) (expr)
55
+ #endif
56
+
57
+ #define GGML_SYCL_DEBUG(...) \
58
+ do { \
59
+ if (UNLIKELY(g_ggml_sycl_debug)) \
60
+ fprintf(stderr, __VA_ARGS__); \
61
+ } while (0)
62
 
63
  #define CHECK_TRY_ERROR(expr) \
64
  [&]() { \
 
500
  }
501
 
502
  bool gpu_has_xmx(sycl::device &dev);
503
+
504
+ template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
505
+ if (LIKELY(!g_ggml_sycl_debug)) {
506
+ return;
507
+ }
508
+ std::stringstream ss;
509
+ ss << prefix << "=[";
510
+ for (std::size_t i = 0; i < N - 1; ++i) {
511
+ ss << array[i] << ", ";
512
+ }
513
+ if constexpr (N > 0) {
514
+ ss << array[N - 1];
515
+ }
516
+ ss << "]";
517
+ GGML_SYCL_DEBUG("%s", ss.str().c_str());
518
+ }
519
+
520
+ inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
521
+ const std::string & suffix = "") {
522
+ if (LIKELY(!g_ggml_sycl_debug)) {
523
+ return;
524
+ }
525
+ GGML_SYCL_DEBUG("%s=", prefix.c_str());
526
+ if (tensor) {
527
+ GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
528
+ debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
529
+ debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
530
+ if (!ggml_is_contiguous(tensor)) {
531
+ GGML_SYCL_DEBUG(";strided");
532
+ }
533
+ if (ggml_is_permuted(tensor)) {
534
+ GGML_SYCL_DEBUG(";permuted");
535
+ }
536
+ } else {
537
+ GGML_SYCL_DEBUG("nullptr");
538
+ }
539
+ GGML_SYCL_DEBUG("%s", suffix.c_str());
540
+ }
541
+
542
+ // Use scope_op_debug_print to log operations coming from running a model
543
+ struct scope_op_debug_print {
544
+ // Use string_views to avoid the cost of creating a string and concatenating them
545
+ // string_views must be alive for as long as the object is alive
546
+ // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
547
+ scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
548
+ std::size_t num_src, const std::string_view & suffix = "") :
549
+ func(func),
550
+ func_suffix(func_suffix) {
551
+ if (LIKELY(!g_ggml_sycl_debug)) {
552
+ return;
553
+ }
554
+ GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
555
+ debug_print_tensor(" dst", dst);
556
+ if (dst) {
557
+ for (std::size_t i = 0; i < num_src; ++i) {
558
+ debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
559
+ }
560
+ }
561
+ GGML_SYCL_DEBUG("%s\n", suffix.data());
562
+ }
563
+
564
+ scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
565
+ const std::string_view & suffix = "") :
566
+ scope_op_debug_print(func, "", dst, num_src, suffix) {}
567
+
568
+ ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
569
+
570
+ private:
571
+ std::string_view func;
572
+ std::string_view func_suffix;
573
+ };
574
+
575
  #endif // GGML_SYCL_COMMON_HPP
ggml/src/ggml-sycl/concat.cpp CHANGED
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
159
  }
160
 
161
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
162
- const ggml_tensor *src0 = dst->src[0];
163
- const ggml_tensor *src1 = dst->src[1];
164
- queue_ptr stream = ctx.stream();
165
-
166
- const int32_t dim = ((int32_t *)dst->op_params)[0];
167
-
168
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
169
- const float *src0_d = (const float *)src0->data;
170
- const float *src1_d = (const float *)src1->data;
171
-
172
- float *dst_d = (float *)dst->data;
173
-
174
- if (dim != 3) {
175
- for (int i3 = 0; i3 < dst->ne[3]; i3++) {
176
- concat_f32_sycl(
177
- src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178
- dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
179
- src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
180
- }
 
 
 
 
 
 
 
181
  } else {
182
- const size_t size0 = ggml_nbytes(src0);
183
- const size_t size1 = ggml_nbytes(src1);
184
-
185
- SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186
- SYCL_CHECK(CHECK_TRY_ERROR(
187
- stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
188
  }
189
- } else
190
- concat_f32_sycl_non_cont(
191
- stream, (const char *)src0->data, (const char *)src1->data,
192
- (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
193
- src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
194
- src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
195
- src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
196
- dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
197
  }
 
159
  }
160
 
161
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
162
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
163
+ const ggml_tensor * src0 = dst->src[0];
164
+ const ggml_tensor * src1 = dst->src[1];
165
+ queue_ptr stream = ctx.stream();
166
+
167
+ const int32_t dim = ((int32_t *) dst->op_params)[0];
168
+
169
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
170
+ const float * src0_d = (const float *) src0->data;
171
+ const float * src1_d = (const float *) src1->data;
172
+
173
+ float * dst_d = (float *) dst->data;
174
+
175
+ if (dim != 3) {
176
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
177
+ concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178
+ dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
179
+ dst->ne[1], dst->ne[2], dim, stream);
180
+ }
181
+ } else {
182
+ const size_t size0 = ggml_nbytes(src0);
183
+ const size_t size1 = ggml_nbytes(src1);
184
+
185
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
187
+ }
188
  } else {
189
+ concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
190
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
191
+ src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192
+ src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
193
+ dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
 
194
  }
 
 
 
 
 
 
 
 
195
  }
ggml/src/ggml-sycl/conv.cpp CHANGED
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
72
  }
73
 
74
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 
75
  const ggml_tensor *src0 = dst->src[0];
76
  const ggml_tensor *src1 = dst->src[1];
77
  const float * src0_d = (const float *)src0->data;
 
72
  }
73
 
74
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
75
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
76
  const ggml_tensor *src0 = dst->src[0];
77
  const ggml_tensor *src1 = dst->src[1];
78
  const float * src0_d = (const float *)src0->data;
ggml/src/ggml-sycl/cpy.cpp CHANGED
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
616
  }
617
 
618
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
 
 
 
619
  const int64_t ne = ggml_nelements(src0);
620
  GGML_ASSERT(ne == ggml_nelements(src1));
621
 
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
629
 
630
  char * src0_ddc = (char *) src0->data;
631
  char * src1_ddc = (char *) src1->data;
632
- GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
633
- ggml_type_name(src1->type));
634
 
635
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
636
  ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
694
  }
695
 
696
  void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697
- // TODO: why do we pass dst as src1 here?
698
- GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
699
  ggml_sycl_cpy(ctx, dst->src[0], dst);
700
- GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
701
  }
 
616
  }
617
 
618
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
619
+ // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
620
+ scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
621
+ std::string(" src0 type=") + ggml_type_name(src0->type));
622
  const int64_t ne = ggml_nelements(src0);
623
  GGML_ASSERT(ne == ggml_nelements(src1));
624
 
 
632
 
633
  char * src0_ddc = (char *) src0->data;
634
  char * src1_ddc = (char *) src1->data;
 
 
635
 
636
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
637
  ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
 
695
  }
696
 
697
  void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
698
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
 
699
  ggml_sycl_cpy(ctx, dst->src[0], dst);
 
700
  }
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1092
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1093
 
1094
  if (src1_convert_f16) {
 
 
1095
  src1_dfloat = src1_dfloat_a.alloc(ne00);
1096
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1097
  GGML_ASSERT(to_fp16_sycl != nullptr);
 
1092
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1093
 
1094
  if (src1_convert_f16) {
1095
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
1096
+ " : converting src1 to fp16");
1097
  src1_dfloat = src1_dfloat_a.alloc(ne00);
1098
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1099
  GGML_ASSERT(to_fp16_sycl != nullptr);
ggml/src/ggml-sycl/element_wise.cpp CHANGED
@@ -1391,146 +1391,121 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
1391
 
1392
 
1393
  void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1394
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1395
  ggml_sycl_op_sqrt(ctx, dst);
1396
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1397
  }
1398
 
1399
  void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1400
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1401
  ggml_sycl_op_sin(ctx, dst);
1402
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1403
  }
1404
 
1405
  void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1406
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1407
  ggml_sycl_op_cos(ctx, dst);
1408
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1409
  }
1410
 
1411
  void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1412
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1413
  ggml_sycl_op_acc(ctx, dst);
1414
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1415
  }
1416
 
1417
  void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1418
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1419
  ggml_sycl_op_gelu(ctx, dst);
1420
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1421
  }
1422
 
1423
  void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1424
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1425
  ggml_sycl_op_silu(ctx, dst);
1426
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1427
  }
1428
 
1429
  void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1430
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1431
  ggml_sycl_op_gelu_quick(ctx, dst);
1432
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1433
  }
1434
 
1435
  void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1436
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1437
  ggml_sycl_op_tanh(ctx, dst);
1438
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1439
  }
1440
 
1441
  void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1442
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1443
  ggml_sycl_op_relu(ctx, dst);
1444
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1445
  }
1446
 
1447
  void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1448
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1449
  ggml_sycl_op_sigmoid(ctx, dst);
1450
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1451
  }
1452
 
1453
  void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1454
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1455
  ggml_sycl_op_hardsigmoid(ctx, dst);
1456
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1457
  }
1458
 
1459
  void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1460
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1461
  ggml_sycl_op_hardswish(ctx, dst);
1462
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1463
  }
1464
 
1465
-
1466
  void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1467
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1468
  ggml_sycl_op_exp(ctx, dst);
1469
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1470
  }
1471
 
1472
  void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1473
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1474
  ggml_sycl_op_log(ctx, dst);
1475
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1476
  }
1477
 
1478
  void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1479
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1480
  ggml_sycl_op_neg(ctx, dst);
1481
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1482
  }
1483
 
1484
  void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1485
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1486
  ggml_sycl_op_step(ctx, dst);
1487
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1488
  }
1489
 
1490
  void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1491
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1492
  ggml_sycl_op_leaky_relu(ctx, dst);
1493
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1494
  }
1495
 
1496
  void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1497
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1498
  ggml_sycl_op_sqr(ctx, dst);
1499
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1500
  }
1501
 
1502
  void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1503
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1504
  ggml_sycl_op_upscale(ctx, dst);
1505
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1506
  }
1507
 
1508
  void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1509
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1510
  ggml_sycl_op_pad(ctx, dst);
1511
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1512
  }
1513
 
1514
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1515
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1516
  ggml_sycl_op_clamp(ctx, dst);
1517
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1518
  }
1519
 
1520
  void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1521
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1522
  ggml_sycl_op_sgn(ctx, dst);
1523
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1524
  }
1525
 
1526
  void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1527
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1528
  ggml_sycl_op_abs(ctx, dst);
1529
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1530
  }
1531
 
1532
  void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1533
- GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1534
  ggml_sycl_op_elu(ctx, dst);
1535
- GGML_SYCL_DEBUG("call %s done\n", __func__);
1536
  }
 
1391
 
1392
 
1393
  void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1394
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1395
  ggml_sycl_op_sqrt(ctx, dst);
 
1396
  }
1397
 
1398
  void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1399
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1400
  ggml_sycl_op_sin(ctx, dst);
 
1401
  }
1402
 
1403
  void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1404
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1405
  ggml_sycl_op_cos(ctx, dst);
 
1406
  }
1407
 
1408
  void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1409
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
1410
  ggml_sycl_op_acc(ctx, dst);
 
1411
  }
1412
 
1413
  void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1414
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1415
  ggml_sycl_op_gelu(ctx, dst);
 
1416
  }
1417
 
1418
  void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1419
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1420
  ggml_sycl_op_silu(ctx, dst);
 
1421
  }
1422
 
1423
  void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1424
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1425
  ggml_sycl_op_gelu_quick(ctx, dst);
 
1426
  }
1427
 
1428
  void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1429
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1430
  ggml_sycl_op_tanh(ctx, dst);
 
1431
  }
1432
 
1433
  void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1434
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1435
  ggml_sycl_op_relu(ctx, dst);
 
1436
  }
1437
 
1438
  void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1439
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1440
  ggml_sycl_op_sigmoid(ctx, dst);
 
1441
  }
1442
 
1443
  void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1444
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1445
  ggml_sycl_op_hardsigmoid(ctx, dst);
 
1446
  }
1447
 
1448
  void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1449
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1450
  ggml_sycl_op_hardswish(ctx, dst);
 
1451
  }
1452
 
 
1453
  void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1454
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1455
  ggml_sycl_op_exp(ctx, dst);
 
1456
  }
1457
 
1458
  void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1459
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1460
  ggml_sycl_op_log(ctx, dst);
 
1461
  }
1462
 
1463
  void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1464
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1465
  ggml_sycl_op_neg(ctx, dst);
 
1466
  }
1467
 
1468
  void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1469
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1470
  ggml_sycl_op_step(ctx, dst);
 
1471
  }
1472
 
1473
  void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1474
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1475
  ggml_sycl_op_leaky_relu(ctx, dst);
 
1476
  }
1477
 
1478
  void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1479
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1480
  ggml_sycl_op_sqr(ctx, dst);
 
1481
  }
1482
 
1483
  void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1484
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1485
  ggml_sycl_op_upscale(ctx, dst);
 
1486
  }
1487
 
1488
  void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1489
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1490
  ggml_sycl_op_pad(ctx, dst);
 
1491
  }
1492
 
1493
  void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1494
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1495
  ggml_sycl_op_clamp(ctx, dst);
 
1496
  }
1497
 
1498
  void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1499
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1500
  ggml_sycl_op_sgn(ctx, dst);
 
1501
  }
1502
 
1503
  void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1504
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1505
  ggml_sycl_op_abs(ctx, dst);
 
1506
  }
1507
 
1508
  void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1509
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1510
  ggml_sycl_op_elu(ctx, dst);
 
1511
  }
ggml/src/ggml-sycl/getrows.cpp CHANGED
@@ -257,8 +257,7 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
257
  GGML_UNUSED(ctx);
258
  }
259
 
260
- void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
261
-
262
  GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
263
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
264
 
@@ -308,4 +307,3 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
308
  GGML_ABORT("fatal error");
309
  }
310
  }
311
-
 
257
  GGML_UNUSED(ctx);
258
  }
259
 
260
+ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
261
  GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
262
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
263
 
 
307
  GGML_ABORT("fatal error");
308
  }
309
  }
 
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -346,6 +346,8 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
346
  static enum ggml_status
347
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
348
  ggml_tensor *tensor) try {
 
 
349
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
350
 
351
  if (tensor->view_src != NULL) {
@@ -381,7 +383,9 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
381
  ggml_tensor *tensor,
382
  const void *data, size_t offset,
383
  size_t size) try {
384
-
 
 
385
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
386
  ggml_sycl_set_device(ctx->device);
387
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -407,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
407
  const ggml_tensor *tensor,
408
  void *data, size_t offset,
409
  size_t size) try {
410
-
 
 
411
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
412
 
413
  ggml_sycl_set_device(ctx->device);
@@ -435,7 +441,12 @@ static bool
435
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
436
  const ggml_tensor *src,
437
  ggml_tensor *dst) try {
438
- if (ggml_backend_buffer_is_sycl(src->buffer)) {
 
 
 
 
 
439
  ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
440
  ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
441
 
@@ -492,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
492
 
493
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
494
  uint8_t value) try {
495
- ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
496
 
497
  ggml_sycl_set_device(ctx->device);
498
  queue_ptr stream = ctx->stream;
@@ -511,7 +523,9 @@ catch (sycl::exception const &exc) {
511
 
512
  static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
513
  size_t offset, size_t size) {
514
- GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
 
 
515
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
516
  SYCL_CHECK(ggml_sycl_set_device(ctx->device));
517
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -789,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
789
  static enum ggml_status
790
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
791
  ggml_tensor *tensor) try {
 
 
792
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
793
 
794
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -873,6 +889,9 @@ static void
873
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
874
  ggml_tensor *tensor, const void *data,
875
  size_t offset, size_t size) try {
 
 
 
876
  // split tensors must always be set in their entirety at once
877
  GGML_ASSERT(offset == 0);
878
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -926,6 +945,9 @@ static void
926
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
927
  const ggml_tensor *tensor, void *data,
928
  size_t offset, size_t size) try {
 
 
 
929
  // split tensors must always be set in their entirety at once
930
  GGML_ASSERT(offset == 0);
931
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -2015,12 +2037,12 @@ inline void ggml_sycl_op_mul_mat_sycl(
2015
  #else
2016
  bool use_fp16 = false;
2017
  #endif
2018
- if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2019
- use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
2020
- dst->op_params[0] == GGML_PREC_DEFAULT) {
2021
- // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
2022
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
2023
  if (src0->type != GGML_TYPE_F16) {
 
 
2024
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
2025
  GGML_ASSERT(to_fp16_sycl != nullptr);
2026
  size_t ne = row_diff*ne00;
@@ -2033,6 +2055,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
2033
 
2034
  ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
2035
  if (src1->type != GGML_TYPE_F16) {
 
 
2036
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
2037
  GGML_ASSERT(to_fp16_sycl != nullptr);
2038
  size_t ne = src1_ncols*ne10;
@@ -2049,6 +2073,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
2049
  DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
2050
  DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2051
  dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
 
 
2052
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2053
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2054
  }
@@ -2064,21 +2090,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
2064
  src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2065
  dst_f16.get(), dpct::library_data_t::real_half, ldc,
2066
  dpct::library_data_t::real_half)));
 
 
2067
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2068
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2069
  }
2070
- }
2071
- else {
2072
- // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
2073
  ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
2074
  ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
2075
  if (src0->type != GGML_TYPE_F32) {
 
 
2076
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
2077
  GGML_ASSERT(to_fp32_sycl != nullptr);
2078
  src0_ddq_as_f32.alloc(row_diff*ne00);
2079
  to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
2080
  }
2081
  if (src1->type != GGML_TYPE_F32) {
 
 
2082
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
2083
  GGML_ASSERT(to_fp32_sycl != nullptr);
2084
  src1_ddq_as_f32.alloc(src1_ncols*ne10);
@@ -2114,8 +2144,7 @@ catch (sycl::exception const &exc) {
2114
  std::exit(1);
2115
  }
2116
 
2117
- static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2118
-
2119
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2120
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2121
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2167,8 +2196,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
2167
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2168
  }
2169
 
2170
- inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2171
-
2172
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2173
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2174
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2199,8 +2227,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
2199
  argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
2200
  }
2201
 
2202
- inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2203
-
2204
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2205
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
2206
 
@@ -2215,8 +2242,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
2215
  argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2216
  }
2217
 
2218
- inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
2219
-
2220
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2221
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2222
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2233,8 +2259,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
2233
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2234
  }
2235
 
2236
- inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2237
-
2238
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2239
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2240
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2421,6 +2446,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2421
  dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2422
 
2423
  if (src1_on_device && src1_is_contiguous) {
 
 
2424
  quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2425
  /*
2426
  DPCT1010:90: SYCL uses exceptions to report errors and does not
@@ -2525,6 +2552,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2525
  }
2526
 
2527
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
 
 
2528
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2529
  /*
2530
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -2619,33 +2648,28 @@ catch (sycl::exception const &exc) {
2619
 
2620
 
2621
  static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2622
- GGML_SYCL_DEBUG("call %s\n", __func__);
2623
  ggml_sycl_op_get_rows(ctx, dst);
2624
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2625
  }
2626
 
2627
  static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2628
- GGML_SYCL_DEBUG("call %s\n", __func__);
2629
  ggml_sycl_op_norm(ctx, dst);
2630
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2631
  }
2632
 
2633
  static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2634
- GGML_SYCL_DEBUG("call %s\n", __func__);
2635
  ggml_sycl_op_rms_norm(ctx, dst);
2636
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2637
  }
2638
 
2639
  static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2640
- GGML_SYCL_DEBUG("call %s\n", __func__);
2641
  ggml_sycl_op_l2_norm(ctx, dst);
2642
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2643
  }
2644
 
2645
  static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2646
- GGML_SYCL_DEBUG("call %s\n", __func__);
2647
  ggml_sycl_op_group_norm(ctx, dst);
2648
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2649
  }
2650
 
2651
  static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2773,6 +2797,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2773
 
2774
  // convert src1 to fp16
2775
  if (src1->type != GGML_TYPE_F16) {
 
 
2776
  const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
2777
  GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2778
  const int64_t ne_src1 = ggml_nelements(src1);
@@ -3076,6 +3102,7 @@ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor *
3076
  }
3077
 
3078
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3079
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
3080
  int64_t min_compute_capability = INT_MAX;
3081
 
@@ -3153,7 +3180,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3153
  constexpr bool convert_src1_to_q8_1 = false;
3154
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
3155
  }
3156
- GGML_SYCL_DEBUG("call %s done\n", __func__);
3157
  }
3158
 
3159
 
@@ -3224,6 +3250,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
3224
 
3225
  static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3226
  ggml_tensor *dst) try {
 
3227
  const ggml_tensor *src0 = dst->src[0];
3228
  const ggml_tensor *src1 = dst->src[1];
3229
  GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
@@ -3392,37 +3419,45 @@ catch (sycl::exception const &exc) {
3392
  }
3393
 
3394
  static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3395
  ggml_sycl_op_scale(ctx, dst);
3396
  }
3397
 
3398
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3399
  ggml_sycl_op_diag_mask_inf(ctx, dst);
3400
  }
3401
 
3402
  static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3403
  ggml_sycl_op_pool2d(ctx, dst);
3404
  }
3405
 
3406
  static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3407
  ggml_sycl_op_im2col(ctx, dst);
3408
  }
3409
 
3410
  static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3411
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3412
  ggml_sycl_op_sum(ctx, dst);
3413
  }
3414
 
3415
  static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3416
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3417
  ggml_sycl_op_sum_rows(ctx, dst);
3418
  }
3419
 
3420
  static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3421
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3422
  ggml_sycl_op_argsort(ctx, dst);
3423
  }
3424
 
3425
  static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
3426
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3427
  ggml_sycl_op_argmax(ctx, dst);
3428
  }
@@ -3716,6 +3751,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
3716
  ggml_tensor *tensor,
3717
  const void *data, size_t offset,
3718
  size_t size) try {
 
 
 
3719
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3720
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3721
 
@@ -3734,6 +3772,9 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
3734
  const ggml_tensor *tensor,
3735
  void *data, size_t offset,
3736
  size_t size) try {
 
 
 
3737
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3738
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3739
 
@@ -3752,7 +3793,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
3752
  const ggml_tensor *src,
3753
  ggml_tensor *dst) try {
3754
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3755
- if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
 
 
 
 
 
 
3756
  /*
3757
  DPCT1009:215: SYCL uses exceptions to report errors and does not use the
3758
  error codes. The original code was commented out and a warning string
@@ -3773,6 +3820,7 @@ catch (sycl::exception const &exc) {
3773
  }
3774
 
3775
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
 
3776
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3777
  const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
3778
  SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
@@ -3906,7 +3954,7 @@ catch (sycl::exception const &exc)
3906
  }
3907
 
3908
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
3909
-
3910
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
3911
 
3912
  if (ggml_backend_is_sycl(backend)) {
@@ -4301,6 +4349,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
4301
 
4302
  static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
4303
  GGML_UNUSED(dev);
 
4304
 
4305
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4306
  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
 
346
  static enum ggml_status
347
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
348
  ggml_tensor *tensor) try {
349
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
350
+ debug_print_tensor(": tensor=", tensor, "\n");
351
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
352
 
353
  if (tensor->view_src != NULL) {
 
383
  ggml_tensor *tensor,
384
  const void *data, size_t offset,
385
  size_t size) try {
386
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
387
+ debug_print_tensor(": tensor=", tensor);
388
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
389
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
390
  ggml_sycl_set_device(ctx->device);
391
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
 
411
  const ggml_tensor *tensor,
412
  void *data, size_t offset,
413
  size_t size) try {
414
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
415
+ debug_print_tensor(": tensor=", tensor);
416
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
417
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
418
 
419
  ggml_sycl_set_device(ctx->device);
 
441
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
442
  const ggml_tensor *src,
443
  ggml_tensor *dst) try {
444
+ bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
445
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
446
+ debug_print_tensor(": dst=", dst);
447
+ debug_print_tensor(" src=", src);
448
+ GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
449
+ if (is_cpy_supported) {
450
  ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
451
  ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
452
 
 
503
 
504
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
505
  uint8_t value) try {
506
+ GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
507
+ ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
508
 
509
  ggml_sycl_set_device(ctx->device);
510
  queue_ptr stream = ctx->stream;
 
523
 
524
  static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
525
  size_t offset, size_t size) {
526
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
527
+ debug_print_tensor(": tensor=", tensor);
528
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
529
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
530
  SYCL_CHECK(ggml_sycl_set_device(ctx->device));
531
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
 
803
  static enum ggml_status
804
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
805
  ggml_tensor *tensor) try {
806
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
807
+ debug_print_tensor(": tensor=", tensor, "\n");
808
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
809
 
810
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
 
889
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
890
  ggml_tensor *tensor, const void *data,
891
  size_t offset, size_t size) try {
892
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
893
+ debug_print_tensor(": tensor=", tensor);
894
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
895
  // split tensors must always be set in their entirety at once
896
  GGML_ASSERT(offset == 0);
897
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
945
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
946
  const ggml_tensor *tensor, void *data,
947
  size_t offset, size_t size) try {
948
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
949
+ debug_print_tensor(": tensor=", tensor);
950
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
951
  // split tensors must always be set in their entirety at once
952
  GGML_ASSERT(offset == 0);
953
  GGML_ASSERT(size == ggml_nbytes(tensor));
 
2037
  #else
2038
  bool use_fp16 = false;
2039
  #endif
2040
+ if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
2041
+ row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
 
 
2042
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
2043
  if (src0->type != GGML_TYPE_F16) {
2044
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
2045
+ " : converting src0 to fp16");
2046
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
2047
  GGML_ASSERT(to_fp16_sycl != nullptr);
2048
  size_t ne = row_diff*ne00;
 
2055
 
2056
  ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
2057
  if (src1->type != GGML_TYPE_F16) {
2058
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
2059
+ " : converting src1 to fp16");
2060
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
2061
  GGML_ASSERT(to_fp16_sycl != nullptr);
2062
  size_t ne = src1_ncols*ne10;
 
2073
  DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
2074
  DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2075
  dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2076
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2077
+ " : converting dst to fp32");
2078
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2079
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2080
  }
 
2090
  src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2091
  dst_f16.get(), dpct::library_data_t::real_half, ldc,
2092
  dpct::library_data_t::real_half)));
2093
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2094
+ " : converting dst to fp32");
2095
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2096
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2097
  }
2098
+ } else {
 
 
2099
  ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
2100
  ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
2101
  if (src0->type != GGML_TYPE_F32) {
2102
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2103
+ " : converting src0 to fp32");
2104
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
2105
  GGML_ASSERT(to_fp32_sycl != nullptr);
2106
  src0_ddq_as_f32.alloc(row_diff*ne00);
2107
  to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
2108
  }
2109
  if (src1->type != GGML_TYPE_F32) {
2110
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2111
+ " : converting src1 to fp32");
2112
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
2113
  GGML_ASSERT(to_fp32_sycl != nullptr);
2114
  src1_ddq_as_f32.alloc(src1_ncols*ne10);
 
2144
  std::exit(1);
2145
  }
2146
 
2147
+ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
2148
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2149
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2150
  dpct::queue_ptr main_stream = ctx.stream();
 
2196
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2197
  }
2198
 
2199
+ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
2200
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2201
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2202
  dpct::queue_ptr main_stream = ctx.stream();
 
2227
  argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
2228
  }
2229
 
2230
+ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
2231
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2232
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
2233
 
 
2242
  argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2243
  }
2244
 
2245
+ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
2246
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2247
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2248
  dpct::queue_ptr main_stream = ctx.stream();
 
2259
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2260
  }
2261
 
2262
+ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
2263
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2264
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2265
  dpct::queue_ptr main_stream = ctx.stream();
 
2446
  dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2447
 
2448
  if (src1_on_device && src1_is_contiguous) {
2449
+ scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2450
+ /*num_src=*/2, " : converting src1 to Q8_1");
2451
  quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2452
  /*
2453
  DPCT1010:90: SYCL uses exceptions to report errors and does not
 
2552
  }
2553
 
2554
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
2555
+ scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2556
+ /*num_src=*/2, " : converting src1 to Q8_1");
2557
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2558
  /*
2559
  DPCT1010:92: SYCL uses exceptions to report errors and does
 
2648
 
2649
 
2650
  static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2651
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
2652
  ggml_sycl_op_get_rows(ctx, dst);
 
2653
  }
2654
 
2655
  static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2656
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2657
  ggml_sycl_op_norm(ctx, dst);
 
2658
  }
2659
 
2660
  static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2661
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2662
  ggml_sycl_op_rms_norm(ctx, dst);
 
2663
  }
2664
 
2665
  static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2666
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2667
  ggml_sycl_op_l2_norm(ctx, dst);
 
2668
  }
2669
 
2670
  static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2671
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2672
  ggml_sycl_op_group_norm(ctx, dst);
 
2673
  }
2674
 
2675
  static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
 
2797
 
2798
  // convert src1 to fp16
2799
  if (src1->type != GGML_TYPE_F16) {
2800
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
2801
+ " : converting src1 to fp16");
2802
  const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
2803
  GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2804
  const int64_t ne_src1 = ggml_nelements(src1);
 
3102
  }
3103
 
3104
  static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3105
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
3106
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
3107
  int64_t min_compute_capability = INT_MAX;
3108
 
 
3180
  constexpr bool convert_src1_to_q8_1 = false;
3181
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
3182
  }
 
3183
  }
3184
 
3185
 
 
3250
 
3251
  static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3252
  ggml_tensor *dst) try {
3253
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
3254
  const ggml_tensor *src0 = dst->src[0];
3255
  const ggml_tensor *src1 = dst->src[1];
3256
  GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
 
3419
  }
3420
 
3421
  static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3422
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3423
  ggml_sycl_op_scale(ctx, dst);
3424
  }
3425
 
3426
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3427
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3428
  ggml_sycl_op_diag_mask_inf(ctx, dst);
3429
  }
3430
 
3431
  static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3432
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3433
  ggml_sycl_op_pool2d(ctx, dst);
3434
  }
3435
 
3436
  static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3437
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
3438
  ggml_sycl_op_im2col(ctx, dst);
3439
  }
3440
 
3441
  static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3442
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3443
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3444
  ggml_sycl_op_sum(ctx, dst);
3445
  }
3446
 
3447
  static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3448
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3449
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3450
  ggml_sycl_op_sum_rows(ctx, dst);
3451
  }
3452
 
3453
  static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3454
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3455
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3456
  ggml_sycl_op_argsort(ctx, dst);
3457
  }
3458
 
3459
  static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3460
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3461
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3462
  ggml_sycl_op_argmax(ctx, dst);
3463
  }
 
3751
  ggml_tensor *tensor,
3752
  const void *data, size_t offset,
3753
  size_t size) try {
3754
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3755
+ debug_print_tensor(": tensor=", tensor);
3756
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3757
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3758
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3759
 
 
3772
  const ggml_tensor *tensor,
3773
  void *data, size_t offset,
3774
  size_t size) try {
3775
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3776
+ debug_print_tensor(": tensor=", tensor);
3777
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3778
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3779
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3780
 
 
3793
  const ggml_tensor *src,
3794
  ggml_tensor *dst) try {
3795
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3796
+ bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
3797
+ ggml_backend_buffer_is_sycl(src->buffer);
3798
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3799
+ debug_print_tensor(": dst=", dst);
3800
+ debug_print_tensor(" src=", src);
3801
+ GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
3802
+ if (is_cpy_supported) {
3803
  /*
3804
  DPCT1009:215: SYCL uses exceptions to report errors and does not use the
3805
  error codes. The original code was commented out and a warning string
 
3820
  }
3821
 
3822
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
3823
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
3824
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3825
  const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
3826
  SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
 
3954
  }
3955
 
3956
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
3957
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
3958
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
3959
 
3960
  if (ggml_backend_is_sycl(backend)) {
 
4349
 
4350
  static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
4351
  GGML_UNUSED(dev);
4352
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
4353
 
4354
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4355
  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
ggml/src/ggml-sycl/gla.cpp CHANGED
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
76
  }
77
 
78
  void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
79
  const float * k_d = static_cast<const float *>(dst->src[0]->data);
80
  const float * v_d = static_cast<const float *>(dst->src[1]->data);
81
  const float * r_d = static_cast<const float *>(dst->src[2]->data);
 
76
  }
77
 
78
  void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
79
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
80
  const float * k_d = static_cast<const float *>(dst->src[0]->data);
81
  const float * v_d = static_cast<const float *>(dst->src[1]->data);
82
  const float * r_d = static_cast<const float *>(dst->src[2]->data);
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -1059,8 +1059,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
1059
  case GGML_TYPE_Q4_K:
1060
  if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1061
  ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
 
1062
  reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1063
  } else {
 
1064
  mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1065
  }
1066
  break;
 
1059
  case GGML_TYPE_Q4_K:
1060
  if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1061
  ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1062
+ GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
1063
  reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1064
  } else {
1065
+ GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
1066
  mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1067
  }
1068
  break;
ggml/src/ggml-sycl/outprod.cpp CHANGED
@@ -1,6 +1,7 @@
1
  #include "outprod.hpp"
2
 
3
  void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
4
  const ggml_tensor *src0 = dst->src[0];
5
  const ggml_tensor *src1 = dst->src[1];
6
 
 
1
  #include "outprod.hpp"
2
 
3
  void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
4
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
5
  const ggml_tensor *src0 = dst->src[0];
6
  const ggml_tensor *src1 = dst->src[1];
7
 
ggml/src/ggml-sycl/rope.cpp CHANGED
@@ -355,8 +355,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
355
  }
356
 
357
  void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
358
- GGML_SYCL_DEBUG("call %s\n", __func__);
359
  ggml_sycl_op_rope(ctx, dst);
360
- GGML_SYCL_DEBUG("call %s done\n", __func__);
361
  }
362
 
 
355
  }
356
 
357
  void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
358
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
359
  ggml_sycl_op_rope(ctx, dst);
 
360
  }
361
 
ggml/src/ggml-sycl/softmax.cpp CHANGED
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
225
  }
226
 
227
  void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
228
-
229
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
230
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
231
 
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
249
 
250
  if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
251
  const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
252
- GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
253
  soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
254
  main_stream, ctx.device);
255
  } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
256
  const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
257
- GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
258
  soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
259
  } else {
260
  /* mask unavailable */
261
- GGML_SYCL_DEBUG("%s: No mask\n", __func__);
262
  soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
263
  }
264
  }
 
225
  }
226
 
227
  void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
228
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
229
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
230
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
231
 
 
249
 
250
  if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
251
  const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
 
252
  soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
253
  main_stream, ctx.device);
254
  } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
255
  const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
 
256
  soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
257
  } else {
258
  /* mask unavailable */
 
259
  soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
260
  }
261
  }
ggml/src/ggml-sycl/tsembd.cpp CHANGED
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
56
  }
57
 
58
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
59
- const ggml_tensor *src0 = dst->src[0];
60
- const ggml_tensor *src1 = dst->src[1];
61
  const float * src0_d = (const float *)src0->data;
62
  float * dst_d = (float *)dst->data;
63
  dpct::queue_ptr stream = ctx.stream();
@@ -69,5 +69,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
69
  const int max_period = dst->op_params[1];
70
 
71
  timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
72
- GGML_UNUSED(src1);
73
  }
 
56
  }
57
 
58
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
59
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
60
+ const ggml_tensor * src0 = dst->src[0];
61
  const float * src0_d = (const float *)src0->data;
62
  float * dst_d = (float *)dst->data;
63
  dpct::queue_ptr stream = ctx.stream();
 
69
  const int max_period = dst->op_params[1];
70
 
71
  timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
 
72
  }
ggml/src/ggml-sycl/wkv.cpp CHANGED
@@ -180,10 +180,7 @@ static void rwkv_wkv7_f32_kernel(
180
  }
181
 
182
  void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
183
-
184
- const ggml_tensor *src0 = dst->src[0];
185
- const ggml_tensor *src1 = dst->src[1];
186
-
187
  const float* k_d = (const float*)dst->src[0]->data;
188
  const float* v_d = (const float*)dst->src[1]->data;
189
  const float* r_d = (const float*)dst->src[2]->data;
@@ -236,16 +233,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
236
  });
237
  });
238
  }
239
-
240
- GGML_UNUSED(src0);
241
- GGML_UNUSED(src1);
242
  }
243
 
244
  void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
245
-
246
- const ggml_tensor *src0 = dst->src[0];
247
- const ggml_tensor *src1 = dst->src[1];
248
-
249
  const float* r_d = (const float*)dst->src[0]->data;
250
  const float* w_d = (const float*)dst->src[1]->data;
251
  const float* k_d = (const float*)dst->src[2]->data;
@@ -299,7 +290,4 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
299
  });
300
  });
301
  }
302
-
303
- GGML_UNUSED(src0);
304
- GGML_UNUSED(src1);
305
  }
 
180
  }
181
 
182
  void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
183
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
 
 
 
184
  const float* k_d = (const float*)dst->src[0]->data;
185
  const float* v_d = (const float*)dst->src[1]->data;
186
  const float* r_d = (const float*)dst->src[2]->data;
 
233
  });
234
  });
235
  }
 
 
 
236
  }
237
 
238
  void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
239
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
 
 
 
240
  const float* r_d = (const float*)dst->src[0]->data;
241
  const float* w_d = (const float*)dst->src[1]->data;
242
  const float* k_d = (const float*)dst->src[2]->data;
 
290
  });
291
  });
292
  }
 
 
 
293
  }