qnixsynapse commited on
Commit
a67a8ec
·
1 Parent(s): c90c972

SYCL: Migrate away from deprecated ggml_tensor->backend (llama/10840)

Browse files

* Migrate to tensor->buffer for checking backend buffer type: 1

* SYCL: common.cpp try to migrate away from tensor->backend

* SYCL: fix assertions and add proper comments

* SYCL: remove extra space

* SYCL: Add back static to ggml_backend_buffer_is_sycl_split function

* SYCL: Add pragma directive to suppress warning spam

* SYCL: Integrate debug logs with GGML_LOG and other fixes

* Revert "SYCL: Integrate debug logs with GGML_LOG and other fixes"

This reverts commit 2607b7de0f0d2f4f1f690226f86fa861aa39cb97.
Let's keep the current SYCL specific logging mechanism for now

* SYCL: Use GGML_SYCL_DEBUG after reverting

* SYCL: reg_get_proc_address func, update to the current func signature

* SYCL: Refactor SYCL buffer checks in ggml_sycl_cpy_tensor_2d

ggml/src/ggml-sycl/common.cpp CHANGED
@@ -11,6 +11,8 @@
11
  //
12
 
13
  #include "common.hpp"
 
 
14
  #include "ggml-impl.h"
15
 
16
  int get_current_device_id() {
@@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
65
  const ggml_sycl_op_flatten_t op) try {
66
 
67
  const bool use_src1 = src1 != nullptr;
68
-
69
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
70
- GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
71
 
72
  // dd = data device
73
  float * src0_ddf = (float *) src0->data;
 
11
  //
12
 
13
  #include "common.hpp"
14
+
15
+ #include "ggml-backend-impl.h"
16
  #include "ggml-impl.h"
17
 
18
  int get_current_device_id() {
 
67
  const ggml_sycl_op_flatten_t op) try {
68
 
69
  const bool use_src1 = src1 != nullptr;
70
+ if(use_src1)
71
+ GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
72
+ GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
73
 
74
  // dd = data device
75
  float * src0_ddf = (float *) src0->data;
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -26,7 +26,11 @@
26
 
27
  #define GGML_COMMON_DECL_SYCL
28
  #define GGML_COMMON_IMPL_SYCL
 
 
 
29
  #include "ggml-common.h"
 
30
 
31
  void* ggml_sycl_host_malloc(size_t size);
32
  void ggml_sycl_host_free(void* ptr);
 
26
 
27
  #define GGML_COMMON_DECL_SYCL
28
  #define GGML_COMMON_IMPL_SYCL
29
+ /* suppress warning spam */
30
+ #pragma clang diagnostic push
31
+ #pragma clang diagnostic ignored "-Wnested-anon-types"
32
  #include "ggml-common.h"
33
+ #pragma clang diagnostic pop
34
 
35
  void* ggml_sycl_host_malloc(size_t size);
36
  void ggml_sycl_host_free(void* ptr);
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
288
  ggml_tensor *tensor) try {
289
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
290
 
291
- if (tensor->view_src != NULL && tensor->view_offs == 0) {
292
  assert(tensor->view_src->buffer->buft == buffer->buft);
293
- tensor->backend = tensor->view_src->backend;
294
- tensor->extra = tensor->view_src->extra;
295
  return;
296
  }
297
 
@@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
539
  auto dev_count = ggml_backend_sycl_get_device_count();
540
 
541
  if (device>=dev_count or device<0) {
542
- printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
543
  device, dev_count-1);
544
  GGML_ASSERT(device<dev_count);
545
  }
@@ -567,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
567
 
568
  int device = ctx->device;
569
  if (device>=ggml_sycl_info().device_count or device<0) {
570
- printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
571
  device, ggml_sycl_info().device_count-1);
572
  GGML_ASSERT(device<ggml_sycl_info().device_count);
573
  }
@@ -746,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
746
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
747
  }
748
 
749
- // FIXME: do not crash if cudaMalloc fails
750
  // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
751
  ggml_sycl_set_device(i);
752
  const queue_ptr stream = ctx->streams[i];
@@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
788
  CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
789
  }
790
  }
791
- tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
792
  tensor->extra = extra;
793
  }
794
  catch (sycl::exception const &exc) {
@@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
2349
 
2350
  dpct::memcpy_direction kind;
2351
  char * src_ptr;
2352
- if (src->backend == GGML_BACKEND_TYPE_CPU) {
2353
  kind = dpct::host_to_device;
 
2354
  src_ptr = (char *) src->data;
2355
  // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
2356
- } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
2357
- GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
 
 
 
 
 
 
 
 
 
2358
  kind = dpct::device_to_device;
2359
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
2360
  int id;
@@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2857
  const int nb2 = dst->nb[2];
2858
  const int nb3 = dst->nb[3];
2859
 
2860
- GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
2861
- GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
2862
  GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
2863
 
2864
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2878
 
2879
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
2880
 
2881
- const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
2882
  GGML_ASSERT(!(split && ne02 > 1));
2883
  GGML_ASSERT(!(split && ne03 > 1));
2884
  GGML_ASSERT(!(split && ne02 < ne12));
@@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
3198
  const ggml_tensor *src1,
3199
  ggml_tensor *dst) try {
3200
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3201
- GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
3202
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
3203
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
3204
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
3231
  GGML_ASSERT(!ggml_is_transposed(src0));
3232
  GGML_ASSERT(!ggml_is_transposed(src1));
3233
  GGML_ASSERT(!ggml_is_permuted(src0));
3234
- GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
3235
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3236
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3237
 
@@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
3293
  ggml_tensor *dst) try {
3294
  GGML_ASSERT(!ggml_is_transposed(src0));
3295
  GGML_ASSERT(!ggml_is_transposed(src1));
3296
- GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
3297
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3298
 
3299
  GGML_TENSOR_BINARY_OP_LOCALS
@@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
4638
  static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
4639
  GGML_UNUSED(reg);
4640
 
4641
- // TODO: update to the current function signature
4642
- //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
4643
- // return (void *)ggml_backend_sycl_split_buffer_type;
4644
- //}
4645
 
4646
  // SYCL doesn't support registering host memory, left here for reference
4647
  // "ggml_backend_register_host_buffer"
 
288
  ggml_tensor *tensor) try {
289
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
290
 
291
+ if (tensor->view_src != NULL) {
292
  assert(tensor->view_src->buffer->buft == buffer->buft);
 
 
293
  return;
294
  }
295
 
 
537
  auto dev_count = ggml_backend_sycl_get_device_count();
538
 
539
  if (device>=dev_count or device<0) {
540
+ GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
541
  device, dev_count-1);
542
  GGML_ASSERT(device<dev_count);
543
  }
 
565
 
566
  int device = ctx->device;
567
  if (device>=ggml_sycl_info().device_count or device<0) {
568
+ GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
569
  device, ggml_sycl_info().device_count-1);
570
  GGML_ASSERT(device<ggml_sycl_info().device_count);
571
  }
 
744
  size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
745
  }
746
 
747
+ // FIXME: do not crash if SYCL Buffer alloc fails
748
  // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
749
  ggml_sycl_set_device(i);
750
  const queue_ptr stream = ctx->streams[i];
 
786
  CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
787
  }
788
  }
 
789
  tensor->extra = extra;
790
  }
791
  catch (sycl::exception const &exc) {
 
2346
 
2347
  dpct::memcpy_direction kind;
2348
  char * src_ptr;
2349
+ if (ggml_backend_buffer_is_host(src->buffer)) {
2350
  kind = dpct::host_to_device;
2351
+ //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
2352
  src_ptr = (char *) src->data;
2353
  // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
2354
+ } else if (ggml_backend_buffer_is_sycl(src->buffer)) {
2355
+ // If buffer is a SYCL buffer
2356
+ //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
2357
+ kind = dpct::device_to_device;
2358
+ src_ptr = (char *) src->data;
2359
+ } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
2360
+ /*
2361
+ If buffer is a SYCL split buffer
2362
+ */
2363
+ //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
2364
+ GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
2365
  kind = dpct::device_to_device;
2366
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
2367
  int id;
 
2864
  const int nb2 = dst->nb[2];
2865
  const int nb3 = dst->nb[3];
2866
 
2867
+ GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
2868
+ GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
2869
  GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
2870
 
2871
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
 
2885
 
2886
  int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
2887
 
2888
+ const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
2889
  GGML_ASSERT(!(split && ne02 > 1));
2890
  GGML_ASSERT(!(split && ne03 > 1));
2891
  GGML_ASSERT(!(split && ne02 < ne12));
 
3205
  const ggml_tensor *src1,
3206
  ggml_tensor *dst) try {
3207
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3208
+ GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
3209
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
3210
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
3211
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
3238
  GGML_ASSERT(!ggml_is_transposed(src0));
3239
  GGML_ASSERT(!ggml_is_transposed(src1));
3240
  GGML_ASSERT(!ggml_is_permuted(src0));
3241
+ GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
3242
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3243
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3244
 
 
3300
  ggml_tensor *dst) try {
3301
  GGML_ASSERT(!ggml_is_transposed(src0));
3302
  GGML_ASSERT(!ggml_is_transposed(src1));
3303
+ GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
3304
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3305
 
3306
  GGML_TENSOR_BINARY_OP_LOCALS
 
4645
  static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
4646
  GGML_UNUSED(reg);
4647
 
4648
+ if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
4649
+ return (void *)ggml_backend_sycl_split_buffer_type;
4650
+ }
 
4651
 
4652
  // SYCL doesn't support registering host memory, left here for reference
4653
  // "ggml_backend_register_host_buffer"