JohannesGaessler commited on
Commit
afc137c
·
1 Parent(s): 73a16f3

CUDA: fix partial offloading for ne0 % 256 != 0 (llama/8572)

Browse files
ggml/include/ggml-backend.h CHANGED
@@ -29,21 +29,23 @@ extern "C" {
29
  enum ggml_backend_buffer_usage {
30
  GGML_BACKEND_BUFFER_USAGE_ANY = 0,
31
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
 
32
  };
33
 
34
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
35
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
36
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
37
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
38
- GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
39
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40
- GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
41
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
43
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
44
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
45
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
46
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
 
47
 
48
  //
49
  // Backend
 
29
  enum ggml_backend_buffer_usage {
30
  GGML_BACKEND_BUFFER_USAGE_ANY = 0,
31
  GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
32
+ GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
33
  };
34
 
35
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
36
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
37
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
38
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
39
+ GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
41
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
42
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
43
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
44
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
45
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
46
+ GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
47
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
48
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
49
 
50
  //
51
  // Backend
ggml/src/ggml-alloc.c CHANGED
@@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
776
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
777
  return false;
778
  }
 
779
  }
780
  }
781
 
 
776
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
777
  return false;
778
  }
779
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
780
  }
781
  }
782
 
ggml/src/ggml-backend.c CHANGED
@@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
134
  }
135
  }
136
 
 
 
 
 
137
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
138
  return buffer->buft;
139
  }
 
134
  }
135
  }
136
 
137
+ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
138
+ return buffer->usage;
139
+ }
140
+
141
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
142
  return buffer->buft;
143
  }
ggml/src/ggml-cuda.cu CHANGED
@@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
464
  return;
465
  }
466
 
467
- if (ggml_is_quantized(tensor->type)) {
468
  // initialize padding to 0 to avoid possible NaN values
469
  size_t original_size = ggml_nbytes(tensor);
470
  size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
471
 
472
- if (padded_size > original_size && tensor->view_src == nullptr) {
473
  ggml_cuda_set_device(ctx->device);
474
  CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
475
  }
@@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
1485
  dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
1486
  }
1487
 
 
 
 
 
 
 
 
1488
  if (src1_on_device && src1_is_contiguous) {
1489
  dev[id].src1_ddf = (float *) src1->data;
1490
  } else {
 
464
  return;
465
  }
466
 
467
+ if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
468
  // initialize padding to 0 to avoid possible NaN values
469
  size_t original_size = ggml_nbytes(tensor);
470
  size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
471
 
472
+ if (padded_size > original_size) {
473
  ggml_cuda_set_device(ctx->device);
474
  CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
475
  }
 
1485
  dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
1486
  }
1487
 
1488
+ // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
1489
+ if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
1490
+ const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
1491
+ const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
1492
+ CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
1493
+ }
1494
+
1495
  if (src1_on_device && src1_is_contiguous) {
1496
  dev[id].src1_ddf = (float *) src1->data;
1497
  } else {