Spaces:
Sleeping
Sleeping
Commit
·
afc137c
1
Parent(s):
73a16f3
CUDA: fix partial offloading for ne0 % 256 != 0 (llama/8572)
Browse files- ggml/include/ggml-backend.h +15 -13
- ggml/src/ggml-alloc.c +1 -0
- ggml/src/ggml-backend.c +4 -0
- ggml/src/ggml-cuda.cu +9 -2
ggml/include/ggml-backend.h
CHANGED
|
@@ -29,21 +29,23 @@ extern "C" {
|
|
| 29 |
enum ggml_backend_buffer_usage {
|
| 30 |
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
| 31 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
|
|
|
| 32 |
};
|
| 33 |
|
| 34 |
-
GGML_API const char *
|
| 35 |
-
GGML_API void
|
| 36 |
-
GGML_API void *
|
| 37 |
-
GGML_API size_t
|
| 38 |
-
GGML_API GGML_CALL void
|
| 39 |
-
GGML_API size_t
|
| 40 |
-
GGML_API size_t
|
| 41 |
-
GGML_API size_t
|
| 42 |
-
GGML_API void
|
| 43 |
-
GGML_API bool
|
| 44 |
-
GGML_API void
|
| 45 |
-
GGML_API
|
| 46 |
-
GGML_API
|
|
|
|
| 47 |
|
| 48 |
//
|
| 49 |
// Backend
|
|
|
|
| 29 |
enum ggml_backend_buffer_usage {
|
| 30 |
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
| 31 |
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 32 |
+
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
| 33 |
};
|
| 34 |
|
| 35 |
+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
| 36 |
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
| 37 |
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
| 38 |
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
| 39 |
+
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 40 |
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 41 |
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
| 42 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 43 |
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 44 |
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 45 |
+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
| 46 |
+
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
| 47 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
| 48 |
+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
| 49 |
|
| 50 |
//
|
| 51 |
// Backend
|
ggml/src/ggml-alloc.c
CHANGED
|
@@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 776 |
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
| 777 |
return false;
|
| 778 |
}
|
|
|
|
| 779 |
}
|
| 780 |
}
|
| 781 |
|
|
|
|
| 776 |
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
| 777 |
return false;
|
| 778 |
}
|
| 779 |
+
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
| 780 |
}
|
| 781 |
}
|
| 782 |
|
ggml/src/ggml-backend.c
CHANGED
|
@@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
|
|
| 134 |
}
|
| 135 |
}
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
| 138 |
return buffer->buft;
|
| 139 |
}
|
|
|
|
| 134 |
}
|
| 135 |
}
|
| 136 |
|
| 137 |
+
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
| 138 |
+
return buffer->usage;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
| 142 |
return buffer->buft;
|
| 143 |
}
|
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -464,12 +464,12 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
| 464 |
return;
|
| 465 |
}
|
| 466 |
|
| 467 |
-
if (ggml_is_quantized(tensor->type)) {
|
| 468 |
// initialize padding to 0 to avoid possible NaN values
|
| 469 |
size_t original_size = ggml_nbytes(tensor);
|
| 470 |
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 471 |
|
| 472 |
-
if (padded_size > original_size
|
| 473 |
ggml_cuda_set_device(ctx->device);
|
| 474 |
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
| 475 |
}
|
|
@@ -1485,6 +1485,13 @@ static void ggml_cuda_op_mul_mat(
|
|
| 1485 |
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
| 1486 |
}
|
| 1487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1488 |
if (src1_on_device && src1_is_contiguous) {
|
| 1489 |
dev[id].src1_ddf = (float *) src1->data;
|
| 1490 |
} else {
|
|
|
|
| 464 |
return;
|
| 465 |
}
|
| 466 |
|
| 467 |
+
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 468 |
// initialize padding to 0 to avoid possible NaN values
|
| 469 |
size_t original_size = ggml_nbytes(tensor);
|
| 470 |
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
| 471 |
|
| 472 |
+
if (padded_size > original_size) {
|
| 473 |
ggml_cuda_set_device(ctx->device);
|
| 474 |
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
| 475 |
}
|
|
|
|
| 1485 |
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
|
| 1486 |
}
|
| 1487 |
|
| 1488 |
+
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
|
| 1489 |
+
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
| 1490 |
+
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
| 1491 |
+
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
| 1492 |
+
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
|
| 1493 |
+
}
|
| 1494 |
+
|
| 1495 |
if (src1_on_device && src1_is_contiguous) {
|
| 1496 |
dev[id].src1_ddf = (float *) src1->data;
|
| 1497 |
} else {
|