Spaces:
Sleeping
Sleeping
Commit
·
23e676b
1
Parent(s):
1136116
CUDA: fix bad asserts for partial offload (llama/13337)
Browse files- ggml/include/ggml.h +4 -0
- ggml/src/ggml-cuda/fattn-common.cuh +2 -0
- ggml/src/ggml-cuda/ggml-cuda.cu +7 -4
- ggml/src/ggml-cuda/mmq.cu +2 -1
- ggml/src/ggml-cuda/mmvq.cu +2 -1
- ggml/src/ggml.c +4 -0
ggml/include/ggml.h
CHANGED
|
@@ -673,11 +673,15 @@ extern "C" {
|
|
| 673 |
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
| 674 |
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
| 675 |
|
|
|
|
| 676 |
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
| 677 |
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
| 678 |
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
| 679 |
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
| 680 |
|
|
|
|
|
|
|
|
|
|
| 681 |
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
| 682 |
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
| 683 |
|
|
|
|
| 673 |
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
| 674 |
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
| 675 |
|
| 676 |
+
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
| 677 |
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
| 678 |
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
| 679 |
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
| 680 |
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
| 681 |
|
| 682 |
+
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
| 683 |
+
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
| 684 |
+
|
| 685 |
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
| 686 |
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
| 687 |
|
ggml/src/ggml-cuda/fattn-common.cuh
CHANGED
|
@@ -719,6 +719,7 @@ void launch_fattn(
|
|
| 719 |
size_t nb23 = V->nb[3];
|
| 720 |
|
| 721 |
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
|
|
|
| 722 |
K_f16.alloc(ggml_nelements(K));
|
| 723 |
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
| 724 |
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
|
@@ -733,6 +734,7 @@ void launch_fattn(
|
|
| 733 |
}
|
| 734 |
|
| 735 |
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
|
|
|
| 736 |
V_f16.alloc(ggml_nelements(V));
|
| 737 |
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
| 738 |
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
|
|
|
| 719 |
size_t nb23 = V->nb[3];
|
| 720 |
|
| 721 |
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
| 722 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
| 723 |
K_f16.alloc(ggml_nelements(K));
|
| 724 |
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
| 725 |
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
|
|
|
| 734 |
}
|
| 735 |
|
| 736 |
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
| 737 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
| 738 |
V_f16.alloc(ggml_nelements(V));
|
| 739 |
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
| 740 |
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
ggml/src/ggml-cuda/ggml-cuda.cu
CHANGED
|
@@ -1536,6 +1536,8 @@ static void ggml_cuda_op_mul_mat(
|
|
| 1536 |
|
| 1537 |
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
| 1538 |
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
|
|
|
|
|
|
| 1539 |
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
| 1540 |
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
| 1541 |
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
|
@@ -2067,10 +2069,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
| 2067 |
}
|
| 2068 |
|
| 2069 |
ggml_tensor src0_slice = *src0;
|
| 2070 |
-
src0_slice.ne[2]
|
| 2071 |
-
src0_slice.nb[3]
|
| 2072 |
-
src0_slice.
|
| 2073 |
-
|
|
|
|
| 2074 |
|
| 2075 |
ggml_tensor src1_slice;
|
| 2076 |
memset(&src1_slice, 0, sizeof(src1_slice));
|
|
|
|
| 1536 |
|
| 1537 |
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
| 1538 |
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
| 1539 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 1540 |
+
GGML_ASSERT(!src0->view_src);
|
| 1541 |
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
| 1542 |
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
| 1543 |
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
|
|
|
| 2069 |
}
|
| 2070 |
|
| 2071 |
ggml_tensor src0_slice = *src0;
|
| 2072 |
+
src0_slice.ne[2] = 1;
|
| 2073 |
+
src0_slice.nb[3] = src0_slice.nb[2];
|
| 2074 |
+
src0_slice.op = GGML_OP_VIEW;
|
| 2075 |
+
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
|
| 2076 |
+
src0_slice.data = (char *) src0->data + i02*nb02;
|
| 2077 |
|
| 2078 |
ggml_tensor src1_slice;
|
| 2079 |
memset(&src1_slice, 0, sizeof(src1_slice));
|
ggml/src/ggml-cuda/mmq.cu
CHANGED
|
@@ -91,7 +91,8 @@ void ggml_cuda_mul_mat_q(
|
|
| 91 |
|
| 92 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 93 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 94 |
-
GGML_ASSERT(
|
|
|
|
| 95 |
const size_t size_data = ggml_nbytes(src0);
|
| 96 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 97 |
if (size_alloc > size_data) {
|
|
|
|
| 91 |
|
| 92 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 93 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 94 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 95 |
+
GGML_ASSERT(!src0->view_src);
|
| 96 |
const size_t size_data = ggml_nbytes(src0);
|
| 97 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 98 |
if (size_alloc > size_data) {
|
ggml/src/ggml-cuda/mmvq.cu
CHANGED
|
@@ -515,7 +515,8 @@ void ggml_cuda_mul_mat_vec_q(
|
|
| 515 |
|
| 516 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 517 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 518 |
-
GGML_ASSERT(
|
|
|
|
| 519 |
const size_t size_data = ggml_nbytes(src0);
|
| 520 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 521 |
if (size_alloc > size_data) {
|
|
|
|
| 515 |
|
| 516 |
// If src0 is a temporary compute buffer, clear any potential padding.
|
| 517 |
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
| 518 |
+
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
| 519 |
+
GGML_ASSERT(!src0->view_src);
|
| 520 |
const size_t size_data = ggml_nbytes(src0);
|
| 521 |
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
| 522 |
if (size_alloc > size_data) {
|
ggml/src/ggml.c
CHANGED
|
@@ -1299,6 +1299,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
|
| 1299 |
return ggml_is_contiguous_n(tensor, 2);
|
| 1300 |
}
|
| 1301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1302 |
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
| 1303 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1304 |
|
|
|
|
| 1299 |
return ggml_is_contiguous_n(tensor, 2);
|
| 1300 |
}
|
| 1301 |
|
| 1302 |
+
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
|
| 1303 |
+
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
| 1304 |
+
}
|
| 1305 |
+
|
| 1306 |
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
| 1307 |
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
| 1308 |
|