Spaces:
Sleeping
Sleeping
slaren
commited on
Commit
·
73e80d1
1
Parent(s):
314d58a
cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X (llama/8800)
Browse files* cuda : fix dmmv cols requirement to 2*GGML_CUDA_DMMV_X
* update asserts
* only use dmmv for supported types
* add test
- ggml/src/ggml-cuda.cu +2 -3
- ggml/src/ggml-cuda/dmmv.cu +15 -6
- ggml/src/ggml-cuda/dmmv.cuh +2 -0
ggml/src/ggml-cuda.cu
CHANGED
|
@@ -1885,10 +1885,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
|
| 1885 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1886 |
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
| 1887 |
|
| 1888 |
-
bool use_dequantize_mul_mat_vec = (
|
| 1889 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1890 |
-
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 &&
|
| 1891 |
-
&& src1->ne[1] == 1;
|
| 1892 |
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
| 1893 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1894 |
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
|
|
| 1885 |
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 1886 |
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
| 1887 |
|
| 1888 |
+
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
| 1889 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1890 |
+
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
|
|
|
| 1891 |
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
| 1892 |
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 1893 |
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
ggml/src/ggml-cuda/dmmv.cu
CHANGED
|
@@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
| 500 |
}
|
| 501 |
|
| 502 |
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 503 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 504 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 505 |
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
| 506 |
const dim3 block_nums(block_num_y, 1, 1);
|
|
@@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|
| 510 |
}
|
| 511 |
|
| 512 |
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 513 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 514 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 515 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 516 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
@@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|
| 519 |
}
|
| 520 |
|
| 521 |
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 522 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 523 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 524 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 525 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
@@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|
| 528 |
}
|
| 529 |
|
| 530 |
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 531 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 532 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 533 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 534 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
@@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|
| 537 |
}
|
| 538 |
|
| 539 |
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 540 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 541 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 542 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 543 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
@@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|
| 588 |
}
|
| 589 |
|
| 590 |
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 591 |
-
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
| 592 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 593 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 594 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
@@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
| 672 |
GGML_UNUSED(src1_ncols);
|
| 673 |
GGML_UNUSED(src1_padded_row_size);
|
| 674 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
}
|
| 501 |
|
| 502 |
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 503 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 504 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 505 |
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
| 506 |
const dim3 block_nums(block_num_y, 1, 1);
|
|
|
|
| 510 |
}
|
| 511 |
|
| 512 |
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 513 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 514 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 515 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 516 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
|
|
| 519 |
}
|
| 520 |
|
| 521 |
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 522 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 523 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 524 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 525 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
|
|
| 528 |
}
|
| 529 |
|
| 530 |
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 531 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 532 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 533 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 534 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
|
|
| 537 |
}
|
| 538 |
|
| 539 |
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 540 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 541 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 542 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 543 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
|
|
| 588 |
}
|
| 589 |
|
| 590 |
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
| 591 |
+
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
| 592 |
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
| 593 |
const dim3 block_nums(block_num_y, 1, 1);
|
| 594 |
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
|
|
| 672 |
GGML_UNUSED(src1_ncols);
|
| 673 |
GGML_UNUSED(src1_padded_row_size);
|
| 674 |
}
|
| 675 |
+
|
| 676 |
+
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
|
| 677 |
+
return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
|
| 678 |
+
src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
|
| 679 |
+
src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
|
| 680 |
+
src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
|
| 681 |
+
src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
|
| 682 |
+
src0_type == GGML_TYPE_F16;
|
| 683 |
+
}
|
ggml/src/ggml-cuda/dmmv.cuh
CHANGED
|
@@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
| 16 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 17 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 18 |
const int64_t src1_padded_row_size, cudaStream_t stream);
|
|
|
|
|
|
|
|
|
| 16 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 17 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
| 18 |
const int64_t src1_padded_row_size, cudaStream_t stream);
|
| 19 |
+
|
| 20 |
+
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
|