Spaces:
Running
Running
lhez
Shawn Gu
Skyler Szot
commited on
Commit
·
4532dc6
1
Parent(s):
14fd317
opencl: fix for small models (llama/11950)
Browse files* opencl: fix small shape gemv, remove unused extensions
* opencl: fix `transpose_16`, `dump_tensor`, enforce subgroup size
* opencl: fix for token length < 4
* opencl: use wave size of 64 for all Adreno GPUs
---------
Co-authored-by: Shawn Gu <[email protected]>
Co-authored-by: Skyler Szot <[email protected]>
- ggml/src/ggml-opencl/ggml-opencl.cpp +25 -29
- ggml/src/ggml-opencl/kernels/ggml-opencl.cl +3 -0
- ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl +8 -5
- ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl +8 -5
- ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl +10 -1
- ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl +13 -19
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
| 444 |
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
| 445 |
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
| 446 |
|
| 447 |
-
//
|
| 448 |
-
|
| 449 |
-
backend_ctx->adreno_wave_size = 64;
|
| 450 |
-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
|
| 451 |
-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
|
| 452 |
-
backend_ctx->adreno_wave_size = 128;
|
| 453 |
-
} else {
|
| 454 |
-
backend_ctx->adreno_wave_size = 128;
|
| 455 |
-
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
|
| 456 |
-
"using wave size %d, "
|
| 457 |
-
"may not work as expected\n",
|
| 458 |
-
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
|
| 459 |
-
}
|
| 460 |
} else if (strstr(default_device->name, "Intel")) {
|
| 461 |
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
| 462 |
} else {
|
|
@@ -1376,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
| 1376 |
int M = tensor->ne[1]; // ne01
|
| 1377 |
int K = tensor->ne[0]; // ne00
|
| 1378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1379 |
// transpose is out of place, so we need to allocate transposed buffers
|
| 1380 |
// <----------------------------------------------------------------------------------> //
|
| 1381 |
// use sub_buffer of max buffer size instead
|
|
@@ -1416,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
| 1416 |
cl_mem qT_d_image1D;
|
| 1417 |
cl_mem dT_d_image1D;
|
| 1418 |
|
| 1419 |
-
cl_image_format img_fmt_1d = { CL_RGBA,
|
| 1420 |
cl_image_desc img_desc_1d;
|
| 1421 |
|
| 1422 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1423 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1424 |
-
img_desc_1d.image_width = M * K /
|
| 1425 |
img_desc_1d.buffer = extra->q;
|
| 1426 |
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1427 |
CL_CHECK(err);
|
| 1428 |
|
| 1429 |
-
img_fmt_1d = { CL_RGBA,
|
| 1430 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1431 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1432 |
-
img_desc_1d.image_width = M * K /
|
| 1433 |
img_desc_1d.buffer = qT_d;
|
| 1434 |
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1435 |
CL_CHECK(err);
|
| 1436 |
|
| 1437 |
-
img_fmt_1d = { CL_RGBA,
|
| 1438 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1439 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1440 |
-
img_desc_1d.image_width = M * K / 32 / 4
|
| 1441 |
img_desc_1d.buffer = extra->d;
|
| 1442 |
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1443 |
CL_CHECK(err);
|
| 1444 |
|
| 1445 |
-
img_fmt_1d = { CL_RGBA,
|
| 1446 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1447 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1448 |
-
img_desc_1d.image_width = M * K / 32 / 4
|
| 1449 |
img_desc_1d.buffer = dT_d;
|
| 1450 |
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1451 |
CL_CHECK(err);
|
|
@@ -1454,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
| 1454 |
// set up and call the transpose kernels
|
| 1455 |
// <----------------------------------------------------------------------------------> //
|
| 1456 |
// weights
|
| 1457 |
-
int height_q = M /
|
| 1458 |
-
int width_q = K /
|
| 1459 |
kernel = backend_ctx->kernel_transpose_16;
|
| 1460 |
|
| 1461 |
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
@@ -1469,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
| 1469 |
CL_CHECK(clWaitForEvents(1, &evt));
|
| 1470 |
|
| 1471 |
// scales
|
| 1472 |
-
int height_s = M /
|
| 1473 |
-
int width_s = K / 32 /
|
| 1474 |
|
| 1475 |
kernel = backend_ctx->kernel_transpose_16;
|
| 1476 |
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
@@ -1864,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
| 1864 |
void * buf_d;
|
| 1865 |
#endif
|
| 1866 |
|
| 1867 |
-
#ifdef GGML_USE_OPENCL
|
| 1868 |
// Make sure everything is done.
|
| 1869 |
CL_CHECK(clFinish(queue));
|
| 1870 |
|
|
@@ -1900,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
| 1900 |
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
|
| 1901 |
CL_CHECK(clFinish(queue));
|
| 1902 |
#endif // GGML_OPENCL_SOA_Q
|
| 1903 |
-
#endif // GGML_USE_OPENCL
|
| 1904 |
|
| 1905 |
// Open file and dump.
|
| 1906 |
char fname[512];
|
|
@@ -2865,6 +2857,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 2865 |
CL_CHECK(status);
|
| 2866 |
|
| 2867 |
int height_B = N/4;
|
|
|
|
|
|
|
|
|
|
| 2868 |
int width_B = K/4;
|
| 2869 |
int padded_height_B = (N + padding)/4;
|
| 2870 |
|
|
@@ -3013,11 +3008,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 3013 |
}
|
| 3014 |
|
| 3015 |
if (N == 1) {
|
| 3016 |
-
|
|
|
|
| 3017 |
local_work_size[1] = 4; // reduce factor
|
| 3018 |
local_work_size[2] = 1;
|
| 3019 |
|
| 3020 |
-
global_work_size[0] = M / 2;
|
| 3021 |
global_work_size[1] = 4; // reduce factor
|
| 3022 |
global_work_size[2] = 1;
|
| 3023 |
}
|
|
|
|
| 444 |
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
| 445 |
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
| 446 |
|
| 447 |
+
// Use wave size of 64 for all Adreno GPUs.
|
| 448 |
+
backend_ctx->adreno_wave_size = 64;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
} else if (strstr(default_device->name, "Intel")) {
|
| 450 |
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
| 451 |
} else {
|
|
|
|
| 1365 |
int M = tensor->ne[1]; // ne01
|
| 1366 |
int K = tensor->ne[0]; // ne00
|
| 1367 |
|
| 1368 |
+
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
|
| 1369 |
+
GGML_ASSERT(K % 32 == 0);
|
| 1370 |
+
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
|
| 1371 |
+
GGML_ASSERT(M % 4 == 0);
|
| 1372 |
+
|
| 1373 |
// transpose is out of place, so we need to allocate transposed buffers
|
| 1374 |
// <----------------------------------------------------------------------------------> //
|
| 1375 |
// use sub_buffer of max buffer size instead
|
|
|
|
| 1410 |
cl_mem qT_d_image1D;
|
| 1411 |
cl_mem dT_d_image1D;
|
| 1412 |
|
| 1413 |
+
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
| 1414 |
cl_image_desc img_desc_1d;
|
| 1415 |
|
| 1416 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1417 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1418 |
+
img_desc_1d.image_width = M * K / 4 / 4;
|
| 1419 |
img_desc_1d.buffer = extra->q;
|
| 1420 |
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1421 |
CL_CHECK(err);
|
| 1422 |
|
| 1423 |
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
| 1424 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1425 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1426 |
+
img_desc_1d.image_width = M * K / 4 / 4;
|
| 1427 |
img_desc_1d.buffer = qT_d;
|
| 1428 |
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1429 |
CL_CHECK(err);
|
| 1430 |
|
| 1431 |
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
| 1432 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1433 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1434 |
+
img_desc_1d.image_width = M * K / 32 / 4;
|
| 1435 |
img_desc_1d.buffer = extra->d;
|
| 1436 |
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1437 |
CL_CHECK(err);
|
| 1438 |
|
| 1439 |
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
| 1440 |
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
| 1441 |
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
| 1442 |
+
img_desc_1d.image_width = M * K / 32 / 4;
|
| 1443 |
img_desc_1d.buffer = dT_d;
|
| 1444 |
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
| 1445 |
CL_CHECK(err);
|
|
|
|
| 1448 |
// set up and call the transpose kernels
|
| 1449 |
// <----------------------------------------------------------------------------------> //
|
| 1450 |
// weights
|
| 1451 |
+
int height_q = M / 4;
|
| 1452 |
+
int width_q = K / 4 / 4;
|
| 1453 |
kernel = backend_ctx->kernel_transpose_16;
|
| 1454 |
|
| 1455 |
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
|
|
| 1463 |
CL_CHECK(clWaitForEvents(1, &evt));
|
| 1464 |
|
| 1465 |
// scales
|
| 1466 |
+
int height_s = M / 4;
|
| 1467 |
+
int width_s = K / 32 / 4;
|
| 1468 |
|
| 1469 |
kernel = backend_ctx->kernel_transpose_16;
|
| 1470 |
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
|
|
| 1858 |
void * buf_d;
|
| 1859 |
#endif
|
| 1860 |
|
|
|
|
| 1861 |
// Make sure everything is done.
|
| 1862 |
CL_CHECK(clFinish(queue));
|
| 1863 |
|
|
|
|
| 1893 |
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
|
| 1894 |
CL_CHECK(clFinish(queue));
|
| 1895 |
#endif // GGML_OPENCL_SOA_Q
|
|
|
|
| 1896 |
|
| 1897 |
// Open file and dump.
|
| 1898 |
char fname[512];
|
|
|
|
| 2857 |
CL_CHECK(status);
|
| 2858 |
|
| 2859 |
int height_B = N/4;
|
| 2860 |
+
if (height_B == 0) {
|
| 2861 |
+
height_B = 1;
|
| 2862 |
+
}
|
| 2863 |
int width_B = K/4;
|
| 2864 |
int padded_height_B = (N + padding)/4;
|
| 2865 |
|
|
|
|
| 3008 |
}
|
| 3009 |
|
| 3010 |
if (N == 1) {
|
| 3011 |
+
size_t wavesize = backend_ctx->adreno_wave_size;
|
| 3012 |
+
local_work_size[0] = wavesize; // localsize
|
| 3013 |
local_work_size[1] = 4; // reduce factor
|
| 3014 |
local_work_size[2] = 1;
|
| 3015 |
|
| 3016 |
+
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
|
| 3017 |
global_work_size[1] = 4; // reduce factor
|
| 3018 |
global_work_size[2] = 1;
|
| 3019 |
}
|
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
CHANGED
|
@@ -1797,6 +1797,9 @@ kernel void kernel_mul_mat_f16_f16(
|
|
| 1797 |
//------------------------------------------------------------------------------
|
| 1798 |
// mul_mat_f16_f32_1row
|
| 1799 |
//------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
| 1800 |
kernel void kernel_mul_mat_f16_f32_1row(
|
| 1801 |
global char * src0,
|
| 1802 |
ulong offset0,
|
|
|
|
| 1797 |
//------------------------------------------------------------------------------
|
| 1798 |
// mul_mat_f16_f32_1row
|
| 1799 |
//------------------------------------------------------------------------------
|
| 1800 |
+
#ifdef ADRENO_GPU
|
| 1801 |
+
REQD_SUBGROUP_SIZE_64
|
| 1802 |
+
#endif
|
| 1803 |
kernel void kernel_mul_mat_f16_f32_1row(
|
| 1804 |
global char * src0,
|
| 1805 |
ulong offset0,
|
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
-
|
| 4 |
-
#
|
| 5 |
-
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
| 6 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
// assume
|
| 9 |
#define QK4_0 32
|
|
@@ -186,8 +188,9 @@
|
|
| 186 |
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 187 |
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
|
|
|
| 191 |
__kernel void kernel_gemv_noshuffle(
|
| 192 |
__read_only image1d_buffer_t src0_q, // quantized A
|
| 193 |
global half2 * src0_d, // A scales
|
|
|
|
| 1 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
+
|
| 4 |
+
#ifdef cl_qcom_reqd_sub_group_size
|
|
|
|
| 5 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 6 |
+
#define ADRENO_GPU 1
|
| 7 |
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 8 |
+
#endif
|
| 9 |
|
| 10 |
// assume
|
| 11 |
#define QK4_0 32
|
|
|
|
| 188 |
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 189 |
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 190 |
|
| 191 |
+
#ifdef ADRENO_GPU
|
| 192 |
+
REQD_SUBGROUP_SIZE_64
|
| 193 |
+
#endif
|
| 194 |
__kernel void kernel_gemv_noshuffle(
|
| 195 |
__read_only image1d_buffer_t src0_q, // quantized A
|
| 196 |
global half2 * src0_d, // A scales
|
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
-
|
| 4 |
-
#
|
| 5 |
-
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
| 6 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
// assume
|
| 9 |
#define QK4_0 32
|
|
@@ -186,8 +188,9 @@
|
|
| 186 |
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 187 |
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
|
|
|
| 191 |
__kernel void kernel_gemv_noshuffle(
|
| 192 |
__read_only image1d_buffer_t src0_q, // quantized A
|
| 193 |
global half2 * src0_d, // A scales
|
|
|
|
| 1 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 3 |
+
|
| 4 |
+
#ifdef cl_qcom_reqd_sub_group_size
|
|
|
|
| 5 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 6 |
+
#define ADRENO_GPU 1
|
| 7 |
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 8 |
+
#endif
|
| 9 |
|
| 10 |
// assume
|
| 11 |
#define QK4_0 32
|
|
|
|
| 188 |
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
| 189 |
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
| 190 |
|
| 191 |
+
#ifdef ADRENO_GPU
|
| 192 |
+
REQD_SUBGROUP_SIZE_64
|
| 193 |
+
#endif
|
| 194 |
__kernel void kernel_gemv_noshuffle(
|
| 195 |
__read_only image1d_buffer_t src0_q, // quantized A
|
| 196 |
global half2 * src0_d, // A scales
|
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
CHANGED
|
@@ -7,7 +7,16 @@
|
|
| 7 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 8 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
| 12 |
global const ushort * src0_q, // quantized A
|
| 13 |
global const half * src0_d, // A scales
|
|
|
|
| 7 |
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 8 |
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 9 |
|
| 10 |
+
#ifdef cl_qcom_reqd_sub_group_size
|
| 11 |
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 12 |
+
#define ADRENO_GPU 1
|
| 13 |
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#ifdef ADRENO_GPU
|
| 17 |
+
REQD_SUBGROUP_SIZE_128
|
| 18 |
+
#endif
|
| 19 |
+
|
| 20 |
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
| 21 |
global const ushort * src0_q, // quantized A
|
| 22 |
global const half * src0_d, // A scales
|
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
-
// 16-bit transpose, loading/storing
|
|
|
|
|
|
|
| 2 |
|
| 3 |
kernel void kernel_transpose_16(
|
| 4 |
__read_only image1d_buffer_t input,
|
|
@@ -9,24 +11,16 @@ kernel void kernel_transpose_16(
|
|
| 9 |
|
| 10 |
const int i = get_global_id(0);
|
| 11 |
const int j = get_global_id(1);
|
| 12 |
-
const int
|
| 13 |
-
const int
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
|
| 20 |
-
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
|
| 21 |
-
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
|
| 22 |
-
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
|
| 29 |
-
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
|
| 30 |
-
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
|
| 31 |
-
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
|
| 32 |
}
|
|
|
|
| 1 |
+
// 16-bit transpose, loading/storing a 4x4 tile of elements
|
| 2 |
+
|
| 3 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 4 |
|
| 5 |
kernel void kernel_transpose_16(
|
| 6 |
__read_only image1d_buffer_t input,
|
|
|
|
| 11 |
|
| 12 |
const int i = get_global_id(0);
|
| 13 |
const int j = get_global_id(1);
|
| 14 |
+
const int i_2 = i<<2;
|
| 15 |
+
const int j_2 = j<<2;
|
| 16 |
|
| 17 |
+
half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
|
| 18 |
+
half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
|
| 19 |
+
half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
|
| 20 |
+
half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
| 23 |
+
write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
| 24 |
+
write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
| 25 |
+
write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|