Spaces:
Sleeping
Sleeping
Alberto Cabrera Pérez
commited on
Commit
·
83e6f74
1
Parent(s):
2adc060
sycl : variable sg_size support for mmvq kernels (llama/12336)
Browse files- ggml/src/ggml-sycl/mmvq.cpp +75 -77
ggml/src/ggml-sycl/mmvq.cpp
CHANGED
|
@@ -3,44 +3,42 @@
|
|
| 3 |
#include <cassert>
|
| 4 |
|
| 5 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 6 |
-
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
| 7 |
-
const sycl::nd_item<3> &item_ct1) {
|
| 8 |
-
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 9 |
-
item_ct1.get_local_id(1);
|
| 10 |
|
| 11 |
if (row >= nrows) {
|
| 12 |
return;
|
| 13 |
}
|
| 14 |
|
| 15 |
-
const int
|
| 16 |
-
|
| 17 |
-
assert(blocks_per_warp>0);
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
float tmp = 0.0f;
|
| 21 |
|
| 22 |
-
const block_q_t
|
| 23 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 24 |
|
| 25 |
-
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
| 26 |
-
|
| 27 |
-
const int ibx = row*blocks_per_row + i; // x block index
|
| 28 |
|
| 29 |
-
const int iby = i * (qk/QK8_1);
|
| 30 |
|
| 31 |
-
|
| 32 |
-
vdr *
|
| 33 |
-
|
| 34 |
-
(qi / vdr)); // x block quant index when casting the quants to int
|
| 35 |
|
| 36 |
-
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
// sum up partial sums and write back result
|
| 40 |
#pragma unroll
|
| 41 |
-
for (int mask =
|
| 42 |
-
tmp +=
|
| 43 |
-
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 44 |
}
|
| 45 |
|
| 46 |
if (item_ct1.get_local_id(2) == 0) {
|
|
@@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
|
| 62 |
}
|
| 63 |
|
| 64 |
const int blocks_per_row = ncols / qk;
|
| 65 |
-
const int blocks_per_warp = vdr *
|
| 66 |
assert(blocks_per_warp>0);
|
| 67 |
|
| 68 |
// partial sum for each thread
|
|
@@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
|
| 87 |
|
| 88 |
// sum up partial sums and write back result
|
| 89 |
#pragma unroll
|
| 90 |
-
for (int mask =
|
| 91 |
tmp +=
|
| 92 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 93 |
}
|
|
@@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
|
| 111 |
}
|
| 112 |
|
| 113 |
const int blocks_per_row = ncols / qk;
|
| 114 |
-
const int blocks_per_warp = vdr *
|
| 115 |
assert(blocks_per_warp>0);
|
| 116 |
// partial sum for each thread
|
| 117 |
float tmp = 0.0f;
|
|
@@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
|
| 135 |
|
| 136 |
// sum up partial sums and write back result
|
| 137 |
#pragma unroll
|
| 138 |
-
for (int mask =
|
| 139 |
tmp +=
|
| 140 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 141 |
}
|
|
@@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
|
| 159 |
}
|
| 160 |
|
| 161 |
const int blocks_per_row = ncols / qk;
|
| 162 |
-
const int blocks_per_warp = vdr *
|
| 163 |
assert(blocks_per_warp>0);
|
| 164 |
// partial sum for each thread
|
| 165 |
float tmp = 0.0f;
|
|
@@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
|
| 183 |
|
| 184 |
// sum up partial sums and write back result
|
| 185 |
#pragma unroll
|
| 186 |
-
for (int mask =
|
| 187 |
tmp +=
|
| 188 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 189 |
}
|
|
@@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
|
| 207 |
}
|
| 208 |
|
| 209 |
const int blocks_per_row = ncols / qk;
|
| 210 |
-
const int blocks_per_warp = vdr *
|
| 211 |
assert(blocks_per_warp>0);
|
| 212 |
// partial sum for each thread
|
| 213 |
float tmp = 0.0f;
|
|
@@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
|
| 231 |
|
| 232 |
// sum up partial sums and write back result
|
| 233 |
#pragma unroll
|
| 234 |
-
for (int mask =
|
| 235 |
tmp +=
|
| 236 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 237 |
}
|
|
@@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
|
| 255 |
}
|
| 256 |
|
| 257 |
const int blocks_per_row = ncols / qk;
|
| 258 |
-
const int blocks_per_warp = vdr *
|
| 259 |
assert(blocks_per_warp>0);
|
| 260 |
// partial sum for each thread
|
| 261 |
float tmp = 0.0f;
|
|
@@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
|
| 279 |
|
| 280 |
// sum up partial sums and write back result
|
| 281 |
#pragma unroll
|
| 282 |
-
for (int mask =
|
| 283 |
tmp +=
|
| 284 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 285 |
}
|
|
@@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
|
| 303 |
}
|
| 304 |
|
| 305 |
const int blocks_per_row = ncols / qk;
|
| 306 |
-
const int blocks_per_warp = vdr *
|
| 307 |
assert(blocks_per_warp>0);
|
| 308 |
// partial sum for each thread
|
| 309 |
float tmp = 0.0f;
|
|
@@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
|
| 327 |
|
| 328 |
// sum up partial sums and write back result
|
| 329 |
#pragma unroll
|
| 330 |
-
for (int mask =
|
| 331 |
tmp +=
|
| 332 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 333 |
}
|
|
@@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
|
| 351 |
}
|
| 352 |
|
| 353 |
const int blocks_per_row = ncols / qk;
|
| 354 |
-
const int blocks_per_warp = vdr *
|
| 355 |
assert(blocks_per_warp>0);
|
| 356 |
// partial sum for each thread
|
| 357 |
float tmp = 0.0f;
|
|
@@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
|
| 375 |
|
| 376 |
// sum up partial sums and write back result
|
| 377 |
#pragma unroll
|
| 378 |
-
for (int mask =
|
| 379 |
tmp +=
|
| 380 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 381 |
}
|
|
@@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
|
| 399 |
}
|
| 400 |
|
| 401 |
const int blocks_per_row = ncols / qk;
|
| 402 |
-
const int blocks_per_warp = vdr *
|
| 403 |
assert(blocks_per_warp>0);
|
| 404 |
// partial sum for each thread
|
| 405 |
float tmp = 0.0f;
|
|
@@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
|
| 423 |
|
| 424 |
// sum up partial sums and write back result
|
| 425 |
#pragma unroll
|
| 426 |
-
for (int mask =
|
| 427 |
tmp +=
|
| 428 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 429 |
}
|
|
@@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
|
| 448 |
}
|
| 449 |
|
| 450 |
const int blocks_per_row = ncols / qk;
|
| 451 |
-
const int blocks_per_warp = vdr *
|
| 452 |
assert(blocks_per_warp>0);
|
| 453 |
// partial sum for each thread
|
| 454 |
float tmp = 0.0f;
|
|
@@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
|
| 472 |
|
| 473 |
// sum up partial sums and write back result
|
| 474 |
#pragma unroll
|
| 475 |
-
for (int mask =
|
| 476 |
tmp +=
|
| 477 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 478 |
}
|
|
@@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 489 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 490 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 491 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 492 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 493 |
{
|
| 494 |
|
| 495 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 497 |
cgh.parallel_for(
|
| 498 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 499 |
[=](sycl::nd_item<3> item_ct1)
|
| 500 |
-
[[intel::reqd_sub_group_size(
|
| 501 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 502 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 503 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 513 |
GGML_ASSERT(ncols % QK4_1 == 0);
|
| 514 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 515 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 516 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 517 |
{
|
| 518 |
|
| 519 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 521 |
cgh.parallel_for(
|
| 522 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 523 |
[=](sycl::nd_item<3> item_ct1)
|
| 524 |
-
[[intel::reqd_sub_group_size(
|
| 525 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 526 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 527 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 537 |
GGML_ASSERT(ncols % QK5_0 == 0);
|
| 538 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 539 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 540 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 541 |
{
|
| 542 |
|
| 543 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 545 |
cgh.parallel_for(
|
| 546 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 547 |
[=](sycl::nd_item<3> item_ct1)
|
| 548 |
-
[[intel::reqd_sub_group_size(
|
| 549 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 550 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 551 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 561 |
GGML_ASSERT(ncols % QK5_1 == 0);
|
| 562 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 563 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 564 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 565 |
{
|
| 566 |
|
| 567 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|
| 569 |
cgh.parallel_for(
|
| 570 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 571 |
[=](sycl::nd_item<3> item_ct1)
|
| 572 |
-
[[intel::reqd_sub_group_size(
|
| 573 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 574 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 575 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 585 |
GGML_ASSERT(ncols % QK8_0 == 0);
|
| 586 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 587 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 588 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 589 |
{
|
| 590 |
|
| 591 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
|
| 593 |
cgh.parallel_for(
|
| 594 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 595 |
[=](sycl::nd_item<3> item_ct1)
|
| 596 |
-
[[intel::reqd_sub_group_size(
|
| 597 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 598 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 599 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 609 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 610 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 611 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 612 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 613 |
{
|
| 614 |
|
| 615 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 617 |
cgh.parallel_for(
|
| 618 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 619 |
[=](sycl::nd_item<3> item_ct1)
|
| 620 |
-
[[intel::reqd_sub_group_size(
|
| 621 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 622 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 623 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 633 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 634 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 635 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 636 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 637 |
{
|
| 638 |
|
| 639 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 641 |
cgh.parallel_for(
|
| 642 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 643 |
[=](sycl::nd_item<3> item_ct1)
|
| 644 |
-
[[intel::reqd_sub_group_size(
|
| 645 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 646 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 647 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 657 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 658 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 659 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 660 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 661 |
{
|
| 662 |
|
| 663 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 665 |
cgh.parallel_for(
|
| 666 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 667 |
[=](sycl::nd_item<3> item_ct1)
|
| 668 |
-
[[intel::reqd_sub_group_size(
|
| 669 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 670 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 671 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 681 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 682 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 683 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 684 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 685 |
{
|
| 686 |
|
| 687 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 689 |
cgh.parallel_for(
|
| 690 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 691 |
[=](sycl::nd_item<3> item_ct1)
|
| 692 |
-
[[intel::reqd_sub_group_size(
|
| 693 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 694 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 695 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 705 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 706 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 707 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 708 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 709 |
{
|
| 710 |
|
| 711 |
stream->submit([&](sycl::handler &cgh) {
|
|
@@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
|
| 713 |
cgh.parallel_for(
|
| 714 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 715 |
[=](sycl::nd_item<3> item_ct1)
|
| 716 |
-
[[intel::reqd_sub_group_size(
|
| 717 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 718 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 719 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
@@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 730 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 731 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 732 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 733 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 734 |
{
|
| 735 |
stream->submit([&](sycl::handler &cgh) {
|
| 736 |
cgh.parallel_for(
|
| 737 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 738 |
[=](sycl::nd_item<3> item_ct1)
|
| 739 |
-
[[intel::reqd_sub_group_size(
|
| 740 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 741 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 742 |
});
|
|
@@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 751 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 752 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 753 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 754 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 755 |
{
|
| 756 |
stream->submit([&](sycl::handler & cgh) {
|
| 757 |
cgh.parallel_for(
|
| 758 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 759 |
[=](sycl::nd_item<3> item_ct1)
|
| 760 |
-
[[intel::reqd_sub_group_size(
|
| 761 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 762 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 763 |
});
|
|
@@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 772 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 773 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 774 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 775 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 776 |
{
|
| 777 |
|
| 778 |
stream->submit([&](sycl::handler &cgh) {
|
| 779 |
cgh.parallel_for(
|
| 780 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 781 |
[=](sycl::nd_item<3> item_ct1)
|
| 782 |
-
[[intel::reqd_sub_group_size(
|
| 783 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 784 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 785 |
});
|
|
@@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 794 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 795 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 796 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 797 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 798 |
{
|
| 799 |
|
| 800 |
stream->submit([&](sycl::handler &cgh) {
|
| 801 |
cgh.parallel_for(
|
| 802 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 803 |
[=](sycl::nd_item<3> item_ct1)
|
| 804 |
-
[[intel::reqd_sub_group_size(
|
| 805 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 806 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 807 |
});
|
|
@@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 816 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 817 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 818 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 819 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 820 |
{
|
| 821 |
|
| 822 |
stream->submit([&](sycl::handler &cgh) {
|
| 823 |
cgh.parallel_for(
|
| 824 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 825 |
[=](sycl::nd_item<3> item_ct1)
|
| 826 |
-
[[intel::reqd_sub_group_size(
|
| 827 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 828 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 829 |
});
|
|
@@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 838 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 839 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 840 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 841 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 842 |
{
|
| 843 |
|
| 844 |
stream->submit([&](sycl::handler &cgh) {
|
| 845 |
cgh.parallel_for(
|
| 846 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 847 |
[=](sycl::nd_item<3> item_ct1)
|
| 848 |
-
[[intel::reqd_sub_group_size(
|
| 849 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 850 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 851 |
});
|
|
@@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
|
| 860 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 861 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 862 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 863 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 864 |
{
|
| 865 |
stream->submit([&](sycl::handler &cgh) {
|
| 866 |
cgh.parallel_for(
|
| 867 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 868 |
[=](sycl::nd_item<3> item_ct1)
|
| 869 |
-
[[intel::reqd_sub_group_size(
|
| 870 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 871 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 872 |
});
|
|
@@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
|
| 881 |
GGML_ASSERT(ncols % QK4_NL == 0);
|
| 882 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 883 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 884 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 885 |
{
|
| 886 |
|
| 887 |
stream->submit([&](sycl::handler &cgh) {
|
| 888 |
cgh.parallel_for(
|
| 889 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 890 |
[=](sycl::nd_item<3> item_ct1)
|
| 891 |
-
[[intel::reqd_sub_group_size(
|
| 892 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 893 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 894 |
});
|
|
@@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
|
| 903 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 904 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 905 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 906 |
-
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y,
|
| 907 |
{
|
| 908 |
|
| 909 |
stream->submit([&](sycl::handler &cgh) {
|
| 910 |
cgh.parallel_for(
|
| 911 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 912 |
[=](sycl::nd_item<3> item_ct1)
|
| 913 |
-
[[intel::reqd_sub_group_size(
|
| 914 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 915 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 916 |
});
|
|
|
|
| 3 |
#include <cassert>
|
| 4 |
|
| 5 |
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
|
| 6 |
+
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
| 7 |
+
const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
|
| 8 |
+
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
|
|
|
| 9 |
|
| 10 |
if (row >= nrows) {
|
| 11 |
return;
|
| 12 |
}
|
| 13 |
|
| 14 |
+
const int blocks_per_row = ncols / qk;
|
| 15 |
+
constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
|
|
|
|
| 16 |
|
| 17 |
+
assert(blocks_per_warp > 0);
|
| 18 |
+
|
| 19 |
+
// partial sum for each thread
|
| 20 |
float tmp = 0.0f;
|
| 21 |
|
| 22 |
+
const block_q_t * x = (const block_q_t *) vx;
|
| 23 |
const block_q8_1 * y = (const block_q8_1 *) vy;
|
| 24 |
|
| 25 |
+
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
|
| 26 |
+
const int ibx = row * blocks_per_row + i; // x block index
|
|
|
|
| 27 |
|
| 28 |
+
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
| 29 |
|
| 30 |
+
for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
|
| 31 |
+
const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
|
| 32 |
+
(qi / vdr)); // x block quant index when casting the quants to int
|
|
|
|
| 33 |
|
| 34 |
+
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
| 35 |
+
}
|
| 36 |
}
|
| 37 |
|
| 38 |
// sum up partial sums and write back result
|
| 39 |
#pragma unroll
|
| 40 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 41 |
+
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
if (item_ct1.get_local_id(2) == 0) {
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
const int blocks_per_row = ncols / qk;
|
| 63 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 64 |
assert(blocks_per_warp>0);
|
| 65 |
|
| 66 |
// partial sum for each thread
|
|
|
|
| 85 |
|
| 86 |
// sum up partial sums and write back result
|
| 87 |
#pragma unroll
|
| 88 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 89 |
tmp +=
|
| 90 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 91 |
}
|
|
|
|
| 109 |
}
|
| 110 |
|
| 111 |
const int blocks_per_row = ncols / qk;
|
| 112 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 113 |
assert(blocks_per_warp>0);
|
| 114 |
// partial sum for each thread
|
| 115 |
float tmp = 0.0f;
|
|
|
|
| 133 |
|
| 134 |
// sum up partial sums and write back result
|
| 135 |
#pragma unroll
|
| 136 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 137 |
tmp +=
|
| 138 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 139 |
}
|
|
|
|
| 157 |
}
|
| 158 |
|
| 159 |
const int blocks_per_row = ncols / qk;
|
| 160 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 161 |
assert(blocks_per_warp>0);
|
| 162 |
// partial sum for each thread
|
| 163 |
float tmp = 0.0f;
|
|
|
|
| 181 |
|
| 182 |
// sum up partial sums and write back result
|
| 183 |
#pragma unroll
|
| 184 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 185 |
tmp +=
|
| 186 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 187 |
}
|
|
|
|
| 205 |
}
|
| 206 |
|
| 207 |
const int blocks_per_row = ncols / qk;
|
| 208 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 209 |
assert(blocks_per_warp>0);
|
| 210 |
// partial sum for each thread
|
| 211 |
float tmp = 0.0f;
|
|
|
|
| 229 |
|
| 230 |
// sum up partial sums and write back result
|
| 231 |
#pragma unroll
|
| 232 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 233 |
tmp +=
|
| 234 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 235 |
}
|
|
|
|
| 253 |
}
|
| 254 |
|
| 255 |
const int blocks_per_row = ncols / qk;
|
| 256 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 257 |
assert(blocks_per_warp>0);
|
| 258 |
// partial sum for each thread
|
| 259 |
float tmp = 0.0f;
|
|
|
|
| 277 |
|
| 278 |
// sum up partial sums and write back result
|
| 279 |
#pragma unroll
|
| 280 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 281 |
tmp +=
|
| 282 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 283 |
}
|
|
|
|
| 301 |
}
|
| 302 |
|
| 303 |
const int blocks_per_row = ncols / qk;
|
| 304 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 305 |
assert(blocks_per_warp>0);
|
| 306 |
// partial sum for each thread
|
| 307 |
float tmp = 0.0f;
|
|
|
|
| 325 |
|
| 326 |
// sum up partial sums and write back result
|
| 327 |
#pragma unroll
|
| 328 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 329 |
tmp +=
|
| 330 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 331 |
}
|
|
|
|
| 349 |
}
|
| 350 |
|
| 351 |
const int blocks_per_row = ncols / qk;
|
| 352 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 353 |
assert(blocks_per_warp>0);
|
| 354 |
// partial sum for each thread
|
| 355 |
float tmp = 0.0f;
|
|
|
|
| 373 |
|
| 374 |
// sum up partial sums and write back result
|
| 375 |
#pragma unroll
|
| 376 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 377 |
tmp +=
|
| 378 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 379 |
}
|
|
|
|
| 397 |
}
|
| 398 |
|
| 399 |
const int blocks_per_row = ncols / qk;
|
| 400 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 401 |
assert(blocks_per_warp>0);
|
| 402 |
// partial sum for each thread
|
| 403 |
float tmp = 0.0f;
|
|
|
|
| 421 |
|
| 422 |
// sum up partial sums and write back result
|
| 423 |
#pragma unroll
|
| 424 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 425 |
tmp +=
|
| 426 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 427 |
}
|
|
|
|
| 446 |
}
|
| 447 |
|
| 448 |
const int blocks_per_row = ncols / qk;
|
| 449 |
+
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
| 450 |
assert(blocks_per_warp>0);
|
| 451 |
// partial sum for each thread
|
| 452 |
float tmp = 0.0f;
|
|
|
|
| 470 |
|
| 471 |
// sum up partial sums and write back result
|
| 472 |
#pragma unroll
|
| 473 |
+
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
| 474 |
tmp +=
|
| 475 |
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
| 476 |
}
|
|
|
|
| 487 |
GGML_ASSERT(ncols % QK4_0 == 0);
|
| 488 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 489 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 490 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 491 |
{
|
| 492 |
|
| 493 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 495 |
cgh.parallel_for(
|
| 496 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 497 |
[=](sycl::nd_item<3> item_ct1)
|
| 498 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 499 |
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
| 500 |
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
| 501 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 511 |
GGML_ASSERT(ncols % QK4_1 == 0);
|
| 512 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 513 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 514 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 515 |
{
|
| 516 |
|
| 517 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 519 |
cgh.parallel_for(
|
| 520 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 521 |
[=](sycl::nd_item<3> item_ct1)
|
| 522 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 523 |
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
| 524 |
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
| 525 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 535 |
GGML_ASSERT(ncols % QK5_0 == 0);
|
| 536 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 537 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 538 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 539 |
{
|
| 540 |
|
| 541 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 543 |
cgh.parallel_for(
|
| 544 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 545 |
[=](sycl::nd_item<3> item_ct1)
|
| 546 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 547 |
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
| 548 |
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
| 549 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 559 |
GGML_ASSERT(ncols % QK5_1 == 0);
|
| 560 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 561 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 562 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 563 |
{
|
| 564 |
|
| 565 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 567 |
cgh.parallel_for(
|
| 568 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 569 |
[=](sycl::nd_item<3> item_ct1)
|
| 570 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 571 |
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
| 572 |
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
| 573 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 583 |
GGML_ASSERT(ncols % QK8_0 == 0);
|
| 584 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 585 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 586 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 587 |
{
|
| 588 |
|
| 589 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 591 |
cgh.parallel_for(
|
| 592 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 593 |
[=](sycl::nd_item<3> item_ct1)
|
| 594 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 595 |
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
| 596 |
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
| 597 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 607 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 608 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 609 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 610 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 611 |
{
|
| 612 |
|
| 613 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 615 |
cgh.parallel_for(
|
| 616 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 617 |
[=](sycl::nd_item<3> item_ct1)
|
| 618 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 619 |
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
| 620 |
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
| 621 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 631 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 632 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 633 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 634 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 635 |
{
|
| 636 |
|
| 637 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 639 |
cgh.parallel_for(
|
| 640 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 641 |
[=](sycl::nd_item<3> item_ct1)
|
| 642 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 643 |
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
| 644 |
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
| 645 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 655 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 656 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 657 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 658 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 659 |
{
|
| 660 |
|
| 661 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 663 |
cgh.parallel_for(
|
| 664 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 665 |
[=](sycl::nd_item<3> item_ct1)
|
| 666 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 667 |
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
| 668 |
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
| 669 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 679 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 680 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 681 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 682 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 683 |
{
|
| 684 |
|
| 685 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 687 |
cgh.parallel_for(
|
| 688 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 689 |
[=](sycl::nd_item<3> item_ct1)
|
| 690 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 691 |
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
| 692 |
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
| 693 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 703 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 704 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 705 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 706 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 707 |
{
|
| 708 |
|
| 709 |
stream->submit([&](sycl::handler &cgh) {
|
|
|
|
| 711 |
cgh.parallel_for(
|
| 712 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 713 |
[=](sycl::nd_item<3> item_ct1)
|
| 714 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 715 |
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
| 716 |
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
| 717 |
vx, vy, dst, ncols, nrows, item_ct1);
|
|
|
|
| 728 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 729 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 730 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 731 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 732 |
{
|
| 733 |
stream->submit([&](sycl::handler &cgh) {
|
| 734 |
cgh.parallel_for(
|
| 735 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 736 |
[=](sycl::nd_item<3> item_ct1)
|
| 737 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 738 |
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
| 739 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 740 |
});
|
|
|
|
| 749 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 750 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 751 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 752 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 753 |
{
|
| 754 |
stream->submit([&](sycl::handler & cgh) {
|
| 755 |
cgh.parallel_for(
|
| 756 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 757 |
[=](sycl::nd_item<3> item_ct1)
|
| 758 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 759 |
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
| 760 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 761 |
});
|
|
|
|
| 770 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 771 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 772 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 773 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 774 |
{
|
| 775 |
|
| 776 |
stream->submit([&](sycl::handler &cgh) {
|
| 777 |
cgh.parallel_for(
|
| 778 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 779 |
[=](sycl::nd_item<3> item_ct1)
|
| 780 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 781 |
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
| 782 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 783 |
});
|
|
|
|
| 792 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 793 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 794 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 795 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 796 |
{
|
| 797 |
|
| 798 |
stream->submit([&](sycl::handler &cgh) {
|
| 799 |
cgh.parallel_for(
|
| 800 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 801 |
[=](sycl::nd_item<3> item_ct1)
|
| 802 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 803 |
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
| 804 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 805 |
});
|
|
|
|
| 814 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 815 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 816 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 817 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 818 |
{
|
| 819 |
|
| 820 |
stream->submit([&](sycl::handler &cgh) {
|
| 821 |
cgh.parallel_for(
|
| 822 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 823 |
[=](sycl::nd_item<3> item_ct1)
|
| 824 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 825 |
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
| 826 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 827 |
});
|
|
|
|
| 836 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 837 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 838 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 839 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 840 |
{
|
| 841 |
|
| 842 |
stream->submit([&](sycl::handler &cgh) {
|
| 843 |
cgh.parallel_for(
|
| 844 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 845 |
[=](sycl::nd_item<3> item_ct1)
|
| 846 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 847 |
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
| 848 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 849 |
});
|
|
|
|
| 858 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 859 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 860 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 861 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 862 |
{
|
| 863 |
stream->submit([&](sycl::handler &cgh) {
|
| 864 |
cgh.parallel_for(
|
| 865 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 866 |
[=](sycl::nd_item<3> item_ct1)
|
| 867 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 868 |
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
| 869 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 870 |
});
|
|
|
|
| 879 |
GGML_ASSERT(ncols % QK4_NL == 0);
|
| 880 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 881 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 882 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 883 |
{
|
| 884 |
|
| 885 |
stream->submit([&](sycl::handler &cgh) {
|
| 886 |
cgh.parallel_for(
|
| 887 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 888 |
[=](sycl::nd_item<3> item_ct1)
|
| 889 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 890 |
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
|
| 891 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 892 |
});
|
|
|
|
| 901 |
GGML_ASSERT(ncols % QK_K == 0);
|
| 902 |
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
| 903 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 904 |
+
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 905 |
{
|
| 906 |
|
| 907 |
stream->submit([&](sycl::handler &cgh) {
|
| 908 |
cgh.parallel_for(
|
| 909 |
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
| 910 |
[=](sycl::nd_item<3> item_ct1)
|
| 911 |
+
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
| 912 |
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
| 913 |
vx, vy, dst, ncols, nrows, item_ct1);
|
| 914 |
});
|