Alberto Cabrera Pérez commited on
Commit
83e6f74
·
1 Parent(s): 2adc060

sycl : variable sg_size support for mmvq kernels (llama/12336)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-sycl/mmvq.cpp +75 -77
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -3,44 +3,42 @@
3
  #include <cassert>
4
 
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
- static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
7
- const sycl::nd_item<3> &item_ct1) {
8
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
9
- item_ct1.get_local_id(1);
10
 
11
  if (row >= nrows) {
12
  return;
13
  }
14
 
15
- const int blocks_per_row = ncols / qk;
16
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
17
- assert(blocks_per_warp>0);
18
 
19
- // partial sum for each thread
 
 
20
  float tmp = 0.0f;
21
 
22
- const block_q_t * x = (const block_q_t *) vx;
23
  const block_q8_1 * y = (const block_q8_1 *) vy;
24
 
25
- for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
26
- i += blocks_per_warp) {
27
- const int ibx = row*blocks_per_row + i; // x block index
28
 
29
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
30
 
31
- const int iqs =
32
- vdr *
33
- (item_ct1.get_local_id(2) %
34
- (qi / vdr)); // x block quant index when casting the quants to int
35
 
36
- tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
 
37
  }
38
 
39
  // sum up partial sums and write back result
40
  #pragma unroll
41
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
42
- tmp +=
43
- dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
44
  }
45
 
46
  if (item_ct1.get_local_id(2) == 0) {
@@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
62
  }
63
 
64
  const int blocks_per_row = ncols / qk;
65
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
66
  assert(blocks_per_warp>0);
67
 
68
  // partial sum for each thread
@@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
87
 
88
  // sum up partial sums and write back result
89
  #pragma unroll
90
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
91
  tmp +=
92
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
93
  }
@@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
111
  }
112
 
113
  const int blocks_per_row = ncols / qk;
114
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
115
  assert(blocks_per_warp>0);
116
  // partial sum for each thread
117
  float tmp = 0.0f;
@@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
135
 
136
  // sum up partial sums and write back result
137
  #pragma unroll
138
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
139
  tmp +=
140
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
141
  }
@@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
159
  }
160
 
161
  const int blocks_per_row = ncols / qk;
162
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
163
  assert(blocks_per_warp>0);
164
  // partial sum for each thread
165
  float tmp = 0.0f;
@@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
183
 
184
  // sum up partial sums and write back result
185
  #pragma unroll
186
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
187
  tmp +=
188
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
189
  }
@@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
207
  }
208
 
209
  const int blocks_per_row = ncols / qk;
210
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
211
  assert(blocks_per_warp>0);
212
  // partial sum for each thread
213
  float tmp = 0.0f;
@@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
231
 
232
  // sum up partial sums and write back result
233
  #pragma unroll
234
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
235
  tmp +=
236
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
237
  }
@@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
255
  }
256
 
257
  const int blocks_per_row = ncols / qk;
258
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
259
  assert(blocks_per_warp>0);
260
  // partial sum for each thread
261
  float tmp = 0.0f;
@@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
279
 
280
  // sum up partial sums and write back result
281
  #pragma unroll
282
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
283
  tmp +=
284
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
285
  }
@@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
303
  }
304
 
305
  const int blocks_per_row = ncols / qk;
306
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
307
  assert(blocks_per_warp>0);
308
  // partial sum for each thread
309
  float tmp = 0.0f;
@@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
327
 
328
  // sum up partial sums and write back result
329
  #pragma unroll
330
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
331
  tmp +=
332
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
333
  }
@@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
351
  }
352
 
353
  const int blocks_per_row = ncols / qk;
354
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
355
  assert(blocks_per_warp>0);
356
  // partial sum for each thread
357
  float tmp = 0.0f;
@@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
375
 
376
  // sum up partial sums and write back result
377
  #pragma unroll
378
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
379
  tmp +=
380
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
381
  }
@@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
399
  }
400
 
401
  const int blocks_per_row = ncols / qk;
402
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
403
  assert(blocks_per_warp>0);
404
  // partial sum for each thread
405
  float tmp = 0.0f;
@@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
423
 
424
  // sum up partial sums and write back result
425
  #pragma unroll
426
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
427
  tmp +=
428
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
429
  }
@@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
448
  }
449
 
450
  const int blocks_per_row = ncols / qk;
451
- const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
452
  assert(blocks_per_warp>0);
453
  // partial sum for each thread
454
  float tmp = 0.0f;
@@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
472
 
473
  // sum up partial sums and write back result
474
  #pragma unroll
475
- for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
476
  tmp +=
477
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
478
  }
@@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
489
  GGML_ASSERT(ncols % QK4_0 == 0);
490
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
491
  const sycl::range<3> block_nums(1, 1, block_num_y);
492
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
493
  {
494
 
495
  stream->submit([&](sycl::handler &cgh) {
@@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
497
  cgh.parallel_for(
498
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
499
  [=](sycl::nd_item<3> item_ct1)
500
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
501
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
502
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
503
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
513
  GGML_ASSERT(ncols % QK4_1 == 0);
514
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
515
  const sycl::range<3> block_nums(1, 1, block_num_y);
516
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
517
  {
518
 
519
  stream->submit([&](sycl::handler &cgh) {
@@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
521
  cgh.parallel_for(
522
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
523
  [=](sycl::nd_item<3> item_ct1)
524
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
525
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
526
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
527
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
537
  GGML_ASSERT(ncols % QK5_0 == 0);
538
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
539
  const sycl::range<3> block_nums(1, 1, block_num_y);
540
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
541
  {
542
 
543
  stream->submit([&](sycl::handler &cgh) {
@@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
545
  cgh.parallel_for(
546
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
547
  [=](sycl::nd_item<3> item_ct1)
548
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
549
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
550
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
551
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
561
  GGML_ASSERT(ncols % QK5_1 == 0);
562
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
563
  const sycl::range<3> block_nums(1, 1, block_num_y);
564
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
565
  {
566
 
567
  stream->submit([&](sycl::handler &cgh) {
@@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
569
  cgh.parallel_for(
570
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
571
  [=](sycl::nd_item<3> item_ct1)
572
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
573
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
574
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
575
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
585
  GGML_ASSERT(ncols % QK8_0 == 0);
586
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
587
  const sycl::range<3> block_nums(1, 1, block_num_y);
588
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
589
  {
590
 
591
  stream->submit([&](sycl::handler &cgh) {
@@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
593
  cgh.parallel_for(
594
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
595
  [=](sycl::nd_item<3> item_ct1)
596
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
597
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
598
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
599
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
609
  GGML_ASSERT(ncols % QK_K == 0);
610
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
611
  const sycl::range<3> block_nums(1, 1, block_num_y);
612
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
613
  {
614
 
615
  stream->submit([&](sycl::handler &cgh) {
@@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
617
  cgh.parallel_for(
618
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
619
  [=](sycl::nd_item<3> item_ct1)
620
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
621
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
622
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
623
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
633
  GGML_ASSERT(ncols % QK_K == 0);
634
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
635
  const sycl::range<3> block_nums(1, 1, block_num_y);
636
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
637
  {
638
 
639
  stream->submit([&](sycl::handler &cgh) {
@@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
641
  cgh.parallel_for(
642
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
643
  [=](sycl::nd_item<3> item_ct1)
644
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
645
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
646
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
647
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
657
  GGML_ASSERT(ncols % QK_K == 0);
658
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
659
  const sycl::range<3> block_nums(1, 1, block_num_y);
660
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
661
  {
662
 
663
  stream->submit([&](sycl::handler &cgh) {
@@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
665
  cgh.parallel_for(
666
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
667
  [=](sycl::nd_item<3> item_ct1)
668
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
669
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
670
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
671
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
681
  GGML_ASSERT(ncols % QK_K == 0);
682
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
683
  const sycl::range<3> block_nums(1, 1, block_num_y);
684
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
685
  {
686
 
687
  stream->submit([&](sycl::handler &cgh) {
@@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
689
  cgh.parallel_for(
690
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
691
  [=](sycl::nd_item<3> item_ct1)
692
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
693
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
694
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
695
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
705
  GGML_ASSERT(ncols % QK_K == 0);
706
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
707
  const sycl::range<3> block_nums(1, 1, block_num_y);
708
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
709
  {
710
 
711
  stream->submit([&](sycl::handler &cgh) {
@@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
713
  cgh.parallel_for(
714
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
715
  [=](sycl::nd_item<3> item_ct1)
716
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
717
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
718
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
719
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
730
  GGML_ASSERT(ncols % QK_K == 0);
731
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
732
  const sycl::range<3> block_nums(1, 1, block_num_y);
733
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
734
  {
735
  stream->submit([&](sycl::handler &cgh) {
736
  cgh.parallel_for(
737
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
738
  [=](sycl::nd_item<3> item_ct1)
739
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
740
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
741
  vx, vy, dst, ncols, nrows, item_ct1);
742
  });
@@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
751
  GGML_ASSERT(ncols % QK_K == 0);
752
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
753
  const sycl::range<3> block_nums(1, 1, block_num_y);
754
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
755
  {
756
  stream->submit([&](sycl::handler & cgh) {
757
  cgh.parallel_for(
758
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
759
  [=](sycl::nd_item<3> item_ct1)
760
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
761
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
762
  vx, vy, dst, ncols, nrows, item_ct1);
763
  });
@@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
772
  GGML_ASSERT(ncols % QK_K == 0);
773
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
774
  const sycl::range<3> block_nums(1, 1, block_num_y);
775
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
776
  {
777
 
778
  stream->submit([&](sycl::handler &cgh) {
779
  cgh.parallel_for(
780
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
781
  [=](sycl::nd_item<3> item_ct1)
782
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
783
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
784
  vx, vy, dst, ncols, nrows, item_ct1);
785
  });
@@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
794
  GGML_ASSERT(ncols % QK_K == 0);
795
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
796
  const sycl::range<3> block_nums(1, 1, block_num_y);
797
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
798
  {
799
 
800
  stream->submit([&](sycl::handler &cgh) {
801
  cgh.parallel_for(
802
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
803
  [=](sycl::nd_item<3> item_ct1)
804
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
805
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
806
  vx, vy, dst, ncols, nrows, item_ct1);
807
  });
@@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
816
  GGML_ASSERT(ncols % QK_K == 0);
817
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
818
  const sycl::range<3> block_nums(1, 1, block_num_y);
819
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
820
  {
821
 
822
  stream->submit([&](sycl::handler &cgh) {
823
  cgh.parallel_for(
824
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
825
  [=](sycl::nd_item<3> item_ct1)
826
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
827
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
828
  vx, vy, dst, ncols, nrows, item_ct1);
829
  });
@@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
838
  GGML_ASSERT(ncols % QK_K == 0);
839
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
840
  const sycl::range<3> block_nums(1, 1, block_num_y);
841
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
842
  {
843
 
844
  stream->submit([&](sycl::handler &cgh) {
845
  cgh.parallel_for(
846
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
847
  [=](sycl::nd_item<3> item_ct1)
848
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
849
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
850
  vx, vy, dst, ncols, nrows, item_ct1);
851
  });
@@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
860
  GGML_ASSERT(ncols % QK_K == 0);
861
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
862
  const sycl::range<3> block_nums(1, 1, block_num_y);
863
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
864
  {
865
  stream->submit([&](sycl::handler &cgh) {
866
  cgh.parallel_for(
867
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
868
  [=](sycl::nd_item<3> item_ct1)
869
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
870
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
871
  vx, vy, dst, ncols, nrows, item_ct1);
872
  });
@@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
881
  GGML_ASSERT(ncols % QK4_NL == 0);
882
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
883
  const sycl::range<3> block_nums(1, 1, block_num_y);
884
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
885
  {
886
 
887
  stream->submit([&](sycl::handler &cgh) {
888
  cgh.parallel_for(
889
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
890
  [=](sycl::nd_item<3> item_ct1)
891
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
892
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
893
  vx, vy, dst, ncols, nrows, item_ct1);
894
  });
@@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
903
  GGML_ASSERT(ncols % QK_K == 0);
904
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
905
  const sycl::range<3> block_nums(1, 1, block_num_y);
906
- const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
907
  {
908
 
909
  stream->submit([&](sycl::handler &cgh) {
910
  cgh.parallel_for(
911
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
912
  [=](sycl::nd_item<3> item_ct1)
913
- [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
914
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
915
  vx, vy, dst, ncols, nrows, item_ct1);
916
  });
 
3
  #include <cassert>
4
 
5
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
6
+ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
7
+ const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
8
+ const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
 
9
 
10
  if (row >= nrows) {
11
  return;
12
  }
13
 
14
+ const int blocks_per_row = ncols / qk;
15
+ constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi; // Ensuring blocks_per_warp > 0
 
16
 
17
+ assert(blocks_per_warp > 0);
18
+
19
+ // partial sum for each thread
20
  float tmp = 0.0f;
21
 
22
+ const block_q_t * x = (const block_q_t *) vx;
23
  const block_q8_1 * y = (const block_q8_1 *) vy;
24
 
25
+ for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
26
+ const int ibx = row * blocks_per_row + i; // x block index
 
27
 
28
+ const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
29
 
30
+ for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
31
+ const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
32
+ (qi / vdr)); // x block quant index when casting the quants to int
 
33
 
34
+ tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
35
+ }
36
  }
37
 
38
  // sum up partial sums and write back result
39
  #pragma unroll
40
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
41
+ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 
42
  }
43
 
44
  if (item_ct1.get_local_id(2) == 0) {
 
60
  }
61
 
62
  const int blocks_per_row = ncols / qk;
63
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
64
  assert(blocks_per_warp>0);
65
 
66
  // partial sum for each thread
 
85
 
86
  // sum up partial sums and write back result
87
  #pragma unroll
88
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
89
  tmp +=
90
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
91
  }
 
109
  }
110
 
111
  const int blocks_per_row = ncols / qk;
112
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
113
  assert(blocks_per_warp>0);
114
  // partial sum for each thread
115
  float tmp = 0.0f;
 
133
 
134
  // sum up partial sums and write back result
135
  #pragma unroll
136
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
137
  tmp +=
138
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
139
  }
 
157
  }
158
 
159
  const int blocks_per_row = ncols / qk;
160
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
161
  assert(blocks_per_warp>0);
162
  // partial sum for each thread
163
  float tmp = 0.0f;
 
181
 
182
  // sum up partial sums and write back result
183
  #pragma unroll
184
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
185
  tmp +=
186
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
187
  }
 
205
  }
206
 
207
  const int blocks_per_row = ncols / qk;
208
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
209
  assert(blocks_per_warp>0);
210
  // partial sum for each thread
211
  float tmp = 0.0f;
 
229
 
230
  // sum up partial sums and write back result
231
  #pragma unroll
232
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
233
  tmp +=
234
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
235
  }
 
253
  }
254
 
255
  const int blocks_per_row = ncols / qk;
256
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
257
  assert(blocks_per_warp>0);
258
  // partial sum for each thread
259
  float tmp = 0.0f;
 
277
 
278
  // sum up partial sums and write back result
279
  #pragma unroll
280
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
281
  tmp +=
282
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
283
  }
 
301
  }
302
 
303
  const int blocks_per_row = ncols / qk;
304
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
305
  assert(blocks_per_warp>0);
306
  // partial sum for each thread
307
  float tmp = 0.0f;
 
325
 
326
  // sum up partial sums and write back result
327
  #pragma unroll
328
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
329
  tmp +=
330
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
331
  }
 
349
  }
350
 
351
  const int blocks_per_row = ncols / qk;
352
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
353
  assert(blocks_per_warp>0);
354
  // partial sum for each thread
355
  float tmp = 0.0f;
 
373
 
374
  // sum up partial sums and write back result
375
  #pragma unroll
376
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
377
  tmp +=
378
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
379
  }
 
397
  }
398
 
399
  const int blocks_per_row = ncols / qk;
400
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
401
  assert(blocks_per_warp>0);
402
  // partial sum for each thread
403
  float tmp = 0.0f;
 
421
 
422
  // sum up partial sums and write back result
423
  #pragma unroll
424
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
425
  tmp +=
426
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
427
  }
 
446
  }
447
 
448
  const int blocks_per_row = ncols / qk;
449
+ const int blocks_per_warp = vdr * WARP_SIZE / qi;
450
  assert(blocks_per_warp>0);
451
  // partial sum for each thread
452
  float tmp = 0.0f;
 
470
 
471
  // sum up partial sums and write back result
472
  #pragma unroll
473
+ for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
474
  tmp +=
475
  dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
476
  }
 
487
  GGML_ASSERT(ncols % QK4_0 == 0);
488
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
489
  const sycl::range<3> block_nums(1, 1, block_num_y);
490
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
491
  {
492
 
493
  stream->submit([&](sycl::handler &cgh) {
 
495
  cgh.parallel_for(
496
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
  [=](sycl::nd_item<3> item_ct1)
498
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
499
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
  vx, vy, dst, ncols, nrows, item_ct1);
 
511
  GGML_ASSERT(ncols % QK4_1 == 0);
512
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
513
  const sycl::range<3> block_nums(1, 1, block_num_y);
514
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
515
  {
516
 
517
  stream->submit([&](sycl::handler &cgh) {
 
519
  cgh.parallel_for(
520
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
  [=](sycl::nd_item<3> item_ct1)
522
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
523
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
  vx, vy, dst, ncols, nrows, item_ct1);
 
535
  GGML_ASSERT(ncols % QK5_0 == 0);
536
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
537
  const sycl::range<3> block_nums(1, 1, block_num_y);
538
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
539
  {
540
 
541
  stream->submit([&](sycl::handler &cgh) {
 
543
  cgh.parallel_for(
544
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
  [=](sycl::nd_item<3> item_ct1)
546
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
547
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
  vx, vy, dst, ncols, nrows, item_ct1);
 
559
  GGML_ASSERT(ncols % QK5_1 == 0);
560
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
561
  const sycl::range<3> block_nums(1, 1, block_num_y);
562
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
563
  {
564
 
565
  stream->submit([&](sycl::handler &cgh) {
 
567
  cgh.parallel_for(
568
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
  [=](sycl::nd_item<3> item_ct1)
570
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
571
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
  vx, vy, dst, ncols, nrows, item_ct1);
 
583
  GGML_ASSERT(ncols % QK8_0 == 0);
584
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
585
  const sycl::range<3> block_nums(1, 1, block_num_y);
586
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
587
  {
588
 
589
  stream->submit([&](sycl::handler &cgh) {
 
591
  cgh.parallel_for(
592
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
  [=](sycl::nd_item<3> item_ct1)
594
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
595
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
  vx, vy, dst, ncols, nrows, item_ct1);
 
607
  GGML_ASSERT(ncols % QK_K == 0);
608
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
609
  const sycl::range<3> block_nums(1, 1, block_num_y);
610
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
611
  {
612
 
613
  stream->submit([&](sycl::handler &cgh) {
 
615
  cgh.parallel_for(
616
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
  [=](sycl::nd_item<3> item_ct1)
618
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
619
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
  vx, vy, dst, ncols, nrows, item_ct1);
 
631
  GGML_ASSERT(ncols % QK_K == 0);
632
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
633
  const sycl::range<3> block_nums(1, 1, block_num_y);
634
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
635
  {
636
 
637
  stream->submit([&](sycl::handler &cgh) {
 
639
  cgh.parallel_for(
640
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
  [=](sycl::nd_item<3> item_ct1)
642
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
643
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
  vx, vy, dst, ncols, nrows, item_ct1);
 
655
  GGML_ASSERT(ncols % QK_K == 0);
656
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
657
  const sycl::range<3> block_nums(1, 1, block_num_y);
658
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
659
  {
660
 
661
  stream->submit([&](sycl::handler &cgh) {
 
663
  cgh.parallel_for(
664
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
  [=](sycl::nd_item<3> item_ct1)
666
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
667
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
  vx, vy, dst, ncols, nrows, item_ct1);
 
679
  GGML_ASSERT(ncols % QK_K == 0);
680
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
681
  const sycl::range<3> block_nums(1, 1, block_num_y);
682
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
683
  {
684
 
685
  stream->submit([&](sycl::handler &cgh) {
 
687
  cgh.parallel_for(
688
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
  [=](sycl::nd_item<3> item_ct1)
690
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
691
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
  vx, vy, dst, ncols, nrows, item_ct1);
 
703
  GGML_ASSERT(ncols % QK_K == 0);
704
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
705
  const sycl::range<3> block_nums(1, 1, block_num_y);
706
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
707
  {
708
 
709
  stream->submit([&](sycl::handler &cgh) {
 
711
  cgh.parallel_for(
712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
  [=](sycl::nd_item<3> item_ct1)
714
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
715
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
  vx, vy, dst, ncols, nrows, item_ct1);
 
728
  GGML_ASSERT(ncols % QK_K == 0);
729
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
730
  const sycl::range<3> block_nums(1, 1, block_num_y);
731
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
732
  {
733
  stream->submit([&](sycl::handler &cgh) {
734
  cgh.parallel_for(
735
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
  [=](sycl::nd_item<3> item_ct1)
737
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
738
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
  vx, vy, dst, ncols, nrows, item_ct1);
740
  });
 
749
  GGML_ASSERT(ncols % QK_K == 0);
750
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
751
  const sycl::range<3> block_nums(1, 1, block_num_y);
752
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
753
  {
754
  stream->submit([&](sycl::handler & cgh) {
755
  cgh.parallel_for(
756
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
757
  [=](sycl::nd_item<3> item_ct1)
758
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
759
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
760
  vx, vy, dst, ncols, nrows, item_ct1);
761
  });
 
770
  GGML_ASSERT(ncols % QK_K == 0);
771
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
772
  const sycl::range<3> block_nums(1, 1, block_num_y);
773
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
774
  {
775
 
776
  stream->submit([&](sycl::handler &cgh) {
777
  cgh.parallel_for(
778
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
779
  [=](sycl::nd_item<3> item_ct1)
780
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
781
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
782
  vx, vy, dst, ncols, nrows, item_ct1);
783
  });
 
792
  GGML_ASSERT(ncols % QK_K == 0);
793
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
794
  const sycl::range<3> block_nums(1, 1, block_num_y);
795
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
796
  {
797
 
798
  stream->submit([&](sycl::handler &cgh) {
799
  cgh.parallel_for(
800
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
801
  [=](sycl::nd_item<3> item_ct1)
802
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
803
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
804
  vx, vy, dst, ncols, nrows, item_ct1);
805
  });
 
814
  GGML_ASSERT(ncols % QK_K == 0);
815
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
816
  const sycl::range<3> block_nums(1, 1, block_num_y);
817
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
818
  {
819
 
820
  stream->submit([&](sycl::handler &cgh) {
821
  cgh.parallel_for(
822
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
823
  [=](sycl::nd_item<3> item_ct1)
824
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
825
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
826
  vx, vy, dst, ncols, nrows, item_ct1);
827
  });
 
836
  GGML_ASSERT(ncols % QK_K == 0);
837
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
838
  const sycl::range<3> block_nums(1, 1, block_num_y);
839
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
840
  {
841
 
842
  stream->submit([&](sycl::handler &cgh) {
843
  cgh.parallel_for(
844
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
845
  [=](sycl::nd_item<3> item_ct1)
846
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
847
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
848
  vx, vy, dst, ncols, nrows, item_ct1);
849
  });
 
858
  GGML_ASSERT(ncols % QK_K == 0);
859
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
860
  const sycl::range<3> block_nums(1, 1, block_num_y);
861
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
862
  {
863
  stream->submit([&](sycl::handler &cgh) {
864
  cgh.parallel_for(
865
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
866
  [=](sycl::nd_item<3> item_ct1)
867
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
868
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
869
  vx, vy, dst, ncols, nrows, item_ct1);
870
  });
 
879
  GGML_ASSERT(ncols % QK4_NL == 0);
880
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
881
  const sycl::range<3> block_nums(1, 1, block_num_y);
882
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
883
  {
884
 
885
  stream->submit([&](sycl::handler &cgh) {
886
  cgh.parallel_for(
887
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
888
  [=](sycl::nd_item<3> item_ct1)
889
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
890
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
891
  vx, vy, dst, ncols, nrows, item_ct1);
892
  });
 
901
  GGML_ASSERT(ncols % QK_K == 0);
902
  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
903
  const sycl::range<3> block_nums(1, 1, block_num_y);
904
+ const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
905
  {
906
 
907
  stream->submit([&](sycl::handler &cgh) {
908
  cgh.parallel_for(
909
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
910
  [=](sycl::nd_item<3> item_ct1)
911
+ [[intel::reqd_sub_group_size(WARP_SIZE)]] {
912
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
913
  vx, vy, dst, ncols, nrows, item_ct1);
914
  });