Chenguang Li noemotiovon hipudding commited on
Commit
ffa5f14
·
1 Parent(s): b00a8a9

get_rows and dup optimization (llama/12671)

Browse files

* [CANN]get_rows and dup optimization.

Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]GET_ROWS and CPY/DUP optimization

Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

* [CANN]code style adjustment

Signed-off-by: noemotiovon <noemotiovon@gmail.com>

---------

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: hipudding <huafengchun@gmail.com>

ggml/src/ggml-cann/CMakeLists.txt CHANGED
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
51
  ${CANN_INSTALL_DIR}/acllib/include
52
  )
53
 
54
- add_subdirectory(kernels)
55
  list(APPEND CANN_LIBRARIES
56
  ascendcl
57
  nnopbase
58
  opapi
59
  acl_op_compiler
60
- ascendc_kernels
61
  )
62
 
63
  file(GLOB GGML_SOURCES_CANN "*.cpp")
 
51
  ${CANN_INSTALL_DIR}/acllib/include
52
  )
53
 
 
54
  list(APPEND CANN_LIBRARIES
55
  ascendcl
56
  nnopbase
57
  opapi
58
  acl_op_compiler
 
59
  )
60
 
61
  file(GLOB GGML_SOURCES_CANN "*.cpp")
ggml/src/ggml-cann/aclnn_ops.cpp CHANGED
@@ -30,6 +30,7 @@
30
  #include <aclnnop/aclnn_copy.h>
31
  #include <aclnnop/aclnn_cos.h>
32
  #include <aclnnop/aclnn_div.h>
 
33
  #include <aclnnop/aclnn_exp.h>
34
  #include <aclnnop/aclnn_fill_scalar.h>
35
  #include <aclnnop/aclnn_group_norm.h>
@@ -58,7 +59,6 @@
58
  #include <vector>
59
 
60
  #include "ggml-impl.h"
61
- #include "kernels/ascendc_kernels.h"
62
 
63
  #define GGML_COMMON_DECL_C
64
 
@@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
99
  ACL_CHECK(aclDestroyIntArray(repeats));
100
  }
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
103
  ggml_tensor* src = dst->src[0];
104
  GGML_ASSERT(ggml_can_repeat(src, dst));
@@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
889
  }
890
 
891
  void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
892
- ggml_tensor* src = dst->src[0];
893
 
894
- aclTensor* acl_src = ggml_cann_create_tensor(src);
895
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
896
-
897
- ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
898
- ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
899
- src->extra = src_extra_allocator.get();
900
- dst->extra = dst_extra_allocator.get();
901
- ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
902
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
903
- ctx.stream()));
904
- ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
905
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
906
- ctx.stream()));
907
-
908
- if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
909
- ggml_are_same_shape(src, dst)) {
910
- cann_copy(ctx, acl_src, acl_dst);
911
- ACL_CHECK(aclDestroyTensor(acl_src));
912
- ACL_CHECK(aclDestroyTensor(acl_dst));
913
- return;
914
- }
915
- // TODO: simplify
916
- if (src->type == GGML_TYPE_F16) {
917
- if (dst->type == GGML_TYPE_Q8_0) {
918
- aclrtlaunch_ascendc_quantize_f16_q8_0(
919
- 24, ctx.stream(), src->data, dst->data,
920
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
921
- ((ggml_tensor*)dst->extra)->ne);
922
- return;
923
- }
924
- if (dst->type == GGML_TYPE_Q4_0) {
925
- aclrtlaunch_ascendc_quantize_f16_to_q4_0(
926
- 24, ctx.stream(), src->data, dst->data,
927
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
928
- ((ggml_tensor*)dst->extra)->ne);
929
- return;
930
- }
931
- if (dst->type == GGML_TYPE_F16) {
932
- if (ggml_are_same_shape(src, dst)) {
933
- cann_copy(ctx, acl_src, acl_dst);
934
- ACL_CHECK(aclDestroyTensor(acl_src));
935
- ACL_CHECK(aclDestroyTensor(acl_dst));
936
- return;
937
- }
938
- if (ggml_is_contiguous(dst)) {
939
- const size_t src_type_size = ggml_type_size(src->type);
940
- if (src->nb[0] == src_type_size) {
941
- // src0 is contigous on first dimension, copy by rows
942
- int64_t rows_num = ggml_nrows(src);
943
-
944
- aclrtlaunch_ascendc_dup_by_rows_fp16(
945
- rows_num, ctx.stream(), src->data, dst->data,
946
- ((ggml_tensor*)src->extra)->ne,
947
- ((ggml_tensor*)src->extra)->nb,
948
- ((ggml_tensor*)dst->extra)->ne,
949
- ((ggml_tensor*)dst->extra)->nb);
950
- return;
951
- }
952
- GGML_ABORT("fatal error");
953
- }
954
- GGML_ABORT("fatal error");
955
- }
956
- if (dst->type == GGML_TYPE_F32) {
957
- if (ggml_are_same_shape(src, dst)) {
958
- cann_copy(ctx, acl_src, acl_dst);
959
- ACL_CHECK(aclDestroyTensor(acl_src));
960
- ACL_CHECK(aclDestroyTensor(acl_dst));
961
- return;
962
- }
963
- if (ggml_is_contiguous(dst)) {
964
- const size_t src_type_size = ggml_type_size(src->type);
965
- if (src->nb[0] == src_type_size) {
966
- // src0 is contigous on first dimension, copy by rows
967
- int64_t rows_num = ggml_nrows(src);
968
- aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
969
- rows_num, ctx.stream(), src->data, dst->data,
970
- ((ggml_tensor*)src->extra)->ne,
971
- ((ggml_tensor*)src->extra)->nb,
972
- ((ggml_tensor*)dst->extra)->ne,
973
- ((ggml_tensor*)dst->extra)->nb);
974
- return;
975
- }
976
- GGML_ABORT("fatal error");
977
- }
978
- GGML_ABORT("fatal error");
979
- }
980
- // TODO
981
- GGML_ABORT("fatal error");
982
- } else if (src->type == GGML_TYPE_F32) {
983
- // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
984
- // && nb0 == type_size)
985
- if (dst->type == GGML_TYPE_Q8_0) {
986
- aclrtlaunch_ascendc_quantize_f32_q8_0(
987
- 24, ctx.stream(), src->data, dst->data,
988
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
989
- ((ggml_tensor*)dst->extra)->ne);
990
- return;
991
- }
992
- if (dst->type == GGML_TYPE_Q4_0) {
993
- aclrtlaunch_ascendc_quantize_f32_to_q4_0(
994
- 24, ctx.stream(), src->data, dst->data,
995
- ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
996
- ((ggml_tensor*)dst->extra)->ne);
997
- return;
998
  }
999
- if (dst->type == GGML_TYPE_F32) {
1000
- if (ggml_are_same_shape(src, dst)) {
1001
- cann_copy(ctx, acl_src, acl_dst);
1002
- ACL_CHECK(aclDestroyTensor(acl_src));
1003
- ACL_CHECK(aclDestroyTensor(acl_dst));
 
 
1004
  return;
1005
- }
1006
- if (ggml_is_contiguous(dst)) {
1007
- const size_t src_type_size = ggml_type_size(src->type);
1008
- if (src->nb[0] == src_type_size) {
1009
- // src0 is contigous on first dimension, copy by rows
1010
- int64_t rows_num = ggml_nrows(src);
1011
- aclrtlaunch_ascendc_dup_by_rows_fp32(
1012
- rows_num, ctx.stream(), src->data, dst->data,
1013
- ((ggml_tensor*)src->extra)->ne,
1014
- ((ggml_tensor*)src->extra)->nb,
1015
- ((ggml_tensor*)dst->extra)->ne,
1016
- ((ggml_tensor*)dst->extra)->nb);
1017
- return;
1018
- }
1019
- GGML_ABORT("fatal error");
1020
  } else {
1021
- // TODO: dst not contiguous
1022
- GGML_ABORT("fatal error");
1023
- }
1024
- }
1025
- if (dst->type == GGML_TYPE_F16) {
1026
- if (ggml_are_same_shape(src, dst)) {
1027
- cann_copy(ctx, acl_src, acl_dst);
1028
- ACL_CHECK(aclDestroyTensor(acl_src));
1029
- ACL_CHECK(aclDestroyTensor(acl_dst));
 
 
 
 
 
 
 
 
 
 
 
1030
  return;
1031
  }
1032
- if (ggml_is_contiguous(dst)) {
1033
- const size_t src_type_size = ggml_type_size(src->type);
1034
- if (src->nb[0] == src_type_size) {
1035
- // src0 is contigous on first dimension, copy by rows
1036
- int64_t rows_num = ggml_nrows(src);
1037
- aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
1038
- rows_num, ctx.stream(), src->data, dst->data,
1039
- ((ggml_tensor*)src->extra)->ne,
1040
- ((ggml_tensor*)src->extra)->nb,
1041
- ((ggml_tensor*)dst->extra)->ne,
1042
- ((ggml_tensor*)dst->extra)->nb);
1043
- return;
1044
- }
1045
- GGML_ABORT("fatal error");
1046
  }
1047
- }
1048
- // TODO
1049
- GGML_ABORT("fatal error");
1050
- } else {
1051
- if (ggml_are_same_shape(src, dst)) {
1052
- cann_copy(ctx, acl_src, acl_dst);
1053
- ACL_CHECK(aclDestroyTensor(acl_src));
1054
- ACL_CHECK(aclDestroyTensor(acl_dst));
 
 
 
 
1055
  return;
 
 
1056
  }
1057
- GGML_ABORT("fatal error");
1058
  }
 
 
 
1059
  }
1060
 
1061
  #ifdef __cplusplus
@@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2378
  ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
2379
  }
2380
 
2381
- void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2382
- ggml_tensor* src0 = dst->src[0];
2383
- ggml_tensor* src1 = dst->src[1];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2384
 
2385
- ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2386
- ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2387
- ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
2388
- src0->extra = src0_extra_allocator.get();
2389
- src1->extra = src1_extra_allocator.get();
2390
- dst->extra = dst_extra_allocator.get();
2391
- ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
2392
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2393
- ctx.stream()));
2394
- ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
2395
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2396
- ctx.stream()));
2397
- ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
2398
- sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
2399
- ctx.stream()));
2400
 
2401
  switch (src0->type) {
2402
  case GGML_TYPE_F32: {
2403
- #ifdef ASCEND_310P
2404
- // Special operation for get_row_f32 kernel of 310P: clear the
2405
- // content of dest data buffer when row is not aligned to 32 bytes
2406
- if ((src0->ne[0] % 8) != 0) {
2407
- size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
2408
- src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2409
- ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2410
- }
2411
- #endif
2412
- aclrtlaunch_ascendc_get_row_f32(
2413
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2414
- ((ggml_tensor*)src0->extra)->ne,
2415
- ((ggml_tensor*)src0->extra)->nb,
2416
- ((ggml_tensor*)src1->extra)->ne,
2417
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2418
- ((ggml_tensor*)dst->extra)->nb);
2419
  break;
2420
  }
2421
  case GGML_TYPE_F16: {
2422
- #ifdef ASCEND_310P
2423
- // Special operation for get_row_f16 kernel of 310P: clear the
2424
- // content of dest data buffer when row is not aligned to 32 bytes
2425
- if ((src0->ne[0] % 16) != 0) {
2426
- size_t dst_len =
2427
- src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
2428
- ggml_type_size(
2429
- GGML_TYPE_F32); // out is also f32, even input is f16
2430
- ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2431
  }
2432
- #endif
2433
- aclrtlaunch_ascendc_get_row_f16(
2434
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2435
- ((ggml_tensor*)src0->extra)->ne,
2436
- ((ggml_tensor*)src0->extra)->nb,
2437
- ((ggml_tensor*)src1->extra)->ne,
2438
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2439
- ((ggml_tensor*)dst->extra)->nb);
2440
  break;
2441
  }
2442
- case GGML_TYPE_Q4_0:
2443
- aclrtlaunch_ascendc_get_row_q4_0(
2444
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2445
- ((ggml_tensor*)src0->extra)->ne,
2446
- ((ggml_tensor*)src1->extra)->ne,
2447
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2448
- ((ggml_tensor*)dst->extra)->nb);
2449
- break;
2450
- case GGML_TYPE_Q8_0:
2451
- aclrtlaunch_ascendc_get_row_q8_0(
2452
- 24, ctx.stream(), src0->data, src1->data, dst->data,
2453
- ((ggml_tensor*)src0->extra)->ne,
2454
- ((ggml_tensor*)src1->extra)->ne,
2455
- ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
2456
- ((ggml_tensor*)dst->extra)->nb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2457
  break;
 
2458
  default:
2459
- GGML_ABORT("fatal error");
2460
  break;
2461
  }
2462
  }
@@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2797
 
2798
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2799
  acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2800
- nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
2801
- &workspaceSize, &executor));
2802
  if (workspaceAddr == nullptr) {
2803
  workspaceAddr = workspace_allocator.alloc(workspaceSize);
2804
  }
 
30
  #include <aclnnop/aclnn_copy.h>
31
  #include <aclnnop/aclnn_cos.h>
32
  #include <aclnnop/aclnn_div.h>
33
+ #include <aclnnop/aclnn_embedding.h>
34
  #include <aclnnop/aclnn_exp.h>
35
  #include <aclnnop/aclnn_fill_scalar.h>
36
  #include <aclnnop/aclnn_group_norm.h>
 
59
  #include <vector>
60
 
61
  #include "ggml-impl.h"
 
62
 
63
  #define GGML_COMMON_DECL_C
64
 
 
99
  ACL_CHECK(aclDestroyIntArray(repeats));
100
  }
101
 
102
+ /**
103
+ * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
104
+ *
105
+ * @details This function performs a type conversion on the elements of the input tensor `acl_src`
106
+ * and stores the results in the destination tensor `acl_dst`. The conversion type is
107
+ * determined based on the `dst` tensor's data type.
108
+ *
109
+ * @param ctx The context for the CANN backend operations.
110
+ * @param acl_src The source tensor whose elements will be cast.
111
+ * @param acl_dst The destination tensor that will store the casted elements.
112
+ * @param dst The ggml tensor specifying the target data type.
113
+ */
114
+ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
115
+ aclTensor* acl_dst, ggml_tensor* dst) {
116
+ uint64_t workspaceSize = 0;
117
+ aclOpExecutor* executor;
118
+ void* workspaceAddr = nullptr;
119
+ ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
120
+ ggml_cann_type_mapping(dst->type),
121
+ acl_dst, &workspaceSize, &executor));
122
+
123
+ if (workspaceSize > 0) {
124
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
125
+ workspaceAddr = workspace_allocator.get();
126
+ }
127
+
128
+ ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
129
+ }
130
+
131
  void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
132
  ggml_tensor* src = dst->src[0];
133
  GGML_ASSERT(ggml_can_repeat(src, dst));
 
918
  }
919
 
920
  void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
921
+ ggml_tensor* src0 = dst->src[0];
922
 
923
+ aclTensor* acl_src = ggml_cann_create_tensor(src0);
924
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
925
+ if (ggml_are_same_shape(src0, dst)) {
926
+ if (dst->type == src0->type) {
927
+ cann_copy(ctx, acl_src, acl_dst);
928
+ } else {
929
+ aclnn_cast(ctx, acl_src, acl_dst, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930
  }
931
+ } else {
932
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
933
+ if (dst->type == src0->type) {
934
+ size_t cpy_size = ggml_nbytes(dst);
935
+ ACL_CHECK(aclrtMemcpyAsync(
936
+ dst->data, cpy_size, src0->data, cpy_size,
937
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
938
  return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
  } else {
940
+ ggml_cann_pool_alloc src_buffer_allocator(
941
+ ctx.pool(),
942
+ ggml_nelements(dst) * ggml_type_size(dst->type));
943
+ void* src_trans_buffer = src_buffer_allocator.get();
944
+ size_t src_trans_nb[GGML_MAX_DIMS];
945
+ src_trans_nb[0] = ggml_type_size(dst->type);
946
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
947
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
948
+ }
949
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
950
+ src_trans_buffer, ggml_cann_type_mapping(dst->type),
951
+ ggml_type_size(dst->type), src0->ne, src_trans_nb,
952
+ GGML_MAX_DIMS);
953
+
954
+ aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
955
+ size_t cpy_size = ggml_nbytes(dst);
956
+ ACL_CHECK(aclrtMemcpyAsync(
957
+ dst->data, cpy_size, src_trans_buffer, cpy_size,
958
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
959
+ ACL_CHECK(aclDestroyTensor(src_trans_tensor));
960
  return;
961
  }
962
+ } else if (ggml_is_contiguous(dst)) {
963
+ ggml_cann_pool_alloc src_buffer_allocator(
964
+ ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
965
+ void* src_trans_buffer = src_buffer_allocator.get();
966
+ size_t src_trans_nb[GGML_MAX_DIMS];
967
+ src_trans_nb[0] = ggml_type_size(dst->type);
968
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
969
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
 
 
 
 
 
 
970
  }
971
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
972
+ src_trans_buffer, ggml_cann_type_mapping(dst->type),
973
+ ggml_type_size(dst->type), src0->ne, src_trans_nb,
974
+ GGML_MAX_DIMS);
975
+
976
+ aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
977
+
978
+ size_t cpy_size = ggml_nbytes(dst);
979
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
980
+ cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
981
+ ctx.stream()));
982
+ ACL_CHECK(aclDestroyTensor(src_trans_tensor));
983
  return;
984
+ } else {
985
+ GGML_ABORT("Unsupport dst is not tontiguous.");
986
  }
 
987
  }
988
+
989
+ ACL_CHECK(aclDestroyTensor(acl_src));
990
+ ACL_CHECK(aclDestroyTensor(acl_dst));
991
  }
992
 
993
  #ifdef __cplusplus
 
2310
  ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
2311
  }
2312
 
2313
+ /**
2314
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
2315
+ *
2316
+ * This function extracts slices from the source tensor (`src_buffer`),
2317
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
2318
+ * embedding operation on them. The embedding operation is applied by iterating
2319
+ * over the last two dimensions of the source tensor, creating the necessary
2320
+ * tensors for the source, index, and output, and executing the embedding operation.
2321
+ *
2322
+ * @param ctx The context for CANN backend operations.
2323
+ * @param src_buffer The source buffer holding the data for the source tensor.
2324
+ * @param src_ne The dimensions of the source tensor.
2325
+ * @param src_nb The strides (byte offsets) of the source tensor.
2326
+ * @param index The index tensor used in the embedding operation.
2327
+ * @param dst The destination tensor where the result will be stored.
2328
+ */
2329
+ static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
2330
+ int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
2331
+ ggml_tensor* dst) {
2332
+ for (int64_t i = 0; i < src_ne[3]; i++) {
2333
+ for (int64_t j = 0; j < src_ne[2]; j++) {
2334
+ // src
2335
+ int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
2336
+ size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
2337
+ aclTensor* acl_src_tensor = ggml_cann_create_tensor(
2338
+ (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
2339
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2340
+ acl_src_ne, acl_src_nb, 2);
2341
+
2342
+ // index
2343
+ int64_t acl_index_ne[1] = {index->ne[0]};
2344
+ size_t acl_index_nb[1] = {index->nb[0]};
2345
+ aclTensor* acl_index = ggml_cann_create_tensor(
2346
+ (char*)index->data + i * index->nb[2] + j * index->nb[1],
2347
+ ggml_cann_type_mapping(index->type), ggml_element_size(index),
2348
+ acl_index_ne, acl_index_nb, 1);
2349
+
2350
+ // out
2351
+ int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
2352
+ size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
2353
+ aclTensor* acl_out = ggml_cann_create_tensor(
2354
+ (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
2355
+ ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
2356
+ acl_out_ne, acl_out_nb, 2);
2357
+
2358
+ uint64_t workspaceSize = 0;
2359
+ aclOpExecutor* executor;
2360
+ void* workspaceAddr = nullptr;
2361
+
2362
+ ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
2363
+ acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
2364
+
2365
+ if (workspaceSize > 0) {
2366
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
2367
+ workspaceSize);
2368
+ workspaceAddr = workspace_allocator.get();
2369
+ }
2370
+
2371
+ ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
2372
+ ctx.stream()));
2373
+
2374
+ ACL_CHECK(aclDestroyTensor(acl_src_tensor));
2375
+ ACL_CHECK(aclDestroyTensor(acl_index));
2376
+ ACL_CHECK(aclDestroyTensor(acl_out));
2377
+ }
2378
+ }
2379
+ }
2380
 
2381
+ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2382
+ ggml_tensor* src0 = dst->src[0]; // src
2383
+ ggml_tensor* src1 = dst->src[1]; // index
 
 
 
 
 
 
 
 
 
 
 
 
2384
 
2385
  switch (src0->type) {
2386
  case GGML_TYPE_F32: {
2387
+ aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
2388
+ dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2389
  break;
2390
  }
2391
  case GGML_TYPE_F16: {
2392
+ aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
2393
+ ggml_cann_pool_alloc src_buffer_allocator(
2394
+ ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
2395
+ void* src_trans_buffer = src_buffer_allocator.get();
2396
+ size_t src_trans_nb[GGML_MAX_DIMS];
2397
+ src_trans_nb[0] = sizeof(float_t);
2398
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2399
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
 
2400
  }
2401
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor(
2402
+ src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
2403
+ src0->ne, src_trans_nb, GGML_MAX_DIMS);
2404
+ aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
2405
+ aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
2406
+ src_trans_nb, src1, dst);
2407
+ ACL_CHECK(aclDestroyTensor(acl_src0));
2408
+ ACL_CHECK(aclDestroyTensor(src_trans_tensor));
2409
  break;
2410
  }
2411
+ case GGML_TYPE_Q8_0: {
2412
+ // add 1 dim for bcast mul.
2413
+ size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
2414
+ dequant_nb[GGML_MAX_DIMS + 1];
2415
+ int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
2416
+ *dequant_ne;
2417
+ int64_t scale_offset = 0;
2418
+
2419
+ // [3,4,5,64] -> [3,4,5,2,32]
2420
+ weight_ne[0] = QK8_0;
2421
+ weight_ne[1] = src0->ne[0] / QK8_0;
2422
+ weight_nb[0] = sizeof(int8_t);
2423
+ weight_nb[1] = weight_nb[0] * weight_ne[0];
2424
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
2425
+ weight_ne[i] = src0->ne[i - 1];
2426
+ weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
2427
+ }
2428
+
2429
+ // [3,4,5,64] -> [3,4,5,2,1]
2430
+ scale_ne[0] = 1;
2431
+ scale_ne[1] = src0->ne[0] / QK8_0;
2432
+ scale_nb[0] = sizeof(uint16_t);
2433
+ scale_nb[1] = scale_nb[0] * scale_ne[0];
2434
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
2435
+ scale_ne[i] = src0->ne[i - 1];
2436
+ scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
2437
+ }
2438
+
2439
+ // [3,4,5,64] -> [3,4,5,2,32]
2440
+ dequant_ne = weight_ne;
2441
+ dequant_nb[0] = sizeof(float_t);
2442
+ for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
2443
+ dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
2444
+ }
2445
+
2446
+ scale_offset = ggml_nelements(src0) * sizeof(int8_t);
2447
+ ggml_cann_pool_alloc dequant_buffer_allocator(
2448
+ ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
2449
+
2450
+ aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2451
+ src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
2452
+ GGML_MAX_DIMS + 1);
2453
+ aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2454
+ src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
2455
+ GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
2456
+ aclTensor* dequant_tensor = ggml_cann_create_tensor(
2457
+ dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
2458
+ dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
2459
+
2460
+ aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
2461
+ dequant_nb[0] = sizeof(float_t);
2462
+ dequant_ne = src0->ne;
2463
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
2464
+ dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
2465
+ }
2466
+
2467
+ aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
2468
+ dequant_ne, dequant_nb, src1, dst);
2469
+
2470
+ ACL_CHECK(aclDestroyTensor(dequant_tensor));
2471
  break;
2472
+ }
2473
  default:
2474
+ GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
2475
  break;
2476
  }
2477
  }
 
2812
 
2813
  ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
2814
  acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
2815
+ nullptr, nullptr, nullptr, antiquantGroupSize,
2816
+ acl_output_tensor, &workspaceSize, &executor));
2817
  if (workspaceAddr == nullptr) {
2818
  workspaceAddr = workspace_allocator.alloc(workspaceSize);
2819
  }
ggml/src/ggml-cann/ggml-cann.cpp CHANGED
@@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1704
  switch (op->src[0]->type) {
1705
  case GGML_TYPE_F32:
1706
  case GGML_TYPE_F16:
1707
- case GGML_TYPE_Q4_0:
1708
  case GGML_TYPE_Q8_0:
1709
  return true;
1710
  default:
@@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1712
  }
1713
  } break;
1714
  case GGML_OP_CPY: {
1715
- switch (op->type) {
1716
- case GGML_TYPE_F32:
1717
- case GGML_TYPE_F16:
1718
- case GGML_TYPE_Q8_0:
1719
- case GGML_TYPE_Q4_0:
1720
- return true;
1721
- default:
1722
- return false;
1723
  }
1724
- }
 
 
 
 
 
 
 
1725
  case GGML_OP_CONT: {
1726
  // TODO: support GGML_TYPE_BF16
1727
  switch (op->src[0]->type) {
@@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1762
  }
1763
  return true;
1764
  }
 
1765
  case GGML_OP_IM2COL:
1766
  case GGML_OP_CONCAT:
1767
- case GGML_OP_DUP:
1768
  case GGML_OP_REPEAT:
1769
  case GGML_OP_NONE:
1770
  case GGML_OP_RESHAPE:
 
1704
  switch (op->src[0]->type) {
1705
  case GGML_TYPE_F32:
1706
  case GGML_TYPE_F16:
 
1707
  case GGML_TYPE_Q8_0:
1708
  return true;
1709
  default:
 
1711
  }
1712
  } break;
1713
  case GGML_OP_CPY: {
1714
+ ggml_tensor *src = op->src[0];
1715
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
1716
+ (src->type != GGML_TYPE_F32 &&
1717
+ src->type != GGML_TYPE_F16)) {
1718
+ // only support F32 and F16.
1719
+ return false;
 
 
1720
  }
1721
+
1722
+ if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
1723
+ // unsupport dst is not contiguous.
1724
+ return false;
1725
+ }
1726
+
1727
+ return true;
1728
+ } break;
1729
  case GGML_OP_CONT: {
1730
  // TODO: support GGML_TYPE_BF16
1731
  switch (op->src[0]->type) {
 
1766
  }
1767
  return true;
1768
  }
1769
+ case GGML_OP_DUP:
1770
  case GGML_OP_IM2COL:
1771
  case GGML_OP_CONCAT:
 
1772
  case GGML_OP_REPEAT:
1773
  case GGML_OP_NONE:
1774
  case GGML_OP_RESHAPE: