Spaces:
Sleeping
get_rows and dup optimization (llama/12671)
Browse files* [CANN]get_rows and dup optimization.
Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
* [CANN]GET_ROWS and CPY/DUP optimization
Co-authored-by: hipudding <huafengchun@gmail.com>
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
* [CANN]code style adjustment
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
* [CANN]code style adjustment
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
* [CANN]code style adjustment
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
* [CANN]code style adjustment
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
---------
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: hipudding <huafengchun@gmail.com>
- ggml/src/ggml-cann/CMakeLists.txt +0 -2
- ggml/src/ggml-cann/aclnn_ops.cpp +241 -226
- ggml/src/ggml-cann/ggml-cann.cpp +15 -11
|
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
|
|
| 51 |
${CANN_INSTALL_DIR}/acllib/include
|
| 52 |
)
|
| 53 |
|
| 54 |
-
add_subdirectory(kernels)
|
| 55 |
list(APPEND CANN_LIBRARIES
|
| 56 |
ascendcl
|
| 57 |
nnopbase
|
| 58 |
opapi
|
| 59 |
acl_op_compiler
|
| 60 |
-
ascendc_kernels
|
| 61 |
)
|
| 62 |
|
| 63 |
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
|
|
|
| 51 |
${CANN_INSTALL_DIR}/acllib/include
|
| 52 |
)
|
| 53 |
|
|
|
|
| 54 |
list(APPEND CANN_LIBRARIES
|
| 55 |
ascendcl
|
| 56 |
nnopbase
|
| 57 |
opapi
|
| 58 |
acl_op_compiler
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
|
@@ -30,6 +30,7 @@
|
|
| 30 |
#include <aclnnop/aclnn_copy.h>
|
| 31 |
#include <aclnnop/aclnn_cos.h>
|
| 32 |
#include <aclnnop/aclnn_div.h>
|
|
|
|
| 33 |
#include <aclnnop/aclnn_exp.h>
|
| 34 |
#include <aclnnop/aclnn_fill_scalar.h>
|
| 35 |
#include <aclnnop/aclnn_group_norm.h>
|
|
@@ -58,7 +59,6 @@
|
|
| 58 |
#include <vector>
|
| 59 |
|
| 60 |
#include "ggml-impl.h"
|
| 61 |
-
#include "kernels/ascendc_kernels.h"
|
| 62 |
|
| 63 |
#define GGML_COMMON_DECL_C
|
| 64 |
|
|
@@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
| 99 |
ACL_CHECK(aclDestroyIntArray(repeats));
|
| 100 |
}
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 103 |
ggml_tensor* src = dst->src[0];
|
| 104 |
GGML_ASSERT(ggml_can_repeat(src, dst));
|
|
@@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
| 889 |
}
|
| 890 |
|
| 891 |
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 892 |
-
ggml_tensor*
|
| 893 |
|
| 894 |
-
aclTensor* acl_src = ggml_cann_create_tensor(
|
| 895 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
|
| 902 |
-
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
| 903 |
-
ctx.stream()));
|
| 904 |
-
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
| 905 |
-
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
| 906 |
-
ctx.stream()));
|
| 907 |
-
|
| 908 |
-
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
|
| 909 |
-
ggml_are_same_shape(src, dst)) {
|
| 910 |
-
cann_copy(ctx, acl_src, acl_dst);
|
| 911 |
-
ACL_CHECK(aclDestroyTensor(acl_src));
|
| 912 |
-
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 913 |
-
return;
|
| 914 |
-
}
|
| 915 |
-
// TODO: simplify
|
| 916 |
-
if (src->type == GGML_TYPE_F16) {
|
| 917 |
-
if (dst->type == GGML_TYPE_Q8_0) {
|
| 918 |
-
aclrtlaunch_ascendc_quantize_f16_q8_0(
|
| 919 |
-
24, ctx.stream(), src->data, dst->data,
|
| 920 |
-
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
| 921 |
-
((ggml_tensor*)dst->extra)->ne);
|
| 922 |
-
return;
|
| 923 |
-
}
|
| 924 |
-
if (dst->type == GGML_TYPE_Q4_0) {
|
| 925 |
-
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
| 926 |
-
24, ctx.stream(), src->data, dst->data,
|
| 927 |
-
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
| 928 |
-
((ggml_tensor*)dst->extra)->ne);
|
| 929 |
-
return;
|
| 930 |
-
}
|
| 931 |
-
if (dst->type == GGML_TYPE_F16) {
|
| 932 |
-
if (ggml_are_same_shape(src, dst)) {
|
| 933 |
-
cann_copy(ctx, acl_src, acl_dst);
|
| 934 |
-
ACL_CHECK(aclDestroyTensor(acl_src));
|
| 935 |
-
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 936 |
-
return;
|
| 937 |
-
}
|
| 938 |
-
if (ggml_is_contiguous(dst)) {
|
| 939 |
-
const size_t src_type_size = ggml_type_size(src->type);
|
| 940 |
-
if (src->nb[0] == src_type_size) {
|
| 941 |
-
// src0 is contigous on first dimension, copy by rows
|
| 942 |
-
int64_t rows_num = ggml_nrows(src);
|
| 943 |
-
|
| 944 |
-
aclrtlaunch_ascendc_dup_by_rows_fp16(
|
| 945 |
-
rows_num, ctx.stream(), src->data, dst->data,
|
| 946 |
-
((ggml_tensor*)src->extra)->ne,
|
| 947 |
-
((ggml_tensor*)src->extra)->nb,
|
| 948 |
-
((ggml_tensor*)dst->extra)->ne,
|
| 949 |
-
((ggml_tensor*)dst->extra)->nb);
|
| 950 |
-
return;
|
| 951 |
-
}
|
| 952 |
-
GGML_ABORT("fatal error");
|
| 953 |
-
}
|
| 954 |
-
GGML_ABORT("fatal error");
|
| 955 |
-
}
|
| 956 |
-
if (dst->type == GGML_TYPE_F32) {
|
| 957 |
-
if (ggml_are_same_shape(src, dst)) {
|
| 958 |
-
cann_copy(ctx, acl_src, acl_dst);
|
| 959 |
-
ACL_CHECK(aclDestroyTensor(acl_src));
|
| 960 |
-
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 961 |
-
return;
|
| 962 |
-
}
|
| 963 |
-
if (ggml_is_contiguous(dst)) {
|
| 964 |
-
const size_t src_type_size = ggml_type_size(src->type);
|
| 965 |
-
if (src->nb[0] == src_type_size) {
|
| 966 |
-
// src0 is contigous on first dimension, copy by rows
|
| 967 |
-
int64_t rows_num = ggml_nrows(src);
|
| 968 |
-
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
|
| 969 |
-
rows_num, ctx.stream(), src->data, dst->data,
|
| 970 |
-
((ggml_tensor*)src->extra)->ne,
|
| 971 |
-
((ggml_tensor*)src->extra)->nb,
|
| 972 |
-
((ggml_tensor*)dst->extra)->ne,
|
| 973 |
-
((ggml_tensor*)dst->extra)->nb);
|
| 974 |
-
return;
|
| 975 |
-
}
|
| 976 |
-
GGML_ABORT("fatal error");
|
| 977 |
-
}
|
| 978 |
-
GGML_ABORT("fatal error");
|
| 979 |
-
}
|
| 980 |
-
// TODO
|
| 981 |
-
GGML_ABORT("fatal error");
|
| 982 |
-
} else if (src->type == GGML_TYPE_F32) {
|
| 983 |
-
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
| 984 |
-
// && nb0 == type_size)
|
| 985 |
-
if (dst->type == GGML_TYPE_Q8_0) {
|
| 986 |
-
aclrtlaunch_ascendc_quantize_f32_q8_0(
|
| 987 |
-
24, ctx.stream(), src->data, dst->data,
|
| 988 |
-
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
| 989 |
-
((ggml_tensor*)dst->extra)->ne);
|
| 990 |
-
return;
|
| 991 |
-
}
|
| 992 |
-
if (dst->type == GGML_TYPE_Q4_0) {
|
| 993 |
-
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
| 994 |
-
24, ctx.stream(), src->data, dst->data,
|
| 995 |
-
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
| 996 |
-
((ggml_tensor*)dst->extra)->ne);
|
| 997 |
-
return;
|
| 998 |
}
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
ACL_CHECK(
|
|
|
|
|
|
|
| 1004 |
return;
|
| 1005 |
-
}
|
| 1006 |
-
if (ggml_is_contiguous(dst)) {
|
| 1007 |
-
const size_t src_type_size = ggml_type_size(src->type);
|
| 1008 |
-
if (src->nb[0] == src_type_size) {
|
| 1009 |
-
// src0 is contigous on first dimension, copy by rows
|
| 1010 |
-
int64_t rows_num = ggml_nrows(src);
|
| 1011 |
-
aclrtlaunch_ascendc_dup_by_rows_fp32(
|
| 1012 |
-
rows_num, ctx.stream(), src->data, dst->data,
|
| 1013 |
-
((ggml_tensor*)src->extra)->ne,
|
| 1014 |
-
((ggml_tensor*)src->extra)->nb,
|
| 1015 |
-
((ggml_tensor*)dst->extra)->ne,
|
| 1016 |
-
((ggml_tensor*)dst->extra)->nb);
|
| 1017 |
-
return;
|
| 1018 |
-
}
|
| 1019 |
-
GGML_ABORT("fatal error");
|
| 1020 |
} else {
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1030 |
return;
|
| 1031 |
}
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
((ggml_tensor*)src->extra)->nb,
|
| 1041 |
-
((ggml_tensor*)dst->extra)->ne,
|
| 1042 |
-
((ggml_tensor*)dst->extra)->nb);
|
| 1043 |
-
return;
|
| 1044 |
-
}
|
| 1045 |
-
GGML_ABORT("fatal error");
|
| 1046 |
}
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
| 1053 |
-
|
| 1054 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
return;
|
|
|
|
|
|
|
| 1056 |
}
|
| 1057 |
-
GGML_ABORT("fatal error");
|
| 1058 |
}
|
|
|
|
|
|
|
|
|
|
| 1059 |
}
|
| 1060 |
|
| 1061 |
#ifdef __cplusplus
|
|
@@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
| 2378 |
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
| 2379 |
}
|
| 2380 |
|
| 2381 |
-
|
| 2382 |
-
|
| 2383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2384 |
|
| 2385 |
-
|
| 2386 |
-
|
| 2387 |
-
|
| 2388 |
-
src0->extra = src0_extra_allocator.get();
|
| 2389 |
-
src1->extra = src1_extra_allocator.get();
|
| 2390 |
-
dst->extra = dst_extra_allocator.get();
|
| 2391 |
-
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
|
| 2392 |
-
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
| 2393 |
-
ctx.stream()));
|
| 2394 |
-
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
|
| 2395 |
-
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
| 2396 |
-
ctx.stream()));
|
| 2397 |
-
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
| 2398 |
-
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
| 2399 |
-
ctx.stream()));
|
| 2400 |
|
| 2401 |
switch (src0->type) {
|
| 2402 |
case GGML_TYPE_F32: {
|
| 2403 |
-
|
| 2404 |
-
|
| 2405 |
-
// content of dest data buffer when row is not aligned to 32 bytes
|
| 2406 |
-
if ((src0->ne[0] % 8) != 0) {
|
| 2407 |
-
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
| 2408 |
-
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
| 2409 |
-
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
| 2410 |
-
}
|
| 2411 |
-
#endif
|
| 2412 |
-
aclrtlaunch_ascendc_get_row_f32(
|
| 2413 |
-
24, ctx.stream(), src0->data, src1->data, dst->data,
|
| 2414 |
-
((ggml_tensor*)src0->extra)->ne,
|
| 2415 |
-
((ggml_tensor*)src0->extra)->nb,
|
| 2416 |
-
((ggml_tensor*)src1->extra)->ne,
|
| 2417 |
-
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
| 2418 |
-
((ggml_tensor*)dst->extra)->nb);
|
| 2419 |
break;
|
| 2420 |
}
|
| 2421 |
case GGML_TYPE_F16: {
|
| 2422 |
-
|
| 2423 |
-
|
| 2424 |
-
|
| 2425 |
-
|
| 2426 |
-
|
| 2427 |
-
|
| 2428 |
-
|
| 2429 |
-
|
| 2430 |
-
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
| 2431 |
}
|
| 2432 |
-
|
| 2433 |
-
|
| 2434 |
-
|
| 2435 |
-
|
| 2436 |
-
|
| 2437 |
-
|
| 2438 |
-
|
| 2439 |
-
|
| 2440 |
break;
|
| 2441 |
}
|
| 2442 |
-
case
|
| 2443 |
-
|
| 2444 |
-
|
| 2445 |
-
|
| 2446 |
-
|
| 2447 |
-
|
| 2448 |
-
|
| 2449 |
-
|
| 2450 |
-
|
| 2451 |
-
|
| 2452 |
-
|
| 2453 |
-
|
| 2454 |
-
|
| 2455 |
-
|
| 2456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2457 |
break;
|
|
|
|
| 2458 |
default:
|
| 2459 |
-
GGML_ABORT("
|
| 2460 |
break;
|
| 2461 |
}
|
| 2462 |
}
|
|
@@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
| 2797 |
|
| 2798 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
| 2799 |
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
| 2800 |
-
nullptr, nullptr, nullptr, antiquantGroupSize,
|
| 2801 |
-
&workspaceSize, &executor));
|
| 2802 |
if (workspaceAddr == nullptr) {
|
| 2803 |
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
| 2804 |
}
|
|
|
|
| 30 |
#include <aclnnop/aclnn_copy.h>
|
| 31 |
#include <aclnnop/aclnn_cos.h>
|
| 32 |
#include <aclnnop/aclnn_div.h>
|
| 33 |
+
#include <aclnnop/aclnn_embedding.h>
|
| 34 |
#include <aclnnop/aclnn_exp.h>
|
| 35 |
#include <aclnnop/aclnn_fill_scalar.h>
|
| 36 |
#include <aclnnop/aclnn_group_norm.h>
|
|
|
|
| 59 |
#include <vector>
|
| 60 |
|
| 61 |
#include "ggml-impl.h"
|
|
|
|
| 62 |
|
| 63 |
#define GGML_COMMON_DECL_C
|
| 64 |
|
|
|
|
| 99 |
ACL_CHECK(aclDestroyIntArray(repeats));
|
| 100 |
}
|
| 101 |
|
| 102 |
+
/**
|
| 103 |
+
* @brief Casts the elements of a tensor to a specified data type using the CANN backend.
|
| 104 |
+
*
|
| 105 |
+
* @details This function performs a type conversion on the elements of the input tensor `acl_src`
|
| 106 |
+
* and stores the results in the destination tensor `acl_dst`. The conversion type is
|
| 107 |
+
* determined based on the `dst` tensor's data type.
|
| 108 |
+
*
|
| 109 |
+
* @param ctx The context for the CANN backend operations.
|
| 110 |
+
* @param acl_src The source tensor whose elements will be cast.
|
| 111 |
+
* @param acl_dst The destination tensor that will store the casted elements.
|
| 112 |
+
* @param dst The ggml tensor specifying the target data type.
|
| 113 |
+
*/
|
| 114 |
+
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
| 115 |
+
aclTensor* acl_dst, ggml_tensor* dst) {
|
| 116 |
+
uint64_t workspaceSize = 0;
|
| 117 |
+
aclOpExecutor* executor;
|
| 118 |
+
void* workspaceAddr = nullptr;
|
| 119 |
+
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
|
| 120 |
+
ggml_cann_type_mapping(dst->type),
|
| 121 |
+
acl_dst, &workspaceSize, &executor));
|
| 122 |
+
|
| 123 |
+
if (workspaceSize > 0) {
|
| 124 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
| 125 |
+
workspaceAddr = workspace_allocator.get();
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 132 |
ggml_tensor* src = dst->src[0];
|
| 133 |
GGML_ASSERT(ggml_can_repeat(src, dst));
|
|
|
|
| 918 |
}
|
| 919 |
|
| 920 |
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 921 |
+
ggml_tensor* src0 = dst->src[0];
|
| 922 |
|
| 923 |
+
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
| 924 |
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
| 925 |
+
if (ggml_are_same_shape(src0, dst)) {
|
| 926 |
+
if (dst->type == src0->type) {
|
| 927 |
+
cann_copy(ctx, acl_src, acl_dst);
|
| 928 |
+
} else {
|
| 929 |
+
aclnn_cast(ctx, acl_src, acl_dst, dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
}
|
| 931 |
+
} else {
|
| 932 |
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
| 933 |
+
if (dst->type == src0->type) {
|
| 934 |
+
size_t cpy_size = ggml_nbytes(dst);
|
| 935 |
+
ACL_CHECK(aclrtMemcpyAsync(
|
| 936 |
+
dst->data, cpy_size, src0->data, cpy_size,
|
| 937 |
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
| 938 |
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
} else {
|
| 940 |
+
ggml_cann_pool_alloc src_buffer_allocator(
|
| 941 |
+
ctx.pool(),
|
| 942 |
+
ggml_nelements(dst) * ggml_type_size(dst->type));
|
| 943 |
+
void* src_trans_buffer = src_buffer_allocator.get();
|
| 944 |
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
| 945 |
+
src_trans_nb[0] = ggml_type_size(dst->type);
|
| 946 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 947 |
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
| 948 |
+
}
|
| 949 |
+
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
| 950 |
+
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
| 951 |
+
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
| 952 |
+
GGML_MAX_DIMS);
|
| 953 |
+
|
| 954 |
+
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
| 955 |
+
size_t cpy_size = ggml_nbytes(dst);
|
| 956 |
+
ACL_CHECK(aclrtMemcpyAsync(
|
| 957 |
+
dst->data, cpy_size, src_trans_buffer, cpy_size,
|
| 958 |
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
| 959 |
+
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
| 960 |
return;
|
| 961 |
}
|
| 962 |
+
} else if (ggml_is_contiguous(dst)) {
|
| 963 |
+
ggml_cann_pool_alloc src_buffer_allocator(
|
| 964 |
+
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
| 965 |
+
void* src_trans_buffer = src_buffer_allocator.get();
|
| 966 |
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
| 967 |
+
src_trans_nb[0] = ggml_type_size(dst->type);
|
| 968 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 969 |
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
}
|
| 971 |
+
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
| 972 |
+
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
| 973 |
+
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
| 974 |
+
GGML_MAX_DIMS);
|
| 975 |
+
|
| 976 |
+
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
| 977 |
+
|
| 978 |
+
size_t cpy_size = ggml_nbytes(dst);
|
| 979 |
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
|
| 980 |
+
cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
| 981 |
+
ctx.stream()));
|
| 982 |
+
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
| 983 |
return;
|
| 984 |
+
} else {
|
| 985 |
+
GGML_ABORT("Unsupport dst is not tontiguous.");
|
| 986 |
}
|
|
|
|
| 987 |
}
|
| 988 |
+
|
| 989 |
+
ACL_CHECK(aclDestroyTensor(acl_src));
|
| 990 |
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
| 991 |
}
|
| 992 |
|
| 993 |
#ifdef __cplusplus
|
|
|
|
| 2310 |
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
| 2311 |
}
|
| 2312 |
|
| 2313 |
+
/**
|
| 2314 |
+
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
| 2315 |
+
*
|
| 2316 |
+
* This function extracts slices from the source tensor (`src_buffer`),
|
| 2317 |
+
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
| 2318 |
+
* embedding operation on them. The embedding operation is applied by iterating
|
| 2319 |
+
* over the last two dimensions of the source tensor, creating the necessary
|
| 2320 |
+
* tensors for the source, index, and output, and executing the embedding operation.
|
| 2321 |
+
*
|
| 2322 |
+
* @param ctx The context for CANN backend operations.
|
| 2323 |
+
* @param src_buffer The source buffer holding the data for the source tensor.
|
| 2324 |
+
* @param src_ne The dimensions of the source tensor.
|
| 2325 |
+
* @param src_nb The strides (byte offsets) of the source tensor.
|
| 2326 |
+
* @param index The index tensor used in the embedding operation.
|
| 2327 |
+
* @param dst The destination tensor where the result will be stored.
|
| 2328 |
+
*/
|
| 2329 |
+
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
| 2330 |
+
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
| 2331 |
+
ggml_tensor* dst) {
|
| 2332 |
+
for (int64_t i = 0; i < src_ne[3]; i++) {
|
| 2333 |
+
for (int64_t j = 0; j < src_ne[2]; j++) {
|
| 2334 |
+
// src
|
| 2335 |
+
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
| 2336 |
+
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
| 2337 |
+
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
| 2338 |
+
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
| 2339 |
+
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
| 2340 |
+
acl_src_ne, acl_src_nb, 2);
|
| 2341 |
+
|
| 2342 |
+
// index
|
| 2343 |
+
int64_t acl_index_ne[1] = {index->ne[0]};
|
| 2344 |
+
size_t acl_index_nb[1] = {index->nb[0]};
|
| 2345 |
+
aclTensor* acl_index = ggml_cann_create_tensor(
|
| 2346 |
+
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
| 2347 |
+
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
| 2348 |
+
acl_index_ne, acl_index_nb, 1);
|
| 2349 |
+
|
| 2350 |
+
// out
|
| 2351 |
+
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
| 2352 |
+
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
| 2353 |
+
aclTensor* acl_out = ggml_cann_create_tensor(
|
| 2354 |
+
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
| 2355 |
+
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
| 2356 |
+
acl_out_ne, acl_out_nb, 2);
|
| 2357 |
+
|
| 2358 |
+
uint64_t workspaceSize = 0;
|
| 2359 |
+
aclOpExecutor* executor;
|
| 2360 |
+
void* workspaceAddr = nullptr;
|
| 2361 |
+
|
| 2362 |
+
ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
|
| 2363 |
+
acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
|
| 2364 |
+
|
| 2365 |
+
if (workspaceSize > 0) {
|
| 2366 |
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
| 2367 |
+
workspaceSize);
|
| 2368 |
+
workspaceAddr = workspace_allocator.get();
|
| 2369 |
+
}
|
| 2370 |
+
|
| 2371 |
+
ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
|
| 2372 |
+
ctx.stream()));
|
| 2373 |
+
|
| 2374 |
+
ACL_CHECK(aclDestroyTensor(acl_src_tensor));
|
| 2375 |
+
ACL_CHECK(aclDestroyTensor(acl_index));
|
| 2376 |
+
ACL_CHECK(aclDestroyTensor(acl_out));
|
| 2377 |
+
}
|
| 2378 |
+
}
|
| 2379 |
+
}
|
| 2380 |
|
| 2381 |
+
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 2382 |
+
ggml_tensor* src0 = dst->src[0]; // src
|
| 2383 |
+
ggml_tensor* src1 = dst->src[1]; // index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2384 |
|
| 2385 |
switch (src0->type) {
|
| 2386 |
case GGML_TYPE_F32: {
|
| 2387 |
+
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
| 2388 |
+
dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2389 |
break;
|
| 2390 |
}
|
| 2391 |
case GGML_TYPE_F16: {
|
| 2392 |
+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
| 2393 |
+
ggml_cann_pool_alloc src_buffer_allocator(
|
| 2394 |
+
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
| 2395 |
+
void* src_trans_buffer = src_buffer_allocator.get();
|
| 2396 |
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
| 2397 |
+
src_trans_nb[0] = sizeof(float_t);
|
| 2398 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2399 |
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
|
|
| 2400 |
}
|
| 2401 |
+
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
| 2402 |
+
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
| 2403 |
+
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
| 2404 |
+
aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
|
| 2405 |
+
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
| 2406 |
+
src_trans_nb, src1, dst);
|
| 2407 |
+
ACL_CHECK(aclDestroyTensor(acl_src0));
|
| 2408 |
+
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
| 2409 |
break;
|
| 2410 |
}
|
| 2411 |
+
case GGML_TYPE_Q8_0: {
|
| 2412 |
+
// add 1 dim for bcast mul.
|
| 2413 |
+
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
| 2414 |
+
dequant_nb[GGML_MAX_DIMS + 1];
|
| 2415 |
+
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
| 2416 |
+
*dequant_ne;
|
| 2417 |
+
int64_t scale_offset = 0;
|
| 2418 |
+
|
| 2419 |
+
// [3,4,5,64] -> [3,4,5,2,32]
|
| 2420 |
+
weight_ne[0] = QK8_0;
|
| 2421 |
+
weight_ne[1] = src0->ne[0] / QK8_0;
|
| 2422 |
+
weight_nb[0] = sizeof(int8_t);
|
| 2423 |
+
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
| 2424 |
+
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
| 2425 |
+
weight_ne[i] = src0->ne[i - 1];
|
| 2426 |
+
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
| 2427 |
+
}
|
| 2428 |
+
|
| 2429 |
+
// [3,4,5,64] -> [3,4,5,2,1]
|
| 2430 |
+
scale_ne[0] = 1;
|
| 2431 |
+
scale_ne[1] = src0->ne[0] / QK8_0;
|
| 2432 |
+
scale_nb[0] = sizeof(uint16_t);
|
| 2433 |
+
scale_nb[1] = scale_nb[0] * scale_ne[0];
|
| 2434 |
+
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
| 2435 |
+
scale_ne[i] = src0->ne[i - 1];
|
| 2436 |
+
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
| 2437 |
+
}
|
| 2438 |
+
|
| 2439 |
+
// [3,4,5,64] -> [3,4,5,2,32]
|
| 2440 |
+
dequant_ne = weight_ne;
|
| 2441 |
+
dequant_nb[0] = sizeof(float_t);
|
| 2442 |
+
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
| 2443 |
+
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
| 2444 |
+
}
|
| 2445 |
+
|
| 2446 |
+
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
| 2447 |
+
ggml_cann_pool_alloc dequant_buffer_allocator(
|
| 2448 |
+
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
| 2449 |
+
|
| 2450 |
+
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
| 2451 |
+
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
| 2452 |
+
GGML_MAX_DIMS + 1);
|
| 2453 |
+
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
| 2454 |
+
src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
|
| 2455 |
+
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
| 2456 |
+
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
| 2457 |
+
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
|
| 2458 |
+
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
| 2459 |
+
|
| 2460 |
+
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
| 2461 |
+
dequant_nb[0] = sizeof(float_t);
|
| 2462 |
+
dequant_ne = src0->ne;
|
| 2463 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2464 |
+
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
| 2465 |
+
}
|
| 2466 |
+
|
| 2467 |
+
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
| 2468 |
+
dequant_ne, dequant_nb, src1, dst);
|
| 2469 |
+
|
| 2470 |
+
ACL_CHECK(aclDestroyTensor(dequant_tensor));
|
| 2471 |
break;
|
| 2472 |
+
}
|
| 2473 |
default:
|
| 2474 |
+
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
|
| 2475 |
break;
|
| 2476 |
}
|
| 2477 |
}
|
|
|
|
| 2812 |
|
| 2813 |
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
| 2814 |
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
| 2815 |
+
nullptr, nullptr, nullptr, antiquantGroupSize,
|
| 2816 |
+
acl_output_tensor, &workspaceSize, &executor));
|
| 2817 |
if (workspaceAddr == nullptr) {
|
| 2818 |
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
| 2819 |
}
|
|
@@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1704 |
switch (op->src[0]->type) {
|
| 1705 |
case GGML_TYPE_F32:
|
| 1706 |
case GGML_TYPE_F16:
|
| 1707 |
-
case GGML_TYPE_Q4_0:
|
| 1708 |
case GGML_TYPE_Q8_0:
|
| 1709 |
return true;
|
| 1710 |
default:
|
|
@@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1712 |
}
|
| 1713 |
} break;
|
| 1714 |
case GGML_OP_CPY: {
|
| 1715 |
-
|
| 1716 |
-
|
| 1717 |
-
|
| 1718 |
-
|
| 1719 |
-
|
| 1720 |
-
|
| 1721 |
-
default:
|
| 1722 |
-
return false;
|
| 1723 |
}
|
| 1724 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1725 |
case GGML_OP_CONT: {
|
| 1726 |
// TODO: support GGML_TYPE_BF16
|
| 1727 |
switch (op->src[0]->type) {
|
|
@@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1762 |
}
|
| 1763 |
return true;
|
| 1764 |
}
|
|
|
|
| 1765 |
case GGML_OP_IM2COL:
|
| 1766 |
case GGML_OP_CONCAT:
|
| 1767 |
-
case GGML_OP_DUP:
|
| 1768 |
case GGML_OP_REPEAT:
|
| 1769 |
case GGML_OP_NONE:
|
| 1770 |
case GGML_OP_RESHAPE:
|
|
|
|
| 1704 |
switch (op->src[0]->type) {
|
| 1705 |
case GGML_TYPE_F32:
|
| 1706 |
case GGML_TYPE_F16:
|
|
|
|
| 1707 |
case GGML_TYPE_Q8_0:
|
| 1708 |
return true;
|
| 1709 |
default:
|
|
|
|
| 1711 |
}
|
| 1712 |
} break;
|
| 1713 |
case GGML_OP_CPY: {
|
| 1714 |
+
ggml_tensor *src = op->src[0];
|
| 1715 |
+
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
| 1716 |
+
(src->type != GGML_TYPE_F32 &&
|
| 1717 |
+
src->type != GGML_TYPE_F16)) {
|
| 1718 |
+
// only support F32 and F16.
|
| 1719 |
+
return false;
|
|
|
|
|
|
|
| 1720 |
}
|
| 1721 |
+
|
| 1722 |
+
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
| 1723 |
+
// unsupport dst is not contiguous.
|
| 1724 |
+
return false;
|
| 1725 |
+
}
|
| 1726 |
+
|
| 1727 |
+
return true;
|
| 1728 |
+
} break;
|
| 1729 |
case GGML_OP_CONT: {
|
| 1730 |
// TODO: support GGML_TYPE_BF16
|
| 1731 |
switch (op->src[0]->type) {
|
|
|
|
| 1766 |
}
|
| 1767 |
return true;
|
| 1768 |
}
|
| 1769 |
+
case GGML_OP_DUP:
|
| 1770 |
case GGML_OP_IM2COL:
|
| 1771 |
case GGML_OP_CONCAT:
|
|
|
|
| 1772 |
case GGML_OP_REPEAT:
|
| 1773 |
case GGML_OP_NONE:
|
| 1774 |
case GGML_OP_RESHAPE:
|