Spaces:
Sleeping
Sleeping
Chenguang Li
commited on
Commit
·
d8d5b0b
1
Parent(s):
bf678f0
CANN: update aclnnGroupedMatmulV2 to aclnnGroupedMatmulV3 (llama/14411)
Browse files* [CANN]update to aclnnGroupedMatmulV2
Signed-off-by: noemotiovon <[email protected]>
* Support MUL_MAT_ID on 310p
Signed-off-by: noemotiovon <[email protected]>
* fix editorconfig
Signed-off-by: noemotiovon <[email protected]>
---------
Signed-off-by: noemotiovon <[email protected]>
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -65,7 +65,7 @@
|
|
| 65 |
#include <aclnnop/aclnn_eq_tensor.h>
|
| 66 |
#include <aclnnop/aclnn_gt_scalar.h>
|
| 67 |
#include <aclnnop/aclnn_pow.h>
|
| 68 |
-
#include <aclnnop/
|
| 69 |
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
| 70 |
#include <float.h>
|
| 71 |
|
|
@@ -2654,6 +2654,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
|
| 2654 |
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
| 2655 |
}
|
| 2656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2657 |
std::vector<aclTensor*> src0_tensor_vec;
|
| 2658 |
std::vector<aclTensor*> src1_tensor_vec;
|
| 2659 |
std::vector<aclTensor*> dst_tensor_vec;
|
|
@@ -2701,9 +2762,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
|
| 2701 |
}
|
| 2702 |
|
| 2703 |
size_t GROUP_SIZE = 128;
|
| 2704 |
-
//
|
| 2705 |
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
| 2706 |
-
// split and call
|
| 2707 |
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
| 2708 |
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
| 2709 |
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
|
@@ -2713,7 +2774,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
|
| 2713 |
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
| 2714 |
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
| 2715 |
|
| 2716 |
-
GGML_CANN_CALL_ACLNN_OP(ctx,
|
| 2717 |
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
| 2718 |
|
| 2719 |
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
|
|
|
| 65 |
#include <aclnnop/aclnn_eq_tensor.h>
|
| 66 |
#include <aclnnop/aclnn_gt_scalar.h>
|
| 67 |
#include <aclnnop/aclnn_pow.h>
|
| 68 |
+
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
| 69 |
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
| 70 |
#include <float.h>
|
| 71 |
|
|
|
|
| 2654 |
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
| 2655 |
}
|
| 2656 |
|
| 2657 |
+
#ifdef ASCEND_310P
|
| 2658 |
+
ggml_tensor src0_row = *src0;
|
| 2659 |
+
ggml_tensor src1_row = *src1;
|
| 2660 |
+
ggml_tensor dst_row = *dst;
|
| 2661 |
+
|
| 2662 |
+
if (src0->type == GGML_TYPE_F16) {
|
| 2663 |
+
src0_row.type = GGML_TYPE_F32;
|
| 2664 |
+
}
|
| 2665 |
+
|
| 2666 |
+
// src0_row [D, M, 1, 1] weight without permute
|
| 2667 |
+
src0_row.ne[2] = 1;
|
| 2668 |
+
src0_row.ne[3] = 1;
|
| 2669 |
+
src0_row.nb[0] = ori_src0_nb[0];
|
| 2670 |
+
src0_row.nb[1] = ori_src0_nb[1];
|
| 2671 |
+
src0_row.nb[2] = ori_src0_nb[1];
|
| 2672 |
+
src0_row.nb[3] = ori_src0_nb[1];
|
| 2673 |
+
|
| 2674 |
+
// src1_row [D, 1, 1, 1] -> input
|
| 2675 |
+
src1_row.ne[1] = 1;
|
| 2676 |
+
src1_row.ne[2] = 1;
|
| 2677 |
+
src1_row.ne[3] = 1;
|
| 2678 |
+
src1_row.nb[2] = nb11;
|
| 2679 |
+
src1_row.nb[3] = nb11;
|
| 2680 |
+
|
| 2681 |
+
// dst_row [M, 1, 1, 1] -> out
|
| 2682 |
+
dst_row.ne[1] = 1;
|
| 2683 |
+
dst_row.ne[2] = 1;
|
| 2684 |
+
dst_row.ne[3] = 1;
|
| 2685 |
+
dst_row.nb[2] = nb1;
|
| 2686 |
+
dst_row.nb[3] = nb1;
|
| 2687 |
+
|
| 2688 |
+
//create weight for one row
|
| 2689 |
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
| 2690 |
+
for (int64_t id = 0; id < n_ids; id++) {
|
| 2691 |
+
// expert index
|
| 2692 |
+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
| 2693 |
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
| 2694 |
+
|
| 2695 |
+
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
| 2696 |
+
int64_t i11 = (ne11 == 1 ? 0 : id);
|
| 2697 |
+
int64_t i12 = iid1;
|
| 2698 |
+
|
| 2699 |
+
int64_t i1 = id;
|
| 2700 |
+
int64_t i2 = i12;
|
| 2701 |
+
|
| 2702 |
+
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
| 2703 |
+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
| 2704 |
+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
| 2705 |
+
|
| 2706 |
+
src0_row.data = src0_tmp_ptr;
|
| 2707 |
+
src1_row.data = src1_tmp_ptr;
|
| 2708 |
+
dst_row.data = dst_tmp_ptr;
|
| 2709 |
+
dst_row.src[0] = &src0_row;
|
| 2710 |
+
dst_row.src[1] = &src1_row;
|
| 2711 |
+
|
| 2712 |
+
ggml_cann_mul_mat(ctx, &dst_row);
|
| 2713 |
+
}
|
| 2714 |
+
}
|
| 2715 |
+
return;
|
| 2716 |
+
#endif
|
| 2717 |
+
|
| 2718 |
std::vector<aclTensor*> src0_tensor_vec;
|
| 2719 |
std::vector<aclTensor*> src1_tensor_vec;
|
| 2720 |
std::vector<aclTensor*> dst_tensor_vec;
|
|
|
|
| 2762 |
}
|
| 2763 |
|
| 2764 |
size_t GROUP_SIZE = 128;
|
| 2765 |
+
// GroupedMatmulV3 required tensor_list.size < 128
|
| 2766 |
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
| 2767 |
+
// split and call GroupedMatmulV3
|
| 2768 |
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
| 2769 |
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
| 2770 |
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
|
|
|
| 2774 |
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
| 2775 |
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
| 2776 |
|
| 2777 |
+
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
|
| 2778 |
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
| 2779 |
|
| 2780 |
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|