Spaces:
Running
Running
move BLAS to a separate backend (llama/6210)
Browse files* move BLAS to a separate backend
* rename GGML_USE_OPENBLAS to GGML_USE_BLAS
* alloc : reuse same buffer when the same buffer type if used multiple times
* set number of threads automatically for openblas and blis
* sched : print assignments when GGML_SCHED_DEBUG env variable is set
* sched : allow ops with weights on an incompatible buffer type
This will cause the weight to be copied to a backend that supports the
op, which is very costly. The weight should have been stored in a buffer
of a backend that can run the op, but llama.cpp cannot do this
automatically at the moment.
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml-alloc.c +77 -21
- ggml-backend-impl.h +20 -8
- ggml-backend.c +178 -64
- ggml-backend.h +3 -3
- ggml-cuda.cu +24 -20
- ggml-kompute.cpp +7 -6
- ggml-metal.m +8 -7
- ggml-rpc.cpp +11 -10
- ggml-sycl.cpp +10 -18
- ggml-vulkan.cpp +13 -13
- ggml.c +22 -183
ggml-alloc.c
CHANGED
|
@@ -339,6 +339,7 @@ struct hash_node {
|
|
| 339 |
};
|
| 340 |
|
| 341 |
struct tensor_alloc {
|
|
|
|
| 342 |
size_t offset;
|
| 343 |
size_t size_max; // 0 = pre-allocated, unused, or view
|
| 344 |
};
|
|
@@ -349,7 +350,6 @@ struct leaf_alloc {
|
|
| 349 |
};
|
| 350 |
|
| 351 |
struct node_alloc {
|
| 352 |
-
int buffer_id;
|
| 353 |
struct tensor_alloc dst;
|
| 354 |
struct tensor_alloc src[GGML_MAX_SRC];
|
| 355 |
};
|
|
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|
| 386 |
for (int i = 0; i < n_bufs; i++) {
|
| 387 |
galloc->bufts[i] = bufts[i];
|
| 388 |
galloc->buffers[i] = NULL;
|
| 389 |
-
|
| 390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
}
|
| 392 |
galloc->n_buffers = n_bufs;
|
| 393 |
|
|
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|
| 405 |
|
| 406 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 407 |
if (galloc->buffers != NULL) {
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
}
|
| 410 |
if (galloc->buf_tallocs != NULL) {
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
}
|
| 414 |
|
|
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|
| 511 |
}
|
| 512 |
}
|
| 513 |
|
| 514 |
-
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node
|
| 515 |
// graph outputs are never freed
|
| 516 |
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
| 517 |
AT_PRINTF("not freeing output %s\n", node->name);
|
| 518 |
return;
|
| 519 |
}
|
| 520 |
|
| 521 |
-
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
| 522 |
-
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
| 523 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 524 |
size_t offset = hn->offset;
|
|
|
|
|
|
|
|
|
|
| 525 |
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
| 526 |
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
| 527 |
hn->allocated = false;
|
|
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
| 626 |
AT_PRINTF("view_src %s: %d children, %d views\n",
|
| 627 |
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
| 628 |
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
| 629 |
-
ggml_gallocr_free_node(galloc, view_src
|
| 630 |
}
|
| 631 |
}
|
| 632 |
else if (p_hn->allocated) {
|
| 633 |
-
ggml_gallocr_free_node(galloc, parent
|
| 634 |
}
|
| 635 |
}
|
| 636 |
AT_PRINTF("\n");
|
|
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 674 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 675 |
struct ggml_tensor * node = graph->nodes[i];
|
| 676 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 677 |
-
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
| 678 |
if (node->view_src || node->data) {
|
|
|
|
| 679 |
node_alloc->dst.offset = SIZE_MAX;
|
| 680 |
node_alloc->dst.size_max = 0;
|
| 681 |
} else {
|
| 682 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 683 |
-
node_alloc->dst.
|
| 684 |
-
node_alloc->dst.
|
|
|
|
| 685 |
}
|
| 686 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 687 |
struct ggml_tensor * src = node->src[j];
|
| 688 |
if (!src || src->view_src || src->data) {
|
|
|
|
| 689 |
node_alloc->src[j].offset = SIZE_MAX;
|
| 690 |
node_alloc->src[j].size_max = 0;
|
| 691 |
} else {
|
| 692 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
|
|
|
| 693 |
node_alloc->src[j].offset = hn->offset;
|
| 694 |
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
| 695 |
}
|
|
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 706 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 707 |
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
| 708 |
if (leaf->view_src || leaf->data) {
|
|
|
|
| 709 |
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
| 710 |
galloc->leaf_allocs[i].leaf.size_max = 0;
|
| 711 |
} else {
|
|
|
|
| 712 |
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
| 713 |
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
| 714 |
}
|
|
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 716 |
|
| 717 |
// reallocate buffers if needed
|
| 718 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
| 720 |
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
| 721 |
|
|
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
| 724 |
#ifndef NDEBUG
|
| 725 |
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 726 |
#endif
|
|
|
|
| 727 |
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 728 |
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
| 729 |
if (galloc->buffers[i] == NULL) {
|
|
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|
| 740 |
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
| 741 |
}
|
| 742 |
|
| 743 |
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor,
|
|
|
|
| 744 |
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
| 745 |
|
| 746 |
if (tensor->view_src != NULL) {
|
|
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|
| 768 |
}
|
| 769 |
}
|
| 770 |
|
| 771 |
-
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct
|
| 772 |
-
ggml_backend_buffer_type_t buft = galloc->bufts[
|
| 773 |
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
| 774 |
return talloc->size_max >= node_size;
|
| 775 |
}
|
|
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
| 793 |
struct ggml_tensor * node = graph->nodes[i];
|
| 794 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 795 |
|
| 796 |
-
if (!ggml_gallocr_node_needs_realloc(galloc, node,
|
| 797 |
#ifndef NDEBUG
|
| 798 |
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
| 799 |
#endif
|
|
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|
| 805 |
if (src == NULL) {
|
| 806 |
continue;
|
| 807 |
}
|
| 808 |
-
if (!ggml_gallocr_node_needs_realloc(galloc, src,
|
| 809 |
#ifndef NDEBUG
|
| 810 |
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
| 811 |
#endif
|
|
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 846 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 847 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 848 |
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
| 849 |
-
ggml_gallocr_init_tensor(galloc, leaf,
|
| 850 |
}
|
| 851 |
// nodes
|
| 852 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
| 857 |
if (src == NULL) {
|
| 858 |
continue;
|
| 859 |
}
|
| 860 |
-
ggml_gallocr_init_tensor(galloc, src,
|
| 861 |
}
|
| 862 |
-
ggml_gallocr_init_tensor(galloc, node,
|
| 863 |
}
|
| 864 |
|
| 865 |
return true;
|
|
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|
| 871 |
if (galloc->buffers[buffer_id] == NULL) {
|
| 872 |
return 0;
|
| 873 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
| 875 |
}
|
| 876 |
|
|
|
|
| 339 |
};
|
| 340 |
|
| 341 |
struct tensor_alloc {
|
| 342 |
+
int buffer_id;
|
| 343 |
size_t offset;
|
| 344 |
size_t size_max; // 0 = pre-allocated, unused, or view
|
| 345 |
};
|
|
|
|
| 350 |
};
|
| 351 |
|
| 352 |
struct node_alloc {
|
|
|
|
| 353 |
struct tensor_alloc dst;
|
| 354 |
struct tensor_alloc src[GGML_MAX_SRC];
|
| 355 |
};
|
|
|
|
| 386 |
for (int i = 0; i < n_bufs; i++) {
|
| 387 |
galloc->bufts[i] = bufts[i];
|
| 388 |
galloc->buffers[i] = NULL;
|
| 389 |
+
|
| 390 |
+
// check if the same buffer type is used multiple times and reuse the same allocator
|
| 391 |
+
for (int j = 0; j < i; j++) {
|
| 392 |
+
if (bufts[i] == bufts[j]) {
|
| 393 |
+
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
| 394 |
+
break;
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
if (galloc->buf_tallocs[i] == NULL) {
|
| 399 |
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
| 400 |
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
| 401 |
+
}
|
| 402 |
}
|
| 403 |
galloc->n_buffers = n_bufs;
|
| 404 |
|
|
|
|
| 416 |
|
| 417 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 418 |
if (galloc->buffers != NULL) {
|
| 419 |
+
// skip if already freed
|
| 420 |
+
bool freed = false;
|
| 421 |
+
for (int j = 0; j < i; j++) {
|
| 422 |
+
if (galloc->buffers[j] == galloc->buffers[i]) {
|
| 423 |
+
freed = true;
|
| 424 |
+
break;
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
if (!freed) {
|
| 428 |
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 429 |
+
}
|
| 430 |
}
|
| 431 |
if (galloc->buf_tallocs != NULL) {
|
| 432 |
+
// skip if already freed
|
| 433 |
+
bool freed = false;
|
| 434 |
+
for (int j = 0; j < i; j++) {
|
| 435 |
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
| 436 |
+
freed = true;
|
| 437 |
+
break;
|
| 438 |
+
}
|
| 439 |
+
}
|
| 440 |
+
if (!freed) {
|
| 441 |
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
| 442 |
+
}
|
| 443 |
}
|
| 444 |
}
|
| 445 |
|
|
|
|
| 542 |
}
|
| 543 |
}
|
| 544 |
|
| 545 |
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
| 546 |
// graph outputs are never freed
|
| 547 |
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
| 548 |
AT_PRINTF("not freeing output %s\n", node->name);
|
| 549 |
return;
|
| 550 |
}
|
| 551 |
|
|
|
|
|
|
|
| 552 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 553 |
size_t offset = hn->offset;
|
| 554 |
+
int buffer_id = hn->buffer_id;
|
| 555 |
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
| 556 |
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
| 557 |
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
| 558 |
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
| 559 |
hn->allocated = false;
|
|
|
|
| 658 |
AT_PRINTF("view_src %s: %d children, %d views\n",
|
| 659 |
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
| 660 |
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
| 661 |
+
ggml_gallocr_free_node(galloc, view_src);
|
| 662 |
}
|
| 663 |
}
|
| 664 |
else if (p_hn->allocated) {
|
| 665 |
+
ggml_gallocr_free_node(galloc, parent);
|
| 666 |
}
|
| 667 |
}
|
| 668 |
AT_PRINTF("\n");
|
|
|
|
| 706 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 707 |
struct ggml_tensor * node = graph->nodes[i];
|
| 708 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
|
|
|
| 709 |
if (node->view_src || node->data) {
|
| 710 |
+
node_alloc->dst.buffer_id = -1;
|
| 711 |
node_alloc->dst.offset = SIZE_MAX;
|
| 712 |
node_alloc->dst.size_max = 0;
|
| 713 |
} else {
|
| 714 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
| 715 |
+
node_alloc->dst.buffer_id = hn->buffer_id;
|
| 716 |
+
node_alloc->dst.offset = hn->offset;
|
| 717 |
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
| 718 |
}
|
| 719 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 720 |
struct ggml_tensor * src = node->src[j];
|
| 721 |
if (!src || src->view_src || src->data) {
|
| 722 |
+
node_alloc->src[j].buffer_id = -1;
|
| 723 |
node_alloc->src[j].offset = SIZE_MAX;
|
| 724 |
node_alloc->src[j].size_max = 0;
|
| 725 |
} else {
|
| 726 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
| 727 |
+
node_alloc->src[j].buffer_id = hn->buffer_id;
|
| 728 |
node_alloc->src[j].offset = hn->offset;
|
| 729 |
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
| 730 |
}
|
|
|
|
| 741 |
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
| 742 |
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
| 743 |
if (leaf->view_src || leaf->data) {
|
| 744 |
+
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
| 745 |
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
| 746 |
galloc->leaf_allocs[i].leaf.size_max = 0;
|
| 747 |
} else {
|
| 748 |
+
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
| 749 |
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
| 750 |
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
| 751 |
}
|
|
|
|
| 753 |
|
| 754 |
// reallocate buffers if needed
|
| 755 |
for (int i = 0; i < galloc->n_buffers; i++) {
|
| 756 |
+
// if the buffer type is used multiple times, we reuse the same buffer
|
| 757 |
+
for (int j = 0; j < i; j++) {
|
| 758 |
+
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
| 759 |
+
galloc->buffers[i] = galloc->buffers[j];
|
| 760 |
+
break;
|
| 761 |
+
}
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
| 765 |
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
| 766 |
|
|
|
|
| 769 |
#ifndef NDEBUG
|
| 770 |
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
| 771 |
#endif
|
| 772 |
+
|
| 773 |
ggml_backend_buffer_free(galloc->buffers[i]);
|
| 774 |
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
| 775 |
if (galloc->buffers[i] == NULL) {
|
|
|
|
| 786 |
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
| 787 |
}
|
| 788 |
|
| 789 |
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
| 790 |
+
int buffer_id = tensor_alloc->buffer_id;
|
| 791 |
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
| 792 |
|
| 793 |
if (tensor->view_src != NULL) {
|
|
|
|
| 815 |
}
|
| 816 |
}
|
| 817 |
|
| 818 |
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
| 819 |
+
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
| 820 |
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
| 821 |
return talloc->size_max >= node_size;
|
| 822 |
}
|
|
|
|
| 840 |
struct ggml_tensor * node = graph->nodes[i];
|
| 841 |
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
| 842 |
|
| 843 |
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
| 844 |
#ifndef NDEBUG
|
| 845 |
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
| 846 |
#endif
|
|
|
|
| 852 |
if (src == NULL) {
|
| 853 |
continue;
|
| 854 |
}
|
| 855 |
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
| 856 |
#ifndef NDEBUG
|
| 857 |
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
| 858 |
#endif
|
|
|
|
| 893 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 894 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 895 |
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
| 896 |
+
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
| 897 |
}
|
| 898 |
// nodes
|
| 899 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
|
|
| 904 |
if (src == NULL) {
|
| 905 |
continue;
|
| 906 |
}
|
| 907 |
+
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
| 908 |
}
|
| 909 |
+
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
| 910 |
}
|
| 911 |
|
| 912 |
return true;
|
|
|
|
| 918 |
if (galloc->buffers[buffer_id] == NULL) {
|
| 919 |
return 0;
|
| 920 |
}
|
| 921 |
+
|
| 922 |
+
for (int i = 0; i < buffer_id; i++) {
|
| 923 |
+
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
| 924 |
+
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
| 925 |
+
// only return the buffer size the first time it appears to avoid double counting
|
| 926 |
+
return 0;
|
| 927 |
+
}
|
| 928 |
+
}
|
| 929 |
+
|
| 930 |
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
| 931 |
}
|
| 932 |
|
ggml-backend-impl.h
CHANGED
|
@@ -17,13 +17,15 @@ extern "C" {
|
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
|
|
|
| 20 |
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 21 |
-
|
| 22 |
-
size_t (*GGML_CALL
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
// check if tensor data is in host memory
|
| 26 |
-
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
| 27 |
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
| 28 |
};
|
| 29 |
|
|
@@ -92,27 +94,37 @@ extern "C" {
|
|
| 92 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 93 |
|
| 94 |
// compute graph with a plan (not used currently)
|
|
|
|
| 95 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 96 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
// compute graph with a plan
|
| 99 |
-
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 100 |
// compute graph without a plan (async)
|
| 101 |
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 102 |
|
| 103 |
-
// check if the backend
|
| 104 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 107 |
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 108 |
// even if the weight has to be copied from the CPU temporarily
|
| 109 |
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 110 |
|
| 111 |
// (optional) event synchronization
|
|
|
|
| 112 |
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
| 113 |
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
|
|
|
| 114 |
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
|
|
| 115 |
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
|
|
| 116 |
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
| 117 |
};
|
| 118 |
|
|
|
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
| 20 |
+
// allocate a buffer of this type
|
| 21 |
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 22 |
+
// tensor alignment
|
| 23 |
+
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
| 24 |
+
// max buffer size that can be allocated
|
| 25 |
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
| 26 |
+
// data size needed to allocate the tensor, including padding
|
| 27 |
+
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
| 28 |
// check if tensor data is in host memory
|
|
|
|
| 29 |
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
| 30 |
};
|
| 31 |
|
|
|
|
| 94 |
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
| 95 |
|
| 96 |
// compute graph with a plan (not used currently)
|
| 97 |
+
// create a new plan for a graph
|
| 98 |
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 99 |
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 100 |
+
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
| 101 |
+
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
| 102 |
+
// compute the graph with the plan
|
| 103 |
+
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 104 |
|
|
|
|
|
|
|
| 105 |
// compute graph without a plan (async)
|
| 106 |
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 107 |
|
| 108 |
+
// check if the backend can compute an operation
|
| 109 |
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 110 |
|
| 111 |
+
// check if the backend can use tensors allocated in a buffer type
|
| 112 |
+
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 113 |
+
|
| 114 |
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
| 115 |
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
| 116 |
// even if the weight has to be copied from the CPU temporarily
|
| 117 |
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 118 |
|
| 119 |
// (optional) event synchronization
|
| 120 |
+
// create a new event that can record events on this backend instance
|
| 121 |
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
| 122 |
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
| 123 |
+
// record an event on the backend instance that created it
|
| 124 |
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
| 125 |
+
// wait for an event on on a different backend instance
|
| 126 |
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
| 127 |
+
// block until an event is recorded
|
| 128 |
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
| 129 |
};
|
| 130 |
|
ggml-backend.c
CHANGED
|
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
|
| 44 |
return ggml_nbytes(tensor);
|
| 45 |
}
|
| 46 |
|
| 47 |
-
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 48 |
-
return buft->iface.supports_backend(buft, backend);
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
| 52 |
if (buft->iface.is_host) {
|
| 53 |
return buft->iface.is_host(buft);
|
|
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|
| 286 |
return backend->iface.supports_op(backend, op);
|
| 287 |
}
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 290 |
if (backend->iface.offload_op != NULL) {
|
| 291 |
return backend->iface.offload_op(backend, op);
|
|
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
|
| 639 |
GGML_UNUSED(buft);
|
| 640 |
}
|
| 641 |
|
| 642 |
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 643 |
-
return ggml_backend_is_cpu(backend);
|
| 644 |
-
|
| 645 |
-
GGML_UNUSED(buft);
|
| 646 |
-
}
|
| 647 |
-
|
| 648 |
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 649 |
return true;
|
| 650 |
|
|
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
| 659 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 660 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 661 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 662 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
| 663 |
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 664 |
},
|
| 665 |
/* .context = */ NULL,
|
|
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|
| 715 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 716 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 717 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 718 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
| 719 |
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 720 |
},
|
| 721 |
/* .context = */ NULL,
|
|
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|
| 836 |
GGML_UNUSED(backend);
|
| 837 |
}
|
| 838 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
static struct ggml_backend_i cpu_backend_i = {
|
| 840 |
/* .get_name = */ ggml_backend_cpu_name,
|
| 841 |
/* .free = */ ggml_backend_cpu_free,
|
|
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
| 846 |
/* .synchronize = */ NULL,
|
| 847 |
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 848 |
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
|
|
| 849 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 850 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 851 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
|
|
|
| 852 |
/* .offload_op = */ NULL,
|
| 853 |
/* .event_new = */ NULL,
|
| 854 |
/* .event_free = */ NULL,
|
|
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
|
|
| 1055 |
int * node_backend_ids; // [graph_size]
|
| 1056 |
int * leaf_backend_ids; // [graph_size]
|
| 1057 |
|
|
|
|
|
|
|
|
|
|
| 1058 |
// copy of the graph with modified inputs
|
| 1059 |
struct ggml_cgraph * graph;
|
| 1060 |
|
|
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
|
|
| 1075 |
ggml_backend_sched_eval_callback callback_eval;
|
| 1076 |
void * callback_eval_user_data;
|
| 1077 |
|
|
|
|
|
|
|
| 1078 |
// align context_buffer to GGML_MEM_ALIGN
|
| 1079 |
#ifdef _MSC_VER
|
| 1080 |
__declspec(align(GGML_MEM_ALIGN))
|
|
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|
| 1097 |
return -1;
|
| 1098 |
}
|
| 1099 |
|
| 1100 |
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
| 1101 |
ggml_backend_buffer_t buffer = tensor->buffer;
|
| 1102 |
if (buffer == NULL) {
|
| 1103 |
return -1;
|
| 1104 |
}
|
| 1105 |
|
| 1106 |
-
// find highest prio backend that supports the buffer type
|
| 1107 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1108 |
-
if (
|
|
|
|
| 1109 |
return i;
|
| 1110 |
}
|
| 1111 |
}
|
| 1112 |
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
|
|
|
| 1116 |
|
| 1117 |
return -1;
|
| 1118 |
}
|
|
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1131 |
// TODO: use supports_op to check if the backend supports the op
|
| 1132 |
|
| 1133 |
// assign pre-allocated nodes to their backend
|
| 1134 |
-
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
| 1135 |
if (cur_backend_id != -1) {
|
| 1136 |
SET_CAUSE(tensor, "1.dst");
|
| 1137 |
return cur_backend_id;
|
|
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1139 |
|
| 1140 |
// view_src
|
| 1141 |
if (tensor->view_src != NULL) {
|
| 1142 |
-
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
| 1143 |
if (cur_backend_id != -1) {
|
| 1144 |
SET_CAUSE(tensor, "1.vsrc");
|
| 1145 |
return cur_backend_id;
|
|
@@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 1161 |
continue;
|
| 1162 |
}
|
| 1163 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1164 |
-
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
| 1165 |
// check if a backend with higher prio wants to offload the op
|
| 1166 |
if (src_backend_id == sched->n_backends - 1) {
|
| 1167 |
for (int b = 0; b < src_backend_id; b++) {
|
|
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|
| 1223 |
}
|
| 1224 |
}
|
| 1225 |
|
| 1226 |
-
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1230 |
|
| 1231 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1232 |
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1280 |
}
|
| 1281 |
}
|
| 1282 |
}
|
| 1283 |
-
#ifdef DEBUG_PASS1
|
| 1284 |
-
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1285 |
-
#endif
|
| 1286 |
|
| 1287 |
// pass 2: expand current backend assignments
|
| 1288 |
// assign the same backend to adjacent nodes
|
| 1289 |
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1290 |
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
// pass 2.2 expand gpu down
|
| 1294 |
{
|
| 1295 |
int cur_backend_id = -1;
|
| 1296 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1306 |
} else {
|
| 1307 |
cur_backend_id = *node_backend_id;
|
| 1308 |
}
|
| 1309 |
-
} else {
|
| 1310 |
-
|
| 1311 |
-
SET_CAUSE(node, "2.2");
|
| 1312 |
}
|
| 1313 |
}
|
| 1314 |
}
|
| 1315 |
-
//
|
| 1316 |
{
|
| 1317 |
int cur_backend_id = -1;
|
| 1318 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1328 |
} else {
|
| 1329 |
cur_backend_id = *node_backend_id;
|
| 1330 |
}
|
| 1331 |
-
} else {
|
| 1332 |
-
|
| 1333 |
-
SET_CAUSE(node, "2.1");
|
| 1334 |
}
|
| 1335 |
}
|
| 1336 |
}
|
| 1337 |
-
//
|
| 1338 |
{
|
| 1339 |
int cur_backend_id = -1;
|
| 1340 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1345 |
int * node_backend_id = &tensor_backend_id(node);
|
| 1346 |
if (*node_backend_id != -1) {
|
| 1347 |
cur_backend_id = *node_backend_id;
|
| 1348 |
-
} else {
|
| 1349 |
-
|
| 1350 |
-
SET_CAUSE(node, "2.4");
|
| 1351 |
}
|
| 1352 |
}
|
| 1353 |
}
|
| 1354 |
-
//
|
| 1355 |
{
|
| 1356 |
int cur_backend_id = -1;
|
| 1357 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1362 |
int * node_backend_id = &tensor_backend_id(node);
|
| 1363 |
if (*node_backend_id != -1) {
|
| 1364 |
cur_backend_id = *node_backend_id;
|
| 1365 |
-
} else {
|
| 1366 |
-
|
| 1367 |
-
SET_CAUSE(node, "2.3");
|
| 1368 |
}
|
| 1369 |
}
|
| 1370 |
}
|
| 1371 |
|
| 1372 |
-
|
| 1373 |
-
|
| 1374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1375 |
|
| 1376 |
-
// pass
|
| 1377 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1378 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1379 |
int * cur_backend_id = &tensor_backend_id(node);
|
| 1380 |
if (node->view_src != NULL && *cur_backend_id == -1) {
|
| 1381 |
*cur_backend_id = tensor_backend_id(node->view_src);
|
| 1382 |
-
SET_CAUSE(node, "
|
| 1383 |
}
|
| 1384 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1385 |
struct ggml_tensor * src = node->src[j];
|
|
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1391 |
if (src->view_src != NULL) {
|
| 1392 |
// views are always on the same backend as the source
|
| 1393 |
*src_backend_id = tensor_backend_id(src->view_src);
|
| 1394 |
-
SET_CAUSE(src, "
|
| 1395 |
} else {
|
| 1396 |
*src_backend_id = *cur_backend_id;
|
| 1397 |
-
SET_CAUSE(src, "
|
| 1398 |
}
|
| 1399 |
}
|
| 1400 |
}
|
| 1401 |
}
|
| 1402 |
-
#ifdef DEBUG_PASS3
|
| 1403 |
-
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
| 1404 |
-
#endif
|
| 1405 |
|
| 1406 |
// pass 4: split graph, find tensors that need to be copied
|
| 1407 |
{
|
|
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1448 |
}
|
| 1449 |
}
|
| 1450 |
// check if the split has too many inputs
|
|
|
|
| 1451 |
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
| 1452 |
const size_t id = hash_id(src);
|
| 1453 |
int src_backend_id = sched->tensor_backend_id[id];
|
| 1454 |
-
|
|
|
|
| 1455 |
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1456 |
need_new_split = true;
|
| 1457 |
break;
|
|
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1486 |
const int src_backend_id = tensor_backend_id(src);
|
| 1487 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1488 |
|
| 1489 |
-
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)
|
| 1490 |
size_t id = hash_id(src);
|
| 1491 |
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
| 1492 |
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1511 |
}
|
| 1512 |
}
|
| 1513 |
|
| 1514 |
-
|
|
|
|
| 1515 |
// create a copy of the input in the split's backend
|
| 1516 |
const size_t id = hash_id(src);
|
| 1517 |
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
|
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1537 |
split->i_end = graph->n_nodes;
|
| 1538 |
sched->n_splits = i_split + 1;
|
| 1539 |
}
|
| 1540 |
-
|
| 1541 |
-
|
| 1542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1543 |
|
| 1544 |
// create copies of the graph for each split
|
| 1545 |
// TODO: avoid this copy
|
|
@@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|
| 1613 |
}
|
| 1614 |
|
| 1615 |
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1616 |
// allocate graph
|
| 1617 |
-
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1618 |
// the re-allocation may cause the split inputs to be moved to a different address
|
| 1619 |
ggml_backend_sched_synchronize(sched);
|
| 1620 |
#ifndef NDEBUG
|
|
@@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
| 1727 |
|
| 1728 |
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
| 1729 |
|
|
|
|
|
|
|
| 1730 |
// initialize hash table
|
| 1731 |
sched->hash_set = ggml_hash_set_new(graph_size);
|
| 1732 |
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
|
@@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
| 1735 |
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 1736 |
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
| 1737 |
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
|
|
|
|
|
|
| 1738 |
|
| 1739 |
sched->n_backends = n_backends;
|
| 1740 |
|
|
@@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
| 1747 |
for (int b = 0; b < n_backends; b++) {
|
| 1748 |
sched->backends[b] = backends[b];
|
| 1749 |
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
| 1750 |
-
GGML_ASSERT(
|
| 1751 |
if (sched->n_copies > 1) {
|
| 1752 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1753 |
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
@@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1779 |
free(sched->tensor_copies);
|
| 1780 |
free(sched->node_backend_ids);
|
| 1781 |
free(sched->leaf_backend_ids);
|
|
|
|
|
|
|
| 1782 |
free(sched);
|
| 1783 |
}
|
| 1784 |
|
|
@@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|
| 1875 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1876 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1877 |
tensor_backend_id(node) = backend_index;
|
|
|
|
| 1878 |
}
|
| 1879 |
|
| 1880 |
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
|
|
|
| 44 |
return ggml_nbytes(tensor);
|
| 45 |
}
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
| 48 |
if (buft->iface.is_host) {
|
| 49 |
return buft->iface.is_host(buft);
|
|
|
|
| 282 |
return backend->iface.supports_op(backend, op);
|
| 283 |
}
|
| 284 |
|
| 285 |
+
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 286 |
+
return backend->iface.supports_buft(backend, buft);
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 290 |
if (backend->iface.offload_op != NULL) {
|
| 291 |
return backend->iface.offload_op(backend, op);
|
|
|
|
| 639 |
GGML_UNUSED(buft);
|
| 640 |
}
|
| 641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 643 |
return true;
|
| 644 |
|
|
|
|
| 653 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 654 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 655 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 656 |
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 657 |
},
|
| 658 |
/* .context = */ NULL,
|
|
|
|
| 708 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 709 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 710 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 711 |
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 712 |
},
|
| 713 |
/* .context = */ NULL,
|
|
|
|
| 828 |
GGML_UNUSED(backend);
|
| 829 |
}
|
| 830 |
|
| 831 |
+
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 832 |
+
return ggml_backend_buft_is_host(buft);
|
| 833 |
+
|
| 834 |
+
GGML_UNUSED(backend);
|
| 835 |
+
}
|
| 836 |
+
|
| 837 |
static struct ggml_backend_i cpu_backend_i = {
|
| 838 |
/* .get_name = */ ggml_backend_cpu_name,
|
| 839 |
/* .free = */ ggml_backend_cpu_free,
|
|
|
|
| 844 |
/* .synchronize = */ NULL,
|
| 845 |
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 846 |
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
| 847 |
+
/* .graph_plan_update = */ NULL,
|
| 848 |
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 849 |
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 850 |
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
| 851 |
+
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
| 852 |
/* .offload_op = */ NULL,
|
| 853 |
/* .event_new = */ NULL,
|
| 854 |
/* .event_free = */ NULL,
|
|
|
|
| 1055 |
int * node_backend_ids; // [graph_size]
|
| 1056 |
int * leaf_backend_ids; // [graph_size]
|
| 1057 |
|
| 1058 |
+
int * prev_node_backend_ids; // [graph_size]
|
| 1059 |
+
int * prev_leaf_backend_ids; // [graph_size]
|
| 1060 |
+
|
| 1061 |
// copy of the graph with modified inputs
|
| 1062 |
struct ggml_cgraph * graph;
|
| 1063 |
|
|
|
|
| 1078 |
ggml_backend_sched_eval_callback callback_eval;
|
| 1079 |
void * callback_eval_user_data;
|
| 1080 |
|
| 1081 |
+
bool debug;
|
| 1082 |
+
|
| 1083 |
// align context_buffer to GGML_MEM_ALIGN
|
| 1084 |
#ifdef _MSC_VER
|
| 1085 |
__declspec(align(GGML_MEM_ALIGN))
|
|
|
|
| 1102 |
return -1;
|
| 1103 |
}
|
| 1104 |
|
| 1105 |
+
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
| 1106 |
ggml_backend_buffer_t buffer = tensor->buffer;
|
| 1107 |
if (buffer == NULL) {
|
| 1108 |
return -1;
|
| 1109 |
}
|
| 1110 |
|
| 1111 |
+
// find highest prio backend that supports the buffer type and the op
|
| 1112 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1113 |
+
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
| 1114 |
+
ggml_backend_supports_op(sched->backends[i], op)) {
|
| 1115 |
return i;
|
| 1116 |
}
|
| 1117 |
}
|
| 1118 |
|
| 1119 |
+
#ifndef NDEBUG
|
| 1120 |
+
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
| 1121 |
+
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
| 1122 |
+
#endif
|
| 1123 |
|
| 1124 |
return -1;
|
| 1125 |
}
|
|
|
|
| 1138 |
// TODO: use supports_op to check if the backend supports the op
|
| 1139 |
|
| 1140 |
// assign pre-allocated nodes to their backend
|
| 1141 |
+
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
| 1142 |
if (cur_backend_id != -1) {
|
| 1143 |
SET_CAUSE(tensor, "1.dst");
|
| 1144 |
return cur_backend_id;
|
|
|
|
| 1146 |
|
| 1147 |
// view_src
|
| 1148 |
if (tensor->view_src != NULL) {
|
| 1149 |
+
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
| 1150 |
if (cur_backend_id != -1) {
|
| 1151 |
SET_CAUSE(tensor, "1.vsrc");
|
| 1152 |
return cur_backend_id;
|
|
|
|
| 1168 |
continue;
|
| 1169 |
}
|
| 1170 |
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1171 |
+
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
| 1172 |
// check if a backend with higher prio wants to offload the op
|
| 1173 |
if (src_backend_id == sched->n_backends - 1) {
|
| 1174 |
for (int b = 0; b < src_backend_id; b++) {
|
|
|
|
| 1230 |
}
|
| 1231 |
}
|
| 1232 |
|
| 1233 |
+
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
| 1234 |
+
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
| 1235 |
+
ggml_backend_buffer_type_t buft = NULL;
|
| 1236 |
+
|
| 1237 |
+
if (buf) {
|
| 1238 |
+
// the tensor is already allocated
|
| 1239 |
+
buft = buf->buft;
|
| 1240 |
+
} else {
|
| 1241 |
+
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
| 1242 |
+
int tensor_backend_id = tensor_backend_id(t);
|
| 1243 |
+
if (tensor_backend_id == -1 && t->view_src) {
|
| 1244 |
+
tensor_backend_id = tensor_backend_id(t->view_src);
|
| 1245 |
+
}
|
| 1246 |
+
if (tensor_backend_id != -1) {
|
| 1247 |
+
buft = sched->bufts[tensor_backend_id];
|
| 1248 |
+
}
|
| 1249 |
+
}
|
| 1250 |
+
|
| 1251 |
+
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
| 1252 |
+
}
|
| 1253 |
+
|
| 1254 |
+
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
| 1255 |
+
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
| 1256 |
+
*node_backend_id = cur_backend_id;
|
| 1257 |
+
SET_CAUSE(node, "2.sup");
|
| 1258 |
+
}
|
| 1259 |
+
}
|
| 1260 |
|
| 1261 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1262 |
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
|
|
|
| 1310 |
}
|
| 1311 |
}
|
| 1312 |
}
|
|
|
|
|
|
|
|
|
|
| 1313 |
|
| 1314 |
// pass 2: expand current backend assignments
|
| 1315 |
// assign the same backend to adjacent nodes
|
| 1316 |
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1317 |
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1318 |
+
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
| 1319 |
+
// expand gpu down
|
|
|
|
| 1320 |
{
|
| 1321 |
int cur_backend_id = -1;
|
| 1322 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
|
|
| 1332 |
} else {
|
| 1333 |
cur_backend_id = *node_backend_id;
|
| 1334 |
}
|
| 1335 |
+
} else if (cur_backend_id != -1) {
|
| 1336 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
|
|
| 1337 |
}
|
| 1338 |
}
|
| 1339 |
}
|
| 1340 |
+
// expand gpu up
|
| 1341 |
{
|
| 1342 |
int cur_backend_id = -1;
|
| 1343 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
|
|
| 1353 |
} else {
|
| 1354 |
cur_backend_id = *node_backend_id;
|
| 1355 |
}
|
| 1356 |
+
} else if (cur_backend_id != -1) {
|
| 1357 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
|
|
| 1358 |
}
|
| 1359 |
}
|
| 1360 |
}
|
| 1361 |
+
// expand rest down
|
| 1362 |
{
|
| 1363 |
int cur_backend_id = -1;
|
| 1364 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
|
|
| 1369 |
int * node_backend_id = &tensor_backend_id(node);
|
| 1370 |
if (*node_backend_id != -1) {
|
| 1371 |
cur_backend_id = *node_backend_id;
|
| 1372 |
+
} else if (cur_backend_id != -1) {
|
| 1373 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
|
|
| 1374 |
}
|
| 1375 |
}
|
| 1376 |
}
|
| 1377 |
+
// expand rest up
|
| 1378 |
{
|
| 1379 |
int cur_backend_id = -1;
|
| 1380 |
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
|
|
|
| 1385 |
int * node_backend_id = &tensor_backend_id(node);
|
| 1386 |
if (*node_backend_id != -1) {
|
| 1387 |
cur_backend_id = *node_backend_id;
|
| 1388 |
+
} else if (cur_backend_id != -1) {
|
| 1389 |
+
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
|
|
|
| 1390 |
}
|
| 1391 |
}
|
| 1392 |
}
|
| 1393 |
|
| 1394 |
+
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
| 1395 |
+
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
| 1396 |
+
// however, we also need to verify that the sources are in compatible buffer types
|
| 1397 |
+
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
| 1398 |
+
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
| 1399 |
+
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
| 1400 |
+
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
| 1401 |
+
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
| 1402 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1403 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1404 |
+
if (ggml_is_view_op(node->op)) {
|
| 1405 |
+
continue;
|
| 1406 |
+
}
|
| 1407 |
+
int * node_backend_id = &tensor_backend_id(node);
|
| 1408 |
+
if (*node_backend_id == -1) {
|
| 1409 |
+
// unassigned node: find the backend with the most supported inputs
|
| 1410 |
+
int n_supported_best = -1;
|
| 1411 |
+
for (int b = 0; b < sched->n_backends; b++) {
|
| 1412 |
+
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1413 |
+
int n_supported = 0;
|
| 1414 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1415 |
+
struct ggml_tensor * src = node->src[j];
|
| 1416 |
+
if (src == NULL) {
|
| 1417 |
+
continue;
|
| 1418 |
+
}
|
| 1419 |
+
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1420 |
+
n_supported++;
|
| 1421 |
+
}
|
| 1422 |
+
}
|
| 1423 |
+
if (n_supported > n_supported_best) {
|
| 1424 |
+
n_supported_best = n_supported;
|
| 1425 |
+
*node_backend_id = b;
|
| 1426 |
+
SET_CAUSE(node, "3.best");
|
| 1427 |
+
}
|
| 1428 |
+
}
|
| 1429 |
+
}
|
| 1430 |
+
} else {
|
| 1431 |
+
// assigned node: upgrade to higher prio backend if possible
|
| 1432 |
+
for (int b = 0; b < *node_backend_id; b++) {
|
| 1433 |
+
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1434 |
+
bool supported = true;
|
| 1435 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1436 |
+
struct ggml_tensor * src = node->src[j];
|
| 1437 |
+
if (src == NULL) {
|
| 1438 |
+
continue;
|
| 1439 |
+
}
|
| 1440 |
+
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1441 |
+
supported = false;
|
| 1442 |
+
break;
|
| 1443 |
+
}
|
| 1444 |
+
}
|
| 1445 |
+
if (supported) {
|
| 1446 |
+
*node_backend_id = b;
|
| 1447 |
+
SET_CAUSE(node, "3.upg");
|
| 1448 |
+
break;
|
| 1449 |
+
}
|
| 1450 |
+
}
|
| 1451 |
+
}
|
| 1452 |
+
}
|
| 1453 |
+
}
|
| 1454 |
|
| 1455 |
+
// pass 4: assign backends to remaining src from dst and view_src
|
| 1456 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1457 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1458 |
int * cur_backend_id = &tensor_backend_id(node);
|
| 1459 |
if (node->view_src != NULL && *cur_backend_id == -1) {
|
| 1460 |
*cur_backend_id = tensor_backend_id(node->view_src);
|
| 1461 |
+
SET_CAUSE(node, "4.vsrc");
|
| 1462 |
}
|
| 1463 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1464 |
struct ggml_tensor * src = node->src[j];
|
|
|
|
| 1470 |
if (src->view_src != NULL) {
|
| 1471 |
// views are always on the same backend as the source
|
| 1472 |
*src_backend_id = tensor_backend_id(src->view_src);
|
| 1473 |
+
SET_CAUSE(src, "4.vsrc");
|
| 1474 |
} else {
|
| 1475 |
*src_backend_id = *cur_backend_id;
|
| 1476 |
+
SET_CAUSE(src, "4.cur");
|
| 1477 |
}
|
| 1478 |
}
|
| 1479 |
}
|
| 1480 |
}
|
|
|
|
|
|
|
|
|
|
| 1481 |
|
| 1482 |
// pass 4: split graph, find tensors that need to be copied
|
| 1483 |
{
|
|
|
|
| 1524 |
}
|
| 1525 |
}
|
| 1526 |
// check if the split has too many inputs
|
| 1527 |
+
// FIXME: count the number of inputs instead of only checking when full
|
| 1528 |
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
| 1529 |
const size_t id = hash_id(src);
|
| 1530 |
int src_backend_id = sched->tensor_backend_id[id];
|
| 1531 |
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1532 |
+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
| 1533 |
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1534 |
need_new_split = true;
|
| 1535 |
break;
|
|
|
|
| 1564 |
const int src_backend_id = tensor_backend_id(src);
|
| 1565 |
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1566 |
|
| 1567 |
+
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1568 |
size_t id = hash_id(src);
|
| 1569 |
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
| 1570 |
ggml_backend_t backend = sched->backends[src_backend_id];
|
|
|
|
| 1589 |
}
|
| 1590 |
}
|
| 1591 |
|
| 1592 |
+
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1593 |
+
if (src_backend_id != cur_backend_id && !supported) {
|
| 1594 |
// create a copy of the input in the split's backend
|
| 1595 |
const size_t id = hash_id(src);
|
| 1596 |
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
|
|
|
| 1616 |
split->i_end = graph->n_nodes;
|
| 1617 |
sched->n_splits = i_split + 1;
|
| 1618 |
}
|
| 1619 |
+
|
| 1620 |
+
if (sched->debug) {
|
| 1621 |
+
ggml_backend_sched_print_assignments(sched, graph);
|
| 1622 |
+
}
|
| 1623 |
+
|
| 1624 |
+
// swap node_backend_ids and leaf_backend_ids and prevs
|
| 1625 |
+
{
|
| 1626 |
+
int * tmp = sched->node_backend_ids;
|
| 1627 |
+
sched->node_backend_ids = sched->prev_node_backend_ids;
|
| 1628 |
+
sched->prev_node_backend_ids = tmp;
|
| 1629 |
+
|
| 1630 |
+
tmp = sched->leaf_backend_ids;
|
| 1631 |
+
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
| 1632 |
+
sched->prev_leaf_backend_ids = tmp;
|
| 1633 |
+
}
|
| 1634 |
|
| 1635 |
// create copies of the graph for each split
|
| 1636 |
// TODO: avoid this copy
|
|
|
|
| 1704 |
}
|
| 1705 |
|
| 1706 |
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1707 |
+
bool backend_ids_changed = false;
|
| 1708 |
+
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
| 1709 |
+
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
|
| 1710 |
+
backend_ids_changed = true;
|
| 1711 |
+
break;
|
| 1712 |
+
}
|
| 1713 |
+
}
|
| 1714 |
+
if (!backend_ids_changed) {
|
| 1715 |
+
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
| 1716 |
+
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
|
| 1717 |
+
backend_ids_changed = true;
|
| 1718 |
+
break;
|
| 1719 |
+
}
|
| 1720 |
+
}
|
| 1721 |
+
}
|
| 1722 |
+
|
| 1723 |
// allocate graph
|
| 1724 |
+
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
| 1725 |
// the re-allocation may cause the split inputs to be moved to a different address
|
| 1726 |
ggml_backend_sched_synchronize(sched);
|
| 1727 |
#ifndef NDEBUG
|
|
|
|
| 1834 |
|
| 1835 |
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
| 1836 |
|
| 1837 |
+
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
| 1838 |
+
|
| 1839 |
// initialize hash table
|
| 1840 |
sched->hash_set = ggml_hash_set_new(graph_size);
|
| 1841 |
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
|
|
|
| 1844 |
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 1845 |
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
| 1846 |
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
| 1847 |
+
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
| 1848 |
+
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
| 1849 |
|
| 1850 |
sched->n_backends = n_backends;
|
| 1851 |
|
|
|
|
| 1858 |
for (int b = 0; b < n_backends; b++) {
|
| 1859 |
sched->backends[b] = backends[b];
|
| 1860 |
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
| 1861 |
+
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
| 1862 |
if (sched->n_copies > 1) {
|
| 1863 |
for (int c = 0; c < sched->n_copies; c++) {
|
| 1864 |
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
|
|
|
| 1890 |
free(sched->tensor_copies);
|
| 1891 |
free(sched->node_backend_ids);
|
| 1892 |
free(sched->leaf_backend_ids);
|
| 1893 |
+
free(sched->prev_node_backend_ids);
|
| 1894 |
+
free(sched->prev_leaf_backend_ids);
|
| 1895 |
free(sched);
|
| 1896 |
}
|
| 1897 |
|
|
|
|
| 1988 |
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 1989 |
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1990 |
tensor_backend_id(node) = backend_index;
|
| 1991 |
+
SET_CAUSE(node, "usr");
|
| 1992 |
}
|
| 1993 |
|
| 1994 |
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
ggml-backend.h
CHANGED
|
@@ -23,7 +23,6 @@ extern "C" {
|
|
| 23 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 24 |
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
| 25 |
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 26 |
-
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
| 27 |
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 28 |
|
| 29 |
// buffer
|
|
@@ -74,6 +73,7 @@ extern "C" {
|
|
| 74 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 75 |
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 76 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
|
|
| 77 |
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 78 |
|
| 79 |
// tensor copy between different backends
|
|
@@ -90,7 +90,7 @@ extern "C" {
|
|
| 90 |
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
| 91 |
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
| 92 |
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
| 93 |
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
| 94 |
|
| 95 |
//
|
| 96 |
// CPU backend
|
|
@@ -119,7 +119,7 @@ extern "C" {
|
|
| 119 |
|
| 120 |
GGML_API size_t ggml_backend_reg_get_count(void);
|
| 121 |
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
| 122 |
-
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is
|
| 123 |
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
| 124 |
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
| 125 |
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
|
|
|
| 23 |
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 24 |
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
| 25 |
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
|
|
| 26 |
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 27 |
|
| 28 |
// buffer
|
|
|
|
| 73 |
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 74 |
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 75 |
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 76 |
+
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
| 77 |
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
| 78 |
|
| 79 |
// tensor copy between different backends
|
|
|
|
| 90 |
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
| 91 |
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
| 92 |
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
| 93 |
+
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
| 94 |
|
| 95 |
//
|
| 96 |
// CPU backend
|
|
|
|
| 119 |
|
| 120 |
GGML_API size_t ggml_backend_reg_get_count(void);
|
| 121 |
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
| 122 |
+
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
| 123 |
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
| 124 |
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
| 125 |
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
ggml-cuda.cu
CHANGED
|
@@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
|
| 543 |
return ctx->name.c_str();
|
| 544 |
}
|
| 545 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 547 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 548 |
|
|
@@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|
| 585 |
GGML_UNUSED(buft);
|
| 586 |
}
|
| 587 |
|
| 588 |
-
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 589 |
-
if (!ggml_backend_is_cuda(backend)) {
|
| 590 |
-
return false;
|
| 591 |
-
}
|
| 592 |
-
|
| 593 |
-
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 594 |
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 595 |
-
|
| 596 |
-
return buft_ctx->device == cuda_ctx->device;
|
| 597 |
-
}
|
| 598 |
-
|
| 599 |
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 600 |
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
| 601 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 602 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 603 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 604 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
| 605 |
-
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
| 606 |
/* .is_host = */ NULL,
|
| 607 |
};
|
| 608 |
|
|
@@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
|
| 863 |
GGML_UNUSED(buft);
|
| 864 |
}
|
| 865 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 866 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 867 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 868 |
// instead, we allocate them for each tensor separately in init_tensor
|
|
@@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|
| 906 |
return total_size;
|
| 907 |
}
|
| 908 |
|
| 909 |
-
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 910 |
-
return ggml_backend_is_cuda(backend);
|
| 911 |
-
|
| 912 |
-
GGML_UNUSED(buft);
|
| 913 |
-
}
|
| 914 |
-
|
| 915 |
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 916 |
return false;
|
| 917 |
|
|
@@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
| 924 |
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
| 925 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 926 |
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
| 927 |
-
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
| 928 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 929 |
};
|
| 930 |
|
|
@@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
| 1024 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 1025 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 1026 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 1027 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 1028 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1029 |
},
|
| 1030 |
/* .context = */ nullptr,
|
|
@@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
| 2879 |
GGML_UNUSED(backend);
|
| 2880 |
}
|
| 2881 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2882 |
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 2883 |
const int min_batch_size = 32;
|
| 2884 |
|
|
@@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|
| 2951 |
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 2952 |
/* .graph_plan_create = */ NULL,
|
| 2953 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 2954 |
/* .graph_plan_compute = */ NULL,
|
| 2955 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 2956 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
|
|
|
| 2957 |
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
| 2958 |
/* .event_new = */ ggml_backend_cuda_event_new,
|
| 2959 |
/* .event_free = */ ggml_backend_cuda_event_free,
|
|
|
|
| 543 |
return ctx->name.c_str();
|
| 544 |
}
|
| 545 |
|
| 546 |
+
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
| 547 |
+
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 551 |
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 552 |
|
|
|
|
| 589 |
GGML_UNUSED(buft);
|
| 590 |
}
|
| 591 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 593 |
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
| 594 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 595 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 596 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 597 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
|
|
|
| 598 |
/* .is_host = */ NULL,
|
| 599 |
};
|
| 600 |
|
|
|
|
| 855 |
GGML_UNUSED(buft);
|
| 856 |
}
|
| 857 |
|
| 858 |
+
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
| 859 |
+
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
| 860 |
+
}
|
| 861 |
+
|
| 862 |
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 863 |
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 864 |
// instead, we allocate them for each tensor separately in init_tensor
|
|
|
|
| 902 |
return total_size;
|
| 903 |
}
|
| 904 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 906 |
return false;
|
| 907 |
|
|
|
|
| 914 |
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
| 915 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 916 |
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
|
|
|
| 917 |
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 918 |
};
|
| 919 |
|
|
|
|
| 1013 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 1014 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 1015 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
|
|
| 1016 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 1017 |
},
|
| 1018 |
/* .context = */ nullptr,
|
|
|
|
| 2867 |
GGML_UNUSED(backend);
|
| 2868 |
}
|
| 2869 |
|
| 2870 |
+
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 2871 |
+
if (ggml_backend_buft_is_cuda_split(buft)) {
|
| 2872 |
+
return true;
|
| 2873 |
+
}
|
| 2874 |
+
|
| 2875 |
+
if (ggml_backend_buft_is_cuda(buft)) {
|
| 2876 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2877 |
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 2878 |
+
return buft_ctx->device == cuda_ctx->device;
|
| 2879 |
+
}
|
| 2880 |
+
|
| 2881 |
+
return false;
|
| 2882 |
+
}
|
| 2883 |
+
|
| 2884 |
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 2885 |
const int min_batch_size = 32;
|
| 2886 |
|
|
|
|
| 2953 |
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 2954 |
/* .graph_plan_create = */ NULL,
|
| 2955 |
/* .graph_plan_free = */ NULL,
|
| 2956 |
+
/* .graph_plan_update = */ NULL,
|
| 2957 |
/* .graph_plan_compute = */ NULL,
|
| 2958 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 2959 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 2960 |
+
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
| 2961 |
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
| 2962 |
/* .event_new = */ ggml_backend_cuda_event_new,
|
| 2963 |
/* .event_free = */ ggml_backend_cuda_event_free,
|
ggml-kompute.cpp
CHANGED
|
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|
| 1902 |
return ctx->max_alloc;
|
| 1903 |
}
|
| 1904 |
|
| 1905 |
-
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 1906 |
-
GGML_UNUSED(buft);
|
| 1907 |
-
return ggml_backend_is_kompute(backend);
|
| 1908 |
-
}
|
| 1909 |
-
|
| 1910 |
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
| 1911 |
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
| 1912 |
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
| 1913 |
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
| 1914 |
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
| 1915 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 1916 |
-
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
| 1917 |
/* .is_host = */ NULL,
|
| 1918 |
};
|
| 1919 |
|
|
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|
| 1973 |
return ggml_vk_supports_op(op);
|
| 1974 |
}
|
| 1975 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1976 |
static struct ggml_backend_i kompute_backend_i = {
|
| 1977 |
/* .get_name = */ ggml_backend_kompute_name,
|
| 1978 |
/* .free = */ ggml_backend_kompute_free,
|
|
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|
| 1983 |
/* .synchronize = */ NULL,
|
| 1984 |
/* .graph_plan_create = */ NULL,
|
| 1985 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 1986 |
/* .graph_plan_compute = */ NULL,
|
| 1987 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
| 1988 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
|
|
|
| 1989 |
/* .offload_op = */ NULL,
|
| 1990 |
/* .event_new = */ NULL,
|
| 1991 |
/* .event_free = */ NULL,
|
|
|
|
| 1902 |
return ctx->max_alloc;
|
| 1903 |
}
|
| 1904 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1905 |
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
| 1906 |
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
| 1907 |
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
| 1908 |
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
| 1909 |
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
| 1910 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 1911 |
/* .is_host = */ NULL,
|
| 1912 |
};
|
| 1913 |
|
|
|
|
| 1967 |
return ggml_vk_supports_op(op);
|
| 1968 |
}
|
| 1969 |
|
| 1970 |
+
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 1971 |
+
GGML_UNUSED(backend);
|
| 1972 |
+
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
| 1973 |
+
}
|
| 1974 |
+
|
| 1975 |
static struct ggml_backend_i kompute_backend_i = {
|
| 1976 |
/* .get_name = */ ggml_backend_kompute_name,
|
| 1977 |
/* .free = */ ggml_backend_kompute_free,
|
|
|
|
| 1982 |
/* .synchronize = */ NULL,
|
| 1983 |
/* .graph_plan_create = */ NULL,
|
| 1984 |
/* .graph_plan_free = */ NULL,
|
| 1985 |
+
/* .graph_plan_update = */ NULL,
|
| 1986 |
/* .graph_plan_compute = */ NULL,
|
| 1987 |
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
| 1988 |
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
| 1989 |
+
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
| 1990 |
/* .offload_op = */ NULL,
|
| 1991 |
/* .event_new = */ NULL,
|
| 1992 |
/* .event_free = */ NULL,
|
ggml-metal.m
CHANGED
|
@@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
|
|
| 3044 |
UNUSED(buft);
|
| 3045 |
}
|
| 3046 |
|
| 3047 |
-
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 3048 |
-
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
| 3049 |
-
|
| 3050 |
-
UNUSED(buft);
|
| 3051 |
-
}
|
| 3052 |
-
|
| 3053 |
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 3054 |
return true;
|
| 3055 |
|
|
@@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|
| 3064 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 3065 |
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
| 3066 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 3067 |
-
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
| 3068 |
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 3069 |
},
|
| 3070 |
/* .context = */ NULL,
|
|
@@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
|
|
| 3179 |
return ggml_metal_supports_op(metal_ctx, op);
|
| 3180 |
}
|
| 3181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3182 |
static struct ggml_backend_i ggml_backend_metal_i = {
|
| 3183 |
/* .get_name = */ ggml_backend_metal_name,
|
| 3184 |
/* .free = */ ggml_backend_metal_free,
|
|
@@ -3189,9 +3188,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|
| 3189 |
/* .synchronize = */ NULL,
|
| 3190 |
/* .graph_plan_create = */ NULL,
|
| 3191 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 3192 |
/* .graph_plan_compute = */ NULL,
|
| 3193 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 3194 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
|
|
|
| 3195 |
/* .offload_op = */ NULL,
|
| 3196 |
/* .event_new = */ NULL,
|
| 3197 |
/* .event_free = */ NULL,
|
|
|
|
| 3044 |
UNUSED(buft);
|
| 3045 |
}
|
| 3046 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3047 |
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 3048 |
return true;
|
| 3049 |
|
|
|
|
| 3058 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 3059 |
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
| 3060 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 3061 |
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
| 3062 |
},
|
| 3063 |
/* .context = */ NULL,
|
|
|
|
| 3172 |
return ggml_metal_supports_op(metal_ctx, op);
|
| 3173 |
}
|
| 3174 |
|
| 3175 |
+
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 3176 |
+
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
| 3177 |
+
|
| 3178 |
+
UNUSED(backend);
|
| 3179 |
+
}
|
| 3180 |
+
|
| 3181 |
static struct ggml_backend_i ggml_backend_metal_i = {
|
| 3182 |
/* .get_name = */ ggml_backend_metal_name,
|
| 3183 |
/* .free = */ ggml_backend_metal_free,
|
|
|
|
| 3188 |
/* .synchronize = */ NULL,
|
| 3189 |
/* .graph_plan_create = */ NULL,
|
| 3190 |
/* .graph_plan_free = */ NULL,
|
| 3191 |
+
/* .graph_plan_update = */ NULL,
|
| 3192 |
/* .graph_plan_compute = */ NULL,
|
| 3193 |
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
| 3194 |
/* .supports_op = */ ggml_backend_metal_supports_op,
|
| 3195 |
+
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
| 3196 |
/* .offload_op = */ NULL,
|
| 3197 |
/* .event_new = */ NULL,
|
| 3198 |
/* .event_free = */ NULL,
|
ggml-rpc.cpp
CHANGED
|
@@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
|
|
| 540 |
return ggml_nbytes(tensor);
|
| 541 |
}
|
| 542 |
|
| 543 |
-
GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 544 |
-
if (!ggml_backend_is_rpc(backend)) {
|
| 545 |
-
return false;
|
| 546 |
-
}
|
| 547 |
-
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 548 |
-
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 549 |
-
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
| 550 |
-
}
|
| 551 |
-
|
| 552 |
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
| 553 |
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
| 554 |
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
|
| 555 |
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
|
| 556 |
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
|
| 557 |
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
|
| 558 |
-
/* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
|
| 559 |
/* .is_host = */ NULL,
|
| 560 |
};
|
| 561 |
|
|
@@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
|
|
| 638 |
return false;
|
| 639 |
}
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
static ggml_backend_i ggml_backend_rpc_interface = {
|
| 642 |
/* .get_name = */ ggml_backend_rpc_name,
|
| 643 |
/* .free = */ ggml_backend_rpc_free,
|
|
@@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
| 648 |
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
| 649 |
/* .graph_plan_create = */ NULL,
|
| 650 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 651 |
/* .graph_plan_compute = */ NULL,
|
| 652 |
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
| 653 |
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
|
|
|
| 654 |
/* .offload_op = */ NULL,
|
| 655 |
/* .event_new = */ NULL,
|
| 656 |
/* .event_free = */ NULL,
|
|
|
|
| 540 |
return ggml_nbytes(tensor);
|
| 541 |
}
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
| 544 |
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
| 545 |
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
|
| 546 |
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
|
| 547 |
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
|
| 548 |
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
|
|
|
|
| 549 |
/* .is_host = */ NULL,
|
| 550 |
};
|
| 551 |
|
|
|
|
| 628 |
return false;
|
| 629 |
}
|
| 630 |
|
| 631 |
+
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 632 |
+
if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
|
| 633 |
+
return false;
|
| 634 |
+
}
|
| 635 |
+
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
| 636 |
+
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
| 637 |
+
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
static ggml_backend_i ggml_backend_rpc_interface = {
|
| 641 |
/* .get_name = */ ggml_backend_rpc_name,
|
| 642 |
/* .free = */ ggml_backend_rpc_free,
|
|
|
|
| 647 |
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
| 648 |
/* .graph_plan_create = */ NULL,
|
| 649 |
/* .graph_plan_free = */ NULL,
|
| 650 |
+
/* .graph_plan_update = */ NULL,
|
| 651 |
/* .graph_plan_compute = */ NULL,
|
| 652 |
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
| 653 |
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
| 654 |
+
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
| 655 |
/* .offload_op = */ NULL,
|
| 656 |
/* .event_new = */ NULL,
|
| 657 |
/* .event_free = */ NULL,
|
ggml-sycl.cpp
CHANGED
|
@@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
|
|
| 16575 |
UNUSED(buft);
|
| 16576 |
}
|
| 16577 |
|
| 16578 |
-
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 16579 |
-
if (!ggml_backend_is_sycl(backend)) {
|
| 16580 |
-
return false;
|
| 16581 |
-
}
|
| 16582 |
-
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
| 16583 |
-
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 16584 |
-
return buft_ctx->device == sycl_ctx->device;
|
| 16585 |
-
}
|
| 16586 |
-
|
| 16587 |
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
| 16588 |
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
| 16589 |
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
| 16590 |
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
| 16591 |
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
| 16592 |
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
| 16593 |
-
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
| 16594 |
/* .is_host = */ nullptr,
|
| 16595 |
};
|
| 16596 |
|
|
@@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|
| 16942 |
return total_size;
|
| 16943 |
}
|
| 16944 |
|
| 16945 |
-
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 16946 |
-
return ggml_backend_is_sycl(backend);
|
| 16947 |
-
|
| 16948 |
-
UNUSED(buft);
|
| 16949 |
-
}
|
| 16950 |
-
|
| 16951 |
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 16952 |
return false;
|
| 16953 |
|
|
@@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|
| 16960 |
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
| 16961 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 16962 |
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
| 16963 |
-
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
|
| 16964 |
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
| 16965 |
};
|
| 16966 |
|
|
@@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|
| 17046 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 17047 |
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
| 17048 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 17049 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 17050 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 17051 |
},
|
| 17052 |
/* .context = */ nullptr,
|
|
@@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
|
|
| 17311 |
GGML_UNUSED(backend);
|
| 17312 |
}
|
| 17313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17314 |
|
| 17315 |
static ggml_backend_i ggml_backend_sycl_interface = {
|
| 17316 |
/* .get_name = */ ggml_backend_sycl_name,
|
|
@@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|
| 17322 |
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
| 17323 |
/* .graph_plan_create = */ NULL,
|
| 17324 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 17325 |
/* .graph_plan_compute = */ NULL,
|
| 17326 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
| 17327 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
|
|
|
| 17328 |
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
| 17329 |
/* .event_new = */ NULL,
|
| 17330 |
/* .event_free = */ NULL,
|
|
|
|
| 16575 |
UNUSED(buft);
|
| 16576 |
}
|
| 16577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16578 |
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
| 16579 |
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
| 16580 |
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
| 16581 |
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
| 16582 |
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
| 16583 |
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
|
|
|
| 16584 |
/* .is_host = */ nullptr,
|
| 16585 |
};
|
| 16586 |
|
|
|
|
| 16932 |
return total_size;
|
| 16933 |
}
|
| 16934 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16935 |
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 16936 |
return false;
|
| 16937 |
|
|
|
|
| 16944 |
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
| 16945 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 16946 |
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
|
|
|
| 16947 |
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
| 16948 |
};
|
| 16949 |
|
|
|
|
| 17029 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 17030 |
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
| 17031 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
|
|
| 17032 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 17033 |
},
|
| 17034 |
/* .context = */ nullptr,
|
|
|
|
| 17293 |
GGML_UNUSED(backend);
|
| 17294 |
}
|
| 17295 |
|
| 17296 |
+
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 17297 |
+
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
| 17298 |
+
return false;
|
| 17299 |
+
}
|
| 17300 |
+
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
| 17301 |
+
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
| 17302 |
+
return buft_ctx->device == sycl_ctx->device;
|
| 17303 |
+
}
|
| 17304 |
|
| 17305 |
static ggml_backend_i ggml_backend_sycl_interface = {
|
| 17306 |
/* .get_name = */ ggml_backend_sycl_name,
|
|
|
|
| 17312 |
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
| 17313 |
/* .graph_plan_create = */ NULL,
|
| 17314 |
/* .graph_plan_free = */ NULL,
|
| 17315 |
+
/* .graph_plan_update = */ NULL,
|
| 17316 |
/* .graph_plan_compute = */ NULL,
|
| 17317 |
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
| 17318 |
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
| 17319 |
+
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
| 17320 |
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
| 17321 |
/* .event_new = */ NULL,
|
| 17322 |
/* .event_free = */ NULL,
|
ggml-vulkan.cpp
CHANGED
|
@@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|
| 6142 |
UNUSED(buft);
|
| 6143 |
}
|
| 6144 |
|
| 6145 |
-
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 6146 |
-
if (!ggml_backend_is_vk(backend)) {
|
| 6147 |
-
return false;
|
| 6148 |
-
}
|
| 6149 |
-
|
| 6150 |
-
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
| 6151 |
-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6152 |
-
|
| 6153 |
-
return buft_ctx->ctx->idx == ctx->idx;
|
| 6154 |
-
}
|
| 6155 |
-
|
| 6156 |
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
| 6157 |
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
| 6158 |
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
| 6159 |
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
| 6160 |
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
| 6161 |
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
| 6162 |
-
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
| 6163 |
/* .is_host = */ NULL,
|
| 6164 |
};
|
| 6165 |
|
|
@@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
| 6235 |
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
| 6236 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 6237 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 6238 |
-
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 6239 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 6240 |
},
|
| 6241 |
/* .context = */ nullptr,
|
|
@@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|
| 6551 |
UNUSED(backend);
|
| 6552 |
}
|
| 6553 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6554 |
// TODO: enable async and synchronize
|
| 6555 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 6556 |
/* .get_name = */ ggml_backend_vk_name,
|
|
@@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
| 6562 |
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
| 6563 |
/* .graph_plan_create = */ NULL,
|
| 6564 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 6565 |
/* .graph_plan_compute = */ NULL,
|
| 6566 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 6567 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
|
|
| 6568 |
/* .offload_op = */ ggml_backend_vk_offload_op,
|
| 6569 |
/* .event_new = */ NULL,
|
| 6570 |
/* .event_free = */ NULL,
|
|
|
|
| 6142 |
UNUSED(buft);
|
| 6143 |
}
|
| 6144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6145 |
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
| 6146 |
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
| 6147 |
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
| 6148 |
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
| 6149 |
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
| 6150 |
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
|
|
|
| 6151 |
/* .is_host = */ NULL,
|
| 6152 |
};
|
| 6153 |
|
|
|
|
| 6223 |
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
| 6224 |
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 6225 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
|
|
| 6226 |
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 6227 |
},
|
| 6228 |
/* .context = */ nullptr,
|
|
|
|
| 6538 |
UNUSED(backend);
|
| 6539 |
}
|
| 6540 |
|
| 6541 |
+
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 6542 |
+
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
| 6543 |
+
return false;
|
| 6544 |
+
}
|
| 6545 |
+
|
| 6546 |
+
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
| 6547 |
+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6548 |
+
|
| 6549 |
+
return buft_ctx->ctx->idx == ctx->idx;
|
| 6550 |
+
}
|
| 6551 |
+
|
| 6552 |
// TODO: enable async and synchronize
|
| 6553 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 6554 |
/* .get_name = */ ggml_backend_vk_name,
|
|
|
|
| 6560 |
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
| 6561 |
/* .graph_plan_create = */ NULL,
|
| 6562 |
/* .graph_plan_free = */ NULL,
|
| 6563 |
+
/* .graph_plan_update = */ NULL,
|
| 6564 |
/* .graph_plan_compute = */ NULL,
|
| 6565 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 6566 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 6567 |
+
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
| 6568 |
/* .offload_op = */ ggml_backend_vk_offload_op,
|
| 6569 |
/* .event_new = */ NULL,
|
| 6570 |
/* .event_free = */ NULL,
|
ggml.c
CHANGED
|
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
| 297 |
|
| 298 |
#if defined(GGML_USE_ACCELERATE)
|
| 299 |
#include <Accelerate/Accelerate.h>
|
| 300 |
-
#elif defined(GGML_USE_OPENBLAS)
|
| 301 |
-
#if defined(GGML_BLAS_USE_MKL)
|
| 302 |
-
#include <mkl.h>
|
| 303 |
-
#else
|
| 304 |
-
#include <cblas.h>
|
| 305 |
-
#endif
|
| 306 |
#endif
|
| 307 |
|
| 308 |
// floating point type used to accumulate sums
|
|
@@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|
| 12179 |
|
| 12180 |
// ggml_compute_forward_mul_mat
|
| 12181 |
|
| 12182 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 12183 |
-
// helper function to determine if it is better to use BLAS or not
|
| 12184 |
-
// for large matrices, BLAS is faster
|
| 12185 |
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
| 12186 |
-
const struct ggml_tensor * src0 = dst->src[0];
|
| 12187 |
-
const struct ggml_tensor * src1 = dst->src[1];
|
| 12188 |
-
|
| 12189 |
-
//const int64_t ne00 = src0->ne[0];
|
| 12190 |
-
//const int64_t ne01 = src0->ne[1];
|
| 12191 |
-
|
| 12192 |
-
const int64_t ne10 = src1->ne[0];
|
| 12193 |
-
|
| 12194 |
-
const int64_t ne0 = dst->ne[0];
|
| 12195 |
-
const int64_t ne1 = dst->ne[1];
|
| 12196 |
-
|
| 12197 |
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
| 12198 |
-
// all the experts for each batch element and the processing would become incredibly slow
|
| 12199 |
-
// TODO: find the optimal values for these
|
| 12200 |
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
| 12201 |
-
ggml_is_contiguous(src0) &&
|
| 12202 |
-
ggml_is_contiguous(src1) &&
|
| 12203 |
-
//src0->type == GGML_TYPE_F32 &&
|
| 12204 |
-
src1->type == GGML_TYPE_F32 &&
|
| 12205 |
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
| 12206 |
-
|
| 12207 |
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
| 12208 |
-
return true;
|
| 12209 |
-
}
|
| 12210 |
-
|
| 12211 |
-
return false;
|
| 12212 |
-
}
|
| 12213 |
-
#endif
|
| 12214 |
-
|
| 12215 |
static void ggml_compute_forward_mul_mat_one_chunk(
|
| 12216 |
const struct ggml_compute_params * params,
|
| 12217 |
struct ggml_tensor * dst,
|
|
@@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|
| 12349 |
// nb01 >= nb00 - src0 is not transposed
|
| 12350 |
// compute by src0 rows
|
| 12351 |
|
| 12352 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 12353 |
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
| 12354 |
-
const int64_t ne_plane = ne01*ne00;
|
| 12355 |
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
| 12356 |
-
UNUSED(desired_wsize);
|
| 12357 |
-
|
| 12358 |
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
| 12359 |
-
if (type != GGML_TYPE_F32) {
|
| 12360 |
-
assert(params->wsize >= desired_wsize);
|
| 12361 |
-
// parallelize by src0 rows
|
| 12362 |
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
| 12363 |
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
| 12364 |
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
| 12365 |
-
const int64_t i03 = i13/r3;
|
| 12366 |
-
const int64_t i02 = i12/r2;
|
| 12367 |
-
|
| 12368 |
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
| 12369 |
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
| 12370 |
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
| 12371 |
-
|
| 12372 |
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
| 12373 |
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
| 12374 |
-
}
|
| 12375 |
-
}
|
| 12376 |
-
}
|
| 12377 |
-
}
|
| 12378 |
-
return;
|
| 12379 |
-
}
|
| 12380 |
-
|
| 12381 |
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
| 12382 |
-
return;
|
| 12383 |
-
}
|
| 12384 |
-
|
| 12385 |
-
// perform sgemm, parallelization controlled by blas lib
|
| 12386 |
-
if (ith != 0) {
|
| 12387 |
-
return;
|
| 12388 |
-
}
|
| 12389 |
-
|
| 12390 |
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
| 12391 |
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
| 12392 |
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
| 12393 |
-
const int64_t i03 = i13/r3;
|
| 12394 |
-
const int64_t i02 = i12/r2;
|
| 12395 |
-
|
| 12396 |
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
| 12397 |
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
| 12398 |
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 12399 |
-
|
| 12400 |
-
if (type != GGML_TYPE_F32) {
|
| 12401 |
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
| 12402 |
-
}
|
| 12403 |
-
|
| 12404 |
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 12405 |
-
ne1, ne01, ne10,
|
| 12406 |
-
1.0f, y, ne10,
|
| 12407 |
-
x, ne00,
|
| 12408 |
-
0.0f, d, ne01);
|
| 12409 |
-
}
|
| 12410 |
-
}
|
| 12411 |
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
| 12412 |
-
|
| 12413 |
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 12414 |
-
|
| 12415 |
-
return;
|
| 12416 |
-
}
|
| 12417 |
-
#endif
|
| 12418 |
-
|
| 12419 |
#if GGML_USE_LLAMAFILE
|
| 12420 |
const bool src1_cont = ggml_is_contiguous(src1);
|
| 12421 |
|
|
@@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 12796 |
// nb01 >= nb00 - src0 is not transposed
|
| 12797 |
// compute by src0 rows
|
| 12798 |
|
| 12799 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 12800 |
-
bool use_blas = ggml_is_matrix(src0) &&
|
| 12801 |
-
ggml_is_matrix(src1) &&
|
| 12802 |
-
ggml_is_contiguous(src0) &&
|
| 12803 |
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
| 12804 |
-
#endif
|
| 12805 |
-
|
| 12806 |
if (params->type == GGML_TASK_TYPE_INIT) {
|
| 12807 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
| 12808 |
-
if (use_blas) {
|
| 12809 |
-
return;
|
| 12810 |
-
}
|
| 12811 |
-
#endif
|
| 12812 |
if (ith != 0) {
|
| 12813 |
return;
|
| 12814 |
}
|
|
@@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 12820 |
return;
|
| 12821 |
}
|
| 12822 |
|
| 12823 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 12824 |
-
if (use_blas) {
|
| 12825 |
-
if (params->ith != 0) { // All threads other than the first do no work.
|
| 12826 |
-
return;
|
| 12827 |
-
}
|
| 12828 |
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
| 12829 |
-
// src0: (k,n)
|
| 12830 |
-
// src1: (k,m)
|
| 12831 |
-
// dst: (m,n)
|
| 12832 |
-
//
|
| 12833 |
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
| 12834 |
-
// Also expressed as (major,minor)
|
| 12835 |
-
// a: (m,k): so src1 transposed
|
| 12836 |
-
// b: (k,n): so src0
|
| 12837 |
-
// c: (m,n)
|
| 12838 |
-
//
|
| 12839 |
-
// However, if ggml_is_transposed(src1) is true, then
|
| 12840 |
-
// src1->data already contains a transposed version, so sgemm mustn't
|
| 12841 |
-
// transpose it further.
|
| 12842 |
-
|
| 12843 |
-
int n = src0->ne[0];
|
| 12844 |
-
int k = src0->ne[1];
|
| 12845 |
-
int m = src1->ne[0];
|
| 12846 |
-
|
| 12847 |
-
int transposeA, lda;
|
| 12848 |
-
|
| 12849 |
-
if (!ggml_is_transposed(src1)) {
|
| 12850 |
-
transposeA = CblasTrans;
|
| 12851 |
-
lda = m;
|
| 12852 |
-
} else {
|
| 12853 |
-
transposeA = CblasNoTrans;
|
| 12854 |
-
lda = k;
|
| 12855 |
-
}
|
| 12856 |
-
|
| 12857 |
-
float * a = (float *) ((char *) src1->data);
|
| 12858 |
-
float * b = (float *) ((char *) src0->data);
|
| 12859 |
-
float * c = (float *) ((char *) dst->data);
|
| 12860 |
-
|
| 12861 |
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
| 12862 |
-
|
| 12863 |
-
return;
|
| 12864 |
-
}
|
| 12865 |
-
#endif
|
| 12866 |
-
|
| 12867 |
// dst[:,:,:,:] = 0
|
| 12868 |
// for i2,i3:
|
| 12869 |
// for i1:
|
|
@@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
| 12993 |
// nb01 >= nb00 - src0 is not transposed
|
| 12994 |
// compute by src0 rows
|
| 12995 |
|
| 12996 |
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 12997 |
-
|
| 12998 |
if (params->type == GGML_TASK_TYPE_INIT) {
|
| 12999 |
if (ith != 0) {
|
| 13000 |
return;
|
|
@@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
| 13391 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13392 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13393 |
|
|
|
|
|
|
|
| 13394 |
dequantize_row_q(
|
| 13395 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13396 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
| 13434 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13435 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13436 |
|
|
|
|
|
|
|
| 13437 |
ggml_fp16_to_fp32_row(
|
| 13438 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13439 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13477,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
| 13477 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13478 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13479 |
|
| 13480 |
-
|
|
|
|
|
|
|
| 13481 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13482 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 13483 |
}
|
|
@@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
| 13520 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13521 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13522 |
|
|
|
|
|
|
|
| 13523 |
ggml_vec_cpy_f32(nc,
|
| 13524 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
| 13525 |
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
@@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
| 18893 |
switch (node->op) {
|
| 18894 |
case GGML_OP_CPY:
|
| 18895 |
case GGML_OP_DUP:
|
|
|
|
| 18896 |
case GGML_OP_ADD:
|
| 18897 |
case GGML_OP_ADD1:
|
| 18898 |
case GGML_OP_ACC:
|
|
@@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
| 18977 |
} break;
|
| 18978 |
case GGML_OP_SCALE:
|
| 18979 |
case GGML_OP_SET:
|
| 18980 |
-
case GGML_OP_CONT:
|
| 18981 |
case GGML_OP_RESHAPE:
|
| 18982 |
case GGML_OP_VIEW:
|
| 18983 |
case GGML_OP_PERMUTE:
|
|
@@ -19137,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
| 19137 |
sched_yield();
|
| 19138 |
}
|
| 19139 |
|
| 19140 |
-
*
|
| 19141 |
-
if (*
|
|
|
|
|
|
|
|
|
|
| 19142 |
#if defined(__SSE3__)
|
| 19143 |
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 19144 |
_mm_pause();
|
|
@@ -19148,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
| 19148 |
|
| 19149 |
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
| 19150 |
// wait for other threads to finish
|
| 19151 |
-
const int last_task_phase = *
|
| 19152 |
|
| 19153 |
while (true) {
|
| 19154 |
if (do_yield) {
|
| 19155 |
sched_yield();
|
| 19156 |
}
|
| 19157 |
|
| 19158 |
-
*
|
| 19159 |
-
if (*
|
|
|
|
|
|
|
|
|
|
| 19160 |
#if defined(__SSE3__)
|
| 19161 |
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 19162 |
_mm_pause();
|
|
@@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
| 19356 |
{
|
| 19357 |
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
| 19358 |
|
| 19359 |
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 19360 |
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
| 19361 |
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
| 19362 |
-
// here we need memory for fully dequantized matrix from src0
|
| 19363 |
-
// take into account that src0 can be broadcasted into src1[2,3]
|
| 19364 |
-
cur = ggml_type_size(GGML_TYPE_F32)
|
| 19365 |
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
| 19366 |
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
| 19367 |
-
}
|
| 19368 |
-
} else
|
| 19369 |
-
#endif
|
| 19370 |
if (node->src[1]->type != vec_dot_type) {
|
| 19371 |
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
| 19372 |
}
|
|
@@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
| 22664 |
}
|
| 22665 |
|
| 22666 |
int ggml_cpu_has_blas(void) {
|
| 22667 |
-
#if defined(
|
| 22668 |
return 1;
|
| 22669 |
#else
|
| 22670 |
return 0;
|
|
|
|
| 297 |
|
| 298 |
#if defined(GGML_USE_ACCELERATE)
|
| 299 |
#include <Accelerate/Accelerate.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
#endif
|
| 301 |
|
| 302 |
// floating point type used to accumulate sums
|
|
|
|
| 12173 |
|
| 12174 |
// ggml_compute_forward_mul_mat
|
| 12175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12176 |
static void ggml_compute_forward_mul_mat_one_chunk(
|
| 12177 |
const struct ggml_compute_params * params,
|
| 12178 |
struct ggml_tensor * dst,
|
|
|
|
| 12310 |
// nb01 >= nb00 - src0 is not transposed
|
| 12311 |
// compute by src0 rows
|
| 12312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12313 |
#if GGML_USE_LLAMAFILE
|
| 12314 |
const bool src1_cont = ggml_is_contiguous(src1);
|
| 12315 |
|
|
|
|
| 12690 |
// nb01 >= nb00 - src0 is not transposed
|
| 12691 |
// compute by src0 rows
|
| 12692 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12693 |
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12694 |
if (ith != 0) {
|
| 12695 |
return;
|
| 12696 |
}
|
|
|
|
| 12702 |
return;
|
| 12703 |
}
|
| 12704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12705 |
// dst[:,:,:,:] = 0
|
| 12706 |
// for i2,i3:
|
| 12707 |
// for i1:
|
|
|
|
| 12831 |
// nb01 >= nb00 - src0 is not transposed
|
| 12832 |
// compute by src0 rows
|
| 12833 |
|
|
|
|
|
|
|
| 12834 |
if (params->type == GGML_TASK_TYPE_INIT) {
|
| 12835 |
if (ith != 0) {
|
| 12836 |
return;
|
|
|
|
| 13227 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13228 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13229 |
|
| 13230 |
+
assert(i01 >= 0 && i01 < ne01);
|
| 13231 |
+
|
| 13232 |
dequantize_row_q(
|
| 13233 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13234 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
|
|
| 13272 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13273 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13274 |
|
| 13275 |
+
assert(i01 >= 0 && i01 < ne01);
|
| 13276 |
+
|
| 13277 |
ggml_fp16_to_fp32_row(
|
| 13278 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13279 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
|
|
| 13317 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13318 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13319 |
|
| 13320 |
+
assert(i01 >= 0 && i01 < ne01);
|
| 13321 |
+
|
| 13322 |
+
ggml_bf16_to_fp32_row(
|
| 13323 |
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
| 13324 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
| 13325 |
}
|
|
|
|
| 13362 |
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
| 13363 |
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
| 13364 |
|
| 13365 |
+
assert(i01 >= 0 && i01 < ne01);
|
| 13366 |
+
|
| 13367 |
ggml_vec_cpy_f32(nc,
|
| 13368 |
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
| 13369 |
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
|
|
| 18737 |
switch (node->op) {
|
| 18738 |
case GGML_OP_CPY:
|
| 18739 |
case GGML_OP_DUP:
|
| 18740 |
+
case GGML_OP_CONT:
|
| 18741 |
case GGML_OP_ADD:
|
| 18742 |
case GGML_OP_ADD1:
|
| 18743 |
case GGML_OP_ACC:
|
|
|
|
| 18822 |
} break;
|
| 18823 |
case GGML_OP_SCALE:
|
| 18824 |
case GGML_OP_SET:
|
|
|
|
| 18825 |
case GGML_OP_RESHAPE:
|
| 18826 |
case GGML_OP_VIEW:
|
| 18827 |
case GGML_OP_PERMUTE:
|
|
|
|
| 18981 |
sched_yield();
|
| 18982 |
}
|
| 18983 |
|
| 18984 |
+
*node_n = atomic_load(&state->shared->node_n);
|
| 18985 |
+
if (*node_n != last_node_n) {
|
| 18986 |
+
break;
|
| 18987 |
+
}
|
| 18988 |
+
|
| 18989 |
#if defined(__SSE3__)
|
| 18990 |
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 18991 |
_mm_pause();
|
|
|
|
| 18995 |
|
| 18996 |
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
| 18997 |
// wait for other threads to finish
|
| 18998 |
+
const int last_task_phase = *task_phase;
|
| 18999 |
|
| 19000 |
while (true) {
|
| 19001 |
if (do_yield) {
|
| 19002 |
sched_yield();
|
| 19003 |
}
|
| 19004 |
|
| 19005 |
+
*task_phase = atomic_load(&state->shared->node_task);
|
| 19006 |
+
if (*task_phase != last_task_phase) {
|
| 19007 |
+
break;
|
| 19008 |
+
}
|
| 19009 |
+
|
| 19010 |
#if defined(__SSE3__)
|
| 19011 |
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
| 19012 |
_mm_pause();
|
|
|
|
| 19206 |
{
|
| 19207 |
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
| 19208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19209 |
if (node->src[1]->type != vec_dot_type) {
|
| 19210 |
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
| 19211 |
}
|
|
|
|
| 22503 |
}
|
| 22504 |
|
| 22505 |
int ggml_cpu_has_blas(void) {
|
| 22506 |
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
| 22507 |
return 1;
|
| 22508 |
#else
|
| 22509 |
return 0;
|