slaren ggerganov commited on
Commit
c773aa9
·
1 Parent(s): efbb7be

move BLAS to a separate backend (llama/6210)

Browse files

* move BLAS to a separate backend

* rename GGML_USE_OPENBLAS to GGML_USE_BLAS

* alloc : reuse same buffer when the same buffer type if used multiple times

* set number of threads automatically for openblas and blis

* sched : print assignments when GGML_SCHED_DEBUG env variable is set

* sched : allow ops with weights on an incompatible buffer type

This will cause the weight to be copied to a backend that supports the
op, which is very costly. The weight should have been stored in a buffer
of a backend that can run the op, but llama.cpp cannot do this
automatically at the moment.

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (11) hide show
  1. ggml-alloc.c +77 -21
  2. ggml-backend-impl.h +20 -8
  3. ggml-backend.c +178 -64
  4. ggml-backend.h +3 -3
  5. ggml-cuda.cu +24 -20
  6. ggml-kompute.cpp +7 -6
  7. ggml-metal.m +8 -7
  8. ggml-rpc.cpp +11 -10
  9. ggml-sycl.cpp +10 -18
  10. ggml-vulkan.cpp +13 -13
  11. ggml.c +22 -183
ggml-alloc.c CHANGED
@@ -339,6 +339,7 @@ struct hash_node {
339
  };
340
 
341
  struct tensor_alloc {
 
342
  size_t offset;
343
  size_t size_max; // 0 = pre-allocated, unused, or view
344
  };
@@ -349,7 +350,6 @@ struct leaf_alloc {
349
  };
350
 
351
  struct node_alloc {
352
- int buffer_id;
353
  struct tensor_alloc dst;
354
  struct tensor_alloc src[GGML_MAX_SRC];
355
  };
@@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
386
  for (int i = 0; i < n_bufs; i++) {
387
  galloc->bufts[i] = bufts[i];
388
  galloc->buffers[i] = NULL;
389
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
390
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
 
 
 
 
 
 
 
 
 
 
 
391
  }
392
  galloc->n_buffers = n_bufs;
393
 
@@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
405
 
406
  for (int i = 0; i < galloc->n_buffers; i++) {
407
  if (galloc->buffers != NULL) {
408
- ggml_backend_buffer_free(galloc->buffers[i]);
 
 
 
 
 
 
 
 
 
 
409
  }
410
  if (galloc->buf_tallocs != NULL) {
411
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
 
 
 
 
 
 
 
 
 
 
412
  }
413
  }
414
 
@@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
511
  }
512
  }
513
 
514
- static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
515
  // graph outputs are never freed
516
  if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
517
  AT_PRINTF("not freeing output %s\n", node->name);
518
  return;
519
  }
520
 
521
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
522
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
523
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
524
  size_t offset = hn->offset;
 
 
 
525
  size_t size = ggml_backend_buft_get_alloc_size(buft, node);
526
  ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
527
  hn->allocated = false;
@@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
626
  AT_PRINTF("view_src %s: %d children, %d views\n",
627
  view_src->name, view_src_hn->n_children, view_src_hn->n_views);
628
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
629
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
630
  }
631
  }
632
  else if (p_hn->allocated) {
633
- ggml_gallocr_free_node(galloc, parent, buffer_id);
634
  }
635
  }
636
  AT_PRINTF("\n");
@@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
674
  for (int i = 0; i < graph->n_nodes; i++) {
675
  struct ggml_tensor * node = graph->nodes[i];
676
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
677
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
678
  if (node->view_src || node->data) {
 
679
  node_alloc->dst.offset = SIZE_MAX;
680
  node_alloc->dst.size_max = 0;
681
  } else {
682
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
683
- node_alloc->dst.offset = hn->offset;
684
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
 
685
  }
686
  for (int j = 0; j < GGML_MAX_SRC; j++) {
687
  struct ggml_tensor * src = node->src[j];
688
  if (!src || src->view_src || src->data) {
 
689
  node_alloc->src[j].offset = SIZE_MAX;
690
  node_alloc->src[j].size_max = 0;
691
  } else {
692
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
 
693
  node_alloc->src[j].offset = hn->offset;
694
  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
695
  }
@@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
708
  if (leaf->view_src || leaf->data) {
 
709
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
  galloc->leaf_allocs[i].leaf.size_max = 0;
711
  } else {
 
712
  galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
  galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
  }
@@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
716
 
717
  // reallocate buffers if needed
718
  for (int i = 0; i < galloc->n_buffers; i++) {
 
 
 
 
 
 
 
 
719
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
720
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
721
 
@@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
724
  #ifndef NDEBUG
725
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
726
  #endif
 
727
  ggml_backend_buffer_free(galloc->buffers[i]);
728
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
729
  if (galloc->buffers[i] == NULL) {
@@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
740
  return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
741
  }
742
 
743
- static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
 
744
  assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
745
 
746
  if (tensor->view_src != NULL) {
@@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
768
  }
769
  }
770
 
771
- static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
772
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
773
  size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
774
  return talloc->size_max >= node_size;
775
  }
@@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
793
  struct ggml_tensor * node = graph->nodes[i];
794
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
795
 
796
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
797
  #ifndef NDEBUG
798
  fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
799
  #endif
@@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
805
  if (src == NULL) {
806
  continue;
807
  }
808
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
809
  #ifndef NDEBUG
810
  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
811
  #endif
@@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
846
  for (int i = 0; i < graph->n_leafs; i++) {
847
  struct ggml_tensor * leaf = graph->leafs[i];
848
  struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
849
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
850
  }
851
  // nodes
852
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
857
  if (src == NULL) {
858
  continue;
859
  }
860
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
861
  }
862
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
863
  }
864
 
865
  return true;
@@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
871
  if (galloc->buffers[buffer_id] == NULL) {
872
  return 0;
873
  }
 
 
 
 
 
 
 
 
 
874
  return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
875
  }
876
 
 
339
  };
340
 
341
  struct tensor_alloc {
342
+ int buffer_id;
343
  size_t offset;
344
  size_t size_max; // 0 = pre-allocated, unused, or view
345
  };
 
350
  };
351
 
352
  struct node_alloc {
 
353
  struct tensor_alloc dst;
354
  struct tensor_alloc src[GGML_MAX_SRC];
355
  };
 
386
  for (int i = 0; i < n_bufs; i++) {
387
  galloc->bufts[i] = bufts[i];
388
  galloc->buffers[i] = NULL;
389
+
390
+ // check if the same buffer type is used multiple times and reuse the same allocator
391
+ for (int j = 0; j < i; j++) {
392
+ if (bufts[i] == bufts[j]) {
393
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
394
+ break;
395
+ }
396
+ }
397
+
398
+ if (galloc->buf_tallocs[i] == NULL) {
399
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
400
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
401
+ }
402
  }
403
  galloc->n_buffers = n_bufs;
404
 
 
416
 
417
  for (int i = 0; i < galloc->n_buffers; i++) {
418
  if (galloc->buffers != NULL) {
419
+ // skip if already freed
420
+ bool freed = false;
421
+ for (int j = 0; j < i; j++) {
422
+ if (galloc->buffers[j] == galloc->buffers[i]) {
423
+ freed = true;
424
+ break;
425
+ }
426
+ }
427
+ if (!freed) {
428
+ ggml_backend_buffer_free(galloc->buffers[i]);
429
+ }
430
  }
431
  if (galloc->buf_tallocs != NULL) {
432
+ // skip if already freed
433
+ bool freed = false;
434
+ for (int j = 0; j < i; j++) {
435
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
436
+ freed = true;
437
+ break;
438
+ }
439
+ }
440
+ if (!freed) {
441
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
442
+ }
443
  }
444
  }
445
 
 
542
  }
543
  }
544
 
545
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
546
  // graph outputs are never freed
547
  if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
548
  AT_PRINTF("not freeing output %s\n", node->name);
549
  return;
550
  }
551
 
 
 
552
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
553
  size_t offset = hn->offset;
554
+ int buffer_id = hn->buffer_id;
555
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
556
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
557
  size_t size = ggml_backend_buft_get_alloc_size(buft, node);
558
  ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
559
  hn->allocated = false;
 
658
  AT_PRINTF("view_src %s: %d children, %d views\n",
659
  view_src->name, view_src_hn->n_children, view_src_hn->n_views);
660
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
661
+ ggml_gallocr_free_node(galloc, view_src);
662
  }
663
  }
664
  else if (p_hn->allocated) {
665
+ ggml_gallocr_free_node(galloc, parent);
666
  }
667
  }
668
  AT_PRINTF("\n");
 
706
  for (int i = 0; i < graph->n_nodes; i++) {
707
  struct ggml_tensor * node = graph->nodes[i];
708
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
 
709
  if (node->view_src || node->data) {
710
+ node_alloc->dst.buffer_id = -1;
711
  node_alloc->dst.offset = SIZE_MAX;
712
  node_alloc->dst.size_max = 0;
713
  } else {
714
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
715
+ node_alloc->dst.buffer_id = hn->buffer_id;
716
+ node_alloc->dst.offset = hn->offset;
717
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
718
  }
719
  for (int j = 0; j < GGML_MAX_SRC; j++) {
720
  struct ggml_tensor * src = node->src[j];
721
  if (!src || src->view_src || src->data) {
722
+ node_alloc->src[j].buffer_id = -1;
723
  node_alloc->src[j].offset = SIZE_MAX;
724
  node_alloc->src[j].size_max = 0;
725
  } else {
726
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727
+ node_alloc->src[j].buffer_id = hn->buffer_id;
728
  node_alloc->src[j].offset = hn->offset;
729
  node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
730
  }
 
741
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
742
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
743
  if (leaf->view_src || leaf->data) {
744
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
745
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
746
  galloc->leaf_allocs[i].leaf.size_max = 0;
747
  } else {
748
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
749
  galloc->leaf_allocs[i].leaf.offset = hn->offset;
750
  galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
751
  }
 
753
 
754
  // reallocate buffers if needed
755
  for (int i = 0; i < galloc->n_buffers; i++) {
756
+ // if the buffer type is used multiple times, we reuse the same buffer
757
+ for (int j = 0; j < i; j++) {
758
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
759
+ galloc->buffers[i] = galloc->buffers[j];
760
+ break;
761
+ }
762
+ }
763
+
764
  size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
765
  size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
766
 
 
769
  #ifndef NDEBUG
770
  fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
771
  #endif
772
+
773
  ggml_backend_buffer_free(galloc->buffers[i]);
774
  galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
775
  if (galloc->buffers[i] == NULL) {
 
786
  return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
787
  }
788
 
789
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790
+ int buffer_id = tensor_alloc->buffer_id;
791
  assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
792
 
793
  if (tensor->view_src != NULL) {
 
815
  }
816
  }
817
 
818
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819
+ ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
820
  size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
821
  return talloc->size_max >= node_size;
822
  }
 
840
  struct ggml_tensor * node = graph->nodes[i];
841
  struct node_alloc * node_alloc = &galloc->node_allocs[i];
842
 
843
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
844
  #ifndef NDEBUG
845
  fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
846
  #endif
 
852
  if (src == NULL) {
853
  continue;
854
  }
855
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
856
  #ifndef NDEBUG
857
  fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
858
  #endif
 
893
  for (int i = 0; i < graph->n_leafs; i++) {
894
  struct ggml_tensor * leaf = graph->leafs[i];
895
  struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
896
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
897
  }
898
  // nodes
899
  for (int i = 0; i < graph->n_nodes; i++) {
 
904
  if (src == NULL) {
905
  continue;
906
  }
907
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
908
  }
909
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
910
  }
911
 
912
  return true;
 
918
  if (galloc->buffers[buffer_id] == NULL) {
919
  return 0;
920
  }
921
+
922
+ for (int i = 0; i < buffer_id; i++) {
923
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
924
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
925
+ // only return the buffer size the first time it appears to avoid double counting
926
+ return 0;
927
+ }
928
+ }
929
+
930
  return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
931
  }
932
 
ggml-backend-impl.h CHANGED
@@ -17,13 +17,15 @@ extern "C" {
17
 
18
  struct ggml_backend_buffer_type_i {
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
 
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
23
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
24
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
 
 
25
  // check if tensor data is in host memory
26
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
27
  bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
28
  };
29
 
@@ -92,27 +94,37 @@ extern "C" {
92
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
93
 
94
  // compute graph with a plan (not used currently)
 
95
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
96
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
 
 
 
97
 
98
- // compute graph with a plan
99
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
  // compute graph without a plan (async)
101
  enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
 
103
- // check if the backend supports an operation
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
 
 
 
 
106
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
  // even if the weight has to be copied from the CPU temporarily
109
  bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
 
111
  // (optional) event synchronization
 
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
 
114
  void (*GGML_CALL event_record) (ggml_backend_event_t event);
 
115
  void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
 
116
  void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
117
  };
118
 
 
17
 
18
  struct ggml_backend_buffer_type_i {
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
+ // allocate a buffer of this type
21
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
22
+ // tensor alignment
23
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
+ // max buffer size that can be allocated
25
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
+ // data size needed to allocate the tensor, including padding
27
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
28
  // check if tensor data is in host memory
 
29
  bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
30
  };
31
 
 
94
  void (*GGML_CALL synchronize)(ggml_backend_t backend);
95
 
96
  // compute graph with a plan (not used currently)
97
+ // create a new plan for a graph
98
  ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
99
  void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
+ // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
+ void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
+ // compute the graph with the plan
103
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
104
 
 
 
105
  // compute graph without a plan (async)
106
  enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
107
 
108
+ // check if the backend can compute an operation
109
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
 
111
+ // check if the backend can use tensors allocated in a buffer type
112
+ bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
+
114
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
115
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
116
  // even if the weight has to be copied from the CPU temporarily
117
  bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
118
 
119
  // (optional) event synchronization
120
+ // create a new event that can record events on this backend instance
121
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
122
  void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
+ // record an event on the backend instance that created it
124
  void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
+ // wait for an event on on a different backend instance
126
  void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
+ // block until an event is recorded
128
  void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
129
  };
130
 
ggml-backend.c CHANGED
@@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
44
  return ggml_nbytes(tensor);
45
  }
46
 
47
- bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
48
- return buft->iface.supports_backend(buft, backend);
49
- }
50
-
51
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
52
  if (buft->iface.is_host) {
53
  return buft->iface.is_host(buft);
@@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
286
  return backend->iface.supports_op(backend, op);
287
  }
288
 
 
 
 
 
289
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
  if (backend->iface.offload_op != NULL) {
291
  return backend->iface.offload_op(backend, op);
@@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
639
  GGML_UNUSED(buft);
640
  }
641
 
642
- GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
643
- return ggml_backend_is_cpu(backend);
644
-
645
- GGML_UNUSED(buft);
646
- }
647
-
648
  GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
649
  return true;
650
 
@@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
659
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
660
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
661
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
662
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
663
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
664
  },
665
  /* .context = */ NULL,
@@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
715
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
716
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
717
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
718
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
719
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
720
  },
721
  /* .context = */ NULL,
@@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
836
  GGML_UNUSED(backend);
837
  }
838
 
 
 
 
 
 
 
839
  static struct ggml_backend_i cpu_backend_i = {
840
  /* .get_name = */ ggml_backend_cpu_name,
841
  /* .free = */ ggml_backend_cpu_free,
@@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
846
  /* .synchronize = */ NULL,
847
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
848
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
 
849
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
850
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
851
  /* .supports_op = */ ggml_backend_cpu_supports_op,
 
852
  /* .offload_op = */ NULL,
853
  /* .event_new = */ NULL,
854
  /* .event_free = */ NULL,
@@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
1055
  int * node_backend_ids; // [graph_size]
1056
  int * leaf_backend_ids; // [graph_size]
1057
 
 
 
 
1058
  // copy of the graph with modified inputs
1059
  struct ggml_cgraph * graph;
1060
 
@@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
1075
  ggml_backend_sched_eval_callback callback_eval;
1076
  void * callback_eval_user_data;
1077
 
 
 
1078
  // align context_buffer to GGML_MEM_ALIGN
1079
  #ifdef _MSC_VER
1080
  __declspec(align(GGML_MEM_ALIGN))
@@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
1097
  return -1;
1098
  }
1099
 
1100
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
1101
  ggml_backend_buffer_t buffer = tensor->buffer;
1102
  if (buffer == NULL) {
1103
  return -1;
1104
  }
1105
 
1106
- // find highest prio backend that supports the buffer type
1107
  for (int i = 0; i < sched->n_backends; i++) {
1108
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
 
1109
  return i;
1110
  }
1111
  }
1112
 
1113
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
1114
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
1115
- GGML_ASSERT(false);
 
1116
 
1117
  return -1;
1118
  }
@@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1131
  // TODO: use supports_op to check if the backend supports the op
1132
 
1133
  // assign pre-allocated nodes to their backend
1134
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
1135
  if (cur_backend_id != -1) {
1136
  SET_CAUSE(tensor, "1.dst");
1137
  return cur_backend_id;
@@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1139
 
1140
  // view_src
1141
  if (tensor->view_src != NULL) {
1142
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
1143
  if (cur_backend_id != -1) {
1144
  SET_CAUSE(tensor, "1.vsrc");
1145
  return cur_backend_id;
@@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1161
  continue;
1162
  }
1163
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1164
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
1165
  // check if a backend with higher prio wants to offload the op
1166
  if (src_backend_id == sched->n_backends - 1) {
1167
  for (int b = 0; b < src_backend_id; b++) {
@@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
1223
  }
1224
  }
1225
 
1226
- //#define DEBUG_PASS1
1227
- //#define DEBUG_PASS2
1228
- //#define DEBUG_PASS3
1229
- //#define DEBUG_PASS4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1230
 
1231
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1232
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1280
  }
1281
  }
1282
  }
1283
- #ifdef DEBUG_PASS1
1284
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1285
- #endif
1286
 
1287
  // pass 2: expand current backend assignments
1288
  // assign the same backend to adjacent nodes
1289
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1290
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1291
-
1292
-
1293
- // pass 2.2 expand gpu down
1294
  {
1295
  int cur_backend_id = -1;
1296
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1306
  } else {
1307
  cur_backend_id = *node_backend_id;
1308
  }
1309
- } else {
1310
- *node_backend_id = cur_backend_id;
1311
- SET_CAUSE(node, "2.2");
1312
  }
1313
  }
1314
  }
1315
- // pass 2.1 expand gpu up
1316
  {
1317
  int cur_backend_id = -1;
1318
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1328
  } else {
1329
  cur_backend_id = *node_backend_id;
1330
  }
1331
- } else {
1332
- *node_backend_id = cur_backend_id;
1333
- SET_CAUSE(node, "2.1");
1334
  }
1335
  }
1336
  }
1337
- // pass 2.4 expand rest down
1338
  {
1339
  int cur_backend_id = -1;
1340
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1345
  int * node_backend_id = &tensor_backend_id(node);
1346
  if (*node_backend_id != -1) {
1347
  cur_backend_id = *node_backend_id;
1348
- } else {
1349
- *node_backend_id = cur_backend_id;
1350
- SET_CAUSE(node, "2.4");
1351
  }
1352
  }
1353
  }
1354
- // pass 2.3 expand rest up
1355
  {
1356
  int cur_backend_id = -1;
1357
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
@@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1362
  int * node_backend_id = &tensor_backend_id(node);
1363
  if (*node_backend_id != -1) {
1364
  cur_backend_id = *node_backend_id;
1365
- } else {
1366
- *node_backend_id = cur_backend_id;
1367
- SET_CAUSE(node, "2.3");
1368
  }
1369
  }
1370
  }
1371
 
1372
- #ifdef DEBUG_PASS2
1373
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1374
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1375
 
1376
- // pass 3: assign backends to remaining src from dst and view_src
1377
  for (int i = 0; i < graph->n_nodes; i++) {
1378
  struct ggml_tensor * node = graph->nodes[i];
1379
  int * cur_backend_id = &tensor_backend_id(node);
1380
  if (node->view_src != NULL && *cur_backend_id == -1) {
1381
  *cur_backend_id = tensor_backend_id(node->view_src);
1382
- SET_CAUSE(node, "3.vsrc");
1383
  }
1384
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1385
  struct ggml_tensor * src = node->src[j];
@@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1391
  if (src->view_src != NULL) {
1392
  // views are always on the same backend as the source
1393
  *src_backend_id = tensor_backend_id(src->view_src);
1394
- SET_CAUSE(src, "3.vsrc");
1395
  } else {
1396
  *src_backend_id = *cur_backend_id;
1397
- SET_CAUSE(src, "3.cur");
1398
  }
1399
  }
1400
  }
1401
  }
1402
- #ifdef DEBUG_PASS3
1403
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1404
- #endif
1405
 
1406
  // pass 4: split graph, find tensors that need to be copied
1407
  {
@@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1448
  }
1449
  }
1450
  // check if the split has too many inputs
 
1451
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1452
  const size_t id = hash_id(src);
1453
  int src_backend_id = sched->tensor_backend_id[id];
1454
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
 
1455
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1456
  need_new_split = true;
1457
  break;
@@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1486
  const int src_backend_id = tensor_backend_id(src);
1487
  assert(src_backend_id != -1); // all inputs should be assigned by now
1488
 
1489
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1490
  size_t id = hash_id(src);
1491
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1492
  ggml_backend_t backend = sched->backends[src_backend_id];
@@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1511
  }
1512
  }
1513
 
1514
- if (src_backend_id != node_backend_id) {
 
1515
  // create a copy of the input in the split's backend
1516
  const size_t id = hash_id(src);
1517
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1537
  split->i_end = graph->n_nodes;
1538
  sched->n_splits = i_split + 1;
1539
  }
1540
- #ifdef DEBUG_PASS4
1541
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
1542
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
1543
 
1544
  // create copies of the graph for each split
1545
  // TODO: avoid this copy
@@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1613
  }
1614
 
1615
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1616
  // allocate graph
1617
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1618
  // the re-allocation may cause the split inputs to be moved to a different address
1619
  ggml_backend_sched_synchronize(sched);
1620
  #ifndef NDEBUG
@@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1727
 
1728
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1729
 
 
 
1730
  // initialize hash table
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1732
  sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
@@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1736
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
 
 
1738
 
1739
  sched->n_backends = n_backends;
1740
 
@@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
1747
  for (int b = 0; b < n_backends; b++) {
1748
  sched->backends[b] = backends[b];
1749
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1750
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
1751
  if (sched->n_copies > 1) {
1752
  for (int c = 0; c < sched->n_copies; c++) {
1753
  sched->events[b][c] = ggml_backend_event_new(backends[b]);
@@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1779
  free(sched->tensor_copies);
1780
  free(sched->node_backend_ids);
1781
  free(sched->leaf_backend_ids);
 
 
1782
  free(sched);
1783
  }
1784
 
@@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
1875
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1876
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1877
  tensor_backend_id(node) = backend_index;
 
1878
  }
1879
 
1880
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
 
44
  return ggml_nbytes(tensor);
45
  }
46
 
 
 
 
 
47
  bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
48
  if (buft->iface.is_host) {
49
  return buft->iface.is_host(buft);
 
282
  return backend->iface.supports_op(backend, op);
283
  }
284
 
285
+ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
286
+ return backend->iface.supports_buft(backend, buft);
287
+ }
288
+
289
  bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
290
  if (backend->iface.offload_op != NULL) {
291
  return backend->iface.offload_op(backend, op);
 
639
  GGML_UNUSED(buft);
640
  }
641
 
 
 
 
 
 
 
642
  GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
643
  return true;
644
 
 
653
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
654
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
655
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
656
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
657
  },
658
  /* .context = */ NULL,
 
708
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
709
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
710
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
711
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
712
  },
713
  /* .context = */ NULL,
 
828
  GGML_UNUSED(backend);
829
  }
830
 
831
+ GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
832
+ return ggml_backend_buft_is_host(buft);
833
+
834
+ GGML_UNUSED(backend);
835
+ }
836
+
837
  static struct ggml_backend_i cpu_backend_i = {
838
  /* .get_name = */ ggml_backend_cpu_name,
839
  /* .free = */ ggml_backend_cpu_free,
 
844
  /* .synchronize = */ NULL,
845
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
846
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
847
+ /* .graph_plan_update = */ NULL,
848
  /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
849
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
850
  /* .supports_op = */ ggml_backend_cpu_supports_op,
851
+ /* .supports_buft = */ ggml_backend_cpu_supports_buft,
852
  /* .offload_op = */ NULL,
853
  /* .event_new = */ NULL,
854
  /* .event_free = */ NULL,
 
1055
  int * node_backend_ids; // [graph_size]
1056
  int * leaf_backend_ids; // [graph_size]
1057
 
1058
+ int * prev_node_backend_ids; // [graph_size]
1059
+ int * prev_leaf_backend_ids; // [graph_size]
1060
+
1061
  // copy of the graph with modified inputs
1062
  struct ggml_cgraph * graph;
1063
 
 
1078
  ggml_backend_sched_eval_callback callback_eval;
1079
  void * callback_eval_user_data;
1080
 
1081
+ bool debug;
1082
+
1083
  // align context_buffer to GGML_MEM_ALIGN
1084
  #ifdef _MSC_VER
1085
  __declspec(align(GGML_MEM_ALIGN))
 
1102
  return -1;
1103
  }
1104
 
1105
+ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1106
  ggml_backend_buffer_t buffer = tensor->buffer;
1107
  if (buffer == NULL) {
1108
  return -1;
1109
  }
1110
 
1111
+ // find highest prio backend that supports the buffer type and the op
1112
  for (int i = 0; i < sched->n_backends; i++) {
1113
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1114
+ ggml_backend_supports_op(sched->backends[i], op)) {
1115
  return i;
1116
  }
1117
  }
1118
 
1119
+ #ifndef NDEBUG
1120
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1121
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1122
+ #endif
1123
 
1124
  return -1;
1125
  }
 
1138
  // TODO: use supports_op to check if the backend supports the op
1139
 
1140
  // assign pre-allocated nodes to their backend
1141
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1142
  if (cur_backend_id != -1) {
1143
  SET_CAUSE(tensor, "1.dst");
1144
  return cur_backend_id;
 
1146
 
1147
  // view_src
1148
  if (tensor->view_src != NULL) {
1149
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1150
  if (cur_backend_id != -1) {
1151
  SET_CAUSE(tensor, "1.vsrc");
1152
  return cur_backend_id;
 
1168
  continue;
1169
  }
1170
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1171
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1172
  // check if a backend with higher prio wants to offload the op
1173
  if (src_backend_id == sched->n_backends - 1) {
1174
  for (int b = 0; b < src_backend_id; b++) {
 
1230
  }
1231
  }
1232
 
1233
+ static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1234
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1235
+ ggml_backend_buffer_type_t buft = NULL;
1236
+
1237
+ if (buf) {
1238
+ // the tensor is already allocated
1239
+ buft = buf->buft;
1240
+ } else {
1241
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
1242
+ int tensor_backend_id = tensor_backend_id(t);
1243
+ if (tensor_backend_id == -1 && t->view_src) {
1244
+ tensor_backend_id = tensor_backend_id(t->view_src);
1245
+ }
1246
+ if (tensor_backend_id != -1) {
1247
+ buft = sched->bufts[tensor_backend_id];
1248
+ }
1249
+ }
1250
+
1251
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1252
+ }
1253
+
1254
+ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1255
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1256
+ *node_backend_id = cur_backend_id;
1257
+ SET_CAUSE(node, "2.sup");
1258
+ }
1259
+ }
1260
 
1261
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1262
  static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
 
1310
  }
1311
  }
1312
  }
 
 
 
1313
 
1314
  // pass 2: expand current backend assignments
1315
  // assign the same backend to adjacent nodes
1316
  // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1317
  // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1318
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1319
+ // expand gpu down
 
1320
  {
1321
  int cur_backend_id = -1;
1322
  for (int i = 0; i < graph->n_nodes; i++) {
 
1332
  } else {
1333
  cur_backend_id = *node_backend_id;
1334
  }
1335
+ } else if (cur_backend_id != -1) {
1336
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
 
1337
  }
1338
  }
1339
  }
1340
+ // expand gpu up
1341
  {
1342
  int cur_backend_id = -1;
1343
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
 
1353
  } else {
1354
  cur_backend_id = *node_backend_id;
1355
  }
1356
+ } else if (cur_backend_id != -1) {
1357
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
 
1358
  }
1359
  }
1360
  }
1361
+ // expand rest down
1362
  {
1363
  int cur_backend_id = -1;
1364
  for (int i = 0; i < graph->n_nodes; i++) {
 
1369
  int * node_backend_id = &tensor_backend_id(node);
1370
  if (*node_backend_id != -1) {
1371
  cur_backend_id = *node_backend_id;
1372
+ } else if (cur_backend_id != -1) {
1373
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
 
1374
  }
1375
  }
1376
  }
1377
+ // expand rest up
1378
  {
1379
  int cur_backend_id = -1;
1380
  for (int i = graph->n_nodes - 1; i >= 0; i--) {
 
1385
  int * node_backend_id = &tensor_backend_id(node);
1386
  if (*node_backend_id != -1) {
1387
  cur_backend_id = *node_backend_id;
1388
+ } else if (cur_backend_id != -1) {
1389
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
 
1390
  }
1391
  }
1392
  }
1393
 
1394
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1395
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1396
+ // however, we also need to verify that the sources are in compatible buffer types
1397
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1398
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1399
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1400
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1401
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1402
+ for (int i = 0; i < graph->n_nodes; i++) {
1403
+ struct ggml_tensor * node = graph->nodes[i];
1404
+ if (ggml_is_view_op(node->op)) {
1405
+ continue;
1406
+ }
1407
+ int * node_backend_id = &tensor_backend_id(node);
1408
+ if (*node_backend_id == -1) {
1409
+ // unassigned node: find the backend with the most supported inputs
1410
+ int n_supported_best = -1;
1411
+ for (int b = 0; b < sched->n_backends; b++) {
1412
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
1413
+ int n_supported = 0;
1414
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1415
+ struct ggml_tensor * src = node->src[j];
1416
+ if (src == NULL) {
1417
+ continue;
1418
+ }
1419
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1420
+ n_supported++;
1421
+ }
1422
+ }
1423
+ if (n_supported > n_supported_best) {
1424
+ n_supported_best = n_supported;
1425
+ *node_backend_id = b;
1426
+ SET_CAUSE(node, "3.best");
1427
+ }
1428
+ }
1429
+ }
1430
+ } else {
1431
+ // assigned node: upgrade to higher prio backend if possible
1432
+ for (int b = 0; b < *node_backend_id; b++) {
1433
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1434
+ bool supported = true;
1435
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1436
+ struct ggml_tensor * src = node->src[j];
1437
+ if (src == NULL) {
1438
+ continue;
1439
+ }
1440
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1441
+ supported = false;
1442
+ break;
1443
+ }
1444
+ }
1445
+ if (supported) {
1446
+ *node_backend_id = b;
1447
+ SET_CAUSE(node, "3.upg");
1448
+ break;
1449
+ }
1450
+ }
1451
+ }
1452
+ }
1453
+ }
1454
 
1455
+ // pass 4: assign backends to remaining src from dst and view_src
1456
  for (int i = 0; i < graph->n_nodes; i++) {
1457
  struct ggml_tensor * node = graph->nodes[i];
1458
  int * cur_backend_id = &tensor_backend_id(node);
1459
  if (node->view_src != NULL && *cur_backend_id == -1) {
1460
  *cur_backend_id = tensor_backend_id(node->view_src);
1461
+ SET_CAUSE(node, "4.vsrc");
1462
  }
1463
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1464
  struct ggml_tensor * src = node->src[j];
 
1470
  if (src->view_src != NULL) {
1471
  // views are always on the same backend as the source
1472
  *src_backend_id = tensor_backend_id(src->view_src);
1473
+ SET_CAUSE(src, "4.vsrc");
1474
  } else {
1475
  *src_backend_id = *cur_backend_id;
1476
+ SET_CAUSE(src, "4.cur");
1477
  }
1478
  }
1479
  }
1480
  }
 
 
 
1481
 
1482
  // pass 4: split graph, find tensors that need to be copied
1483
  {
 
1524
  }
1525
  }
1526
  // check if the split has too many inputs
1527
+ // FIXME: count the number of inputs instead of only checking when full
1528
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1529
  const size_t id = hash_id(src);
1530
  int src_backend_id = sched->tensor_backend_id[id];
1531
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1532
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1533
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1534
  need_new_split = true;
1535
  break;
 
1564
  const int src_backend_id = tensor_backend_id(src);
1565
  assert(src_backend_id != -1); // all inputs should be assigned by now
1566
 
1567
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1568
  size_t id = hash_id(src);
1569
  if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1570
  ggml_backend_t backend = sched->backends[src_backend_id];
 
1589
  }
1590
  }
1591
 
1592
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1593
+ if (src_backend_id != cur_backend_id && !supported) {
1594
  // create a copy of the input in the split's backend
1595
  const size_t id = hash_id(src);
1596
  if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
 
1616
  split->i_end = graph->n_nodes;
1617
  sched->n_splits = i_split + 1;
1618
  }
1619
+
1620
+ if (sched->debug) {
1621
+ ggml_backend_sched_print_assignments(sched, graph);
1622
+ }
1623
+
1624
+ // swap node_backend_ids and leaf_backend_ids and prevs
1625
+ {
1626
+ int * tmp = sched->node_backend_ids;
1627
+ sched->node_backend_ids = sched->prev_node_backend_ids;
1628
+ sched->prev_node_backend_ids = tmp;
1629
+
1630
+ tmp = sched->leaf_backend_ids;
1631
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1632
+ sched->prev_leaf_backend_ids = tmp;
1633
+ }
1634
 
1635
  // create copies of the graph for each split
1636
  // TODO: avoid this copy
 
1704
  }
1705
 
1706
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1707
+ bool backend_ids_changed = false;
1708
+ for (int i = 0; i < sched->graph->n_nodes; i++) {
1709
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
1710
+ backend_ids_changed = true;
1711
+ break;
1712
+ }
1713
+ }
1714
+ if (!backend_ids_changed) {
1715
+ for (int i = 0; i < sched->graph->n_leafs; i++) {
1716
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
1717
+ backend_ids_changed = true;
1718
+ break;
1719
+ }
1720
+ }
1721
+ }
1722
+
1723
  // allocate graph
1724
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1725
  // the re-allocation may cause the split inputs to be moved to a different address
1726
  ggml_backend_sched_synchronize(sched);
1727
  #ifndef NDEBUG
 
1834
 
1835
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1836
 
1837
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1838
+
1839
  // initialize hash table
1840
  sched->hash_set = ggml_hash_set_new(graph_size);
1841
  sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
 
1844
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1845
  sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1846
  sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1847
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1848
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1849
 
1850
  sched->n_backends = n_backends;
1851
 
 
1858
  for (int b = 0; b < n_backends; b++) {
1859
  sched->backends[b] = backends[b];
1860
  sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1861
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1862
  if (sched->n_copies > 1) {
1863
  for (int c = 0; c < sched->n_copies; c++) {
1864
  sched->events[b][c] = ggml_backend_event_new(backends[b]);
 
1890
  free(sched->tensor_copies);
1891
  free(sched->node_backend_ids);
1892
  free(sched->leaf_backend_ids);
1893
+ free(sched->prev_node_backend_ids);
1894
+ free(sched->prev_leaf_backend_ids);
1895
  free(sched);
1896
  }
1897
 
 
1988
  int backend_index = ggml_backend_sched_backend_id(sched, backend);
1989
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1990
  tensor_backend_id(node) = backend_index;
1991
+ SET_CAUSE(node, "usr");
1992
  }
1993
 
1994
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
ggml-backend.h CHANGED
@@ -23,7 +23,6 @@ extern "C" {
23
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
26
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
27
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
28
 
29
  // buffer
@@ -74,6 +73,7 @@ extern "C" {
74
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
76
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
 
77
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
 
79
  // tensor copy between different backends
@@ -90,7 +90,7 @@ extern "C" {
90
  GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
  GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
  GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
94
 
95
  //
96
  // CPU backend
@@ -119,7 +119,7 @@ extern "C" {
119
 
120
  GGML_API size_t ggml_backend_reg_get_count(void);
121
  GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
122
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
123
  GGML_API const char * ggml_backend_reg_get_name(size_t i);
124
  GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
125
  GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
 
23
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
24
  GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
25
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
 
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
27
 
28
  // buffer
 
73
  GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
74
  GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
75
  GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
76
+ GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
77
  GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
78
 
79
  // tensor copy between different backends
 
90
  GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
91
  GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
92
  GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
93
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
94
 
95
  //
96
  // CPU backend
 
119
 
120
  GGML_API size_t ggml_backend_reg_get_count(void);
121
  GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
122
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
123
  GGML_API const char * ggml_backend_reg_get_name(size_t i);
124
  GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
125
  GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
ggml-cuda.cu CHANGED
@@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
543
  return ctx->name.c_str();
544
  }
545
 
 
 
 
 
546
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
547
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
548
 
@@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
585
  GGML_UNUSED(buft);
586
  }
587
 
588
- GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
589
- if (!ggml_backend_is_cuda(backend)) {
590
- return false;
591
- }
592
-
593
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
594
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
595
-
596
- return buft_ctx->device == cuda_ctx->device;
597
- }
598
-
599
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
600
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
601
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
602
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
603
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
604
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
605
- /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
606
  /* .is_host = */ NULL,
607
  };
608
 
@@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
863
  GGML_UNUSED(buft);
864
  }
865
 
 
 
 
 
866
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
867
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
868
  // instead, we allocate them for each tensor separately in init_tensor
@@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
906
  return total_size;
907
  }
908
 
909
- GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
910
- return ggml_backend_is_cuda(backend);
911
-
912
- GGML_UNUSED(buft);
913
- }
914
-
915
  GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
916
  return false;
917
 
@@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
924
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
925
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
926
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
927
- /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
928
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
929
  };
930
 
@@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
1024
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1025
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1026
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
1027
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
1028
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1029
  },
1030
  /* .context = */ nullptr,
@@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
2879
  GGML_UNUSED(backend);
2880
  }
2881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2882
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2883
  const int min_batch_size = 32;
2884
 
@@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
2951
  /* .synchronize = */ ggml_backend_cuda_synchronize,
2952
  /* .graph_plan_create = */ NULL,
2953
  /* .graph_plan_free = */ NULL,
 
2954
  /* .graph_plan_compute = */ NULL,
2955
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2956
  /* .supports_op = */ ggml_backend_cuda_supports_op,
 
2957
  /* .offload_op = */ ggml_backend_cuda_offload_op,
2958
  /* .event_new = */ ggml_backend_cuda_event_new,
2959
  /* .event_free = */ ggml_backend_cuda_event_free,
 
543
  return ctx->name.c_str();
544
  }
545
 
546
+ static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
547
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
548
+ }
549
+
550
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
551
  ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
552
 
 
589
  GGML_UNUSED(buft);
590
  }
591
 
 
 
 
 
 
 
 
 
 
 
 
592
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
593
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
594
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
595
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
596
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
597
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
 
598
  /* .is_host = */ NULL,
599
  };
600
 
 
855
  GGML_UNUSED(buft);
856
  }
857
 
858
+ static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
859
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
860
+ }
861
+
862
  GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
863
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
864
  // instead, we allocate them for each tensor separately in init_tensor
 
902
  return total_size;
903
  }
904
 
 
 
 
 
 
 
905
  GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
906
  return false;
907
 
 
914
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
915
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
916
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
 
917
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
918
  };
919
 
 
1013
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
1014
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
1015
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 
1016
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
1017
  },
1018
  /* .context = */ nullptr,
 
2867
  GGML_UNUSED(backend);
2868
  }
2869
 
2870
+ GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
2871
+ if (ggml_backend_buft_is_cuda_split(buft)) {
2872
+ return true;
2873
+ }
2874
+
2875
+ if (ggml_backend_buft_is_cuda(buft)) {
2876
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2877
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
2878
+ return buft_ctx->device == cuda_ctx->device;
2879
+ }
2880
+
2881
+ return false;
2882
+ }
2883
+
2884
  GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
2885
  const int min_batch_size = 32;
2886
 
 
2953
  /* .synchronize = */ ggml_backend_cuda_synchronize,
2954
  /* .graph_plan_create = */ NULL,
2955
  /* .graph_plan_free = */ NULL,
2956
+ /* .graph_plan_update = */ NULL,
2957
  /* .graph_plan_compute = */ NULL,
2958
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
2959
  /* .supports_op = */ ggml_backend_cuda_supports_op,
2960
+ /* .supports_buft = */ ggml_backend_cuda_supports_buft,
2961
  /* .offload_op = */ ggml_backend_cuda_offload_op,
2962
  /* .event_new = */ ggml_backend_cuda_event_new,
2963
  /* .event_free = */ ggml_backend_cuda_event_free,
ggml-kompute.cpp CHANGED
@@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
1902
  return ctx->max_alloc;
1903
  }
1904
 
1905
- static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1906
- GGML_UNUSED(buft);
1907
- return ggml_backend_is_kompute(backend);
1908
- }
1909
-
1910
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1911
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1912
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1913
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1914
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1915
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1916
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1917
  /* .is_host = */ NULL,
1918
  };
1919
 
@@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
1973
  return ggml_vk_supports_op(op);
1974
  }
1975
 
 
 
 
 
 
1976
  static struct ggml_backend_i kompute_backend_i = {
1977
  /* .get_name = */ ggml_backend_kompute_name,
1978
  /* .free = */ ggml_backend_kompute_free,
@@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
1983
  /* .synchronize = */ NULL,
1984
  /* .graph_plan_create = */ NULL,
1985
  /* .graph_plan_free = */ NULL,
 
1986
  /* .graph_plan_compute = */ NULL,
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
 
1989
  /* .offload_op = */ NULL,
1990
  /* .event_new = */ NULL,
1991
  /* .event_free = */ NULL,
 
1902
  return ctx->max_alloc;
1903
  }
1904
 
 
 
 
 
 
1905
  static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1906
  /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1907
  /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1908
  /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1909
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1910
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
1911
  /* .is_host = */ NULL,
1912
  };
1913
 
 
1967
  return ggml_vk_supports_op(op);
1968
  }
1969
 
1970
+ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
1971
+ GGML_UNUSED(backend);
1972
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
1973
+ }
1974
+
1975
  static struct ggml_backend_i kompute_backend_i = {
1976
  /* .get_name = */ ggml_backend_kompute_name,
1977
  /* .free = */ ggml_backend_kompute_free,
 
1982
  /* .synchronize = */ NULL,
1983
  /* .graph_plan_create = */ NULL,
1984
  /* .graph_plan_free = */ NULL,
1985
+ /* .graph_plan_update = */ NULL,
1986
  /* .graph_plan_compute = */ NULL,
1987
  /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1988
  /* .supports_op = */ ggml_backend_kompute_supports_op,
1989
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
1990
  /* .offload_op = */ NULL,
1991
  /* .event_new = */ NULL,
1992
  /* .event_free = */ NULL,
ggml-metal.m CHANGED
@@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
3044
  UNUSED(buft);
3045
  }
3046
 
3047
- GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
3048
- return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
3049
-
3050
- UNUSED(buft);
3051
- }
3052
-
3053
  GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3054
  return true;
3055
 
@@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
3064
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
3065
  /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
3066
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
3067
- /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
3068
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3069
  },
3070
  /* .context = */ NULL,
@@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
3179
  return ggml_metal_supports_op(metal_ctx, op);
3180
  }
3181
 
 
 
 
 
 
 
3182
  static struct ggml_backend_i ggml_backend_metal_i = {
3183
  /* .get_name = */ ggml_backend_metal_name,
3184
  /* .free = */ ggml_backend_metal_free,
@@ -3189,9 +3188,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
3189
  /* .synchronize = */ NULL,
3190
  /* .graph_plan_create = */ NULL,
3191
  /* .graph_plan_free = */ NULL,
 
3192
  /* .graph_plan_compute = */ NULL,
3193
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
3194
  /* .supports_op = */ ggml_backend_metal_supports_op,
 
3195
  /* .offload_op = */ NULL,
3196
  /* .event_new = */ NULL,
3197
  /* .event_free = */ NULL,
 
3044
  UNUSED(buft);
3045
  }
3046
 
 
 
 
 
 
 
3047
  GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
3048
  return true;
3049
 
 
3058
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
3059
  /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
3060
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
3061
  /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
3062
  },
3063
  /* .context = */ NULL,
 
3172
  return ggml_metal_supports_op(metal_ctx, op);
3173
  }
3174
 
3175
+ GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
3176
+ return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
3177
+
3178
+ UNUSED(backend);
3179
+ }
3180
+
3181
  static struct ggml_backend_i ggml_backend_metal_i = {
3182
  /* .get_name = */ ggml_backend_metal_name,
3183
  /* .free = */ ggml_backend_metal_free,
 
3188
  /* .synchronize = */ NULL,
3189
  /* .graph_plan_create = */ NULL,
3190
  /* .graph_plan_free = */ NULL,
3191
+ /* .graph_plan_update = */ NULL,
3192
  /* .graph_plan_compute = */ NULL,
3193
  /* .graph_compute = */ ggml_backend_metal_graph_compute,
3194
  /* .supports_op = */ ggml_backend_metal_supports_op,
3195
+ /* .supports_buft = */ ggml_backend_metal_supports_buft,
3196
  /* .offload_op = */ NULL,
3197
  /* .event_new = */ NULL,
3198
  /* .event_free = */ NULL,
ggml-rpc.cpp CHANGED
@@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
540
  return ggml_nbytes(tensor);
541
  }
542
 
543
- GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
544
- if (!ggml_backend_is_rpc(backend)) {
545
- return false;
546
- }
547
- ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
548
- ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
549
- return buft_ctx->endpoint == rpc_ctx->endpoint;
550
- }
551
-
552
  static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
553
  /* .get_name = */ ggml_backend_rpc_buffer_type_name,
554
  /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
555
  /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
556
  /* .get_max_size = */ ggml_backend_rpc_get_max_size,
557
  /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
558
- /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
559
  /* .is_host = */ NULL,
560
  };
561
 
@@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
638
  return false;
639
  }
640
 
 
 
 
 
 
 
 
 
 
641
  static ggml_backend_i ggml_backend_rpc_interface = {
642
  /* .get_name = */ ggml_backend_rpc_name,
643
  /* .free = */ ggml_backend_rpc_free,
@@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
648
  /* .synchronize = */ ggml_backend_rpc_synchronize,
649
  /* .graph_plan_create = */ NULL,
650
  /* .graph_plan_free = */ NULL,
 
651
  /* .graph_plan_compute = */ NULL,
652
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
653
  /* .supports_op = */ ggml_backend_rpc_supports_op,
 
654
  /* .offload_op = */ NULL,
655
  /* .event_new = */ NULL,
656
  /* .event_free = */ NULL,
 
540
  return ggml_nbytes(tensor);
541
  }
542
 
 
 
 
 
 
 
 
 
 
543
  static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
544
  /* .get_name = */ ggml_backend_rpc_buffer_type_name,
545
  /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
546
  /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
547
  /* .get_max_size = */ ggml_backend_rpc_get_max_size,
548
  /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
 
549
  /* .is_host = */ NULL,
550
  };
551
 
 
628
  return false;
629
  }
630
 
631
+ GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
632
+ if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
633
+ return false;
634
+ }
635
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
636
+ ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
637
+ return buft_ctx->endpoint == rpc_ctx->endpoint;
638
+ }
639
+
640
  static ggml_backend_i ggml_backend_rpc_interface = {
641
  /* .get_name = */ ggml_backend_rpc_name,
642
  /* .free = */ ggml_backend_rpc_free,
 
647
  /* .synchronize = */ ggml_backend_rpc_synchronize,
648
  /* .graph_plan_create = */ NULL,
649
  /* .graph_plan_free = */ NULL,
650
+ /* .graph_plan_update = */ NULL,
651
  /* .graph_plan_compute = */ NULL,
652
  /* .graph_compute = */ ggml_backend_rpc_graph_compute,
653
  /* .supports_op = */ ggml_backend_rpc_supports_op,
654
+ /* .supports_buft = */ ggml_backend_rpc_supports_buft,
655
  /* .offload_op = */ NULL,
656
  /* .event_new = */ NULL,
657
  /* .event_free = */ NULL,
ggml-sycl.cpp CHANGED
@@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
16575
  UNUSED(buft);
16576
  }
16577
 
16578
- GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
16579
- if (!ggml_backend_is_sycl(backend)) {
16580
- return false;
16581
- }
16582
- ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
16583
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
16584
- return buft_ctx->device == sycl_ctx->device;
16585
- }
16586
-
16587
  static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16588
  /* .get_name = */ ggml_backend_sycl_buffer_type_name,
16589
  /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
16590
  /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
16591
  /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
16592
  /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
16593
- /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
16594
  /* .is_host = */ nullptr,
16595
  };
16596
 
@@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
16942
  return total_size;
16943
  }
16944
 
16945
- GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
16946
- return ggml_backend_is_sycl(backend);
16947
-
16948
- UNUSED(buft);
16949
- }
16950
-
16951
  GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
16952
  return false;
16953
 
@@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
16960
  /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
16961
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
16962
  /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
16963
- /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
16964
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
16965
  };
16966
 
@@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
17046
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
17047
  /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
17048
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
17049
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
17050
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
17051
  },
17052
  /* .context = */ nullptr,
@@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
17311
  GGML_UNUSED(backend);
17312
  }
17313
 
 
 
 
 
 
 
 
 
17314
 
17315
  static ggml_backend_i ggml_backend_sycl_interface = {
17316
  /* .get_name = */ ggml_backend_sycl_name,
@@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
17322
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17323
  /* .graph_plan_create = */ NULL,
17324
  /* .graph_plan_free = */ NULL,
 
17325
  /* .graph_plan_compute = */ NULL,
17326
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17327
  /* .supports_op = */ ggml_backend_sycl_supports_op,
 
17328
  /* .offload_op = */ ggml_backend_sycl_offload_op,
17329
  /* .event_new = */ NULL,
17330
  /* .event_free = */ NULL,
 
16575
  UNUSED(buft);
16576
  }
16577
 
 
 
 
 
 
 
 
 
 
16578
  static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
16579
  /* .get_name = */ ggml_backend_sycl_buffer_type_name,
16580
  /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
16581
  /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
16582
  /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
16583
  /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
 
16584
  /* .is_host = */ nullptr,
16585
  };
16586
 
 
16932
  return total_size;
16933
  }
16934
 
 
 
 
 
 
 
16935
  GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
16936
  return false;
16937
 
 
16944
  /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
16945
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
16946
  /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
 
16947
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
16948
  };
16949
 
 
17029
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
17030
  /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
17031
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 
17032
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
17033
  },
17034
  /* .context = */ nullptr,
 
17293
  GGML_UNUSED(backend);
17294
  }
17295
 
17296
+ GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
17297
+ if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
17298
+ return false;
17299
+ }
17300
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
17301
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
17302
+ return buft_ctx->device == sycl_ctx->device;
17303
+ }
17304
 
17305
  static ggml_backend_i ggml_backend_sycl_interface = {
17306
  /* .get_name = */ ggml_backend_sycl_name,
 
17312
  /* .synchronize = */ ggml_backend_sycl_synchronize,
17313
  /* .graph_plan_create = */ NULL,
17314
  /* .graph_plan_free = */ NULL,
17315
+ /* .graph_plan_update = */ NULL,
17316
  /* .graph_plan_compute = */ NULL,
17317
  /* .graph_compute = */ ggml_backend_sycl_graph_compute,
17318
  /* .supports_op = */ ggml_backend_sycl_supports_op,
17319
+ /* .supports_buft = */ ggml_backend_sycl_supports_buft,
17320
  /* .offload_op = */ ggml_backend_sycl_offload_op,
17321
  /* .event_new = */ NULL,
17322
  /* .event_free = */ NULL,
ggml-vulkan.cpp CHANGED
@@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
6142
  UNUSED(buft);
6143
  }
6144
 
6145
- GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
6146
- if (!ggml_backend_is_vk(backend)) {
6147
- return false;
6148
- }
6149
-
6150
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6151
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6152
-
6153
- return buft_ctx->ctx->idx == ctx->idx;
6154
- }
6155
-
6156
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6157
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6158
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6159
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6160
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6161
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
6162
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
6163
  /* .is_host = */ NULL,
6164
  };
6165
 
@@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
6235
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6236
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6237
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
6238
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
6239
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6240
  },
6241
  /* .context = */ nullptr,
@@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
6551
  UNUSED(backend);
6552
  }
6553
 
 
 
 
 
 
 
 
 
 
 
 
6554
  // TODO: enable async and synchronize
6555
  static ggml_backend_i ggml_backend_vk_interface = {
6556
  /* .get_name = */ ggml_backend_vk_name,
@@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
6562
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6563
  /* .graph_plan_create = */ NULL,
6564
  /* .graph_plan_free = */ NULL,
 
6565
  /* .graph_plan_compute = */ NULL,
6566
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6567
  /* .supports_op = */ ggml_backend_vk_supports_op,
 
6568
  /* .offload_op = */ ggml_backend_vk_offload_op,
6569
  /* .event_new = */ NULL,
6570
  /* .event_free = */ NULL,
 
6142
  UNUSED(buft);
6143
  }
6144
 
 
 
 
 
 
 
 
 
 
 
 
6145
  static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
6146
  /* .get_name = */ ggml_backend_vk_buffer_type_name,
6147
  /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
6148
  /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
6149
  /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
6150
  /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
 
6151
  /* .is_host = */ NULL,
6152
  };
6153
 
 
6223
  /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
6224
  /* .get_max_size = */ NULL, // defaults to SIZE_MAX
6225
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 
6226
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
6227
  },
6228
  /* .context = */ nullptr,
 
6538
  UNUSED(backend);
6539
  }
6540
 
6541
+ GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
6542
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
6543
+ return false;
6544
+ }
6545
+
6546
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
6547
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6548
+
6549
+ return buft_ctx->ctx->idx == ctx->idx;
6550
+ }
6551
+
6552
  // TODO: enable async and synchronize
6553
  static ggml_backend_i ggml_backend_vk_interface = {
6554
  /* .get_name = */ ggml_backend_vk_name,
 
6560
  /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
6561
  /* .graph_plan_create = */ NULL,
6562
  /* .graph_plan_free = */ NULL,
6563
+ /* .graph_plan_update = */ NULL,
6564
  /* .graph_plan_compute = */ NULL,
6565
  /* .graph_compute = */ ggml_backend_vk_graph_compute,
6566
  /* .supports_op = */ ggml_backend_vk_supports_op,
6567
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
6568
  /* .offload_op = */ ggml_backend_vk_offload_op,
6569
  /* .event_new = */ NULL,
6570
  /* .event_free = */ NULL,
ggml.c CHANGED
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
297
 
298
  #if defined(GGML_USE_ACCELERATE)
299
  #include <Accelerate/Accelerate.h>
300
- #elif defined(GGML_USE_OPENBLAS)
301
- #if defined(GGML_BLAS_USE_MKL)
302
- #include <mkl.h>
303
- #else
304
- #include <cblas.h>
305
- #endif
306
  #endif
307
 
308
  // floating point type used to accumulate sums
@@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
12179
 
12180
  // ggml_compute_forward_mul_mat
12181
 
12182
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12183
- // helper function to determine if it is better to use BLAS or not
12184
- // for large matrices, BLAS is faster
12185
- static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
12186
- const struct ggml_tensor * src0 = dst->src[0];
12187
- const struct ggml_tensor * src1 = dst->src[1];
12188
-
12189
- //const int64_t ne00 = src0->ne[0];
12190
- //const int64_t ne01 = src0->ne[1];
12191
-
12192
- const int64_t ne10 = src1->ne[0];
12193
-
12194
- const int64_t ne0 = dst->ne[0];
12195
- const int64_t ne1 = dst->ne[1];
12196
-
12197
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
12198
- // all the experts for each batch element and the processing would become incredibly slow
12199
- // TODO: find the optimal values for these
12200
- if (dst->op != GGML_OP_MUL_MAT_ID &&
12201
- ggml_is_contiguous(src0) &&
12202
- ggml_is_contiguous(src1) &&
12203
- //src0->type == GGML_TYPE_F32 &&
12204
- src1->type == GGML_TYPE_F32 &&
12205
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
12206
-
12207
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
12208
- return true;
12209
- }
12210
-
12211
- return false;
12212
- }
12213
- #endif
12214
-
12215
  static void ggml_compute_forward_mul_mat_one_chunk(
12216
  const struct ggml_compute_params * params,
12217
  struct ggml_tensor * dst,
@@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
12349
  // nb01 >= nb00 - src0 is not transposed
12350
  // compute by src0 rows
12351
 
12352
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12353
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
12354
- const int64_t ne_plane = ne01*ne00;
12355
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
12356
- UNUSED(desired_wsize);
12357
-
12358
- if (params->type == GGML_TASK_TYPE_INIT) {
12359
- if (type != GGML_TYPE_F32) {
12360
- assert(params->wsize >= desired_wsize);
12361
- // parallelize by src0 rows
12362
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12363
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12364
- // broadcast src0 into src1 across 2nd,3rd dimension
12365
- const int64_t i03 = i13/r3;
12366
- const int64_t i02 = i12/r2;
12367
-
12368
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12369
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12370
- ggml_to_float_t const to_float = type_traits[type].to_float;
12371
-
12372
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
12373
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
12374
- }
12375
- }
12376
- }
12377
- }
12378
- return;
12379
- }
12380
-
12381
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
12382
- return;
12383
- }
12384
-
12385
- // perform sgemm, parallelization controlled by blas lib
12386
- if (ith != 0) {
12387
- return;
12388
- }
12389
-
12390
- //const int64_t tgemm0 = ggml_perf_time_us();
12391
- for (int64_t i13 = 0; i13 < ne13; i13++) {
12392
- for (int64_t i12 = 0; i12 < ne12; i12++) {
12393
- const int64_t i03 = i13/r3;
12394
- const int64_t i02 = i12/r2;
12395
-
12396
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
12397
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
12398
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
12399
-
12400
- if (type != GGML_TYPE_F32) {
12401
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
12402
- }
12403
-
12404
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
12405
- ne1, ne01, ne10,
12406
- 1.0f, y, ne10,
12407
- x, ne00,
12408
- 0.0f, d, ne01);
12409
- }
12410
- }
12411
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
12412
-
12413
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
12414
-
12415
- return;
12416
- }
12417
- #endif
12418
-
12419
  #if GGML_USE_LLAMAFILE
12420
  const bool src1_cont = ggml_is_contiguous(src1);
12421
 
@@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
12796
  // nb01 >= nb00 - src0 is not transposed
12797
  // compute by src0 rows
12798
 
12799
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12800
- bool use_blas = ggml_is_matrix(src0) &&
12801
- ggml_is_matrix(src1) &&
12802
- ggml_is_contiguous(src0) &&
12803
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
12804
- #endif
12805
-
12806
  if (params->type == GGML_TASK_TYPE_INIT) {
12807
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
12808
- if (use_blas) {
12809
- return;
12810
- }
12811
- #endif
12812
  if (ith != 0) {
12813
  return;
12814
  }
@@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
12820
  return;
12821
  }
12822
 
12823
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12824
- if (use_blas) {
12825
- if (params->ith != 0) { // All threads other than the first do no work.
12826
- return;
12827
- }
12828
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
12829
- // src0: (k,n)
12830
- // src1: (k,m)
12831
- // dst: (m,n)
12832
- //
12833
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
12834
- // Also expressed as (major,minor)
12835
- // a: (m,k): so src1 transposed
12836
- // b: (k,n): so src0
12837
- // c: (m,n)
12838
- //
12839
- // However, if ggml_is_transposed(src1) is true, then
12840
- // src1->data already contains a transposed version, so sgemm mustn't
12841
- // transpose it further.
12842
-
12843
- int n = src0->ne[0];
12844
- int k = src0->ne[1];
12845
- int m = src1->ne[0];
12846
-
12847
- int transposeA, lda;
12848
-
12849
- if (!ggml_is_transposed(src1)) {
12850
- transposeA = CblasTrans;
12851
- lda = m;
12852
- } else {
12853
- transposeA = CblasNoTrans;
12854
- lda = k;
12855
- }
12856
-
12857
- float * a = (float *) ((char *) src1->data);
12858
- float * b = (float *) ((char *) src0->data);
12859
- float * c = (float *) ((char *) dst->data);
12860
-
12861
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
12862
-
12863
- return;
12864
- }
12865
- #endif
12866
-
12867
  // dst[:,:,:,:] = 0
12868
  // for i2,i3:
12869
  // for i1:
@@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
12993
  // nb01 >= nb00 - src0 is not transposed
12994
  // compute by src0 rows
12995
 
12996
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
12997
-
12998
  if (params->type == GGML_TASK_TYPE_INIT) {
12999
  if (ith != 0) {
13000
  return;
@@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
13391
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13392
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13393
 
 
 
13394
  dequantize_row_q(
13395
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13396
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
13434
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13435
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13436
 
 
 
13437
  ggml_fp16_to_fp32_row(
13438
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13439
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@@ -13477,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
13477
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13478
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13479
 
13480
- ggml_bf16_to_fp32_row(
 
 
13481
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13482
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13483
  }
@@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
13520
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13521
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13522
 
 
 
13523
  ggml_vec_cpy_f32(nc,
13524
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13525
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18893
  switch (node->op) {
18894
  case GGML_OP_CPY:
18895
  case GGML_OP_DUP:
 
18896
  case GGML_OP_ADD:
18897
  case GGML_OP_ADD1:
18898
  case GGML_OP_ACC:
@@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
18977
  } break;
18978
  case GGML_OP_SCALE:
18979
  case GGML_OP_SET:
18980
- case GGML_OP_CONT:
18981
  case GGML_OP_RESHAPE:
18982
  case GGML_OP_VIEW:
18983
  case GGML_OP_PERMUTE:
@@ -19137,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19137
  sched_yield();
19138
  }
19139
 
19140
- * node_n = atomic_load(&state->shared->node_n);
19141
- if (* node_n != last_node_n) break;
 
 
 
19142
  #if defined(__SSE3__)
19143
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19144
  _mm_pause();
@@ -19148,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19148
 
19149
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
19150
  // wait for other threads to finish
19151
- const int last_task_phase = * task_phase;
19152
 
19153
  while (true) {
19154
  if (do_yield) {
19155
  sched_yield();
19156
  }
19157
 
19158
- * task_phase = atomic_load(&state->shared->node_task);
19159
- if (* task_phase != last_task_phase) break;
 
 
 
19160
  #if defined(__SSE3__)
19161
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19162
  _mm_pause();
@@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
19356
  {
19357
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19358
 
19359
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
19360
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
19361
- if (node->src[0]->type != GGML_TYPE_F32) {
19362
- // here we need memory for fully dequantized matrix from src0
19363
- // take into account that src0 can be broadcasted into src1[2,3]
19364
- cur = ggml_type_size(GGML_TYPE_F32)
19365
- * node->src[0]->ne[0]*node->src[0]->ne[1]
19366
- * node->src[1]->ne[2]*node->src[1]->ne[3];
19367
- }
19368
- } else
19369
- #endif
19370
  if (node->src[1]->type != vec_dot_type) {
19371
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19372
  }
@@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
22664
  }
22665
 
22666
  int ggml_cpu_has_blas(void) {
22667
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22668
  return 1;
22669
  #else
22670
  return 0;
 
297
 
298
  #if defined(GGML_USE_ACCELERATE)
299
  #include <Accelerate/Accelerate.h>
 
 
 
 
 
 
300
  #endif
301
 
302
  // floating point type used to accumulate sums
 
12173
 
12174
  // ggml_compute_forward_mul_mat
12175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12176
  static void ggml_compute_forward_mul_mat_one_chunk(
12177
  const struct ggml_compute_params * params,
12178
  struct ggml_tensor * dst,
 
12310
  // nb01 >= nb00 - src0 is not transposed
12311
  // compute by src0 rows
12312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12313
  #if GGML_USE_LLAMAFILE
12314
  const bool src1_cont = ggml_is_contiguous(src1);
12315
 
 
12690
  // nb01 >= nb00 - src0 is not transposed
12691
  // compute by src0 rows
12692
 
 
 
 
 
 
 
 
12693
  if (params->type == GGML_TASK_TYPE_INIT) {
 
 
 
 
 
12694
  if (ith != 0) {
12695
  return;
12696
  }
 
12702
  return;
12703
  }
12704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12705
  // dst[:,:,:,:] = 0
12706
  // for i2,i3:
12707
  // for i1:
 
12831
  // nb01 >= nb00 - src0 is not transposed
12832
  // compute by src0 rows
12833
 
 
 
12834
  if (params->type == GGML_TASK_TYPE_INIT) {
12835
  if (ith != 0) {
12836
  return;
 
13227
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13228
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13229
 
13230
+ assert(i01 >= 0 && i01 < ne01);
13231
+
13232
  dequantize_row_q(
13233
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13234
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
 
13272
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13273
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13274
 
13275
+ assert(i01 >= 0 && i01 < ne01);
13276
+
13277
  ggml_fp16_to_fp32_row(
13278
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13279
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
 
13317
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13318
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13319
 
13320
+ assert(i01 >= 0 && i01 < ne01);
13321
+
13322
+ ggml_bf16_to_fp32_row(
13323
  (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
13324
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
13325
  }
 
13362
  const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
13363
  const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
13364
 
13365
+ assert(i01 >= 0 && i01 < ne01);
13366
+
13367
  ggml_vec_cpy_f32(nc,
13368
  (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
13369
  (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
 
18737
  switch (node->op) {
18738
  case GGML_OP_CPY:
18739
  case GGML_OP_DUP:
18740
+ case GGML_OP_CONT:
18741
  case GGML_OP_ADD:
18742
  case GGML_OP_ADD1:
18743
  case GGML_OP_ACC:
 
18822
  } break;
18823
  case GGML_OP_SCALE:
18824
  case GGML_OP_SET:
 
18825
  case GGML_OP_RESHAPE:
18826
  case GGML_OP_VIEW:
18827
  case GGML_OP_PERMUTE:
 
18981
  sched_yield();
18982
  }
18983
 
18984
+ *node_n = atomic_load(&state->shared->node_n);
18985
+ if (*node_n != last_node_n) {
18986
+ break;
18987
+ }
18988
+
18989
  #if defined(__SSE3__)
18990
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
18991
  _mm_pause();
 
18995
 
18996
  static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
18997
  // wait for other threads to finish
18998
+ const int last_task_phase = *task_phase;
18999
 
19000
  while (true) {
19001
  if (do_yield) {
19002
  sched_yield();
19003
  }
19004
 
19005
+ *task_phase = atomic_load(&state->shared->node_task);
19006
+ if (*task_phase != last_task_phase) {
19007
+ break;
19008
+ }
19009
+
19010
  #if defined(__SSE3__)
19011
  // Tell the processor we're spinning. It's a processor hint for spinlocks.
19012
  _mm_pause();
 
19206
  {
19207
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
19208
 
 
 
 
 
 
 
 
 
 
 
 
19209
  if (node->src[1]->type != vec_dot_type) {
19210
  cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
19211
  }
 
22503
  }
22504
 
22505
  int ggml_cpu_has_blas(void) {
22506
+ #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
22507
  return 1;
22508
  #else
22509
  return 0;