ggerganov commited on
Commit
f33490f
·
unverified ·
1 Parent(s): 98b68e8

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -1903,6 +1903,28 @@ static void llama_kv_cache_seq_shift(
1903
  cache.head = new_head != cache.size ? new_head : 0;
1904
  }
1905
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1906
  //
1907
  // model loading and saving
1908
  //
@@ -2180,7 +2202,11 @@ struct llama_model_loader {
2180
  type_max = type;
2181
  }
2182
 
2183
- // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
 
 
 
 
2184
  }
2185
 
2186
  switch (type_max) {
@@ -2196,6 +2222,8 @@ struct llama_model_loader {
2196
  case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2197
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2198
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
 
 
2199
  default:
2200
  {
2201
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2558,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2558
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2559
 
2560
  // K-quants
2561
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
 
2562
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2563
  case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2564
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2567,6 +2596,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2567
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2568
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2569
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
 
 
2570
 
2571
  default: return "unknown, may not work";
2572
  }
@@ -2801,6 +2832,7 @@ static void llm_load_hparams(
2801
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2802
 
2803
  switch (hparams.n_layer) {
 
2804
  case 32: model.type = e_model::MODEL_3B; break;
2805
  default: model.type = e_model::MODEL_UNKNOWN;
2806
  }
@@ -3117,7 +3149,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3117
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3118
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3119
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3120
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
 
 
 
 
 
 
 
 
3121
  if (ml.n_bytes < GiB) {
3122
  LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
3123
  } else {
@@ -4772,7 +4812,6 @@ struct llm_build_context {
4772
  const int64_t n_embd_head = hparams.n_embd_head_v;
4773
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4774
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4775
- GGML_ASSERT(n_embd_gqa == n_embd);
4776
 
4777
  struct ggml_tensor * cur;
4778
  struct ggml_tensor * inpL;
@@ -4896,7 +4935,6 @@ struct llm_build_context {
4896
  const int64_t n_embd_head = hparams.n_embd_head_v;
4897
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4898
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4899
- GGML_ASSERT(n_embd_gqa == n_embd);
4900
 
4901
  struct ggml_tensor * cur;
4902
  struct ggml_tensor * pos;
@@ -4995,9 +5033,7 @@ struct llm_build_context {
4995
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4996
 
4997
  const int64_t n_embd_head = hparams.n_embd_head_v;
4998
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4999
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5000
- GGML_ASSERT(n_embd_gqa == n_embd);
5001
 
5002
  const int64_t n_rot = n_embd_head_k / 2;
5003
 
@@ -5209,9 +5245,7 @@ struct llm_build_context {
5209
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5210
 
5211
  const int64_t n_embd_head = hparams.n_embd_head_v;
5212
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5213
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5214
- GGML_ASSERT(n_embd_gqa == n_embd);
5215
 
5216
  struct ggml_tensor * cur;
5217
  struct ggml_tensor * inpL;
@@ -5304,7 +5338,6 @@ struct llm_build_context {
5304
  const int64_t n_embd_head = hparams.n_embd_head_v;
5305
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5306
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5307
- GGML_ASSERT(n_embd_gqa == n_embd);
5308
 
5309
  struct ggml_tensor * cur;
5310
  struct ggml_tensor * inpL;
@@ -5400,7 +5433,6 @@ struct llm_build_context {
5400
  const int64_t n_embd_head = hparams.n_embd_head_v;
5401
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5402
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5403
- GGML_ASSERT(n_embd_gqa == n_embd);
5404
 
5405
  struct ggml_tensor * cur;
5406
  struct ggml_tensor * inpL;
@@ -5727,7 +5759,6 @@ struct llm_build_context {
5727
  const int64_t n_embd_head = hparams.n_embd_head_v;
5728
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5729
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5730
- GGML_ASSERT(n_embd_gqa == n_embd);
5731
 
5732
  struct ggml_tensor * cur;
5733
  struct ggml_tensor * attn_norm_output;
@@ -5951,7 +5982,6 @@ struct llm_build_context {
5951
  const int64_t n_embd_head = hparams.n_embd_head_v;
5952
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5953
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5954
- GGML_ASSERT(n_embd_gqa == n_embd);
5955
 
5956
  struct ggml_tensor * cur;
5957
  struct ggml_tensor * pos;
@@ -8926,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8926
  // TODO: explore better strategies
8927
  new_type = GGML_TYPE_Q8_0;
8928
  }
8929
- } else if (name.find("ffn_down.weight") != std::string::npos) {
8930
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
 
 
 
8931
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8932
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8933
  : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8934
  : GGML_TYPE_Q3_K;
8935
  }
@@ -8938,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8938
  }
8939
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8940
  if (arch == LLM_ARCH_FALCON) {
8941
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8942
  use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8943
  } else {
8944
  if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8945
  }
8946
  }
8947
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8948
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8949
  new_type = GGML_TYPE_Q5_K;
8950
  }
8951
  ++qs.i_feed_forward_w2;
@@ -8963,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8963
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8964
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8965
  }
8966
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8967
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8968
- }
 
8969
  // This can be used to reduce the size of the Q5_K_S model.
8970
  // The associated PPL increase is fully in line with the size reduction
8971
  //else {
@@ -9014,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9014
 
9015
  // K-quants
9016
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
 
9017
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9018
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9019
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9022,6 +9057,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9022
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9023
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9024
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
 
 
9025
 
9026
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9027
  }
@@ -9070,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9070
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9071
  ++qs.n_attention_wv;
9072
  }
9073
- else if (name.find("ffn_down.weight") != std::string::npos) {
9074
  ++qs.n_feed_forward_w2;
9075
  }
9076
  }
@@ -10146,9 +10183,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
10146
  }
10147
 
10148
  void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
 
 
 
 
10149
  llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
10150
  }
10151
 
 
 
 
 
 
 
 
 
10152
  // Returns the *maximum* size of the state
10153
  size_t llama_get_state_size(const struct llama_context * ctx) {
10154
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10881,7 +10930,7 @@ void llama_print_timings(struct llama_context * ctx) {
10881
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
10882
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
10883
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10884
- LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
10885
  }
10886
 
10887
  void llama_reset_timings(struct llama_context * ctx) {
 
1903
  cache.head = new_head != cache.size ? new_head : 0;
1904
  }
1905
 
1906
+ static void llama_kv_cache_seq_div(
1907
+ struct llama_kv_cache & cache,
1908
+ llama_seq_id seq_id,
1909
+ llama_pos p0,
1910
+ llama_pos p1,
1911
+ int d) {
1912
+ if (p0 < 0) p0 = 0;
1913
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1914
+
1915
+ for (uint32_t i = 0; i < cache.size; ++i) {
1916
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1917
+ cache.has_shift = true;
1918
+
1919
+ {
1920
+ llama_pos p_old = cache.cells[i].pos;
1921
+ cache.cells[i].pos /= d;
1922
+ cache.cells[i].delta += cache.cells[i].pos - p_old;
1923
+ }
1924
+ }
1925
+ }
1926
+ }
1927
+
1928
  //
1929
  // model loading and saving
1930
  //
 
2202
  type_max = type;
2203
  }
2204
 
2205
+ // TODO: make runtime configurable
2206
+ #if 0
2207
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2208
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2209
+ #endif
2210
  }
2211
 
2212
  switch (type_max) {
 
2222
  case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2223
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2224
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2225
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2226
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2227
  default:
2228
  {
2229
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
 
2586
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2587
 
2588
  // K-quants
2589
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2590
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
2591
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2592
  case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2593
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
 
2596
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2597
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2598
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2599
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2600
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2601
 
2602
  default: return "unknown, may not work";
2603
  }
 
2832
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2833
 
2834
  switch (hparams.n_layer) {
2835
+ case 24: model.type = e_model::MODEL_1B; break;
2836
  case 32: model.type = e_model::MODEL_3B; break;
2837
  default: model.type = e_model::MODEL_UNKNOWN;
2838
  }
 
3149
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3150
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3151
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3152
+ if (ml.n_elements >= 1e12) {
3153
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
3154
+ } else if (ml.n_elements >= 1e9) {
3155
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3156
+ } else if (ml.n_elements >= 1e6) {
3157
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
3158
+ } else {
3159
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
3160
+ }
3161
  if (ml.n_bytes < GiB) {
3162
  LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
3163
  } else {
 
4812
  const int64_t n_embd_head = hparams.n_embd_head_v;
4813
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4814
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
4815
 
4816
  struct ggml_tensor * cur;
4817
  struct ggml_tensor * inpL;
 
4935
  const int64_t n_embd_head = hparams.n_embd_head_v;
4936
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4937
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
4938
 
4939
  struct ggml_tensor * cur;
4940
  struct ggml_tensor * pos;
 
5033
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5034
 
5035
  const int64_t n_embd_head = hparams.n_embd_head_v;
 
5036
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5037
 
5038
  const int64_t n_rot = n_embd_head_k / 2;
5039
 
 
5245
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5246
 
5247
  const int64_t n_embd_head = hparams.n_embd_head_v;
 
5248
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5249
 
5250
  struct ggml_tensor * cur;
5251
  struct ggml_tensor * inpL;
 
5338
  const int64_t n_embd_head = hparams.n_embd_head_v;
5339
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5340
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5341
 
5342
  struct ggml_tensor * cur;
5343
  struct ggml_tensor * inpL;
 
5433
  const int64_t n_embd_head = hparams.n_embd_head_v;
5434
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5435
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5436
 
5437
  struct ggml_tensor * cur;
5438
  struct ggml_tensor * inpL;
 
5759
  const int64_t n_embd_head = hparams.n_embd_head_v;
5760
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5761
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5762
 
5763
  struct ggml_tensor * cur;
5764
  struct ggml_tensor * attn_norm_output;
 
5982
  const int64_t n_embd_head = hparams.n_embd_head_v;
5983
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5984
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
5985
 
5986
  struct ggml_tensor * cur;
5987
  struct ggml_tensor * pos;
 
8956
  // TODO: explore better strategies
8957
  new_type = GGML_TYPE_Q8_0;
8958
  }
8959
+ } else if (name.find("ffn_down") != std::string::npos) {
8960
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8963
+ }
8964
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8965
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8966
  : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8967
  : GGML_TYPE_Q3_K;
8968
  }
 
8971
  }
8972
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8973
  if (arch == LLM_ARCH_FALCON) {
8974
+ new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8975
  use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8976
  } else {
8977
  if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8978
  }
8979
  }
8980
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8981
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8982
  new_type = GGML_TYPE_Q5_K;
8983
  }
8984
  ++qs.i_feed_forward_w2;
 
8996
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8997
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8998
  }
8999
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002
+ //}
9003
  // This can be used to reduce the size of the Q5_K_S model.
9004
  // The associated PPL increase is fully in line with the size reduction
9005
  //else {
 
9048
 
9049
  // K-quants
9050
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9051
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9052
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9053
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9054
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
 
9057
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9058
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9059
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9060
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9061
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9062
 
9063
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9064
  }
 
9107
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9108
  ++qs.n_attention_wv;
9109
  }
9110
+ else if (name.find("ffn_down") != std::string::npos) {
9111
  ++qs.n_feed_forward_w2;
9112
  }
9113
  }
 
10183
  }
10184
 
10185
  void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
10186
+ if (delta == 0) {
10187
+ return;
10188
+ }
10189
+
10190
  llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
10191
  }
10192
 
10193
+ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
10194
+ if (d == 1) {
10195
+ return;
10196
+ }
10197
+
10198
+ llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
10199
+ }
10200
+
10201
  // Returns the *maximum* size of the state
10202
  size_t llama_get_state_size(const struct llama_context * ctx) {
10203
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
 
10930
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
10931
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
10932
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10933
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
10934
  }
10935
 
10936
  void llama_reset_timings(struct llama_context * ctx) {
examples/talk-llama/llama.h CHANGED
@@ -103,6 +103,9 @@ extern "C" {
103
  LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
104
  LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
105
  LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
 
 
 
106
 
107
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
108
  };
@@ -484,6 +487,17 @@ extern "C" {
484
  llama_pos p1,
485
  llama_pos delta);
486
 
 
 
 
 
 
 
 
 
 
 
 
487
  //
488
  // State / sessions
489
  //
 
103
  LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
104
  LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
105
  LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
106
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107
+ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
109
 
110
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
111
  };
 
487
  llama_pos p1,
488
  llama_pos delta);
489
 
490
+ // Integer division of the positions by factor of `d > 1`
491
+ // If the KV cache is RoPEd, the KV data is updated accordingly
492
+ // p0 < 0 : [0, p1]
493
+ // p1 < 0 : [p0, inf)
494
+ LLAMA_API void llama_kv_cache_seq_div(
495
+ struct llama_context * ctx,
496
+ llama_seq_id seq_id,
497
+ llama_pos p0,
498
+ llama_pos p1,
499
+ int d);
500
+
501
  //
502
  // State / sessions
503
  //