Spaces:
Sleeping
Sleeping
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +553 -105
- examples/talk-llama/llama.h +1 -0
examples/talk-llama/llama.cpp
CHANGED
|
@@ -196,6 +196,7 @@ enum llm_arch {
|
|
| 196 |
LLM_ARCH_STARCODER,
|
| 197 |
LLM_ARCH_PERSIMMON,
|
| 198 |
LLM_ARCH_REFACT,
|
|
|
|
| 199 |
LLM_ARCH_BLOOM,
|
| 200 |
LLM_ARCH_STABLELM,
|
| 201 |
LLM_ARCH_QWEN,
|
|
@@ -220,6 +221,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 220 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
| 221 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
| 222 |
{ LLM_ARCH_REFACT, "refact" },
|
|
|
|
| 223 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 224 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
| 225 |
{ LLM_ARCH_QWEN, "qwen" },
|
|
@@ -261,6 +263,7 @@ enum llm_kv {
|
|
| 261 |
LLM_KV_ATTENTION_VALUE_LENGTH,
|
| 262 |
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
| 263 |
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
|
|
| 264 |
|
| 265 |
LLM_KV_ROPE_DIMENSION_COUNT,
|
| 266 |
LLM_KV_ROPE_FREQ_BASE,
|
|
@@ -273,6 +276,7 @@ enum llm_kv {
|
|
| 273 |
LLM_KV_TOKENIZER_MODEL,
|
| 274 |
LLM_KV_TOKENIZER_LIST,
|
| 275 |
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
|
|
|
| 276 |
LLM_KV_TOKENIZER_SCORES,
|
| 277 |
LLM_KV_TOKENIZER_MERGES,
|
| 278 |
LLM_KV_TOKENIZER_BOS_ID,
|
|
@@ -316,6 +320,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 316 |
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
| 317 |
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
| 318 |
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
|
|
| 319 |
|
| 320 |
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
| 321 |
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
@@ -328,6 +333,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 328 |
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
| 329 |
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
| 330 |
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
|
|
|
| 331 |
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
| 332 |
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
| 333 |
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
|
@@ -355,6 +361,7 @@ struct LLM_KV {
|
|
| 355 |
enum llm_tensor {
|
| 356 |
LLM_TENSOR_TOKEN_EMBD,
|
| 357 |
LLM_TENSOR_TOKEN_EMBD_NORM,
|
|
|
|
| 358 |
LLM_TENSOR_POS_EMBD,
|
| 359 |
LLM_TENSOR_OUTPUT,
|
| 360 |
LLM_TENSOR_OUTPUT_NORM,
|
|
@@ -536,6 +543,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 536 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 537 |
},
|
| 538 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
{
|
| 540 |
LLM_ARCH_BLOOM,
|
| 541 |
{
|
|
@@ -1440,6 +1464,11 @@ static llama_state g_state;
|
|
| 1440 |
// available llama models
|
| 1441 |
enum e_model {
|
| 1442 |
MODEL_UNKNOWN,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1443 |
MODEL_0_5B,
|
| 1444 |
MODEL_1B,
|
| 1445 |
MODEL_2B,
|
|
@@ -1481,6 +1510,7 @@ struct llama_hparams {
|
|
| 1481 |
uint32_t n_ff;
|
| 1482 |
uint32_t n_expert = 0;
|
| 1483 |
uint32_t n_expert_used = 0;
|
|
|
|
| 1484 |
|
| 1485 |
float f_norm_eps;
|
| 1486 |
float f_norm_rms_eps;
|
|
@@ -1493,6 +1523,8 @@ struct llama_hparams {
|
|
| 1493 |
float f_clamp_kqv;
|
| 1494 |
float f_max_alibi_bias;
|
| 1495 |
|
|
|
|
|
|
|
| 1496 |
|
| 1497 |
bool operator!=(const llama_hparams & other) const {
|
| 1498 |
if (this->vocab_only != other.vocab_only) return true;
|
|
@@ -1720,6 +1752,7 @@ struct llama_model {
|
|
| 1720 |
llama_vocab vocab;
|
| 1721 |
|
| 1722 |
struct ggml_tensor * tok_embd;
|
|
|
|
| 1723 |
struct ggml_tensor * pos_embd;
|
| 1724 |
struct ggml_tensor * tok_norm;
|
| 1725 |
struct ggml_tensor * tok_norm_b;
|
|
@@ -1839,8 +1872,6 @@ struct llama_context {
|
|
| 1839 |
// memory buffers used to evaluate the model
|
| 1840 |
std::vector<uint8_t> buf_compute_meta;
|
| 1841 |
ggml_backend_sched_t sched = nullptr;
|
| 1842 |
-
// allocator for the input tensors
|
| 1843 |
-
ggml_tallocr * alloc = nullptr;
|
| 1844 |
|
| 1845 |
// input tensors
|
| 1846 |
ggml_backend_buffer_t buf_input = nullptr;
|
|
@@ -1850,6 +1881,7 @@ struct llama_context {
|
|
| 1850 |
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
| 1851 |
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
| 1852 |
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
|
|
|
| 1853 |
|
| 1854 |
#ifdef GGML_USE_MPI
|
| 1855 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
@@ -2829,6 +2861,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
|
| 2829 |
switch (type) {
|
| 2830 |
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
| 2831 |
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
|
|
|
| 2832 |
default: return "unknown";
|
| 2833 |
}
|
| 2834 |
}
|
|
@@ -3000,6 +3033,26 @@ static void llm_load_hparams(
|
|
| 3000 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3001 |
}
|
| 3002 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3003 |
case LLM_ARCH_BLOOM:
|
| 3004 |
{
|
| 3005 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -3204,6 +3257,16 @@ static void llm_load_vocab(
|
|
| 3204 |
vocab.special_unk_id = -1;
|
| 3205 |
vocab.special_sep_id = -1;
|
| 3206 |
vocab.special_pad_id = -1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3207 |
} else {
|
| 3208 |
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
| 3209 |
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
@@ -3232,6 +3295,8 @@ static void llm_load_vocab(
|
|
| 3232 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 3233 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 3234 |
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
|
|
|
|
|
|
| 3235 |
} else {
|
| 3236 |
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
| 3237 |
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
@@ -3569,6 +3634,7 @@ static bool llm_load_tensors(
|
|
| 3569 |
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
| 3570 |
const int64_t n_embd_gqa = n_embd_v_gqa;
|
| 3571 |
const int64_t n_vocab = hparams.n_vocab;
|
|
|
|
| 3572 |
const int64_t n_ff = hparams.n_ff;
|
| 3573 |
|
| 3574 |
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
@@ -3783,11 +3849,50 @@ static bool llm_load_tensors(
|
|
| 3783 |
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
| 3784 |
}
|
| 3785 |
} break;
|
| 3786 |
-
case
|
| 3787 |
{
|
| 3788 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3789 |
-
model.
|
| 3790 |
-
model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3791 |
|
| 3792 |
// output
|
| 3793 |
{
|
|
@@ -4739,6 +4844,7 @@ struct llm_build_context {
|
|
| 4739 |
const int32_t n_orig_ctx;
|
| 4740 |
|
| 4741 |
const bool do_rope_shift;
|
|
|
|
| 4742 |
|
| 4743 |
const llm_build_cb & cb;
|
| 4744 |
|
|
@@ -4782,6 +4888,7 @@ struct llm_build_context {
|
|
| 4782 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 4783 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 4784 |
do_rope_shift (worst_case || kv_self.has_shift),
|
|
|
|
| 4785 |
cb (cb),
|
| 4786 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 4787 |
// all initializations should be done in init()
|
|
@@ -5625,6 +5732,100 @@ struct llm_build_context {
|
|
| 5625 |
return gf;
|
| 5626 |
}
|
| 5627 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5628 |
struct ggml_cgraph * build_bloom() {
|
| 5629 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5630 |
|
|
@@ -6996,12 +7197,10 @@ struct llm_build_context {
|
|
| 6996 |
|
| 6997 |
static struct ggml_cgraph * llama_build_graph(
|
| 6998 |
llama_context & lctx,
|
| 6999 |
-
const llama_batch & batch
|
|
|
|
| 7000 |
const auto & model = lctx.model;
|
| 7001 |
|
| 7002 |
-
// check if we should build the worst-case graph (for memory measurement)
|
| 7003 |
-
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
| 7004 |
-
|
| 7005 |
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
| 7006 |
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
| 7007 |
if (il >= 0) {
|
|
@@ -7022,67 +7221,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 7022 |
|
| 7023 |
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
| 7024 |
|
| 7025 |
-
//
|
| 7026 |
-
// set input data
|
| 7027 |
-
//
|
| 7028 |
-
|
| 7029 |
-
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
| 7030 |
-
if (batch.token) {
|
| 7031 |
-
const int64_t n_tokens = batch.n_tokens;
|
| 7032 |
-
|
| 7033 |
-
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
| 7034 |
-
}
|
| 7035 |
-
|
| 7036 |
-
if (batch.embd) {
|
| 7037 |
-
const int64_t n_embd = llm.n_embd;
|
| 7038 |
-
const int64_t n_tokens = batch.n_tokens;
|
| 7039 |
-
|
| 7040 |
-
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
| 7041 |
-
}
|
| 7042 |
-
|
| 7043 |
-
if (batch.pos) {
|
| 7044 |
-
const int64_t n_tokens = batch.n_tokens;
|
| 7045 |
-
|
| 7046 |
-
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
| 7047 |
-
}
|
| 7048 |
-
|
| 7049 |
-
{
|
| 7050 |
-
const int64_t n_kv = llm.n_kv;
|
| 7051 |
-
const int64_t n_tokens = batch.n_tokens;
|
| 7052 |
-
|
| 7053 |
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
| 7054 |
-
float * data = (float *) lctx.inp_KQ_mask->data;
|
| 7055 |
-
|
| 7056 |
-
for (int h = 0; h < 1; ++h) {
|
| 7057 |
-
for (int j = 0; j < n_tokens; ++j) {
|
| 7058 |
-
const llama_pos pos = batch.pos[j];
|
| 7059 |
-
const llama_seq_id seq_id = batch.seq_id[j][0];
|
| 7060 |
-
|
| 7061 |
-
for (int i = 0; i < n_kv; ++i) {
|
| 7062 |
-
float f;
|
| 7063 |
-
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
| 7064 |
-
f = -INFINITY;
|
| 7065 |
-
} else {
|
| 7066 |
-
f = 0;
|
| 7067 |
-
}
|
| 7068 |
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
| 7069 |
-
}
|
| 7070 |
-
}
|
| 7071 |
-
}
|
| 7072 |
-
}
|
| 7073 |
-
|
| 7074 |
-
if (llm.do_rope_shift) {
|
| 7075 |
-
const int64_t n_ctx = llm.n_ctx;
|
| 7076 |
-
|
| 7077 |
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
| 7078 |
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
| 7079 |
-
|
| 7080 |
-
for (int i = 0; i < n_ctx; ++i) {
|
| 7081 |
-
data[i] = lctx.kv_self.cells[i].delta;
|
| 7082 |
-
}
|
| 7083 |
-
}
|
| 7084 |
-
}
|
| 7085 |
-
|
| 7086 |
llm.init();
|
| 7087 |
|
| 7088 |
switch (model.arch) {
|
|
@@ -7110,6 +7248,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 7110 |
{
|
| 7111 |
result = llm.build_refact();
|
| 7112 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7113 |
case LLM_ARCH_BLOOM:
|
| 7114 |
{
|
| 7115 |
result = llm.build_bloom();
|
|
@@ -7167,6 +7309,83 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 7167 |
return result;
|
| 7168 |
}
|
| 7169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7170 |
// decode a batch of tokens by evaluating the transformer
|
| 7171 |
//
|
| 7172 |
// - lctx: llama context
|
|
@@ -7265,17 +7484,22 @@ static int llama_decode_internal(
|
|
| 7265 |
ggml_backend_sched_reset(lctx.sched);
|
| 7266 |
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
| 7267 |
|
| 7268 |
-
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
| 7269 |
|
| 7270 |
// the output is always the last tensor in the graph
|
| 7271 |
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
| 7272 |
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
| 7273 |
-
|
| 7274 |
-
// the embeddings could be the second to last tensor, or the third to last tensor
|
| 7275 |
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
| 7276 |
-
if (strcmp(
|
| 7277 |
-
embeddings
|
| 7278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7279 |
}
|
| 7280 |
|
| 7281 |
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
@@ -7305,6 +7529,9 @@ static int llama_decode_internal(
|
|
| 7305 |
if (lctx.backend_cpu != nullptr) {
|
| 7306 |
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
| 7307 |
}
|
|
|
|
|
|
|
|
|
|
| 7308 |
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
| 7309 |
|
| 7310 |
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
@@ -7344,7 +7571,7 @@ static int llama_decode_internal(
|
|
| 7344 |
// extract logits
|
| 7345 |
// TODO: do not compute and extract logits if only embeddings are needed
|
| 7346 |
// need to update the graphs to skip "result_output"
|
| 7347 |
-
{
|
| 7348 |
auto & logits_out = lctx.logits;
|
| 7349 |
|
| 7350 |
#ifndef NDEBUG
|
|
@@ -7388,9 +7615,11 @@ static int llama_decode_internal(
|
|
| 7388 |
if (!lctx.embedding.empty()) {
|
| 7389 |
auto & embedding_out = lctx.embedding;
|
| 7390 |
|
|
|
|
|
|
|
| 7391 |
embedding_out.resize(n_embd);
|
| 7392 |
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
| 7393 |
-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(),
|
| 7394 |
ggml_backend_synchronize(embeddings_backend);
|
| 7395 |
}
|
| 7396 |
|
|
@@ -7454,6 +7683,9 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
| 7454 |
GGML_ASSERT(false);
|
| 7455 |
return unicode_to_bytes_bpe(token_data.text);
|
| 7456 |
}
|
|
|
|
|
|
|
|
|
|
| 7457 |
default:
|
| 7458 |
GGML_ASSERT(false);
|
| 7459 |
}
|
|
@@ -7466,6 +7698,7 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
| 7466 |
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
| 7467 |
return vocab.token_to_id.at(buf);
|
| 7468 |
}
|
|
|
|
| 7469 |
case LLAMA_VOCAB_TYPE_BPE: {
|
| 7470 |
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
| 7471 |
}
|
|
@@ -7936,12 +8169,212 @@ private:
|
|
| 7936 |
llm_bigram_bpe::queue work_queue;
|
| 7937 |
};
|
| 7938 |
|
| 7939 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7940 |
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
| 7941 |
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
| 7942 |
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
| 7943 |
|
| 7944 |
-
struct fragment_buffer_variant{
|
| 7945 |
fragment_buffer_variant(llama_vocab::id _token)
|
| 7946 |
:
|
| 7947 |
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
|
@@ -7971,8 +8404,7 @@ struct fragment_buffer_variant{
|
|
| 7971 |
|
| 7972 |
// #define PRETOKENIZERDEBUG
|
| 7973 |
|
| 7974 |
-
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
| 7975 |
-
{
|
| 7976 |
// for each special token
|
| 7977 |
for (const auto & st: vocab.special_tokens_cache) {
|
| 7978 |
const auto & special_token = st.first;
|
|
@@ -8090,10 +8522,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8090 |
switch (vocab.type) {
|
| 8091 |
case LLAMA_VOCAB_TYPE_SPM:
|
| 8092 |
{
|
| 8093 |
-
for (const auto & fragment: fragment_buffer)
|
| 8094 |
-
|
| 8095 |
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
| 8096 |
-
{
|
| 8097 |
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
| 8098 |
|
| 8099 |
// TODO: It's likely possible to get rid of this string copy entirely
|
|
@@ -8113,19 +8543,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8113 |
llm_tokenizer_spm tokenizer(vocab);
|
| 8114 |
llama_escape_whitespace(raw_text);
|
| 8115 |
tokenizer.tokenize(raw_text, output);
|
| 8116 |
-
}
|
| 8117 |
-
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 8118 |
-
{
|
| 8119 |
output.push_back(fragment.token);
|
| 8120 |
}
|
| 8121 |
}
|
| 8122 |
} break;
|
| 8123 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 8124 |
{
|
| 8125 |
-
for (const auto & fragment: fragment_buffer)
|
| 8126 |
-
|
| 8127 |
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
| 8128 |
-
{
|
| 8129 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8130 |
|
| 8131 |
#ifdef PRETOKENIZERDEBUG
|
|
@@ -8133,9 +8559,23 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
| 8133 |
#endif
|
| 8134 |
llm_tokenizer_bpe tokenizer(vocab);
|
| 8135 |
tokenizer.tokenize(raw_text, output);
|
|
|
|
|
|
|
| 8136 |
}
|
| 8137 |
-
|
| 8138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8139 |
output.push_back(fragment.token);
|
| 8140 |
}
|
| 8141 |
}
|
|
@@ -10799,7 +11239,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 10799 |
// graph inputs
|
| 10800 |
{
|
| 10801 |
ggml_init_params init_params = {
|
| 10802 |
-
/* .mem_size */ ggml_tensor_overhead()*
|
| 10803 |
/* .mem_buffer */ nullptr,
|
| 10804 |
/* .no_alloc */ true,
|
| 10805 |
};
|
|
@@ -10810,12 +11250,14 @@ struct llama_context * llama_new_context_with_model(
|
|
| 10810 |
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 10811 |
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
| 10812 |
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
|
|
|
| 10813 |
|
| 10814 |
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
| 10815 |
ggml_set_name(ctx->inp_embd, "inp_embd");
|
| 10816 |
ggml_set_name(ctx->inp_pos, "inp_pos");
|
| 10817 |
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
| 10818 |
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
|
|
|
| 10819 |
|
| 10820 |
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
| 10821 |
|
|
@@ -10841,23 +11283,27 @@ struct llama_context * llama_new_context_with_model(
|
|
| 10841 |
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
| 10842 |
|
| 10843 |
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
| 10844 |
-
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
| 10845 |
|
| 10846 |
// build worst-case graph
|
| 10847 |
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
| 10848 |
int n_past = cparams.n_ctx - n_tokens;
|
| 10849 |
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
| 10850 |
-
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
| 10851 |
|
| 10852 |
// initialize scheduler with the worst-case graph
|
| 10853 |
-
|
| 10854 |
-
|
|
|
|
|
|
|
|
|
|
| 10855 |
|
| 10856 |
-
for (
|
| 10857 |
-
|
|
|
|
|
|
|
| 10858 |
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
| 10859 |
-
|
| 10860 |
-
|
| 10861 |
}
|
| 10862 |
|
| 10863 |
// note: the number of splits during measure is higher than during inference due to the kv shift
|
|
@@ -11746,6 +12192,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|
| 11746 |
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
| 11747 |
if (0 <= token && token < llama_n_vocab(model)) {
|
| 11748 |
switch (llama_vocab_get_type(model->vocab)) {
|
|
|
|
| 11749 |
case LLAMA_VOCAB_TYPE_SPM: {
|
| 11750 |
// NOTE: we accept all unsupported token types,
|
| 11751 |
// suppressing them like CONTROL tokens.
|
|
@@ -11869,6 +12316,7 @@ const char * llama_print_system_info(void) {
|
|
| 11869 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 11870 |
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
| 11871 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
|
|
| 11872 |
|
| 11873 |
return s.c_str();
|
| 11874 |
}
|
|
|
|
| 196 |
LLM_ARCH_STARCODER,
|
| 197 |
LLM_ARCH_PERSIMMON,
|
| 198 |
LLM_ARCH_REFACT,
|
| 199 |
+
LLM_ARCH_BERT,
|
| 200 |
LLM_ARCH_BLOOM,
|
| 201 |
LLM_ARCH_STABLELM,
|
| 202 |
LLM_ARCH_QWEN,
|
|
|
|
| 221 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
| 222 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
| 223 |
{ LLM_ARCH_REFACT, "refact" },
|
| 224 |
+
{ LLM_ARCH_BERT, "bert" },
|
| 225 |
{ LLM_ARCH_BLOOM, "bloom" },
|
| 226 |
{ LLM_ARCH_STABLELM, "stablelm" },
|
| 227 |
{ LLM_ARCH_QWEN, "qwen" },
|
|
|
|
| 263 |
LLM_KV_ATTENTION_VALUE_LENGTH,
|
| 264 |
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
| 265 |
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
| 266 |
+
LLM_KV_ATTENTION_CAUSAL,
|
| 267 |
|
| 268 |
LLM_KV_ROPE_DIMENSION_COUNT,
|
| 269 |
LLM_KV_ROPE_FREQ_BASE,
|
|
|
|
| 276 |
LLM_KV_TOKENIZER_MODEL,
|
| 277 |
LLM_KV_TOKENIZER_LIST,
|
| 278 |
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
| 279 |
+
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
| 280 |
LLM_KV_TOKENIZER_SCORES,
|
| 281 |
LLM_KV_TOKENIZER_MERGES,
|
| 282 |
LLM_KV_TOKENIZER_BOS_ID,
|
|
|
|
| 320 |
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
| 321 |
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
| 322 |
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
| 323 |
+
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
| 324 |
|
| 325 |
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
| 326 |
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
|
|
| 333 |
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
| 334 |
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
| 335 |
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
| 336 |
+
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
|
| 337 |
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
| 338 |
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
| 339 |
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
|
|
|
| 361 |
enum llm_tensor {
|
| 362 |
LLM_TENSOR_TOKEN_EMBD,
|
| 363 |
LLM_TENSOR_TOKEN_EMBD_NORM,
|
| 364 |
+
LLM_TENSOR_TOKEN_TYPES,
|
| 365 |
LLM_TENSOR_POS_EMBD,
|
| 366 |
LLM_TENSOR_OUTPUT,
|
| 367 |
LLM_TENSOR_OUTPUT_NORM,
|
|
|
|
| 543 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 544 |
},
|
| 545 |
},
|
| 546 |
+
{
|
| 547 |
+
LLM_ARCH_BERT,
|
| 548 |
+
{
|
| 549 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 550 |
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
| 551 |
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
| 552 |
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
| 553 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
| 554 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 555 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 556 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 557 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 558 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
| 559 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 560 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 561 |
+
},
|
| 562 |
+
},
|
| 563 |
{
|
| 564 |
LLM_ARCH_BLOOM,
|
| 565 |
{
|
|
|
|
| 1464 |
// available llama models
|
| 1465 |
enum e_model {
|
| 1466 |
MODEL_UNKNOWN,
|
| 1467 |
+
MODEL_17M,
|
| 1468 |
+
MODEL_22M,
|
| 1469 |
+
MODEL_33M,
|
| 1470 |
+
MODEL_109M,
|
| 1471 |
+
MODEL_335M,
|
| 1472 |
MODEL_0_5B,
|
| 1473 |
MODEL_1B,
|
| 1474 |
MODEL_2B,
|
|
|
|
| 1510 |
uint32_t n_ff;
|
| 1511 |
uint32_t n_expert = 0;
|
| 1512 |
uint32_t n_expert_used = 0;
|
| 1513 |
+
uint32_t n_vocab_type = 0; // for BERT-style token types
|
| 1514 |
|
| 1515 |
float f_norm_eps;
|
| 1516 |
float f_norm_rms_eps;
|
|
|
|
| 1523 |
float f_clamp_kqv;
|
| 1524 |
float f_max_alibi_bias;
|
| 1525 |
|
| 1526 |
+
bool causal_attn = true;
|
| 1527 |
+
|
| 1528 |
|
| 1529 |
bool operator!=(const llama_hparams & other) const {
|
| 1530 |
if (this->vocab_only != other.vocab_only) return true;
|
|
|
|
| 1752 |
llama_vocab vocab;
|
| 1753 |
|
| 1754 |
struct ggml_tensor * tok_embd;
|
| 1755 |
+
struct ggml_tensor * type_embd;
|
| 1756 |
struct ggml_tensor * pos_embd;
|
| 1757 |
struct ggml_tensor * tok_norm;
|
| 1758 |
struct ggml_tensor * tok_norm_b;
|
|
|
|
| 1872 |
// memory buffers used to evaluate the model
|
| 1873 |
std::vector<uint8_t> buf_compute_meta;
|
| 1874 |
ggml_backend_sched_t sched = nullptr;
|
|
|
|
|
|
|
| 1875 |
|
| 1876 |
// input tensors
|
| 1877 |
ggml_backend_buffer_t buf_input = nullptr;
|
|
|
|
| 1881 |
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
| 1882 |
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
| 1883 |
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
| 1884 |
+
struct ggml_tensor * inp_sum; // F32 [1, n_batch]
|
| 1885 |
|
| 1886 |
#ifdef GGML_USE_MPI
|
| 1887 |
ggml_mpi_context * ctx_mpi = NULL;
|
|
|
|
| 2861 |
switch (type) {
|
| 2862 |
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
| 2863 |
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
| 2864 |
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
| 2865 |
default: return "unknown";
|
| 2866 |
}
|
| 2867 |
}
|
|
|
|
| 3033 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 3034 |
}
|
| 3035 |
} break;
|
| 3036 |
+
case LLM_ARCH_BERT:
|
| 3037 |
+
{
|
| 3038 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 3039 |
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
| 3040 |
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
| 3041 |
+
|
| 3042 |
+
switch (hparams.n_layer) {
|
| 3043 |
+
case 3:
|
| 3044 |
+
model.type = e_model::MODEL_17M; break; // bge-micro
|
| 3045 |
+
case 6:
|
| 3046 |
+
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
| 3047 |
+
case 12:
|
| 3048 |
+
switch (hparams.n_embd) {
|
| 3049 |
+
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
| 3050 |
+
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
| 3051 |
+
} break;
|
| 3052 |
+
case 24:
|
| 3053 |
+
model.type = e_model::MODEL_335M; break; // bge-large
|
| 3054 |
+
}
|
| 3055 |
+
} break;
|
| 3056 |
case LLM_ARCH_BLOOM:
|
| 3057 |
{
|
| 3058 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
| 3257 |
vocab.special_unk_id = -1;
|
| 3258 |
vocab.special_sep_id = -1;
|
| 3259 |
vocab.special_pad_id = -1;
|
| 3260 |
+
} else if (tokenizer_name == "bert") {
|
| 3261 |
+
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
| 3262 |
+
|
| 3263 |
+
// default special tokens
|
| 3264 |
+
vocab.special_bos_id = 101;
|
| 3265 |
+
vocab.special_eos_id = 102;
|
| 3266 |
+
vocab.special_unk_id = 100;
|
| 3267 |
+
vocab.special_sep_id = -1;
|
| 3268 |
+
vocab.special_pad_id = -1;
|
| 3269 |
+
vocab.add_space_prefix = false;
|
| 3270 |
} else {
|
| 3271 |
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
| 3272 |
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
|
|
| 3295 |
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
| 3296 |
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
| 3297 |
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
| 3298 |
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
| 3299 |
+
vocab.linefeed_id = vocab.special_pad_id;
|
| 3300 |
} else {
|
| 3301 |
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
| 3302 |
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
|
|
| 3634 |
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
| 3635 |
const int64_t n_embd_gqa = n_embd_v_gqa;
|
| 3636 |
const int64_t n_vocab = hparams.n_vocab;
|
| 3637 |
+
const int64_t n_vocab_type = hparams.n_vocab_type;
|
| 3638 |
const int64_t n_ff = hparams.n_ff;
|
| 3639 |
|
| 3640 |
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
|
|
| 3849 |
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
| 3850 |
}
|
| 3851 |
} break;
|
| 3852 |
+
case LLM_ARCH_BERT:
|
| 3853 |
{
|
| 3854 |
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3855 |
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
| 3856 |
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
| 3857 |
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
| 3858 |
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
| 3859 |
+
|
| 3860 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 3861 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 3862 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 3863 |
+
|
| 3864 |
+
auto & layer = model.layers[i];
|
| 3865 |
+
|
| 3866 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3867 |
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
| 3868 |
+
|
| 3869 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 3870 |
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
| 3871 |
+
|
| 3872 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 3873 |
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
| 3874 |
+
|
| 3875 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 3876 |
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
| 3877 |
+
|
| 3878 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3879 |
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
| 3880 |
+
|
| 3881 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3882 |
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
| 3883 |
+
|
| 3884 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 3885 |
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3886 |
+
|
| 3887 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
| 3888 |
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
| 3889 |
+
}
|
| 3890 |
+
} break;
|
| 3891 |
+
case LLM_ARCH_BLOOM:
|
| 3892 |
+
{
|
| 3893 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3894 |
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
| 3895 |
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
| 3896 |
|
| 3897 |
// output
|
| 3898 |
{
|
|
|
|
| 4844 |
const int32_t n_orig_ctx;
|
| 4845 |
|
| 4846 |
const bool do_rope_shift;
|
| 4847 |
+
const bool causal_attn;
|
| 4848 |
|
| 4849 |
const llm_build_cb & cb;
|
| 4850 |
|
|
|
|
| 4888 |
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
| 4889 |
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
| 4890 |
do_rope_shift (worst_case || kv_self.has_shift),
|
| 4891 |
+
causal_attn (hparams.causal_attn),
|
| 4892 |
cb (cb),
|
| 4893 |
buf_compute_meta (lctx.buf_compute_meta) {
|
| 4894 |
// all initializations should be done in init()
|
|
|
|
| 5732 |
return gf;
|
| 5733 |
}
|
| 5734 |
|
| 5735 |
+
struct ggml_cgraph * build_bert() {
|
| 5736 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5737 |
+
|
| 5738 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 5739 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 5740 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 5741 |
+
|
| 5742 |
+
struct ggml_tensor * cur;
|
| 5743 |
+
struct ggml_tensor * inpL;
|
| 5744 |
+
|
| 5745 |
+
// get input vectors with right size
|
| 5746 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 5747 |
+
struct ggml_tensor * inp_sum = ggml_view_1d(ctx0, lctx.inp_sum, n_tokens, 0);
|
| 5748 |
+
|
| 5749 |
+
// construct input embeddings (token, type, position)
|
| 5750 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 5751 |
+
// token types are hardcoded to zero ("Sentence A")
|
| 5752 |
+
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
| 5753 |
+
inpL = ggml_add(ctx0, inpL, type_row0);
|
| 5754 |
+
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
| 5755 |
+
cb(inpL, "inp_embd", -1);
|
| 5756 |
+
|
| 5757 |
+
// embed layer norm
|
| 5758 |
+
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
|
| 5759 |
+
cb(inpL, "inp_norm", -1);
|
| 5760 |
+
|
| 5761 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 5762 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 5763 |
+
cb(KQ_mask, "KQ_mask", -1); // [n_kv, n_tokens]
|
| 5764 |
+
|
| 5765 |
+
// iterate layers
|
| 5766 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 5767 |
+
struct ggml_tensor * cur = inpL;
|
| 5768 |
+
|
| 5769 |
+
// self-attention
|
| 5770 |
+
{
|
| 5771 |
+
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
| 5772 |
+
cb(Qcur, "Qcur", il);
|
| 5773 |
+
|
| 5774 |
+
struct ggml_tensor * Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
| 5775 |
+
cb(Kcur, "Kcur", il);
|
| 5776 |
+
|
| 5777 |
+
struct ggml_tensor * Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
| 5778 |
+
cb(Vcur, "Vcur", il);
|
| 5779 |
+
|
| 5780 |
+
// seems like we just need to do this for Q?
|
| 5781 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 5782 |
+
|
| 5783 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 5784 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 5785 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 5786 |
+
cb(cur, "kqv_out", il);
|
| 5787 |
+
}
|
| 5788 |
+
|
| 5789 |
+
// re-add the layer input
|
| 5790 |
+
cur = ggml_add(ctx0, cur, inpL);
|
| 5791 |
+
|
| 5792 |
+
// attention layer norm
|
| 5793 |
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
| 5794 |
+
|
| 5795 |
+
struct ggml_tensor * ffn_inp = cur;
|
| 5796 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 5797 |
+
|
| 5798 |
+
// feed-forward network
|
| 5799 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 5800 |
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
| 5801 |
+
NULL, NULL,
|
| 5802 |
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
| 5803 |
+
NULL,
|
| 5804 |
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
| 5805 |
+
cb(cur, "ffn_out", il);
|
| 5806 |
+
|
| 5807 |
+
// attentions bypass the intermediate layer
|
| 5808 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 5809 |
+
|
| 5810 |
+
// output layer norm
|
| 5811 |
+
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
| 5812 |
+
|
| 5813 |
+
// input for next layer
|
| 5814 |
+
inpL = cur;
|
| 5815 |
+
}
|
| 5816 |
+
|
| 5817 |
+
// final output
|
| 5818 |
+
cur = inpL;
|
| 5819 |
+
|
| 5820 |
+
// pooling
|
| 5821 |
+
cur = ggml_mul_mat(ctx0, inp_sum, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
|
| 5822 |
+
cb(cur, "result_embed", -1);
|
| 5823 |
+
|
| 5824 |
+
ggml_build_forward_expand(gf, cur);
|
| 5825 |
+
|
| 5826 |
+
return gf;
|
| 5827 |
+
}
|
| 5828 |
+
|
| 5829 |
struct ggml_cgraph * build_bloom() {
|
| 5830 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 5831 |
|
|
|
|
| 7197 |
|
| 7198 |
static struct ggml_cgraph * llama_build_graph(
|
| 7199 |
llama_context & lctx,
|
| 7200 |
+
const llama_batch & batch,
|
| 7201 |
+
bool worst_case) {
|
| 7202 |
const auto & model = lctx.model;
|
| 7203 |
|
|
|
|
|
|
|
|
|
|
| 7204 |
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
| 7205 |
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
| 7206 |
if (il >= 0) {
|
|
|
|
| 7221 |
|
| 7222 |
struct llm_build_context llm(lctx, batch, cb, worst_case);
|
| 7223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7224 |
llm.init();
|
| 7225 |
|
| 7226 |
switch (model.arch) {
|
|
|
|
| 7248 |
{
|
| 7249 |
result = llm.build_refact();
|
| 7250 |
} break;
|
| 7251 |
+
case LLM_ARCH_BERT:
|
| 7252 |
+
{
|
| 7253 |
+
result = llm.build_bert();
|
| 7254 |
+
} break;
|
| 7255 |
case LLM_ARCH_BLOOM:
|
| 7256 |
{
|
| 7257 |
result = llm.build_bloom();
|
|
|
|
| 7309 |
return result;
|
| 7310 |
}
|
| 7311 |
|
| 7312 |
+
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
| 7313 |
+
//
|
| 7314 |
+
// set input data
|
| 7315 |
+
//
|
| 7316 |
+
|
| 7317 |
+
const auto & hparams = lctx.model.hparams;
|
| 7318 |
+
const auto & cparams = lctx.cparams;
|
| 7319 |
+
const auto & kv_self = lctx.kv_self;
|
| 7320 |
+
|
| 7321 |
+
if (batch.token) {
|
| 7322 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7323 |
+
|
| 7324 |
+
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
| 7325 |
+
}
|
| 7326 |
+
|
| 7327 |
+
if (batch.embd) {
|
| 7328 |
+
const int64_t n_embd = hparams.n_embd;
|
| 7329 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7330 |
+
|
| 7331 |
+
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
| 7332 |
+
}
|
| 7333 |
+
|
| 7334 |
+
if (batch.pos) {
|
| 7335 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7336 |
+
|
| 7337 |
+
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
| 7338 |
+
}
|
| 7339 |
+
|
| 7340 |
+
{
|
| 7341 |
+
const int64_t n_kv = kv_self.n;
|
| 7342 |
+
const int64_t n_tokens = batch.n_tokens;
|
| 7343 |
+
|
| 7344 |
+
assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
| 7345 |
+
|
| 7346 |
+
float * data = (float *) lctx.inp_KQ_mask->data;
|
| 7347 |
+
|
| 7348 |
+
for (int h = 0; h < 1; ++h) {
|
| 7349 |
+
for (int j = 0; j < n_tokens; ++j) {
|
| 7350 |
+
const llama_pos pos = batch.pos[j];
|
| 7351 |
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
| 7352 |
+
|
| 7353 |
+
for (int i = 0; i < n_kv; ++i) {
|
| 7354 |
+
float f;
|
| 7355 |
+
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
| 7356 |
+
f = -INFINITY;
|
| 7357 |
+
} else {
|
| 7358 |
+
f = 0;
|
| 7359 |
+
}
|
| 7360 |
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
| 7361 |
+
}
|
| 7362 |
+
}
|
| 7363 |
+
}
|
| 7364 |
+
}
|
| 7365 |
+
|
| 7366 |
+
|
| 7367 |
+
{
|
| 7368 |
+
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
|
| 7369 |
+
float * data = (float *) lctx.inp_sum->data;
|
| 7370 |
+
|
| 7371 |
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
| 7372 |
+
data[i] = 1.0f/float(batch.n_tokens);
|
| 7373 |
+
}
|
| 7374 |
+
}
|
| 7375 |
+
|
| 7376 |
+
if (kv_self.has_shift) {
|
| 7377 |
+
const int64_t n_ctx = cparams.n_ctx;
|
| 7378 |
+
|
| 7379 |
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
| 7380 |
+
|
| 7381 |
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
| 7382 |
+
|
| 7383 |
+
for (int i = 0; i < n_ctx; ++i) {
|
| 7384 |
+
data[i] = lctx.kv_self.cells[i].delta;
|
| 7385 |
+
}
|
| 7386 |
+
}
|
| 7387 |
+
}
|
| 7388 |
+
|
| 7389 |
// decode a batch of tokens by evaluating the transformer
|
| 7390 |
//
|
| 7391 |
// - lctx: llama context
|
|
|
|
| 7484 |
ggml_backend_sched_reset(lctx.sched);
|
| 7485 |
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
| 7486 |
|
| 7487 |
+
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
| 7488 |
|
| 7489 |
// the output is always the last tensor in the graph
|
| 7490 |
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
|
|
|
|
|
|
|
|
| 7491 |
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
| 7492 |
+
if (strcmp(res->name, "result_output") == 0) {
|
| 7493 |
+
// the embeddings could be the second to last tensor, or the third to last tensor
|
| 7494 |
+
if (strcmp(embeddings->name, "result_norm") != 0) {
|
| 7495 |
+
embeddings = gf->nodes[gf->n_nodes - 3];
|
| 7496 |
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
| 7497 |
+
}
|
| 7498 |
+
} else if (strcmp(res->name, "result_embed") == 0) {
|
| 7499 |
+
embeddings = res;
|
| 7500 |
+
res = nullptr;
|
| 7501 |
+
} else {
|
| 7502 |
+
GGML_ASSERT(false);
|
| 7503 |
}
|
| 7504 |
|
| 7505 |
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
|
|
| 7529 |
if (lctx.backend_cpu != nullptr) {
|
| 7530 |
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
| 7531 |
}
|
| 7532 |
+
|
| 7533 |
+
llama_set_inputs(lctx, batch);
|
| 7534 |
+
|
| 7535 |
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
| 7536 |
|
| 7537 |
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
|
|
| 7571 |
// extract logits
|
| 7572 |
// TODO: do not compute and extract logits if only embeddings are needed
|
| 7573 |
// need to update the graphs to skip "result_output"
|
| 7574 |
+
if (res) {
|
| 7575 |
auto & logits_out = lctx.logits;
|
| 7576 |
|
| 7577 |
#ifndef NDEBUG
|
|
|
|
| 7615 |
if (!lctx.embedding.empty()) {
|
| 7616 |
auto & embedding_out = lctx.embedding;
|
| 7617 |
|
| 7618 |
+
const int64_t embed_pos = res ? n_embd * (n_tokens-1) : 0;
|
| 7619 |
+
|
| 7620 |
embedding_out.resize(n_embd);
|
| 7621 |
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
| 7622 |
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embed_pos*sizeof(float), n_embd*sizeof(float));
|
| 7623 |
ggml_backend_synchronize(embeddings_backend);
|
| 7624 |
}
|
| 7625 |
|
|
|
|
| 7683 |
GGML_ASSERT(false);
|
| 7684 |
return unicode_to_bytes_bpe(token_data.text);
|
| 7685 |
}
|
| 7686 |
+
case LLAMA_VOCAB_TYPE_WPM: {
|
| 7687 |
+
GGML_ASSERT(false);
|
| 7688 |
+
}
|
| 7689 |
default:
|
| 7690 |
GGML_ASSERT(false);
|
| 7691 |
}
|
|
|
|
| 7698 |
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
| 7699 |
return vocab.token_to_id.at(buf);
|
| 7700 |
}
|
| 7701 |
+
case LLAMA_VOCAB_TYPE_WPM:
|
| 7702 |
case LLAMA_VOCAB_TYPE_BPE: {
|
| 7703 |
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
| 7704 |
}
|
|
|
|
| 8169 |
llm_bigram_bpe::queue work_queue;
|
| 8170 |
};
|
| 8171 |
|
| 8172 |
+
struct llm_tokenizer_wpm {
|
| 8173 |
+
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
| 8174 |
+
|
| 8175 |
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
| 8176 |
+
auto * token_map = &vocab.token_to_id;
|
| 8177 |
+
|
| 8178 |
+
// normalize and split by whitespace
|
| 8179 |
+
std::vector<std::string> words = preprocess(text);
|
| 8180 |
+
|
| 8181 |
+
// bos token prepended already
|
| 8182 |
+
|
| 8183 |
+
// find the longest tokens that form the words
|
| 8184 |
+
for (const std::string &word : words) {
|
| 8185 |
+
// skip empty words
|
| 8186 |
+
if (word.size() == 0) {
|
| 8187 |
+
continue;
|
| 8188 |
+
}
|
| 8189 |
+
|
| 8190 |
+
// prepend phantom space
|
| 8191 |
+
std::string word1 = "\xe2\x96\x81" + word;
|
| 8192 |
+
int n = word1.size();
|
| 8193 |
+
|
| 8194 |
+
// we're at the start of a new word
|
| 8195 |
+
int i = 0;
|
| 8196 |
+
bool match_any = false;
|
| 8197 |
+
|
| 8198 |
+
// move through character position in word
|
| 8199 |
+
while (i < n) {
|
| 8200 |
+
// loop through possible match length
|
| 8201 |
+
bool match = false;
|
| 8202 |
+
for (int j = n; j > i; j--) {
|
| 8203 |
+
auto it = token_map->find(word1.substr(i, j - i));
|
| 8204 |
+
if (it != token_map->end()) {
|
| 8205 |
+
output.push_back(it->second);
|
| 8206 |
+
match = true;
|
| 8207 |
+
match_any = true;
|
| 8208 |
+
i = j;
|
| 8209 |
+
break;
|
| 8210 |
+
}
|
| 8211 |
+
}
|
| 8212 |
+
|
| 8213 |
+
// must be an unknown character
|
| 8214 |
+
if (!match) {
|
| 8215 |
+
i++;
|
| 8216 |
+
}
|
| 8217 |
+
}
|
| 8218 |
+
|
| 8219 |
+
// we didn't find any matches for this word
|
| 8220 |
+
if (!match_any) {
|
| 8221 |
+
output.push_back(vocab.special_unk_id);
|
| 8222 |
+
}
|
| 8223 |
+
}
|
| 8224 |
+
|
| 8225 |
+
// append eos token
|
| 8226 |
+
output.push_back(vocab.special_eos_id);
|
| 8227 |
+
}
|
| 8228 |
+
|
| 8229 |
+
std::vector<std::string> preprocess(const std::string & text) {
|
| 8230 |
+
std::string ori_str = normalize(text);
|
| 8231 |
+
uint64_t ori_size = ori_str.size();
|
| 8232 |
+
|
| 8233 |
+
// single punct / single symbol / single digit
|
| 8234 |
+
// baseline: add whitespace on the left and right of punct and chinese characters
|
| 8235 |
+
std::vector<std::string> words;
|
| 8236 |
+
std::string new_str = "";
|
| 8237 |
+
uint64_t i = 0;
|
| 8238 |
+
while (i < ori_size) {
|
| 8239 |
+
int utf_char_len = utf8_len(ori_str[i]);
|
| 8240 |
+
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
|
| 8241 |
+
new_str += " ";
|
| 8242 |
+
new_str += ori_str[i];
|
| 8243 |
+
new_str += " ";
|
| 8244 |
+
i += 1;
|
| 8245 |
+
}
|
| 8246 |
+
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
|
| 8247 |
+
new_str += " ";
|
| 8248 |
+
new_str += ori_str.substr(i, 3);
|
| 8249 |
+
new_str += " ";
|
| 8250 |
+
i += 3;
|
| 8251 |
+
}
|
| 8252 |
+
else {
|
| 8253 |
+
new_str += ori_str[i];
|
| 8254 |
+
i += 1;
|
| 8255 |
+
}
|
| 8256 |
+
}
|
| 8257 |
+
|
| 8258 |
+
// split by whitespace
|
| 8259 |
+
uint64_t l = 0;
|
| 8260 |
+
uint64_t r = 0;
|
| 8261 |
+
while (r < new_str.size()) {
|
| 8262 |
+
// if is whitespace
|
| 8263 |
+
if (isspace(new_str[r])) {
|
| 8264 |
+
if (r > l) words.push_back(new_str.substr(l, (r - l)));
|
| 8265 |
+
l = r + 1;
|
| 8266 |
+
r = l;
|
| 8267 |
+
}
|
| 8268 |
+
else {
|
| 8269 |
+
r += 1;
|
| 8270 |
+
}
|
| 8271 |
+
}
|
| 8272 |
+
if (r > l) {
|
| 8273 |
+
words.push_back(new_str.substr(l, (r - l)));
|
| 8274 |
+
}
|
| 8275 |
+
return words;
|
| 8276 |
+
}
|
| 8277 |
+
|
| 8278 |
+
std::string normalize(const std::string & text) {
|
| 8279 |
+
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
|
| 8280 |
+
std::string text2 = strip_accents(text);
|
| 8281 |
+
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
|
| 8282 |
+
char c = text2[i];
|
| 8283 |
+
if (c >= 'A' && c <= 'Z') {
|
| 8284 |
+
text2[i] = c - 'A' + 'a';
|
| 8285 |
+
}
|
| 8286 |
+
}
|
| 8287 |
+
return text2;
|
| 8288 |
+
}
|
| 8289 |
+
|
| 8290 |
+
bool is_chinese_char(const std::string & str) {
|
| 8291 |
+
int len = str.length();
|
| 8292 |
+
unsigned int codepoint = 0;
|
| 8293 |
+
int num_bytes = 0;
|
| 8294 |
+
int i = 0;
|
| 8295 |
+
unsigned char ch = static_cast<unsigned char>(str[i]);
|
| 8296 |
+
if (ch <= 0x7f) {
|
| 8297 |
+
codepoint = ch;
|
| 8298 |
+
num_bytes = 1;
|
| 8299 |
+
} else if ((ch >> 5) == 0x06) {
|
| 8300 |
+
codepoint = ch & 0x1f;
|
| 8301 |
+
num_bytes = 2;
|
| 8302 |
+
} else if ((ch >> 4) == 0x0e) {
|
| 8303 |
+
codepoint = ch & 0x0f;
|
| 8304 |
+
num_bytes = 3;
|
| 8305 |
+
} else if ((ch >> 3) == 0x1e) {
|
| 8306 |
+
codepoint = ch & 0x07;
|
| 8307 |
+
num_bytes = 4;
|
| 8308 |
+
}
|
| 8309 |
+
for (int j = 1; j < num_bytes; ++j) {
|
| 8310 |
+
if (i + j >= len) {
|
| 8311 |
+
return false; // incomplete UTF-8 character
|
| 8312 |
+
}
|
| 8313 |
+
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
| 8314 |
+
if ((next_ch >> 6) != 0x02) {
|
| 8315 |
+
return false; // invalid trailing byte
|
| 8316 |
+
}
|
| 8317 |
+
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
| 8318 |
+
}
|
| 8319 |
+
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
| 8320 |
+
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
| 8321 |
+
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
| 8322 |
+
(codepoint >= 0x2A700 && codepoint <= 0x2B73F) ||
|
| 8323 |
+
(codepoint >= 0x2B740 && codepoint <= 0x2B81F) ||
|
| 8324 |
+
(codepoint >= 0x2B920 && codepoint <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
|
| 8325 |
+
(codepoint >= 0xF900 && codepoint <= 0xFAFF) ||
|
| 8326 |
+
(codepoint >= 0x2F800 && codepoint <= 0x2FA1F) ||
|
| 8327 |
+
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
|
| 8328 |
+
(codepoint >= 0xFF00 && codepoint <= 0xFFEF)) {
|
| 8329 |
+
return true; // NOLINT
|
| 8330 |
+
}
|
| 8331 |
+
return false;
|
| 8332 |
+
}
|
| 8333 |
+
|
| 8334 |
+
std::string strip_accents(const std::string & input_string) {
|
| 8335 |
+
std::string resultString;
|
| 8336 |
+
std::map<std::string, char> accent_map = {
|
| 8337 |
+
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
| 8338 |
+
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
| 8339 |
+
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
| 8340 |
+
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
| 8341 |
+
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
| 8342 |
+
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
| 8343 |
+
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
| 8344 |
+
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
| 8345 |
+
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
| 8346 |
+
};
|
| 8347 |
+
|
| 8348 |
+
for (size_t i = 0; i < input_string.length();) {
|
| 8349 |
+
int len = utf8_len(input_string[i]);
|
| 8350 |
+
std::string curChar = input_string.substr(i, len);
|
| 8351 |
+
auto iter = accent_map.find(curChar);
|
| 8352 |
+
if (iter != accent_map.end()) {
|
| 8353 |
+
resultString += iter->second;
|
| 8354 |
+
} else {
|
| 8355 |
+
resultString += curChar;
|
| 8356 |
+
}
|
| 8357 |
+
i += len;
|
| 8358 |
+
}
|
| 8359 |
+
|
| 8360 |
+
return resultString;
|
| 8361 |
+
}
|
| 8362 |
+
|
| 8363 |
+
static size_t utf8_len(char src) {
|
| 8364 |
+
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
| 8365 |
+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
| 8366 |
+
return lookup[highbits];
|
| 8367 |
+
}
|
| 8368 |
+
|
| 8369 |
+
const llama_vocab & vocab;
|
| 8370 |
+
};
|
| 8371 |
+
|
| 8372 |
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
| 8373 |
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
| 8374 |
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
| 8375 |
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
| 8376 |
|
| 8377 |
+
struct fragment_buffer_variant {
|
| 8378 |
fragment_buffer_variant(llama_vocab::id _token)
|
| 8379 |
:
|
| 8380 |
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
|
|
|
| 8404 |
|
| 8405 |
// #define PRETOKENIZERDEBUG
|
| 8406 |
|
| 8407 |
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
|
|
|
| 8408 |
// for each special token
|
| 8409 |
for (const auto & st: vocab.special_tokens_cache) {
|
| 8410 |
const auto & special_token = st.first;
|
|
|
|
| 8522 |
switch (vocab.type) {
|
| 8523 |
case LLAMA_VOCAB_TYPE_SPM:
|
| 8524 |
{
|
| 8525 |
+
for (const auto & fragment: fragment_buffer) {
|
| 8526 |
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
|
|
|
|
|
| 8527 |
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
| 8528 |
|
| 8529 |
// TODO: It's likely possible to get rid of this string copy entirely
|
|
|
|
| 8543 |
llm_tokenizer_spm tokenizer(vocab);
|
| 8544 |
llama_escape_whitespace(raw_text);
|
| 8545 |
tokenizer.tokenize(raw_text, output);
|
| 8546 |
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
|
|
|
|
|
| 8547 |
output.push_back(fragment.token);
|
| 8548 |
}
|
| 8549 |
}
|
| 8550 |
} break;
|
| 8551 |
case LLAMA_VOCAB_TYPE_BPE:
|
| 8552 |
{
|
| 8553 |
+
for (const auto & fragment: fragment_buffer) {
|
| 8554 |
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
|
|
|
|
|
| 8555 |
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8556 |
|
| 8557 |
#ifdef PRETOKENIZERDEBUG
|
|
|
|
| 8559 |
#endif
|
| 8560 |
llm_tokenizer_bpe tokenizer(vocab);
|
| 8561 |
tokenizer.tokenize(raw_text, output);
|
| 8562 |
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 8563 |
+
output.push_back(fragment.token);
|
| 8564 |
}
|
| 8565 |
+
}
|
| 8566 |
+
} break;
|
| 8567 |
+
case LLAMA_VOCAB_TYPE_WPM:
|
| 8568 |
+
{
|
| 8569 |
+
for (const auto & fragment: fragment_buffer) {
|
| 8570 |
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
| 8571 |
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
| 8572 |
+
|
| 8573 |
+
#ifdef PRETOKENIZERDEBUG
|
| 8574 |
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
| 8575 |
+
#endif
|
| 8576 |
+
llm_tokenizer_wpm tokenizer(vocab);
|
| 8577 |
+
tokenizer.tokenize(raw_text, output);
|
| 8578 |
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
| 8579 |
output.push_back(fragment.token);
|
| 8580 |
}
|
| 8581 |
}
|
|
|
|
| 11239 |
// graph inputs
|
| 11240 |
{
|
| 11241 |
ggml_init_params init_params = {
|
| 11242 |
+
/* .mem_size */ ggml_tensor_overhead()*7,
|
| 11243 |
/* .mem_buffer */ nullptr,
|
| 11244 |
/* .no_alloc */ true,
|
| 11245 |
};
|
|
|
|
| 11250 |
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
| 11251 |
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
| 11252 |
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
| 11253 |
+
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
|
| 11254 |
|
| 11255 |
ggml_set_name(ctx->inp_tokens, "inp_tokens");
|
| 11256 |
ggml_set_name(ctx->inp_embd, "inp_embd");
|
| 11257 |
ggml_set_name(ctx->inp_pos, "inp_pos");
|
| 11258 |
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
|
| 11259 |
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
|
| 11260 |
+
ggml_set_name(ctx->inp_sum, "inp_sum");
|
| 11261 |
|
| 11262 |
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
| 11263 |
|
|
|
|
| 11283 |
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
| 11284 |
|
| 11285 |
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
|
|
|
| 11286 |
|
| 11287 |
// build worst-case graph
|
| 11288 |
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
| 11289 |
int n_past = cparams.n_ctx - n_tokens;
|
| 11290 |
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
| 11291 |
+
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
| 11292 |
|
| 11293 |
// initialize scheduler with the worst-case graph
|
| 11294 |
+
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
|
| 11295 |
+
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
| 11296 |
+
llama_free(ctx);
|
| 11297 |
+
return nullptr;
|
| 11298 |
+
}
|
| 11299 |
|
| 11300 |
+
for (size_t i = 0; i < ctx->backends.size(); i++) {
|
| 11301 |
+
ggml_backend_t backend = ctx->backends[i];
|
| 11302 |
+
ggml_backend_buffer_type_t buft = backend_buft[i];
|
| 11303 |
+
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
| 11304 |
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
| 11305 |
+
ggml_backend_buft_name(buft),
|
| 11306 |
+
size / 1024.0 / 1024.0);
|
| 11307 |
}
|
| 11308 |
|
| 11309 |
// note: the number of splits during measure is higher than during inference due to the kv shift
|
|
|
|
| 12192 |
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
| 12193 |
if (0 <= token && token < llama_n_vocab(model)) {
|
| 12194 |
switch (llama_vocab_get_type(model->vocab)) {
|
| 12195 |
+
case LLAMA_VOCAB_TYPE_WPM:
|
| 12196 |
case LLAMA_VOCAB_TYPE_SPM: {
|
| 12197 |
// NOTE: we accept all unsupported token types,
|
| 12198 |
// suppressing them like CONTROL tokens.
|
|
|
|
| 12316 |
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 12317 |
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
| 12318 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 12319 |
+
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
| 12320 |
|
| 12321 |
return s.c_str();
|
| 12322 |
}
|
examples/talk-llama/llama.h
CHANGED
|
@@ -61,6 +61,7 @@ extern "C" {
|
|
| 61 |
enum llama_vocab_type {
|
| 62 |
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
| 63 |
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
|
|
|
| 64 |
};
|
| 65 |
|
| 66 |
enum llama_token_type {
|
|
|
|
| 61 |
enum llama_vocab_type {
|
| 62 |
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
| 63 |
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
| 64 |
+
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
| 65 |
};
|
| 66 |
|
| 67 |
enum llama_token_type {
|