Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-sampling.cpp +35 -90
- examples/talk-llama/llama-vocab.cpp +2 -1
- examples/talk-llama/llama.cpp +485 -33
- examples/talk-llama/llama.h +15 -15
- examples/talk-llama/unicode.cpp +51 -51
- examples/talk-llama/unicode.h +9 -10
examples/talk-llama/llama-sampling.cpp
CHANGED
|
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
|
| 1396 |
// penalties
|
| 1397 |
|
| 1398 |
struct llama_sampler_penalties {
|
| 1399 |
-
const int32_t n_vocab;
|
| 1400 |
-
const llama_token special_eos_id;
|
| 1401 |
-
const llama_token linefeed_id;
|
| 1402 |
-
|
| 1403 |
const int32_t penalty_last_n;
|
| 1404 |
const float penalty_repeat;
|
| 1405 |
const float penalty_freq;
|
| 1406 |
const float penalty_present;
|
| 1407 |
|
| 1408 |
-
const bool penalize_nl;
|
| 1409 |
-
const bool ignore_eos;
|
| 1410 |
-
|
| 1411 |
ring_buffer<llama_token> prev;
|
|
|
|
|
|
|
|
|
|
| 1412 |
};
|
| 1413 |
|
| 1414 |
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
|
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
|
| 1421 |
return;
|
| 1422 |
}
|
| 1423 |
|
| 1424 |
-
ctx->
|
| 1425 |
-
}
|
| 1426 |
-
|
| 1427 |
-
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
| 1428 |
-
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
| 1429 |
|
| 1430 |
-
if
|
| 1431 |
-
|
|
|
|
| 1432 |
|
| 1433 |
-
|
| 1434 |
-
if (
|
| 1435 |
-
|
| 1436 |
-
} else {
|
| 1437 |
-
// else, search for the special EOS token
|
| 1438 |
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1439 |
-
if (cur_p->data[i].id == ctx->special_eos_id) {
|
| 1440 |
-
cur_p->data[i].logit = -INFINITY;
|
| 1441 |
-
break;
|
| 1442 |
-
}
|
| 1443 |
-
}
|
| 1444 |
}
|
| 1445 |
}
|
| 1446 |
|
| 1447 |
-
|
| 1448 |
-
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
| 1449 |
-
return;
|
| 1450 |
-
}
|
| 1451 |
-
|
| 1452 |
-
bool nl_found = false;
|
| 1453 |
-
size_t nl_idx = 0;
|
| 1454 |
-
float nl_logit = -INFINITY;
|
| 1455 |
-
if (!ctx->penalize_nl) {
|
| 1456 |
-
assert(ctx->linefeed_id >= 0);
|
| 1457 |
|
| 1458 |
-
|
| 1459 |
-
|
| 1460 |
-
|
| 1461 |
-
|
| 1462 |
-
|
| 1463 |
-
} else {
|
| 1464 |
-
// else, search for the linefeed token
|
| 1465 |
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1466 |
-
if (cur_p->data[i].id == ctx->linefeed_id) {
|
| 1467 |
-
nl_found = true;
|
| 1468 |
-
nl_idx = i;
|
| 1469 |
-
nl_logit = cur_p->data[i].logit;
|
| 1470 |
-
break;
|
| 1471 |
-
}
|
| 1472 |
-
}
|
| 1473 |
-
}
|
| 1474 |
}
|
| 1475 |
|
| 1476 |
-
|
| 1477 |
-
|
| 1478 |
-
|
| 1479 |
-
|
|
|
|
|
|
|
| 1480 |
|
| 1481 |
-
|
| 1482 |
-
|
|
|
|
| 1483 |
}
|
| 1484 |
|
| 1485 |
// Apply frequency and presence penalties to the cur_p
|
| 1486 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1487 |
-
const auto token_iter = token_count.find(cur_p->data[i].id);
|
| 1488 |
-
if (token_iter == token_count.end()) {
|
| 1489 |
continue;
|
| 1490 |
}
|
| 1491 |
|
| 1492 |
const int count = token_iter->second;
|
| 1493 |
|
|
|
|
|
|
|
| 1494 |
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
| 1495 |
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
| 1496 |
if (cur_p->data[i].logit <= 0) {
|
|
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
|
| 1503 |
}
|
| 1504 |
|
| 1505 |
cur_p->sorted = false;
|
| 1506 |
-
|
| 1507 |
-
if (!ctx->penalize_nl && nl_found) {
|
| 1508 |
-
// restore the logit of the newline token if it was penalized
|
| 1509 |
-
cur_p->data[nl_idx].logit = nl_logit;
|
| 1510 |
-
}
|
| 1511 |
}
|
| 1512 |
|
| 1513 |
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
| 1514 |
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
| 1515 |
ctx->prev.clear();
|
|
|
|
| 1516 |
}
|
| 1517 |
|
| 1518 |
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
| 1519 |
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
| 1520 |
auto * result = llama_sampler_init_penalties(
|
| 1521 |
-
ctx->n_vocab,
|
| 1522 |
-
ctx->special_eos_id,
|
| 1523 |
-
ctx->linefeed_id,
|
| 1524 |
ctx->penalty_last_n,
|
| 1525 |
ctx->penalty_repeat,
|
| 1526 |
ctx->penalty_freq,
|
| 1527 |
-
ctx->penalty_present
|
| 1528 |
-
ctx->penalize_nl,
|
| 1529 |
-
ctx->ignore_eos);
|
| 1530 |
|
| 1531 |
// copy the state
|
| 1532 |
{
|
|
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
| 1552 |
};
|
| 1553 |
|
| 1554 |
struct llama_sampler * llama_sampler_init_penalties(
|
| 1555 |
-
int32_t n_vocab,
|
| 1556 |
-
llama_token special_eos_id,
|
| 1557 |
-
llama_token linefeed_id,
|
| 1558 |
int32_t penalty_last_n,
|
| 1559 |
float penalty_repeat,
|
| 1560 |
float penalty_freq,
|
| 1561 |
-
float penalty_present
|
| 1562 |
-
bool penalize_nl,
|
| 1563 |
-
bool ignore_eos) {
|
| 1564 |
-
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
| 1565 |
-
penalize_nl = true;
|
| 1566 |
-
}
|
| 1567 |
-
|
| 1568 |
-
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
| 1569 |
-
ignore_eos = false;
|
| 1570 |
-
}
|
| 1571 |
-
|
| 1572 |
penalty_last_n = std::max(penalty_last_n, 0);
|
| 1573 |
|
| 1574 |
return new llama_sampler {
|
| 1575 |
/* .iface = */ &llama_sampler_penalties_i,
|
| 1576 |
/* .ctx = */ new llama_sampler_penalties {
|
| 1577 |
-
/* .n_vocab = */ n_vocab,
|
| 1578 |
-
/* .special_eos_id = */ special_eos_id,
|
| 1579 |
-
/* .linefeed_id = */ linefeed_id,
|
| 1580 |
/* .penalty_last_n = */ penalty_last_n,
|
| 1581 |
/* .penalty_repeat = */ penalty_repeat,
|
| 1582 |
/* .penalty_freq = */ penalty_freq,
|
| 1583 |
/* .penalty_present = */ penalty_present,
|
| 1584 |
-
/* .penalize_nl = */ penalize_nl,
|
| 1585 |
-
/* .ignore_eos = */ ignore_eos,
|
| 1586 |
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
|
|
|
| 1587 |
},
|
| 1588 |
};
|
| 1589 |
}
|
|
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|
| 1611 |
if (word.find(str) != std::string::npos) {
|
| 1612 |
token_sequences.emplace(token_id, std::vector<llama_token>());
|
| 1613 |
} else {
|
| 1614 |
-
size_t word_len = word.size()
|
|
|
|
| 1615 |
size_t pos = -1;
|
| 1616 |
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
| 1617 |
bool match = true;
|
|
|
|
| 1396 |
// penalties
|
| 1397 |
|
| 1398 |
struct llama_sampler_penalties {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1399 |
const int32_t penalty_last_n;
|
| 1400 |
const float penalty_repeat;
|
| 1401 |
const float penalty_freq;
|
| 1402 |
const float penalty_present;
|
| 1403 |
|
|
|
|
|
|
|
|
|
|
| 1404 |
ring_buffer<llama_token> prev;
|
| 1405 |
+
|
| 1406 |
+
// a frequency map to count token occurrences
|
| 1407 |
+
std::unordered_map<llama_token, int> token_count;
|
| 1408 |
};
|
| 1409 |
|
| 1410 |
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
|
|
|
| 1417 |
return;
|
| 1418 |
}
|
| 1419 |
|
| 1420 |
+
ctx->token_count[token]++;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1421 |
|
| 1422 |
+
// if the ring buffer is full, remove the oldest token
|
| 1423 |
+
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
| 1424 |
+
const auto old = ctx->prev.front();
|
| 1425 |
|
| 1426 |
+
ctx->token_count[old]--;
|
| 1427 |
+
if (ctx->token_count[old] == 0) {
|
| 1428 |
+
ctx->token_count.erase(old);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1429 |
}
|
| 1430 |
}
|
| 1431 |
|
| 1432 |
+
ctx->prev.push_back(token);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1433 |
|
| 1434 |
+
#if 0
|
| 1435 |
+
// sanity check
|
| 1436 |
+
std::unordered_map<llama_token, int> tmp;
|
| 1437 |
+
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
| 1438 |
+
tmp[ctx->prev.rat(i)]++;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1439 |
}
|
| 1440 |
|
| 1441 |
+
assert(ctx->token_count == tmp);
|
| 1442 |
+
#endif
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
| 1446 |
+
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
| 1447 |
|
| 1448 |
+
if ((ctx->penalty_last_n == 0) ||
|
| 1449 |
+
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
| 1450 |
+
return;
|
| 1451 |
}
|
| 1452 |
|
| 1453 |
// Apply frequency and presence penalties to the cur_p
|
| 1454 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1455 |
+
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
| 1456 |
+
if (token_iter == ctx->token_count.end()) {
|
| 1457 |
continue;
|
| 1458 |
}
|
| 1459 |
|
| 1460 |
const int count = token_iter->second;
|
| 1461 |
|
| 1462 |
+
assert(count > 0 && count <= ctx->penalty_last_n);
|
| 1463 |
+
|
| 1464 |
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
| 1465 |
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
| 1466 |
if (cur_p->data[i].logit <= 0) {
|
|
|
|
| 1473 |
}
|
| 1474 |
|
| 1475 |
cur_p->sorted = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1476 |
}
|
| 1477 |
|
| 1478 |
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
| 1479 |
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
| 1480 |
ctx->prev.clear();
|
| 1481 |
+
ctx->token_count.clear();
|
| 1482 |
}
|
| 1483 |
|
| 1484 |
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
| 1485 |
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
| 1486 |
auto * result = llama_sampler_init_penalties(
|
|
|
|
|
|
|
|
|
|
| 1487 |
ctx->penalty_last_n,
|
| 1488 |
ctx->penalty_repeat,
|
| 1489 |
ctx->penalty_freq,
|
| 1490 |
+
ctx->penalty_present);
|
|
|
|
|
|
|
| 1491 |
|
| 1492 |
// copy the state
|
| 1493 |
{
|
|
|
|
| 1513 |
};
|
| 1514 |
|
| 1515 |
struct llama_sampler * llama_sampler_init_penalties(
|
|
|
|
|
|
|
|
|
|
| 1516 |
int32_t penalty_last_n,
|
| 1517 |
float penalty_repeat,
|
| 1518 |
float penalty_freq,
|
| 1519 |
+
float penalty_present) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1520 |
penalty_last_n = std::max(penalty_last_n, 0);
|
| 1521 |
|
| 1522 |
return new llama_sampler {
|
| 1523 |
/* .iface = */ &llama_sampler_penalties_i,
|
| 1524 |
/* .ctx = */ new llama_sampler_penalties {
|
|
|
|
|
|
|
|
|
|
| 1525 |
/* .penalty_last_n = */ penalty_last_n,
|
| 1526 |
/* .penalty_repeat = */ penalty_repeat,
|
| 1527 |
/* .penalty_freq = */ penalty_freq,
|
| 1528 |
/* .penalty_present = */ penalty_present,
|
|
|
|
|
|
|
| 1529 |
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
| 1530 |
+
/* .token_count = */ {},
|
| 1531 |
},
|
| 1532 |
};
|
| 1533 |
}
|
|
|
|
| 1555 |
if (word.find(str) != std::string::npos) {
|
| 1556 |
token_sequences.emplace(token_id, std::vector<llama_token>());
|
| 1557 |
} else {
|
| 1558 |
+
size_t word_len = word.size();
|
| 1559 |
+
size_t str_len = str.size();
|
| 1560 |
size_t pos = -1;
|
| 1561 |
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
| 1562 |
bool match = true;
|
examples/talk-llama/llama-vocab.cpp
CHANGED
|
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
| 418 |
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
| 419 |
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
| 420 |
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
|
|
|
| 421 |
regex_exprs = {
|
| 422 |
"\\p{N}",
|
| 423 |
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
|
| 737 |
std::vector<std::string> words(1, "");
|
| 738 |
|
| 739 |
for (const uint32_t cpt : cpts_nfd) {
|
| 740 |
-
const auto flags =
|
| 741 |
|
| 742 |
if (flags.is_whitespace) {
|
| 743 |
if (words.back().size()) { // finish previous word if any
|
|
|
|
| 418 |
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
| 419 |
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
| 420 |
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
| 421 |
+
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
| 422 |
regex_exprs = {
|
| 423 |
"\\p{N}",
|
| 424 |
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
|
|
| 738 |
std::vector<std::string> words(1, "");
|
| 739 |
|
| 740 |
for (const uint32_t cpt : cpts_nfd) {
|
| 741 |
+
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
| 742 |
|
| 743 |
if (flags.is_whitespace) {
|
| 744 |
if (words.back().size()) { // finish previous word if any
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -163,6 +163,7 @@ enum llm_arch {
|
|
| 163 |
LLM_ARCH_QWEN,
|
| 164 |
LLM_ARCH_QWEN2,
|
| 165 |
LLM_ARCH_QWEN2MOE,
|
|
|
|
| 166 |
LLM_ARCH_PHI2,
|
| 167 |
LLM_ARCH_PHI3,
|
| 168 |
LLM_ARCH_PLAMO,
|
|
@@ -183,6 +184,7 @@ enum llm_arch {
|
|
| 183 |
LLM_ARCH_OLMOE,
|
| 184 |
LLM_ARCH_OPENELM,
|
| 185 |
LLM_ARCH_ARCTIC,
|
|
|
|
| 186 |
LLM_ARCH_DEEPSEEK2,
|
| 187 |
LLM_ARCH_CHATGLM,
|
| 188 |
LLM_ARCH_BITNET,
|
|
@@ -217,6 +219,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 217 |
{ LLM_ARCH_QWEN, "qwen" },
|
| 218 |
{ LLM_ARCH_QWEN2, "qwen2" },
|
| 219 |
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
|
|
| 220 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 221 |
{ LLM_ARCH_PHI3, "phi3" },
|
| 222 |
{ LLM_ARCH_PLAMO, "plamo" },
|
|
@@ -237,6 +240,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 237 |
{ LLM_ARCH_OLMOE, "olmoe" },
|
| 238 |
{ LLM_ARCH_OPENELM, "openelm" },
|
| 239 |
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
|
|
| 240 |
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
| 241 |
{ LLM_ARCH_CHATGLM, "chatglm" },
|
| 242 |
{ LLM_ARCH_BITNET, "bitnet" },
|
|
@@ -308,6 +312,7 @@ enum llm_kv {
|
|
| 308 |
LLM_KV_ATTENTION_SCALE,
|
| 309 |
|
| 310 |
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
|
|
| 311 |
LLM_KV_ROPE_FREQ_BASE,
|
| 312 |
LLM_KV_ROPE_SCALE_LINEAR,
|
| 313 |
LLM_KV_ROPE_SCALING_TYPE,
|
|
@@ -424,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 424 |
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
| 425 |
|
| 426 |
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
|
|
| 427 |
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
| 428 |
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
| 429 |
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
@@ -898,6 +904,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 898 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 899 |
},
|
| 900 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 901 |
{
|
| 902 |
LLM_ARCH_QWEN2MOE,
|
| 903 |
{
|
|
@@ -1288,6 +1311,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1288 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1289 |
},
|
| 1290 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1291 |
{
|
| 1292 |
LLM_ARCH_DEEPSEEK2,
|
| 1293 |
{
|
|
@@ -1562,6 +1612,7 @@ enum llm_chat_template {
|
|
| 1562 |
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
| 1563 |
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
| 1564 |
LLM_CHAT_TEMPLATE_PHI_3,
|
|
|
|
| 1565 |
LLM_CHAT_TEMPLATE_ZEPHYR,
|
| 1566 |
LLM_CHAT_TEMPLATE_MONARCH,
|
| 1567 |
LLM_CHAT_TEMPLATE_GEMMA,
|
|
@@ -1579,6 +1630,7 @@ enum llm_chat_template {
|
|
| 1579 |
LLM_CHAT_TEMPLATE_EXAONE_3,
|
| 1580 |
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
| 1581 |
LLM_CHAT_TEMPLATE_GRANITE,
|
|
|
|
| 1582 |
LLM_CHAT_TEMPLATE_UNKNOWN,
|
| 1583 |
};
|
| 1584 |
|
|
@@ -1593,6 +1645,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
| 1593 |
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
| 1594 |
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
| 1595 |
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
|
|
|
| 1596 |
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
| 1597 |
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
| 1598 |
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
|
@@ -1610,6 +1663,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
| 1610 |
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
| 1611 |
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
| 1612 |
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
|
|
|
| 1613 |
};
|
| 1614 |
|
| 1615 |
static llm_arch llm_arch_from_string(const std::string & name) {
|
|
@@ -1794,7 +1848,7 @@ private:
|
|
| 1794 |
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 1795 |
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
| 1796 |
if (!bufLen) {
|
| 1797 |
-
ret = format("Win32 error code: %
|
| 1798 |
} else {
|
| 1799 |
ret = lpMsgBuf;
|
| 1800 |
LocalFree(lpMsgBuf);
|
|
@@ -2132,7 +2186,7 @@ struct llama_mmap {
|
|
| 2132 |
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
| 2133 |
|
| 2134 |
// may fail on pre-Windows 8 systems
|
| 2135 |
-
pPrefetchVirtualMemory =
|
| 2136 |
|
| 2137 |
if (pPrefetchVirtualMemory) {
|
| 2138 |
// advise the kernel to preload the mapped memory
|
|
@@ -2474,11 +2528,12 @@ struct llama_hparams {
|
|
| 2474 |
uint32_t time_decay_extra_dim = 0;
|
| 2475 |
uint32_t wkv_head_size = 0;
|
| 2476 |
|
| 2477 |
-
float
|
| 2478 |
-
float
|
| 2479 |
-
float
|
| 2480 |
-
uint32_t
|
| 2481 |
-
float
|
|
|
|
| 2482 |
|
| 2483 |
// for State Space Models
|
| 2484 |
uint32_t ssm_d_conv = 0;
|
|
@@ -2535,6 +2590,9 @@ struct llama_hparams {
|
|
| 2535 |
|
| 2536 |
if (this->rope_finetuned != other.rope_finetuned) return true;
|
| 2537 |
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
|
|
|
|
|
|
|
|
|
| 2538 |
|
| 2539 |
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
| 2540 |
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
@@ -3378,6 +3436,11 @@ struct llama_context {
|
|
| 3378 |
// whether we are computing encoder output or decoder output
|
| 3379 |
bool is_encoding = false;
|
| 3380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3381 |
// output of the encoder part of the encoder-decoder models
|
| 3382 |
std::vector<float> embd_enc;
|
| 3383 |
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
@@ -4578,9 +4641,6 @@ struct llama_model_loader {
|
|
| 4578 |
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
| 4579 |
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
| 4580 |
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
| 4581 |
-
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
| 4582 |
-
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
| 4583 |
-
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
| 4584 |
default:
|
| 4585 |
{
|
| 4586 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -5344,9 +5404,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
| 5344 |
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
| 5345 |
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
| 5346 |
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
| 5347 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
| 5348 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
| 5349 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
| 5350 |
|
| 5351 |
default: return "unknown, may not work";
|
| 5352 |
}
|
|
@@ -5753,6 +5810,13 @@ static void llm_load_hparams(
|
|
| 5753 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5754 |
}
|
| 5755 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5756 |
case LLM_ARCH_QWEN2:
|
| 5757 |
{
|
| 5758 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -6063,6 +6127,19 @@ static void llm_load_hparams(
|
|
| 6063 |
model.type = e_model::MODEL_UNKNOWN;
|
| 6064 |
}
|
| 6065 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6066 |
case LLM_ARCH_DEEPSEEK2:
|
| 6067 |
{
|
| 6068 |
bool is_lite = (hparams.n_layer == 27);
|
|
@@ -6398,6 +6475,11 @@ static void llm_load_vocab(
|
|
| 6398 |
} else if (
|
| 6399 |
tokenizer_pre == "falcon") {
|
| 6400 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6401 |
} else if (
|
| 6402 |
tokenizer_pre == "mpt") {
|
| 6403 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
|
@@ -6409,6 +6491,7 @@ static void llm_load_vocab(
|
|
| 6409 |
tokenizer_pre == "phi-2" ||
|
| 6410 |
tokenizer_pre == "jina-es" ||
|
| 6411 |
tokenizer_pre == "jina-de" ||
|
|
|
|
| 6412 |
tokenizer_pre == "jina-v1-en" ||
|
| 6413 |
tokenizer_pre == "jina-v2-es" ||
|
| 6414 |
tokenizer_pre == "jina-v2-de" ||
|
|
@@ -6479,6 +6562,9 @@ static void llm_load_vocab(
|
|
| 6479 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
| 6480 |
vocab.tokenizer_add_bos = true;
|
| 6481 |
vocab.tokenizer_clean_spaces = false;
|
|
|
|
|
|
|
|
|
|
| 6482 |
} else {
|
| 6483 |
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
| 6484 |
}
|
|
@@ -7057,6 +7143,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 7057 |
|
| 7058 |
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
| 7059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7060 |
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
| 7061 |
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
| 7062 |
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
@@ -8170,6 +8263,7 @@ static bool llm_load_tensors(
|
|
| 8170 |
}
|
| 8171 |
} break;
|
| 8172 |
case LLM_ARCH_QWEN2:
|
|
|
|
| 8173 |
{
|
| 8174 |
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 8175 |
|
|
@@ -8830,6 +8924,55 @@ static bool llm_load_tensors(
|
|
| 8830 |
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
| 8831 |
}
|
| 8832 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8833 |
case LLM_ARCH_DEEPSEEK2:
|
| 8834 |
{
|
| 8835 |
const bool is_lite = (hparams.n_layer == 27);
|
|
@@ -12559,6 +12702,124 @@ struct llm_build_context {
|
|
| 12559 |
return gf;
|
| 12560 |
}
|
| 12561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12562 |
struct ggml_cgraph * build_qwen2moe() {
|
| 12563 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 12564 |
|
|
@@ -15066,6 +15327,161 @@ struct llm_build_context {
|
|
| 15066 |
return gf;
|
| 15067 |
}
|
| 15068 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15069 |
struct ggml_cgraph * build_deepseek2() {
|
| 15070 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 15071 |
|
|
@@ -16660,6 +17076,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16660 |
{
|
| 16661 |
result = llm.build_qwen2();
|
| 16662 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16663 |
case LLM_ARCH_QWEN2MOE:
|
| 16664 |
{
|
| 16665 |
result = llm.build_qwen2moe();
|
|
@@ -16748,6 +17169,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 16748 |
{
|
| 16749 |
result = llm.build_arctic();
|
| 16750 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16751 |
case LLM_ARCH_DEEPSEEK2:
|
| 16752 |
{
|
| 16753 |
result = llm.build_deepseek2();
|
|
@@ -16878,8 +17303,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|
| 16878 |
|
| 16879 |
if (ubatch.pos && lctx.inp_pos) {
|
| 16880 |
const int64_t n_tokens = ubatch.n_tokens;
|
| 16881 |
-
|
| 16882 |
-
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
| 16883 |
}
|
| 16884 |
|
| 16885 |
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
@@ -18364,10 +18789,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
| 18364 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 18365 |
new_type = GGML_TYPE_IQ3_S;
|
| 18366 |
}
|
| 18367 |
-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
| 18368 |
-
new_type == GGML_TYPE_Q4_0_8_8) {
|
| 18369 |
-
new_type = GGML_TYPE_Q4_0;
|
| 18370 |
-
}
|
| 18371 |
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
| 18372 |
new_type = GGML_TYPE_Q4_K;
|
| 18373 |
}
|
|
@@ -18690,9 +19111,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 18690 |
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
| 18691 |
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
| 18692 |
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
| 18693 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
| 18694 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
| 18695 |
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
| 18696 |
|
| 18697 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 18698 |
}
|
|
@@ -19031,14 +19449,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 19031 |
f32_data = (float *) f32_conv_buf.data();
|
| 19032 |
}
|
| 19033 |
|
| 19034 |
-
int chunk_size_multiplier = 1;
|
| 19035 |
-
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
| 19036 |
-
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
| 19037 |
-
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
| 19038 |
-
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
| 19039 |
-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
| 19040 |
-
}
|
| 19041 |
-
|
| 19042 |
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
| 19043 |
fflush(stdout);
|
| 19044 |
|
|
@@ -19051,8 +19461,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 19051 |
const int64_t nrows = tensor->ne[1];
|
| 19052 |
|
| 19053 |
static const int64_t min_chunk_size = 32 * 512;
|
| 19054 |
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row))
|
| 19055 |
-
chunk_size_multiplier;
|
| 19056 |
|
| 19057 |
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
| 19058 |
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
@@ -19995,6 +20404,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
| 19995 |
case LLM_ARCH_COMMAND_R:
|
| 19996 |
case LLM_ARCH_OLMO:
|
| 19997 |
case LLM_ARCH_ARCTIC:
|
|
|
|
| 19998 |
case LLM_ARCH_DEEPSEEK2:
|
| 19999 |
case LLM_ARCH_CHATGLM:
|
| 20000 |
case LLM_ARCH_GRANITE:
|
|
@@ -20028,6 +20438,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
| 20028 |
case LLM_ARCH_MINICPM3:
|
| 20029 |
return LLAMA_ROPE_TYPE_NEOX;
|
| 20030 |
|
|
|
|
|
|
|
|
|
|
| 20031 |
// all model arches should be listed explicitly here
|
| 20032 |
case LLM_ARCH_UNKNOWN:
|
| 20033 |
GGML_ABORT("unknown architecture");
|
|
@@ -21596,7 +22009,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
| 21596 |
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
| 21597 |
}
|
| 21598 |
} else if ((size_t) i >= ctx->output_ids.size()) {
|
| 21599 |
-
throw std::runtime_error(format("out of range [0, %
|
| 21600 |
} else {
|
| 21601 |
j = ctx->output_ids[i];
|
| 21602 |
}
|
|
@@ -21813,6 +22226,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|
| 21813 |
}
|
| 21814 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
| 21815 |
return LLM_CHAT_TEMPLATE_PHI_3;
|
|
|
|
|
|
|
| 21816 |
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
| 21817 |
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
| 21818 |
} else if (tmpl_contains("bos_token + message['role']")) {
|
|
@@ -21857,6 +22272,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|
| 21857 |
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
| 21858 |
} else if (tmpl_contains("<|start_of_role|>")) {
|
| 21859 |
return LLM_CHAT_TEMPLATE_GRANITE;
|
|
|
|
|
|
|
| 21860 |
}
|
| 21861 |
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
| 21862 |
}
|
|
@@ -21963,6 +22380,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 21963 |
if (add_ass) {
|
| 21964 |
ss << "<|assistant|>\n";
|
| 21965 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21966 |
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
| 21967 |
// zephyr template
|
| 21968 |
for (auto message : chat) {
|
|
@@ -22180,6 +22606,32 @@ static int32_t llama_chat_apply_template_internal(
|
|
| 22180 |
if (add_ass) {
|
| 22181 |
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
| 22182 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22183 |
} else {
|
| 22184 |
// template not supported
|
| 22185 |
return -1;
|
|
|
|
| 163 |
LLM_ARCH_QWEN,
|
| 164 |
LLM_ARCH_QWEN2,
|
| 165 |
LLM_ARCH_QWEN2MOE,
|
| 166 |
+
LLM_ARCH_QWEN2VL,
|
| 167 |
LLM_ARCH_PHI2,
|
| 168 |
LLM_ARCH_PHI3,
|
| 169 |
LLM_ARCH_PLAMO,
|
|
|
|
| 184 |
LLM_ARCH_OLMOE,
|
| 185 |
LLM_ARCH_OPENELM,
|
| 186 |
LLM_ARCH_ARCTIC,
|
| 187 |
+
LLM_ARCH_DEEPSEEK,
|
| 188 |
LLM_ARCH_DEEPSEEK2,
|
| 189 |
LLM_ARCH_CHATGLM,
|
| 190 |
LLM_ARCH_BITNET,
|
|
|
|
| 219 |
{ LLM_ARCH_QWEN, "qwen" },
|
| 220 |
{ LLM_ARCH_QWEN2, "qwen2" },
|
| 221 |
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
| 222 |
+
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
| 223 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 224 |
{ LLM_ARCH_PHI3, "phi3" },
|
| 225 |
{ LLM_ARCH_PLAMO, "plamo" },
|
|
|
|
| 240 |
{ LLM_ARCH_OLMOE, "olmoe" },
|
| 241 |
{ LLM_ARCH_OPENELM, "openelm" },
|
| 242 |
{ LLM_ARCH_ARCTIC, "arctic" },
|
| 243 |
+
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
| 244 |
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
| 245 |
{ LLM_ARCH_CHATGLM, "chatglm" },
|
| 246 |
{ LLM_ARCH_BITNET, "bitnet" },
|
|
|
|
| 312 |
LLM_KV_ATTENTION_SCALE,
|
| 313 |
|
| 314 |
LLM_KV_ROPE_DIMENSION_COUNT,
|
| 315 |
+
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
| 316 |
LLM_KV_ROPE_FREQ_BASE,
|
| 317 |
LLM_KV_ROPE_SCALE_LINEAR,
|
| 318 |
LLM_KV_ROPE_SCALING_TYPE,
|
|
|
|
| 429 |
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
| 430 |
|
| 431 |
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
| 432 |
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
| 433 |
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
| 434 |
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
| 435 |
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
|
|
| 904 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 905 |
},
|
| 906 |
},
|
| 907 |
+
{
|
| 908 |
+
LLM_ARCH_QWEN2VL,
|
| 909 |
+
{
|
| 910 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 911 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 912 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 913 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 914 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 915 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 916 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 917 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 918 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 919 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 920 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 921 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 922 |
+
},
|
| 923 |
+
},
|
| 924 |
{
|
| 925 |
LLM_ARCH_QWEN2MOE,
|
| 926 |
{
|
|
|
|
| 1311 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1312 |
},
|
| 1313 |
},
|
| 1314 |
+
{
|
| 1315 |
+
LLM_ARCH_DEEPSEEK,
|
| 1316 |
+
{
|
| 1317 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1318 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1319 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1320 |
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 1321 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1322 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 1323 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 1324 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 1325 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 1326 |
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
| 1327 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 1328 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 1329 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 1330 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 1331 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1332 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1333 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1334 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1335 |
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
| 1336 |
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
| 1337 |
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
| 1338 |
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
| 1339 |
+
},
|
| 1340 |
+
},
|
| 1341 |
{
|
| 1342 |
LLM_ARCH_DEEPSEEK2,
|
| 1343 |
{
|
|
|
|
| 1612 |
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
| 1613 |
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
| 1614 |
LLM_CHAT_TEMPLATE_PHI_3,
|
| 1615 |
+
LLM_CHAT_TEMPLATE_FALCON_3,
|
| 1616 |
LLM_CHAT_TEMPLATE_ZEPHYR,
|
| 1617 |
LLM_CHAT_TEMPLATE_MONARCH,
|
| 1618 |
LLM_CHAT_TEMPLATE_GEMMA,
|
|
|
|
| 1630 |
LLM_CHAT_TEMPLATE_EXAONE_3,
|
| 1631 |
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
| 1632 |
LLM_CHAT_TEMPLATE_GRANITE,
|
| 1633 |
+
LLM_CHAT_TEMPLATE_GIGACHAT,
|
| 1634 |
LLM_CHAT_TEMPLATE_UNKNOWN,
|
| 1635 |
};
|
| 1636 |
|
|
|
|
| 1645 |
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
| 1646 |
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
| 1647 |
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
| 1648 |
+
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
| 1649 |
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
| 1650 |
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
| 1651 |
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
|
|
|
| 1663 |
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
| 1664 |
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
| 1665 |
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
| 1666 |
+
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
| 1667 |
};
|
| 1668 |
|
| 1669 |
static llm_arch llm_arch_from_string(const std::string & name) {
|
|
|
|
| 1848 |
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 1849 |
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
| 1850 |
if (!bufLen) {
|
| 1851 |
+
ret = format("Win32 error code: %lx", error_code);
|
| 1852 |
} else {
|
| 1853 |
ret = lpMsgBuf;
|
| 1854 |
LocalFree(lpMsgBuf);
|
|
|
|
| 2186 |
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
| 2187 |
|
| 2188 |
// may fail on pre-Windows 8 systems
|
| 2189 |
+
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
| 2190 |
|
| 2191 |
if (pPrefetchVirtualMemory) {
|
| 2192 |
// advise the kernel to preload the mapped memory
|
|
|
|
| 2528 |
uint32_t time_decay_extra_dim = 0;
|
| 2529 |
uint32_t wkv_head_size = 0;
|
| 2530 |
|
| 2531 |
+
float rope_attn_factor = 1.0f;
|
| 2532 |
+
float rope_freq_base_train;
|
| 2533 |
+
float rope_freq_scale_train;
|
| 2534 |
+
uint32_t n_ctx_orig_yarn;
|
| 2535 |
+
float rope_yarn_log_mul;
|
| 2536 |
+
int rope_sections[4];
|
| 2537 |
|
| 2538 |
// for State Space Models
|
| 2539 |
uint32_t ssm_d_conv = 0;
|
|
|
|
| 2590 |
|
| 2591 |
if (this->rope_finetuned != other.rope_finetuned) return true;
|
| 2592 |
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
| 2593 |
+
if (std::equal(std::begin(this->rope_sections),
|
| 2594 |
+
std::end(this->rope_sections),
|
| 2595 |
+
std::begin(other.rope_sections))) return true;
|
| 2596 |
|
| 2597 |
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
| 2598 |
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
|
|
| 3436 |
// whether we are computing encoder output or decoder output
|
| 3437 |
bool is_encoding = false;
|
| 3438 |
|
| 3439 |
+
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
| 3440 |
+
// number of position id each token get, 1 for each token in most cases.
|
| 3441 |
+
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
| 3442 |
+
int n_pos_per_token = 1;
|
| 3443 |
+
|
| 3444 |
// output of the encoder part of the encoder-decoder models
|
| 3445 |
std::vector<float> embd_enc;
|
| 3446 |
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
|
|
| 4641 |
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
| 4642 |
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
| 4643 |
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
|
|
|
|
|
|
|
|
|
| 4644 |
default:
|
| 4645 |
{
|
| 4646 |
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
|
|
| 5404 |
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
| 5405 |
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
| 5406 |
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
|
|
|
|
|
|
|
|
|
| 5407 |
|
| 5408 |
default: return "unknown, may not work";
|
| 5409 |
}
|
|
|
|
| 5810 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 5811 |
}
|
| 5812 |
} break;
|
| 5813 |
+
case LLM_ARCH_QWEN2VL:
|
| 5814 |
+
{
|
| 5815 |
+
std::array<int, 4> section_dims;
|
| 5816 |
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
| 5817 |
+
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
| 5818 |
+
}
|
| 5819 |
+
// fall through
|
| 5820 |
case LLM_ARCH_QWEN2:
|
| 5821 |
{
|
| 5822 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
| 6127 |
model.type = e_model::MODEL_UNKNOWN;
|
| 6128 |
}
|
| 6129 |
} break;
|
| 6130 |
+
case LLM_ARCH_DEEPSEEK:
|
| 6131 |
+
{
|
| 6132 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 6133 |
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
| 6134 |
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
| 6135 |
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
| 6136 |
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
| 6137 |
+
|
| 6138 |
+
switch (hparams.n_layer) {
|
| 6139 |
+
case 28: model.type = e_model::MODEL_20B; break;
|
| 6140 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 6141 |
+
}
|
| 6142 |
+
} break;
|
| 6143 |
case LLM_ARCH_DEEPSEEK2:
|
| 6144 |
{
|
| 6145 |
bool is_lite = (hparams.n_layer == 27);
|
|
|
|
| 6475 |
} else if (
|
| 6476 |
tokenizer_pre == "falcon") {
|
| 6477 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
| 6478 |
+
} else if (
|
| 6479 |
+
tokenizer_pre == "falcon3") {
|
| 6480 |
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
| 6481 |
+
vocab.tokenizer_ignore_merges = true;
|
| 6482 |
+
vocab.tokenizer_add_bos = true;
|
| 6483 |
} else if (
|
| 6484 |
tokenizer_pre == "mpt") {
|
| 6485 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
|
|
|
| 6491 |
tokenizer_pre == "phi-2" ||
|
| 6492 |
tokenizer_pre == "jina-es" ||
|
| 6493 |
tokenizer_pre == "jina-de" ||
|
| 6494 |
+
tokenizer_pre == "gigachat" ||
|
| 6495 |
tokenizer_pre == "jina-v1-en" ||
|
| 6496 |
tokenizer_pre == "jina-v2-es" ||
|
| 6497 |
tokenizer_pre == "jina-v2-de" ||
|
|
|
|
| 6562 |
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
| 6563 |
vocab.tokenizer_add_bos = true;
|
| 6564 |
vocab.tokenizer_clean_spaces = false;
|
| 6565 |
+
} else if (
|
| 6566 |
+
tokenizer_pre == "minerva-7b") {
|
| 6567 |
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
| 6568 |
} else {
|
| 6569 |
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
| 6570 |
}
|
|
|
|
| 7143 |
|
| 7144 |
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
| 7145 |
|
| 7146 |
+
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
| 7147 |
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
| 7148 |
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 7149 |
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
| 7150 |
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
| 7151 |
+
}
|
| 7152 |
+
|
| 7153 |
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
| 7154 |
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
| 7155 |
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
|
|
| 8263 |
}
|
| 8264 |
} break;
|
| 8265 |
case LLM_ARCH_QWEN2:
|
| 8266 |
+
case LLM_ARCH_QWEN2VL:
|
| 8267 |
{
|
| 8268 |
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 8269 |
|
|
|
|
| 8924 |
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
| 8925 |
}
|
| 8926 |
} break;
|
| 8927 |
+
case LLM_ARCH_DEEPSEEK:
|
| 8928 |
+
{
|
| 8929 |
+
|
| 8930 |
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
| 8931 |
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
| 8932 |
+
|
| 8933 |
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
| 8934 |
+
|
| 8935 |
+
// output
|
| 8936 |
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
| 8937 |
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
| 8938 |
+
|
| 8939 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 8940 |
+
auto & layer = model.layers[i];
|
| 8941 |
+
|
| 8942 |
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
| 8943 |
+
|
| 8944 |
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
| 8945 |
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
| 8946 |
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
| 8947 |
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
| 8948 |
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
| 8949 |
+
|
| 8950 |
+
if (i < (int) hparams.n_layer_dense_lead) {
|
| 8951 |
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
| 8952 |
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
| 8953 |
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
| 8954 |
+
} else {
|
| 8955 |
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
| 8956 |
+
|
| 8957 |
+
if (n_expert == 0) {
|
| 8958 |
+
throw std::runtime_error("n_expert must be > 0");
|
| 8959 |
+
}
|
| 8960 |
+
if (n_expert_used == 0) {
|
| 8961 |
+
throw std::runtime_error("n_expert_used must be > 0");
|
| 8962 |
+
}
|
| 8963 |
+
|
| 8964 |
+
// MoE branch
|
| 8965 |
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
| 8966 |
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
| 8967 |
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
| 8968 |
+
|
| 8969 |
+
// Shared expert branch
|
| 8970 |
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
| 8971 |
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
| 8972 |
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
| 8973 |
+
}
|
| 8974 |
+
}
|
| 8975 |
+
} break;
|
| 8976 |
case LLM_ARCH_DEEPSEEK2:
|
| 8977 |
{
|
| 8978 |
const bool is_lite = (hparams.n_layer == 27);
|
|
|
|
| 12702 |
return gf;
|
| 12703 |
}
|
| 12704 |
|
| 12705 |
+
struct ggml_cgraph * build_qwen2vl() {
|
| 12706 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 12707 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 12708 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 12709 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 12710 |
+
|
| 12711 |
+
struct ggml_tensor * cur;
|
| 12712 |
+
struct ggml_tensor * inpL;
|
| 12713 |
+
|
| 12714 |
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
| 12715 |
+
|
| 12716 |
+
// inp_pos - contains the positions
|
| 12717 |
+
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
|
| 12718 |
+
cb(lctx.inp_pos, "inp_pos", -1);
|
| 12719 |
+
ggml_set_input(lctx.inp_pos);
|
| 12720 |
+
struct ggml_tensor * inp_pos = lctx.inp_pos;
|
| 12721 |
+
|
| 12722 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 12723 |
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
| 12724 |
+
int sections[4];
|
| 12725 |
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
| 12726 |
+
|
| 12727 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 12728 |
+
struct ggml_tensor * inpSA = inpL;
|
| 12729 |
+
|
| 12730 |
+
// norm
|
| 12731 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 12732 |
+
model.layers[il].attn_norm, NULL,
|
| 12733 |
+
LLM_NORM_RMS, cb, il);
|
| 12734 |
+
cb(cur, "attn_norm", il);
|
| 12735 |
+
|
| 12736 |
+
// self-attention
|
| 12737 |
+
{
|
| 12738 |
+
// compute Q and K and RoPE them
|
| 12739 |
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
| 12740 |
+
cb(Qcur, "Qcur", il);
|
| 12741 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 12742 |
+
cb(Qcur, "Qcur", il);
|
| 12743 |
+
|
| 12744 |
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
| 12745 |
+
cb(Kcur, "Kcur", il);
|
| 12746 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 12747 |
+
cb(Kcur, "Kcur", il);
|
| 12748 |
+
|
| 12749 |
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
| 12750 |
+
cb(Vcur, "Vcur", il);
|
| 12751 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 12752 |
+
cb(Vcur, "Vcur", il);
|
| 12753 |
+
|
| 12754 |
+
Qcur = ggml_rope_multi(
|
| 12755 |
+
ctx0,
|
| 12756 |
+
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
| 12757 |
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 12758 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 12759 |
+
);
|
| 12760 |
+
cb(Qcur, "Qcur", il);
|
| 12761 |
+
|
| 12762 |
+
Kcur = ggml_rope_multi(
|
| 12763 |
+
ctx0,
|
| 12764 |
+
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
| 12765 |
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 12766 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 12767 |
+
);
|
| 12768 |
+
cb(Kcur, "Kcur", il);
|
| 12769 |
+
|
| 12770 |
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
| 12771 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 12772 |
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 12773 |
+
}
|
| 12774 |
+
|
| 12775 |
+
if (il == n_layer - 1) {
|
| 12776 |
+
// skip computing output for unused tokens
|
| 12777 |
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 12778 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 12779 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 12780 |
+
}
|
| 12781 |
+
|
| 12782 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 12783 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 12784 |
+
|
| 12785 |
+
// feed-forward network
|
| 12786 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 12787 |
+
model.layers[il].ffn_norm, NULL,
|
| 12788 |
+
LLM_NORM_RMS, cb, il);
|
| 12789 |
+
cb(cur, "ffn_norm", il);
|
| 12790 |
+
|
| 12791 |
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
| 12792 |
+
model.layers[il].ffn_up, NULL, NULL,
|
| 12793 |
+
model.layers[il].ffn_gate, NULL, NULL,
|
| 12794 |
+
model.layers[il].ffn_down, NULL, NULL,
|
| 12795 |
+
NULL,
|
| 12796 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 12797 |
+
cb(cur, "ffn_out", il);
|
| 12798 |
+
|
| 12799 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 12800 |
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
| 12801 |
+
cb(cur, "l_out", il);
|
| 12802 |
+
|
| 12803 |
+
// input for next layer
|
| 12804 |
+
inpL = cur;
|
| 12805 |
+
}
|
| 12806 |
+
|
| 12807 |
+
cur = inpL;
|
| 12808 |
+
|
| 12809 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 12810 |
+
model.output_norm, NULL,
|
| 12811 |
+
LLM_NORM_RMS, cb, -1);
|
| 12812 |
+
cb(cur, "result_norm", -1);
|
| 12813 |
+
|
| 12814 |
+
// lm_head
|
| 12815 |
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
| 12816 |
+
cb(cur, "result_output", -1);
|
| 12817 |
+
|
| 12818 |
+
ggml_build_forward_expand(gf, cur);
|
| 12819 |
+
|
| 12820 |
+
return gf;
|
| 12821 |
+
}
|
| 12822 |
+
|
| 12823 |
struct ggml_cgraph * build_qwen2moe() {
|
| 12824 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 12825 |
|
|
|
|
| 15327 |
return gf;
|
| 15328 |
}
|
| 15329 |
|
| 15330 |
+
struct ggml_cgraph * build_deepseek() {
|
| 15331 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 15332 |
+
|
| 15333 |
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
| 15334 |
+
int32_t n_tokens = this->n_tokens;
|
| 15335 |
+
|
| 15336 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 15337 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 15338 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 15339 |
+
|
| 15340 |
+
struct ggml_tensor * cur;
|
| 15341 |
+
struct ggml_tensor * inpL;
|
| 15342 |
+
|
| 15343 |
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
| 15344 |
+
|
| 15345 |
+
// inp_pos - contains the positions
|
| 15346 |
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
| 15347 |
+
|
| 15348 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 15349 |
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
| 15350 |
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
| 15351 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 15352 |
+
struct ggml_tensor * inpSA = inpL;
|
| 15353 |
+
|
| 15354 |
+
// norm
|
| 15355 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 15356 |
+
model.layers[il].attn_norm, NULL,
|
| 15357 |
+
LLM_NORM_RMS, cb, il);
|
| 15358 |
+
cb(cur, "attn_norm", il);
|
| 15359 |
+
|
| 15360 |
+
// self-attention
|
| 15361 |
+
{
|
| 15362 |
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
| 15363 |
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
| 15364 |
+
|
| 15365 |
+
// compute Q and K and RoPE them
|
| 15366 |
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
| 15367 |
+
cb(Qcur, "Qcur", il);
|
| 15368 |
+
if (model.layers[il].bq) {
|
| 15369 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 15370 |
+
cb(Qcur, "Qcur", il);
|
| 15371 |
+
}
|
| 15372 |
+
|
| 15373 |
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
| 15374 |
+
cb(Kcur, "Kcur", il);
|
| 15375 |
+
if (model.layers[il].bk) {
|
| 15376 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 15377 |
+
cb(Kcur, "Kcur", il);
|
| 15378 |
+
}
|
| 15379 |
+
|
| 15380 |
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
| 15381 |
+
cb(Vcur, "Vcur", il);
|
| 15382 |
+
if (model.layers[il].bv) {
|
| 15383 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 15384 |
+
cb(Vcur, "Vcur", il);
|
| 15385 |
+
}
|
| 15386 |
+
|
| 15387 |
+
Qcur = ggml_rope_ext(
|
| 15388 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
| 15389 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 15390 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 15391 |
+
);
|
| 15392 |
+
cb(Qcur, "Qcur", il);
|
| 15393 |
+
|
| 15394 |
+
Kcur = ggml_rope_ext(
|
| 15395 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
| 15396 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 15397 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 15398 |
+
);
|
| 15399 |
+
cb(Kcur, "Kcur", il);
|
| 15400 |
+
|
| 15401 |
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
| 15402 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 15403 |
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
| 15404 |
+
}
|
| 15405 |
+
|
| 15406 |
+
if (il == n_layer - 1) {
|
| 15407 |
+
// skip computing output for unused tokens
|
| 15408 |
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 15409 |
+
n_tokens = n_outputs;
|
| 15410 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 15411 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 15412 |
+
}
|
| 15413 |
+
|
| 15414 |
+
|
| 15415 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 15416 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 15417 |
+
|
| 15418 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 15419 |
+
model.layers[il].ffn_norm, NULL,
|
| 15420 |
+
LLM_NORM_RMS, cb, il);
|
| 15421 |
+
cb(cur, "ffn_norm", il);
|
| 15422 |
+
|
| 15423 |
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
| 15424 |
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
| 15425 |
+
model.layers[il].ffn_up, NULL, NULL,
|
| 15426 |
+
model.layers[il].ffn_gate, NULL, NULL,
|
| 15427 |
+
model.layers[il].ffn_down, NULL, NULL,
|
| 15428 |
+
NULL,
|
| 15429 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 15430 |
+
cb(cur, "ffn_out", il);
|
| 15431 |
+
} else {
|
| 15432 |
+
// MoE branch
|
| 15433 |
+
ggml_tensor * moe_out =
|
| 15434 |
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
| 15435 |
+
model.layers[il].ffn_gate_inp,
|
| 15436 |
+
model.layers[il].ffn_up_exps,
|
| 15437 |
+
model.layers[il].ffn_gate_exps,
|
| 15438 |
+
model.layers[il].ffn_down_exps,
|
| 15439 |
+
n_expert, n_expert_used,
|
| 15440 |
+
LLM_FFN_SILU, false,
|
| 15441 |
+
false, hparams.expert_weights_scale,
|
| 15442 |
+
cb, il);
|
| 15443 |
+
cb(moe_out, "ffn_moe_out", il);
|
| 15444 |
+
|
| 15445 |
+
// FFN shared expert
|
| 15446 |
+
{
|
| 15447 |
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
| 15448 |
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
| 15449 |
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
| 15450 |
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
| 15451 |
+
NULL,
|
| 15452 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 15453 |
+
cb(ffn_shexp, "ffn_shexp", il);
|
| 15454 |
+
|
| 15455 |
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
| 15456 |
+
cb(cur, "ffn_out", il);
|
| 15457 |
+
}
|
| 15458 |
+
}
|
| 15459 |
+
|
| 15460 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 15461 |
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
| 15462 |
+
cb(cur, "l_out", il);
|
| 15463 |
+
|
| 15464 |
+
// input for next layer
|
| 15465 |
+
inpL = cur;
|
| 15466 |
+
}
|
| 15467 |
+
|
| 15468 |
+
cur = inpL;
|
| 15469 |
+
|
| 15470 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 15471 |
+
model.output_norm, NULL,
|
| 15472 |
+
LLM_NORM_RMS, cb, -1);
|
| 15473 |
+
cb(cur, "result_norm", -1);
|
| 15474 |
+
|
| 15475 |
+
// lm_head
|
| 15476 |
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
| 15477 |
+
|
| 15478 |
+
cb(cur, "result_output", -1);
|
| 15479 |
+
|
| 15480 |
+
ggml_build_forward_expand(gf, cur);
|
| 15481 |
+
|
| 15482 |
+
return gf;
|
| 15483 |
+
}
|
| 15484 |
+
|
| 15485 |
struct ggml_cgraph * build_deepseek2() {
|
| 15486 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
| 15487 |
|
|
|
|
| 17076 |
{
|
| 17077 |
result = llm.build_qwen2();
|
| 17078 |
} break;
|
| 17079 |
+
case LLM_ARCH_QWEN2VL:
|
| 17080 |
+
{
|
| 17081 |
+
lctx.n_pos_per_token = 4;
|
| 17082 |
+
result = llm.build_qwen2vl();
|
| 17083 |
+
} break;
|
| 17084 |
case LLM_ARCH_QWEN2MOE:
|
| 17085 |
{
|
| 17086 |
result = llm.build_qwen2moe();
|
|
|
|
| 17169 |
{
|
| 17170 |
result = llm.build_arctic();
|
| 17171 |
} break;
|
| 17172 |
+
case LLM_ARCH_DEEPSEEK:
|
| 17173 |
+
{
|
| 17174 |
+
result = llm.build_deepseek();
|
| 17175 |
+
} break;
|
| 17176 |
case LLM_ARCH_DEEPSEEK2:
|
| 17177 |
{
|
| 17178 |
result = llm.build_deepseek2();
|
|
|
|
| 17303 |
|
| 17304 |
if (ubatch.pos && lctx.inp_pos) {
|
| 17305 |
const int64_t n_tokens = ubatch.n_tokens;
|
| 17306 |
+
auto n_pos = lctx.n_pos_per_token;
|
| 17307 |
+
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
|
| 17308 |
}
|
| 17309 |
|
| 17310 |
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
|
|
| 18789 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
| 18790 |
new_type = GGML_TYPE_IQ3_S;
|
| 18791 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18792 |
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
| 18793 |
new_type = GGML_TYPE_Q4_K;
|
| 18794 |
}
|
|
|
|
| 19111 |
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
| 19112 |
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
| 19113 |
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
|
|
|
|
|
|
|
|
|
| 19114 |
|
| 19115 |
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
| 19116 |
}
|
|
|
|
| 19449 |
f32_data = (float *) f32_conv_buf.data();
|
| 19450 |
}
|
| 19451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19452 |
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
| 19453 |
fflush(stdout);
|
| 19454 |
|
|
|
|
| 19461 |
const int64_t nrows = tensor->ne[1];
|
| 19462 |
|
| 19463 |
static const int64_t min_chunk_size = 32 * 512;
|
| 19464 |
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
|
|
| 19465 |
|
| 19466 |
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
| 19467 |
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
|
|
| 20404 |
case LLM_ARCH_COMMAND_R:
|
| 20405 |
case LLM_ARCH_OLMO:
|
| 20406 |
case LLM_ARCH_ARCTIC:
|
| 20407 |
+
case LLM_ARCH_DEEPSEEK:
|
| 20408 |
case LLM_ARCH_DEEPSEEK2:
|
| 20409 |
case LLM_ARCH_CHATGLM:
|
| 20410 |
case LLM_ARCH_GRANITE:
|
|
|
|
| 20438 |
case LLM_ARCH_MINICPM3:
|
| 20439 |
return LLAMA_ROPE_TYPE_NEOX;
|
| 20440 |
|
| 20441 |
+
case LLM_ARCH_QWEN2VL:
|
| 20442 |
+
return LLAMA_ROPE_TYPE_MROPE;
|
| 20443 |
+
|
| 20444 |
// all model arches should be listed explicitly here
|
| 20445 |
case LLM_ARCH_UNKNOWN:
|
| 20446 |
GGML_ABORT("unknown architecture");
|
|
|
|
| 22009 |
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
| 22010 |
}
|
| 22011 |
} else if ((size_t) i >= ctx->output_ids.size()) {
|
| 22012 |
+
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
| 22013 |
} else {
|
| 22014 |
j = ctx->output_ids[i];
|
| 22015 |
}
|
|
|
|
| 22226 |
}
|
| 22227 |
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
| 22228 |
return LLM_CHAT_TEMPLATE_PHI_3;
|
| 22229 |
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
| 22230 |
+
return LLM_CHAT_TEMPLATE_FALCON_3;
|
| 22231 |
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
| 22232 |
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
| 22233 |
} else if (tmpl_contains("bos_token + message['role']")) {
|
|
|
|
| 22272 |
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
| 22273 |
} else if (tmpl_contains("<|start_of_role|>")) {
|
| 22274 |
return LLM_CHAT_TEMPLATE_GRANITE;
|
| 22275 |
+
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
| 22276 |
+
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
| 22277 |
}
|
| 22278 |
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
| 22279 |
}
|
|
|
|
| 22380 |
if (add_ass) {
|
| 22381 |
ss << "<|assistant|>\n";
|
| 22382 |
}
|
| 22383 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
| 22384 |
+
// Falcon 3
|
| 22385 |
+
for (auto message : chat) {
|
| 22386 |
+
std::string role(message->role);
|
| 22387 |
+
ss << "<|" << role << "|>\n" << message->content << "\n";
|
| 22388 |
+
}
|
| 22389 |
+
if (add_ass) {
|
| 22390 |
+
ss << "<|assistant|>\n";
|
| 22391 |
+
}
|
| 22392 |
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
| 22393 |
// zephyr template
|
| 22394 |
for (auto message : chat) {
|
|
|
|
| 22606 |
if (add_ass) {
|
| 22607 |
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
| 22608 |
}
|
| 22609 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
| 22610 |
+
// GigaChat template
|
| 22611 |
+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
| 22612 |
+
|
| 22613 |
+
// Handle system message if present
|
| 22614 |
+
if (has_system) {
|
| 22615 |
+
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
| 22616 |
+
} else {
|
| 22617 |
+
ss << "<s>";
|
| 22618 |
+
}
|
| 22619 |
+
|
| 22620 |
+
// Process remaining messages
|
| 22621 |
+
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
| 22622 |
+
std::string role(chat[i]->role);
|
| 22623 |
+
if (role == "user") {
|
| 22624 |
+
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
| 22625 |
+
<< "available functions<|role_sep|>[]<|message_sep|>";
|
| 22626 |
+
} else if (role == "assistant") {
|
| 22627 |
+
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
| 22628 |
+
}
|
| 22629 |
+
}
|
| 22630 |
+
|
| 22631 |
+
// Add generation prompt if needed
|
| 22632 |
+
if (add_ass) {
|
| 22633 |
+
ss << "assistant<|role_sep|>";
|
| 22634 |
+
}
|
| 22635 |
} else {
|
| 22636 |
// template not supported
|
| 22637 |
return -1;
|
examples/talk-llama/llama.h
CHANGED
|
@@ -104,12 +104,15 @@ extern "C" {
|
|
| 104 |
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
| 105 |
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
| 106 |
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
|
|
| 107 |
};
|
| 108 |
|
| 109 |
enum llama_rope_type {
|
| 110 |
-
LLAMA_ROPE_TYPE_NONE
|
| 111 |
-
LLAMA_ROPE_TYPE_NORM
|
| 112 |
-
LLAMA_ROPE_TYPE_NEOX
|
|
|
|
|
|
|
| 113 |
};
|
| 114 |
|
| 115 |
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
@@ -171,9 +174,9 @@ extern "C" {
|
|
| 171 |
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
| 172 |
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
| 173 |
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
| 174 |
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, //
|
| 175 |
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, //
|
| 176 |
-
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, //
|
| 177 |
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
| 178 |
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
| 179 |
|
|
@@ -455,6 +458,7 @@ extern "C" {
|
|
| 455 |
// Functions to access the model's GGUF metadata scalar values
|
| 456 |
// - The functions return the length of the string on success, or -1 on failure
|
| 457 |
// - The output string is always null-terminated and cleared on failure
|
|
|
|
| 458 |
// - GGUF array values are not supported by these functions
|
| 459 |
|
| 460 |
// Get metadata value as a string by key name
|
|
@@ -1135,16 +1139,12 @@ extern "C" {
|
|
| 1135 |
const char * grammar_str,
|
| 1136 |
const char * grammar_root);
|
| 1137 |
|
|
|
|
| 1138 |
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
| 1139 |
-
int32_t
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
float penalty_repeat, // 1.0 = disabled
|
| 1144 |
-
float penalty_freq, // 0.0 = disabled
|
| 1145 |
-
float penalty_present, // 0.0 = disabled
|
| 1146 |
-
bool penalize_nl, // consider newlines as a repeatable token
|
| 1147 |
-
bool ignore_eos); // ignore the end-of-sequence token
|
| 1148 |
|
| 1149 |
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
| 1150 |
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
|
|
|
| 104 |
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
| 105 |
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
| 106 |
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
| 107 |
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
| 108 |
};
|
| 109 |
|
| 110 |
enum llama_rope_type {
|
| 111 |
+
LLAMA_ROPE_TYPE_NONE = -1,
|
| 112 |
+
LLAMA_ROPE_TYPE_NORM = 0,
|
| 113 |
+
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
| 114 |
+
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
| 115 |
+
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
| 116 |
};
|
| 117 |
|
| 118 |
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
|
|
| 174 |
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
| 175 |
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
| 176 |
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
| 177 |
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
| 178 |
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
| 179 |
+
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
| 180 |
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
| 181 |
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
| 182 |
|
|
|
|
| 458 |
// Functions to access the model's GGUF metadata scalar values
|
| 459 |
// - The functions return the length of the string on success, or -1 on failure
|
| 460 |
// - The output string is always null-terminated and cleared on failure
|
| 461 |
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
| 462 |
// - GGUF array values are not supported by these functions
|
| 463 |
|
| 464 |
// Get metadata value as a string by key name
|
|
|
|
| 1139 |
const char * grammar_str,
|
| 1140 |
const char * grammar_root);
|
| 1141 |
|
| 1142 |
+
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
| 1143 |
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
| 1144 |
+
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
| 1145 |
+
float penalty_repeat, // 1.0 = disabled
|
| 1146 |
+
float penalty_freq, // 0.0 = disabled
|
| 1147 |
+
float penalty_present); // 0.0 = disabled
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1148 |
|
| 1149 |
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
| 1150 |
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
examples/talk-llama/unicode.cpp
CHANGED
|
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|
| 71 |
throw std::invalid_argument("failed to convert utf8 to codepoint");
|
| 72 |
}
|
| 73 |
|
| 74 |
-
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t
|
| 75 |
// std::vector<uint16_t> result;
|
| 76 |
-
// if (/* 0x0000 <=
|
| 77 |
-
// result.emplace_back(
|
| 78 |
// return result;
|
| 79 |
// }
|
| 80 |
-
// if (0x10000 <=
|
| 81 |
-
// result.emplace_back(0xd800 | ((
|
| 82 |
-
// result.emplace_back(0xdc00 | ((
|
| 83 |
// return result;
|
| 84 |
// }
|
| 85 |
// throw std::invalid_argument("failed to convert codepoint to utf16");
|
|
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|
| 120 |
// return result;
|
| 121 |
//}
|
| 122 |
|
| 123 |
-
static std::vector<
|
| 124 |
-
std::vector<
|
| 125 |
|
| 126 |
assert (unicode_ranges_flags.begin()[0].first == 0);
|
| 127 |
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
|
@@ -253,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
| 253 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
| 254 |
};
|
| 255 |
|
| 256 |
-
auto _get_flags = [&] (const size_t pos) ->
|
| 257 |
-
return (offset_ini <= pos && pos < offset_end) ?
|
| 258 |
};
|
| 259 |
|
| 260 |
size_t _prev_end = offset_ini;
|
|
@@ -371,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
| 371 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
| 372 |
};
|
| 373 |
|
| 374 |
-
auto _get_flags = [&] (const size_t pos) ->
|
| 375 |
-
return (offset_ini <= pos && pos < offset_end) ?
|
| 376 |
};
|
| 377 |
|
| 378 |
size_t _prev_end = offset_ini;
|
|
@@ -572,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|
| 572 |
// interface
|
| 573 |
//
|
| 574 |
|
| 575 |
-
std::string unicode_cpt_to_utf8(uint32_t
|
| 576 |
std::string result;
|
| 577 |
|
| 578 |
-
if (/* 0x00 <=
|
| 579 |
-
result.push_back(
|
| 580 |
return result;
|
| 581 |
}
|
| 582 |
-
if (0x80 <=
|
| 583 |
-
result.push_back(0xc0 | ((
|
| 584 |
-
result.push_back(0x80 | (
|
| 585 |
return result;
|
| 586 |
}
|
| 587 |
-
if (0x800 <=
|
| 588 |
-
result.push_back(0xe0 | ((
|
| 589 |
-
result.push_back(0x80 | ((
|
| 590 |
-
result.push_back(0x80 | (
|
| 591 |
return result;
|
| 592 |
}
|
| 593 |
-
if (0x10000 <=
|
| 594 |
-
result.push_back(0xf0 | ((
|
| 595 |
-
result.push_back(0x80 | ((
|
| 596 |
-
result.push_back(0x80 | ((
|
| 597 |
-
result.push_back(0x80 | (
|
| 598 |
return result;
|
| 599 |
}
|
| 600 |
|
|
@@ -624,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
|
| 624 |
return result;
|
| 625 |
}
|
| 626 |
|
| 627 |
-
|
| 628 |
-
static const
|
| 629 |
static const auto cpt_flags = unicode_cpt_flags_array();
|
| 630 |
-
return
|
| 631 |
}
|
| 632 |
|
| 633 |
-
|
| 634 |
-
static const
|
| 635 |
if (utf8.empty()) {
|
| 636 |
return undef; // undefined
|
| 637 |
}
|
| 638 |
size_t offset = 0;
|
| 639 |
-
return
|
| 640 |
}
|
| 641 |
|
| 642 |
std::string unicode_byte_to_utf8(uint8_t byte) {
|
|
@@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
|
| 649 |
return map.at(utf8);
|
| 650 |
}
|
| 651 |
|
| 652 |
-
uint32_t unicode_tolower(uint32_t
|
| 653 |
// binary search
|
| 654 |
-
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(),
|
| 655 |
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
| 656 |
return pair.first < value;
|
| 657 |
});
|
| 658 |
-
if (it != unicode_map_lowercase.end() && it->first ==
|
| 659 |
return it->second;
|
| 660 |
}
|
| 661 |
-
return
|
| 662 |
}
|
| 663 |
|
| 664 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
| 665 |
// unicode categories
|
| 666 |
static const std::map<std::string, int> k_ucat_enum = {
|
| 667 |
-
{ "\\p{N}",
|
| 668 |
-
{ "\\p{L}",
|
| 669 |
-
{ "\\p{P}",
|
| 670 |
};
|
| 671 |
|
| 672 |
static const std::map<int, int> k_ucat_cpt = {
|
| 673 |
-
{
|
| 674 |
-
{
|
| 675 |
-
{
|
| 676 |
};
|
| 677 |
|
| 678 |
static const std::map<int, std::string> k_ucat_map = {
|
| 679 |
-
{
|
| 680 |
-
{
|
| 681 |
-
{
|
| 682 |
};
|
| 683 |
|
| 684 |
// compute collapsed codepoints only if needed by at least one regex
|
| 685 |
bool need_collapse = false;
|
| 686 |
-
for (auto & regex_expr : regex_exprs) {
|
| 687 |
// search for unicode categories
|
| 688 |
for (const auto & ucat : k_ucat_enum) {
|
| 689 |
if (std::string::npos != regex_expr.find(ucat.first)) {
|
|
@@ -709,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
| 709 |
continue;
|
| 710 |
}
|
| 711 |
|
| 712 |
-
const auto flags =
|
| 713 |
|
| 714 |
if (flags.is_whitespace) {
|
| 715 |
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
|
@@ -725,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
| 725 |
|
| 726 |
std::vector<size_t> bpe_offsets = { cpts.size() };
|
| 727 |
|
| 728 |
-
for (auto & regex_expr : regex_exprs) {
|
| 729 |
// first, see if we have an efficient custom regex implementation
|
| 730 |
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
|
| 731 |
|
|
@@ -739,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
| 739 |
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
|
| 740 |
// with the corresponding collapsed representation
|
| 741 |
bool use_collapsed = false;
|
| 742 |
-
for (auto & ucat : k_ucat_enum) {
|
| 743 |
if (std::string::npos != regex_expr.find(ucat.first)) {
|
| 744 |
use_collapsed = true;
|
| 745 |
break;
|
|
@@ -805,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
| 805 |
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
| 806 |
std::wstring wtext(cpts.begin(), cpts.end());
|
| 807 |
for (size_t i = 0; i < wtext.size(); ++i) {
|
| 808 |
-
if (wtext[i] > 0x7F &&
|
| 809 |
wtext[i] = 0x0B;
|
| 810 |
}
|
| 811 |
}
|
|
|
|
| 71 |
throw std::invalid_argument("failed to convert utf8 to codepoint");
|
| 72 |
}
|
| 73 |
|
| 74 |
+
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
|
| 75 |
// std::vector<uint16_t> result;
|
| 76 |
+
// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
|
| 77 |
+
// result.emplace_back(cpt);
|
| 78 |
// return result;
|
| 79 |
// }
|
| 80 |
+
// if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
| 81 |
+
// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
|
| 82 |
+
// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
|
| 83 |
// return result;
|
| 84 |
// }
|
| 85 |
// throw std::invalid_argument("failed to convert codepoint to utf16");
|
|
|
|
| 120 |
// return result;
|
| 121 |
//}
|
| 122 |
|
| 123 |
+
static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
|
| 124 |
+
std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
|
| 125 |
|
| 126 |
assert (unicode_ranges_flags.begin()[0].first == 0);
|
| 127 |
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
|
|
|
| 253 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
| 254 |
};
|
| 255 |
|
| 256 |
+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
| 257 |
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
| 258 |
};
|
| 259 |
|
| 260 |
size_t _prev_end = offset_ini;
|
|
|
|
| 371 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
| 372 |
};
|
| 373 |
|
| 374 |
+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
| 375 |
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
| 376 |
};
|
| 377 |
|
| 378 |
size_t _prev_end = offset_ini;
|
|
|
|
| 572 |
// interface
|
| 573 |
//
|
| 574 |
|
| 575 |
+
std::string unicode_cpt_to_utf8(uint32_t cpt) {
|
| 576 |
std::string result;
|
| 577 |
|
| 578 |
+
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
|
| 579 |
+
result.push_back(cpt);
|
| 580 |
return result;
|
| 581 |
}
|
| 582 |
+
if (0x80 <= cpt && cpt <= 0x7ff) {
|
| 583 |
+
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
|
| 584 |
+
result.push_back(0x80 | (cpt & 0x3f));
|
| 585 |
return result;
|
| 586 |
}
|
| 587 |
+
if (0x800 <= cpt && cpt <= 0xffff) {
|
| 588 |
+
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
|
| 589 |
+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
| 590 |
+
result.push_back(0x80 | (cpt & 0x3f));
|
| 591 |
return result;
|
| 592 |
}
|
| 593 |
+
if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
| 594 |
+
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
|
| 595 |
+
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
|
| 596 |
+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
| 597 |
+
result.push_back(0x80 | (cpt & 0x3f));
|
| 598 |
return result;
|
| 599 |
}
|
| 600 |
|
|
|
|
| 624 |
return result;
|
| 625 |
}
|
| 626 |
|
| 627 |
+
unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
|
| 628 |
+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
| 629 |
static const auto cpt_flags = unicode_cpt_flags_array();
|
| 630 |
+
return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
|
| 631 |
}
|
| 632 |
|
| 633 |
+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
|
| 634 |
+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
| 635 |
if (utf8.empty()) {
|
| 636 |
return undef; // undefined
|
| 637 |
}
|
| 638 |
size_t offset = 0;
|
| 639 |
+
return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
|
| 640 |
}
|
| 641 |
|
| 642 |
std::string unicode_byte_to_utf8(uint8_t byte) {
|
|
|
|
| 649 |
return map.at(utf8);
|
| 650 |
}
|
| 651 |
|
| 652 |
+
uint32_t unicode_tolower(uint32_t cpt) {
|
| 653 |
// binary search
|
| 654 |
+
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
|
| 655 |
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
| 656 |
return pair.first < value;
|
| 657 |
});
|
| 658 |
+
if (it != unicode_map_lowercase.end() && it->first == cpt) {
|
| 659 |
return it->second;
|
| 660 |
}
|
| 661 |
+
return cpt; // Return the original code point if no lowercase mapping is found
|
| 662 |
}
|
| 663 |
|
| 664 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
| 665 |
// unicode categories
|
| 666 |
static const std::map<std::string, int> k_ucat_enum = {
|
| 667 |
+
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
| 668 |
+
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
| 669 |
+
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
| 670 |
};
|
| 671 |
|
| 672 |
static const std::map<int, int> k_ucat_cpt = {
|
| 673 |
+
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
| 674 |
+
{ unicode_cpt_flags::LETTER, 0xD2 },
|
| 675 |
+
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
| 676 |
};
|
| 677 |
|
| 678 |
static const std::map<int, std::string> k_ucat_map = {
|
| 679 |
+
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
| 680 |
+
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
| 681 |
+
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
| 682 |
};
|
| 683 |
|
| 684 |
// compute collapsed codepoints only if needed by at least one regex
|
| 685 |
bool need_collapse = false;
|
| 686 |
+
for (const auto & regex_expr : regex_exprs) {
|
| 687 |
// search for unicode categories
|
| 688 |
for (const auto & ucat : k_ucat_enum) {
|
| 689 |
if (std::string::npos != regex_expr.find(ucat.first)) {
|
|
|
|
| 709 |
continue;
|
| 710 |
}
|
| 711 |
|
| 712 |
+
const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
|
| 713 |
|
| 714 |
if (flags.is_whitespace) {
|
| 715 |
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
|
|
|
| 725 |
|
| 726 |
std::vector<size_t> bpe_offsets = { cpts.size() };
|
| 727 |
|
| 728 |
+
for (const auto & regex_expr : regex_exprs) {
|
| 729 |
// first, see if we have an efficient custom regex implementation
|
| 730 |
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
|
| 731 |
|
|
|
|
| 739 |
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
|
| 740 |
// with the corresponding collapsed representation
|
| 741 |
bool use_collapsed = false;
|
| 742 |
+
for (const auto & ucat : k_ucat_enum) {
|
| 743 |
if (std::string::npos != regex_expr.find(ucat.first)) {
|
| 744 |
use_collapsed = true;
|
| 745 |
break;
|
|
|
|
| 805 |
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
| 806 |
std::wstring wtext(cpts.begin(), cpts.end());
|
| 807 |
for (size_t i = 0; i < wtext.size(); ++i) {
|
| 808 |
+
if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
|
| 809 |
wtext[i] = 0x0B;
|
| 810 |
}
|
| 811 |
}
|
examples/talk-llama/unicode.h
CHANGED
|
@@ -4,9 +4,7 @@
|
|
| 4 |
#include <string>
|
| 5 |
#include <vector>
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
struct codepoint_flags {
|
| 10 |
enum {
|
| 11 |
UNDEFINED = 0x0001,
|
| 12 |
NUMBER = 0x0002, // regex: \p{N}
|
|
@@ -35,7 +33,7 @@ struct codepoint_flags {
|
|
| 35 |
uint16_t is_nfd : 1;
|
| 36 |
|
| 37 |
// decode from uint16
|
| 38 |
-
inline
|
| 39 |
*reinterpret_cast<uint16_t*>(this) = flags;
|
| 40 |
}
|
| 41 |
|
|
@@ -50,18 +48,19 @@ struct codepoint_flags {
|
|
| 50 |
|
| 51 |
size_t unicode_len_utf8(char src);
|
| 52 |
|
| 53 |
-
std::string unicode_cpt_to_utf8(uint32_t
|
| 54 |
-
uint32_t
|
|
|
|
| 55 |
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
| 56 |
|
| 57 |
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
|
| 62 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 63 |
-
uint8_t
|
| 64 |
|
| 65 |
-
uint32_t unicode_tolower(uint32_t
|
| 66 |
|
| 67 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
|
|
|
| 4 |
#include <string>
|
| 5 |
#include <vector>
|
| 6 |
|
| 7 |
+
struct unicode_cpt_flags {
|
|
|
|
|
|
|
| 8 |
enum {
|
| 9 |
UNDEFINED = 0x0001,
|
| 10 |
NUMBER = 0x0002, // regex: \p{N}
|
|
|
|
| 33 |
uint16_t is_nfd : 1;
|
| 34 |
|
| 35 |
// decode from uint16
|
| 36 |
+
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
| 37 |
*reinterpret_cast<uint16_t*>(this) = flags;
|
| 38 |
}
|
| 39 |
|
|
|
|
| 48 |
|
| 49 |
size_t unicode_len_utf8(char src);
|
| 50 |
|
| 51 |
+
std::string unicode_cpt_to_utf8 (uint32_t cpt);
|
| 52 |
+
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
| 53 |
+
|
| 54 |
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
| 55 |
|
| 56 |
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
| 57 |
|
| 58 |
+
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
|
| 59 |
+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
|
| 60 |
|
| 61 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 62 |
+
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
| 63 |
|
| 64 |
+
uint32_t unicode_tolower(uint32_t cpt);
|
| 65 |
|
| 66 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|