ggerganov commited on
Commit
1493d0c
·
unverified ·
1 Parent(s): 4ce2d25

talk-llama : update to latest llama.cpp

Browse files
examples/talk-llama/README.md CHANGED
@@ -19,7 +19,7 @@ brew install sdl2
19
  make talk-llama
20
 
21
  # Run it
22
- ./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
23
  ```
24
 
25
  - The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
@@ -36,7 +36,7 @@ This feature is especially helpful for maintaining context in long conversations
36
  Example usage:
37
 
38
  ```bash
39
- ./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
40
  ```
41
 
42
  ## TTS
 
19
  make talk-llama
20
 
21
  # Run it
22
+ ./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
23
  ```
24
 
25
  - The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
 
36
  Example usage:
37
 
38
  ```bash
39
+ ./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
40
  ```
41
 
42
  ## TTS
examples/talk-llama/llama-util.h DELETED
@@ -1,474 +0,0 @@
1
- // Internal header to be included only by llama.cpp.
2
- // Contains wrappers around OS interfaces.
3
-
4
- #ifndef LLAMA_UTIL_H
5
- #define LLAMA_UTIL_H
6
-
7
- #include <cstdio>
8
- #include <cstdint>
9
- #include <cerrno>
10
- #include <cstring>
11
- #include <cstdarg>
12
- #include <cstdlib>
13
- #include <climits>
14
-
15
- #include <string>
16
- #include <vector>
17
- #include <stdexcept>
18
-
19
- #ifdef __has_include
20
- #if __has_include(<unistd.h>)
21
- #include <unistd.h>
22
- #if defined(_POSIX_MAPPED_FILES)
23
- #include <sys/mman.h>
24
- #endif
25
- #if defined(_POSIX_MEMLOCK_RANGE)
26
- #include <sys/resource.h>
27
- #endif
28
- #endif
29
- #endif
30
-
31
- #if defined(_WIN32)
32
- #define WIN32_LEAN_AND_MEAN
33
- #ifndef NOMINMAX
34
- #define NOMINMAX
35
- #endif
36
- #include <windows.h>
37
- #include <io.h>
38
- #include <stdio.h> // for _fseeki64
39
- #endif
40
-
41
- #define LLAMA_ASSERT(x) \
42
- do { \
43
- if (!(x)) { \
44
- fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
45
- abort(); \
46
- } \
47
- } while (0)
48
-
49
- #ifdef __GNUC__
50
- #ifdef __MINGW32__
51
- __attribute__((format(gnu_printf, 1, 2)))
52
- #else
53
- __attribute__((format(printf, 1, 2)))
54
- #endif
55
- #endif
56
- static std::string format(const char * fmt, ...) {
57
- va_list ap, ap2;
58
- va_start(ap, fmt);
59
- va_copy(ap2, ap);
60
- int size = vsnprintf(NULL, 0, fmt, ap);
61
- LLAMA_ASSERT(size >= 0 && size < INT_MAX);
62
- std::vector<char> buf(size + 1);
63
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
64
- LLAMA_ASSERT(size2 == size);
65
- va_end(ap2);
66
- va_end(ap);
67
- return std::string(buf.data(), size);
68
- }
69
-
70
- struct llama_file {
71
- // use FILE * so we don't have to re-open the file to mmap
72
- FILE * fp;
73
- size_t size;
74
-
75
- llama_file(const char * fname, const char * mode) {
76
- fp = std::fopen(fname, mode);
77
- if (fp == NULL) {
78
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
79
- }
80
- seek(0, SEEK_END);
81
- size = tell();
82
- seek(0, SEEK_SET);
83
- }
84
-
85
- size_t tell() const {
86
- #ifdef _WIN32
87
- __int64 ret = _ftelli64(fp);
88
- #else
89
- long ret = std::ftell(fp);
90
- #endif
91
- LLAMA_ASSERT(ret != -1); // this really shouldn't fail
92
- return (size_t) ret;
93
- }
94
-
95
- void seek(size_t offset, int whence) {
96
- #ifdef _WIN32
97
- int ret = _fseeki64(fp, (__int64) offset, whence);
98
- #else
99
- int ret = std::fseek(fp, (long) offset, whence);
100
- #endif
101
- LLAMA_ASSERT(ret == 0); // same
102
- }
103
-
104
- void read_raw(void * ptr, size_t len) const {
105
- if (len == 0) {
106
- return;
107
- }
108
- errno = 0;
109
- std::size_t ret = std::fread(ptr, len, 1, fp);
110
- if (ferror(fp)) {
111
- throw std::runtime_error(format("read error: %s", strerror(errno)));
112
- }
113
- if (ret != 1) {
114
- throw std::runtime_error(std::string("unexpectedly reached end of file"));
115
- }
116
- }
117
-
118
- std::uint32_t read_u32() {
119
- std::uint32_t ret;
120
- read_raw(&ret, sizeof(ret));
121
- return ret;
122
- }
123
-
124
- std::string read_string(std::uint32_t len) {
125
- std::vector<char> chars(len);
126
- read_raw(chars.data(), len);
127
- return std::string(chars.data(), len);
128
- }
129
-
130
- void write_raw(const void * ptr, size_t len) const {
131
- if (len == 0) {
132
- return;
133
- }
134
- errno = 0;
135
- size_t ret = std::fwrite(ptr, len, 1, fp);
136
- if (ret != 1) {
137
- throw std::runtime_error(format("write error: %s", strerror(errno)));
138
- }
139
- }
140
-
141
- void write_u32(std::uint32_t val) {
142
- write_raw(&val, sizeof(val));
143
- }
144
-
145
- ~llama_file() {
146
- if (fp) {
147
- std::fclose(fp);
148
- }
149
- }
150
- };
151
-
152
- #if defined(_WIN32)
153
- static std::string llama_format_win_err(DWORD err) {
154
- LPSTR buf;
155
- size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
156
- NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
157
- if (!size) {
158
- return "FormatMessageA failed";
159
- }
160
- std::string ret(buf, size);
161
- LocalFree(buf);
162
- return ret;
163
- }
164
- #endif
165
-
166
- struct llama_mmap {
167
- void * addr;
168
- size_t size;
169
-
170
- llama_mmap(const llama_mmap &) = delete;
171
-
172
- #ifdef _POSIX_MAPPED_FILES
173
- static constexpr bool SUPPORTED = true;
174
-
175
- llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176
- size = file->size;
177
- int fd = fileno(file->fp);
178
- int flags = MAP_SHARED;
179
- #ifdef __linux__
180
- flags |= MAP_POPULATE;
181
- #endif
182
- addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
183
- if (addr == MAP_FAILED) {
184
- throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185
- }
186
-
187
- if (prefetch > 0) {
188
- // Advise the kernel to preload the mapped memory
189
- if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
190
- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
191
- strerror(errno));
192
- }
193
- }
194
- }
195
-
196
- ~llama_mmap() {
197
- munmap(addr, size);
198
- }
199
- #elif defined(_WIN32)
200
- static constexpr bool SUPPORTED = true;
201
-
202
- llama_mmap(struct llama_file * file, bool prefetch = true) {
203
- size = file->size;
204
-
205
- HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
206
-
207
- HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
208
- DWORD error = GetLastError();
209
-
210
- if (hMapping == NULL) {
211
- throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
212
- }
213
-
214
- addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
215
- error = GetLastError();
216
- CloseHandle(hMapping);
217
-
218
- if (addr == NULL) {
219
- throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
220
- }
221
-
222
- #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
223
- if (prefetch) {
224
- // Advise the kernel to preload the mapped memory
225
- WIN32_MEMORY_RANGE_ENTRY range;
226
- range.VirtualAddress = addr;
227
- range.NumberOfBytes = (SIZE_T)size;
228
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
229
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
230
- llama_format_win_err(GetLastError()).c_str());
231
- }
232
- }
233
- #else
234
- #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
235
- #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
236
- }
237
-
238
- ~llama_mmap() {
239
- if (!UnmapViewOfFile(addr)) {
240
- fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
241
- llama_format_win_err(GetLastError()).c_str());
242
- }
243
- }
244
- #else
245
- static constexpr bool SUPPORTED = false;
246
-
247
- llama_mmap(struct llama_file *, bool prefetch = true) {
248
- (void)prefetch;
249
- throw std::runtime_error(std::string("mmap not supported"));
250
- }
251
- #endif
252
- };
253
-
254
- // Represents some region of memory being locked using mlock or VirtualLock;
255
- // will automatically unlock on destruction.
256
- struct llama_mlock {
257
- void * addr = NULL;
258
- size_t size = 0;
259
- bool failed_already = false;
260
-
261
- llama_mlock() {}
262
- llama_mlock(const llama_mlock &) = delete;
263
-
264
- ~llama_mlock() {
265
- if (size) {
266
- raw_unlock(addr, size);
267
- }
268
- }
269
-
270
- void init(void * ptr) {
271
- LLAMA_ASSERT(addr == NULL && size == 0);
272
- addr = ptr;
273
- }
274
-
275
- void grow_to(size_t target_size) {
276
- LLAMA_ASSERT(addr);
277
- if (failed_already) {
278
- return;
279
- }
280
- size_t granularity = lock_granularity();
281
- target_size = (target_size + granularity - 1) & ~(granularity - 1);
282
- if (target_size > size) {
283
- if (raw_lock((uint8_t *) addr + size, target_size - size)) {
284
- size = target_size;
285
- } else {
286
- failed_already = true;
287
- }
288
- }
289
- }
290
-
291
- #ifdef _POSIX_MEMLOCK_RANGE
292
- static constexpr bool SUPPORTED = true;
293
-
294
- size_t lock_granularity() {
295
- return (size_t) sysconf(_SC_PAGESIZE);
296
- }
297
-
298
- #ifdef __APPLE__
299
- #define MLOCK_SUGGESTION \
300
- "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
301
- "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
302
- #else
303
- #define MLOCK_SUGGESTION \
304
- "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
305
- #endif
306
-
307
- bool raw_lock(const void * addr, size_t size) {
308
- if (!mlock(addr, size)) {
309
- return true;
310
- } else {
311
- char* errmsg = std::strerror(errno);
312
- bool suggest = (errno == ENOMEM);
313
-
314
- // Check if the resource limit is fine after all
315
- struct rlimit lock_limit;
316
- if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
317
- suggest = false;
318
- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
319
- suggest = false;
320
-
321
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
322
- size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
323
- return false;
324
- }
325
- }
326
-
327
- #undef MLOCK_SUGGESTION
328
-
329
- void raw_unlock(void * addr, size_t size) {
330
- if (munlock(addr, size)) {
331
- fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
332
- }
333
- }
334
- #elif defined(_WIN32)
335
- static constexpr bool SUPPORTED = true;
336
-
337
- size_t lock_granularity() {
338
- SYSTEM_INFO si;
339
- GetSystemInfo(&si);
340
- return (size_t) si.dwPageSize;
341
- }
342
-
343
- bool raw_lock(void * ptr, size_t len) {
344
- for (int tries = 1; ; tries++) {
345
- if (VirtualLock(ptr, len)) {
346
- return true;
347
- }
348
- if (tries == 2) {
349
- fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
350
- len, size, llama_format_win_err(GetLastError()).c_str());
351
- return false;
352
- }
353
-
354
- // It failed but this was only the first try; increase the working
355
- // set size and try again.
356
- SIZE_T min_ws_size, max_ws_size;
357
- if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
358
- fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
359
- llama_format_win_err(GetLastError()).c_str());
360
- return false;
361
- }
362
- // Per MSDN: "The maximum number of pages that a process can lock
363
- // is equal to the number of pages in its minimum working set minus
364
- // a small overhead."
365
- // Hopefully a megabyte is enough overhead:
366
- size_t increment = len + 1048576;
367
- // The minimum must be <= the maximum, so we need to increase both:
368
- min_ws_size += increment;
369
- max_ws_size += increment;
370
- if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
371
- fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
372
- llama_format_win_err(GetLastError()).c_str());
373
- return false;
374
- }
375
- }
376
- }
377
-
378
- void raw_unlock(void * ptr, size_t len) {
379
- if (!VirtualUnlock(ptr, len)) {
380
- fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
381
- llama_format_win_err(GetLastError()).c_str());
382
- }
383
- }
384
- #else
385
- static constexpr bool SUPPORTED = false;
386
-
387
- size_t lock_granularity() {
388
- return (size_t) 65536;
389
- }
390
-
391
- bool raw_lock(const void * addr, size_t len) {
392
- fprintf(stderr, "warning: mlock not supported on this system\n");
393
- return false;
394
- }
395
-
396
- void raw_unlock(const void * addr, size_t len) {}
397
- #endif
398
- };
399
-
400
- // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
401
- struct llama_buffer {
402
- uint8_t * addr = NULL;
403
- size_t size = 0;
404
-
405
- llama_buffer() = default;
406
-
407
- void resize(size_t len) {
408
- delete[] addr;
409
- addr = new uint8_t[len];
410
- size = len;
411
- }
412
-
413
- ~llama_buffer() {
414
- delete[] addr;
415
- }
416
-
417
- // disable copy and move
418
- llama_buffer(const llama_buffer&) = delete;
419
- llama_buffer(llama_buffer&&) = delete;
420
- llama_buffer& operator=(const llama_buffer&) = delete;
421
- llama_buffer& operator=(llama_buffer&&) = delete;
422
- };
423
-
424
- #ifdef GGML_USE_CUBLAS
425
- #include "ggml-cuda.h"
426
- struct llama_ctx_buffer {
427
- uint8_t * addr = NULL;
428
- bool is_cuda;
429
- size_t size = 0;
430
-
431
- llama_ctx_buffer() = default;
432
-
433
- void resize(size_t size) {
434
- free();
435
-
436
- addr = (uint8_t *) ggml_cuda_host_malloc(size);
437
- if (addr) {
438
- is_cuda = true;
439
- }
440
- else {
441
- // fall back to pageable memory
442
- addr = new uint8_t[size];
443
- is_cuda = false;
444
- }
445
- this->size = size;
446
- }
447
-
448
- void free() {
449
- if (addr) {
450
- if (is_cuda) {
451
- ggml_cuda_host_free(addr);
452
- }
453
- else {
454
- delete[] addr;
455
- }
456
- }
457
- addr = NULL;
458
- }
459
-
460
- ~llama_ctx_buffer() {
461
- free();
462
- }
463
-
464
- // disable copy and move
465
- llama_ctx_buffer(const llama_ctx_buffer&) = delete;
466
- llama_ctx_buffer(llama_ctx_buffer&&) = delete;
467
- llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
468
- llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
469
- };
470
- #else
471
- typedef llama_buffer llama_ctx_buffer;
472
- #endif
473
-
474
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -1,8 +1,16 @@
1
  #ifndef LLAMA_H
2
  #define LLAMA_H
3
 
 
 
 
 
 
 
 
4
  #include <stddef.h>
5
  #include <stdint.h>
 
6
  #include <stdbool.h>
7
 
8
  #ifdef LLAMA_SHARED
@@ -19,17 +27,25 @@
19
  # define LLAMA_API
20
  #endif
21
 
22
- #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
- #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
- #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
- #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
- #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 
 
 
27
 
28
- #define LLAMA_FILE_VERSION 3
29
- #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
- #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
- #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
- #define LLAMA_SESSION_VERSION 1
 
 
 
 
33
 
34
  #ifdef __cplusplus
35
  extern "C" {
@@ -41,10 +57,57 @@ extern "C" {
41
  // TODO: show sample usage
42
  //
43
 
 
44
  struct llama_context;
45
 
46
  typedef int llama_token;
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  typedef struct llama_token_data {
49
  llama_token id; // token id
50
  float logit; // log-odds of the token
@@ -60,67 +123,152 @@ extern "C" {
60
  typedef void (*llama_progress_callback)(float progress, void *ctx);
61
 
62
  struct llama_context_params {
63
- int n_ctx; // text context
64
- int n_gpu_layers; // number of layers to store in VRAM
65
- int seed; // RNG seed, -1 for random
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
 
 
 
67
  bool f16_kv; // use fp16 for KV cache
68
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
69
  bool vocab_only; // only load the vocabulary, no weights
70
  bool use_mmap; // use mmap if possible
71
  bool use_mlock; // force system to keep model in RAM
72
  bool embedding; // embedding mode only
73
-
74
- // called with a progress value between 0 and 1, pass NULL to disable
75
- llama_progress_callback progress_callback;
76
- // context pointer passed to the progress callback
77
- void * progress_callback_user_data;
78
  };
79
 
80
- // model file types
81
- enum llama_ftype {
82
- LLAMA_FTYPE_ALL_F32 = 0,
83
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
86
- LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
87
- // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
- // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  };
93
 
94
- LLAMA_API struct llama_context_params llama_context_default_params();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- LLAMA_API bool llama_mmap_supported();
97
- LLAMA_API bool llama_mlock_supported();
98
 
99
- // TODO: not great API - very likely to change
100
  // Initialize the llama + ggml backend
 
101
  // Call once at the start of the program
102
- LLAMA_API void llama_init_backend();
103
 
104
- LLAMA_API int64_t llama_time_us();
 
105
 
106
- // Various functions for loading a ggml llama model.
107
- // Allocate (almost) all memory needed for the model.
108
- // Return NULL on failure
109
- LLAMA_API struct llama_context * llama_init_from_file(
110
  const char * path_model,
111
  struct llama_context_params params);
112
 
 
 
 
 
 
 
113
  // Frees all allocated memory
114
  LLAMA_API void llama_free(struct llama_context * ctx);
115
 
116
- // TODO: not great API - very likely to change
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  // Returns 0 on success
118
- // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
119
  LLAMA_API int llama_model_quantize(
120
  const char * fname_inp,
121
  const char * fname_out,
122
- enum llama_ftype ftype,
123
- int nthread);
124
 
125
  // Apply a LoRA adapter to a loaded model
126
  // path_base_model is the path to a higher quality model to use as a base for
@@ -128,17 +276,24 @@ extern "C" {
128
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
129
  // will be applied on top of the previous one
130
  // Returns 0 on success
131
- LLAMA_API int llama_apply_lora_from_file(
132
  struct llama_context * ctx,
133
  const char * path_lora,
134
  const char * path_base_model,
135
- int n_threads);
 
 
 
 
 
 
 
136
 
137
  // Returns the number of tokens in the KV cache
138
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
139
 
140
  // Sets the current rng seed.
141
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
142
 
143
  // Returns the maximum size in bytes of the state (rng, logits, embedding
144
  // and kv_cache) - will often be smaller after compacting tokens
@@ -168,21 +323,19 @@ extern "C" {
168
  int n_past,
169
  int n_threads);
170
 
171
- // Convert the provided text into tokens.
172
- // The tokens pointer must be large enough to hold the resulting tokens.
173
- // Returns the number of tokens on success, no more than n_max_tokens
174
- // Returns a negative number on failure - the number of tokens that would have been returned
175
- // TODO: not sure if correct
176
- LLAMA_API int llama_tokenize(
177
  struct llama_context * ctx,
178
- const char * text,
179
- llama_token * tokens,
180
- int n_max_tokens,
181
- bool add_bos);
182
 
183
- LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
184
- LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
185
- LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
 
186
 
187
  // Token logits obtained from the last call to llama_eval()
188
  // The logits for the last token are stored in the last row
@@ -195,15 +348,75 @@ extern "C" {
195
  // shape: [n_embd] (1-dimensional)
196
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
197
 
198
- // Token Id -> String. Uses the vocabulary in the provided context
199
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 
 
 
 
 
 
 
200
 
201
  // Special tokens
202
- LLAMA_API llama_token llama_token_bos();
203
- LLAMA_API llama_token llama_token_eos();
204
- LLAMA_API llama_token llama_token_nl();
 
 
 
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  // Sampling functions
 
207
 
208
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
209
  LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
@@ -211,6 +424,16 @@ extern "C" {
211
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
212
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
213
 
 
 
 
 
 
 
 
 
 
 
214
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
215
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
216
 
@@ -227,6 +450,9 @@ extern "C" {
227
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
228
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
229
 
 
 
 
230
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
231
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
232
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -248,13 +474,60 @@ extern "C" {
248
  /// @details Randomly selects a token from the candidates based on their probabilities.
249
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  // Performance information
 
252
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
253
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
254
 
255
  // Print system information
256
  LLAMA_API const char * llama_print_system_info(void);
257
 
 
 
 
 
 
 
258
  #ifdef __cplusplus
259
  }
260
  #endif
@@ -264,10 +537,11 @@ extern "C" {
264
 
265
  #include <vector>
266
  #include <string>
 
267
  struct ggml_tensor;
268
 
269
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
270
 
271
- #endif
272
 
273
  #endif // LLAMA_H
 
1
  #ifndef LLAMA_H
2
  #define LLAMA_H
3
 
4
+ #include "ggml.h"
5
+ #ifdef GGML_USE_CUBLAS
6
+ #include "ggml-cuda.h"
7
+ #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
8
+ #else
9
+ #define LLAMA_MAX_DEVICES 1
10
+ #endif // GGML_USE_CUBLAS
11
  #include <stddef.h>
12
  #include <stdint.h>
13
+ #include <stdio.h>
14
  #include <stdbool.h>
15
 
16
  #ifdef LLAMA_SHARED
 
27
  # define LLAMA_API
28
  #endif
29
 
30
+ #ifdef __GNUC__
31
+ # define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
32
+ #elif defined(_MSC_VER)
33
+ # define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
34
+ #else
35
+ # define DEPRECATED(func, hint) func
36
+ #endif
37
+
38
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
39
 
40
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
41
+
42
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
43
+ #define LLAMA_SESSION_VERSION 1
44
+
45
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
46
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
47
+ #define LLAMA_SUPPORTS_GPU_OFFLOAD
48
+ #endif
49
 
50
  #ifdef __cplusplus
51
  extern "C" {
 
57
  // TODO: show sample usage
58
  //
59
 
60
+ struct llama_model;
61
  struct llama_context;
62
 
63
  typedef int llama_token;
64
 
65
+ enum llama_log_level {
66
+ LLAMA_LOG_LEVEL_ERROR = 2,
67
+ LLAMA_LOG_LEVEL_WARN = 3,
68
+ LLAMA_LOG_LEVEL_INFO = 4
69
+ };
70
+
71
+ enum llama_vocab_type {
72
+ LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
73
+ LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
74
+ };
75
+
76
+ enum llama_token_type {
77
+ LLAMA_TOKEN_TYPE_UNDEFINED = 0,
78
+ LLAMA_TOKEN_TYPE_NORMAL = 1,
79
+ LLAMA_TOKEN_TYPE_UNKNOWN = 2,
80
+ LLAMA_TOKEN_TYPE_CONTROL = 3,
81
+ LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
82
+ LLAMA_TOKEN_TYPE_UNUSED = 5,
83
+ LLAMA_TOKEN_TYPE_BYTE = 6,
84
+ };
85
+
86
+ // model file types
87
+ enum llama_ftype {
88
+ LLAMA_FTYPE_ALL_F32 = 0,
89
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
92
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
93
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
94
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
95
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
96
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
97
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
98
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
99
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
100
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
101
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
102
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
105
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
106
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
107
+
108
+ LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
109
+ };
110
+
111
  typedef struct llama_token_data {
112
  llama_token id; // token id
113
  float logit; // log-odds of the token
 
123
  typedef void (*llama_progress_callback)(float progress, void *ctx);
124
 
125
  struct llama_context_params {
126
+ uint32_t seed; // RNG seed, -1 for random
127
+ int32_t n_ctx; // text context
128
+ int32_t n_batch; // prompt processing batch size
129
+ int32_t n_gpu_layers; // number of layers to store in VRAM
130
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
131
+
132
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
133
+
134
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
135
+ float rope_freq_base; // RoPE base frequency
136
+ float rope_freq_scale; // RoPE frequency scaling factor
137
+
138
+ // called with a progress value between 0 and 1, pass NULL to disable
139
+ llama_progress_callback progress_callback;
140
+ // context pointer passed to the progress callback
141
+ void * progress_callback_user_data;
142
 
143
+ // Keep the booleans together to avoid misalignment during copy-by-value.
144
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
145
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
146
  bool f16_kv; // use fp16 for KV cache
147
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
148
  bool vocab_only; // only load the vocabulary, no weights
149
  bool use_mmap; // use mmap if possible
150
  bool use_mlock; // force system to keep model in RAM
151
  bool embedding; // embedding mode only
 
 
 
 
 
152
  };
153
 
154
+ // Signature for logging events
155
+ // Note that text includes the new line character at the end for most events.
156
+ // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
157
+ // if it exists.
158
+ // It might not exist for progress report where '.' is output repeatedly.
159
+ typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
160
+
161
+ // model quantization parameters
162
+ typedef struct llama_model_quantize_params {
163
+ int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
164
+ enum llama_ftype ftype; // quantize to this llama_ftype
165
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
166
+ bool quantize_output_tensor; // quantize output.weight
167
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
168
+ } llama_model_quantize_params;
169
+
170
+ // grammar types
171
+ struct llama_grammar;
172
+
173
+ // grammar element type
174
+ enum llama_gretype {
175
+ // end of rule definition
176
+ LLAMA_GRETYPE_END = 0,
177
+
178
+ // start of alternate definition for rule
179
+ LLAMA_GRETYPE_ALT = 1,
180
+
181
+ // non-terminal element: reference to rule
182
+ LLAMA_GRETYPE_RULE_REF = 2,
183
+
184
+ // terminal element: character (code point)
185
+ LLAMA_GRETYPE_CHAR = 3,
186
+
187
+ // inverse char(s) ([^a], [^a-b] [^abc])
188
+ LLAMA_GRETYPE_CHAR_NOT = 4,
189
+
190
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
191
+ // be an inclusive range ([a-z])
192
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
193
+
194
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
195
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
196
+ LLAMA_GRETYPE_CHAR_ALT = 6,
197
  };
198
 
199
+ typedef struct llama_grammar_element {
200
+ enum llama_gretype type;
201
+ uint32_t value; // Unicode code point or rule ID
202
+ } llama_grammar_element;
203
+
204
+ // performance timing information
205
+ struct llama_timings {
206
+ double t_start_ms;
207
+ double t_end_ms;
208
+ double t_load_ms;
209
+ double t_sample_ms;
210
+ double t_p_eval_ms;
211
+ double t_eval_ms;
212
+
213
+ int32_t n_sample;
214
+ int32_t n_p_eval;
215
+ int32_t n_eval;
216
+ };
217
 
218
+ LLAMA_API struct llama_context_params llama_context_default_params(void);
219
+ LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
220
 
 
221
  // Initialize the llama + ggml backend
222
+ // If numa is true, use NUMA optimizations
223
  // Call once at the start of the program
224
+ LLAMA_API void llama_backend_init(bool numa);
225
 
226
+ // Call once at the end of the program - currently only used for MPI
227
+ LLAMA_API void llama_backend_free(void);
228
 
229
+ LLAMA_API struct llama_model * llama_load_model_from_file(
 
 
 
230
  const char * path_model,
231
  struct llama_context_params params);
232
 
233
+ LLAMA_API void llama_free_model(struct llama_model * model);
234
+
235
+ LLAMA_API struct llama_context * llama_new_context_with_model(
236
+ struct llama_model * model,
237
+ struct llama_context_params params);
238
+
239
  // Frees all allocated memory
240
  LLAMA_API void llama_free(struct llama_context * ctx);
241
 
242
+ LLAMA_API int64_t llama_time_us(void);
243
+
244
+ LLAMA_API int llama_max_devices (void);
245
+ LLAMA_API bool llama_mmap_supported (void);
246
+ LLAMA_API bool llama_mlock_supported(void);
247
+
248
+ LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
249
+ LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
250
+ LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
251
+ LLAMA_API int llama_n_embd (const struct llama_context * ctx);
252
+
253
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
254
+
255
+ LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
256
+ LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
257
+ LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
258
+ LLAMA_API int llama_model_n_embd (const struct llama_model * model);
259
+
260
+ // Get a string describing the model type
261
+ LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
262
+ // Returns the total size of all the tensors in the model in bytes
263
+ LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
264
+ // Returns the total number of parameters in the model
265
+ LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
266
+
267
  // Returns 0 on success
 
268
  LLAMA_API int llama_model_quantize(
269
  const char * fname_inp,
270
  const char * fname_out,
271
+ const llama_model_quantize_params * params);
 
272
 
273
  // Apply a LoRA adapter to a loaded model
274
  // path_base_model is the path to a higher quality model to use as a base for
 
276
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
277
  // will be applied on top of the previous one
278
  // Returns 0 on success
279
+ LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
280
  struct llama_context * ctx,
281
  const char * path_lora,
282
  const char * path_base_model,
283
+ int n_threads),
284
+ "please use llama_model_apply_lora_from_file instead");
285
+
286
+ LLAMA_API int llama_model_apply_lora_from_file(
287
+ const struct llama_model * model,
288
+ const char * path_lora,
289
+ const char * path_base_model,
290
+ int n_threads);
291
 
292
  // Returns the number of tokens in the KV cache
293
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
294
 
295
  // Sets the current rng seed.
296
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
297
 
298
  // Returns the maximum size in bytes of the state (rng, logits, embedding
299
  // and kv_cache) - will often be smaller after compacting tokens
 
323
  int n_past,
324
  int n_threads);
325
 
326
+ // Same as llama_eval, but use float matrix input directly.
327
+ LLAMA_API int llama_eval_embd(
 
 
 
 
328
  struct llama_context * ctx,
329
+ const float * embd,
330
+ int n_tokens,
331
+ int n_past,
332
+ int n_threads);
333
 
334
+ // Export a static computation graph for context of 511 and batch size of 1
335
+ // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
336
+ // parameters here to keep things simple
337
+ // IMPORTANT: do not use for anything else other than debugging and testing!
338
+ LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
339
 
340
  // Token logits obtained from the last call to llama_eval()
341
  // The logits for the last token are stored in the last row
 
348
  // shape: [n_embd] (1-dimensional)
349
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
350
 
351
+ //
352
+ // Vocab
353
+ //
354
+
355
+ LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
356
+
357
+ LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
358
+
359
+ LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
360
 
361
  // Special tokens
362
+ LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
363
+ LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
364
+ LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
365
+
366
+ //
367
+ // Tokenization
368
+ //
369
 
370
+ // Convert the provided text into tokens.
371
+ // The tokens pointer must be large enough to hold the resulting tokens.
372
+ // Returns the number of tokens on success, no more than n_max_tokens
373
+ // Returns a negative number on failure - the number of tokens that would have been returned
374
+ LLAMA_API int llama_tokenize(
375
+ struct llama_context * ctx,
376
+ const char * text,
377
+ llama_token * tokens,
378
+ int n_max_tokens,
379
+ bool add_bos);
380
+
381
+ LLAMA_API int llama_tokenize_with_model(
382
+ const struct llama_model * model,
383
+ const char * text,
384
+ llama_token * tokens,
385
+ int n_max_tokens,
386
+ bool add_bos);
387
+
388
+ // Token Id -> Piece.
389
+ // Uses the vocabulary in the provided context.
390
+ // Does not write null terminator to the buffer.
391
+ // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
392
+ LLAMA_API int llama_token_to_piece(
393
+ const struct llama_context * ctx,
394
+ llama_token token,
395
+ char * buf,
396
+ int length);
397
+
398
+ LLAMA_API int llama_token_to_piece_with_model(
399
+ const struct llama_model * model,
400
+ llama_token token,
401
+ char * buf,
402
+ int length);
403
+
404
+ //
405
+ // Grammar
406
+ //
407
+
408
+ LLAMA_API struct llama_grammar * llama_grammar_init(
409
+ const llama_grammar_element ** rules,
410
+ size_t n_rules,
411
+ size_t start_rule_index);
412
+
413
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
414
+
415
+ LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
416
+
417
+ //
418
  // Sampling functions
419
+ //
420
 
421
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
422
  LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
 
424
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
425
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
426
 
427
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
428
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
429
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
430
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
431
+ LLAMA_API void llama_sample_classifier_free_guidance(
432
+ struct llama_context * ctx,
433
+ llama_token_data_array * candidates,
434
+ struct llama_context * guidance_ctx,
435
+ float scale);
436
+
437
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
438
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
439
 
 
450
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
451
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
452
 
453
+ /// @details Apply constraints from grammar
454
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
455
+
456
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
457
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
458
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
 
474
  /// @details Randomly selects a token from the candidates based on their probabilities.
475
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
476
 
477
+ /// @details Accepts the sampled token into the grammar
478
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
479
+
480
+ //
481
+ // Beam search
482
+ //
483
+
484
+ struct llama_beam_view {
485
+ const llama_token * tokens;
486
+ size_t n_tokens;
487
+ float p; // Cumulative beam probability (renormalized relative to all beams)
488
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
489
+ };
490
+
491
+ // Passed to beam_search_callback function.
492
+ // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
493
+ // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
494
+ // These pointers are valid only during the synchronous callback, so should not be saved.
495
+ struct llama_beams_state {
496
+ struct llama_beam_view * beam_views;
497
+ size_t n_beams; // Number of elements in beam_views[].
498
+ size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
499
+ bool last_call; // True iff this is the last callback invocation.
500
+ };
501
+
502
+ // Type of pointer to the beam_search_callback function.
503
+ // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
504
+ // passed back to beam_search_callback. This avoids having to use global variables in the callback.
505
+ typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
506
+
507
+ /// @details Deterministically returns entire sentence constructed by a beam search.
508
+ /// @param ctx Pointer to the llama_context.
509
+ /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
510
+ /// @param callback_data A pointer that is simply passed back to callback.
511
+ /// @param n_beams Number of beams to use.
512
+ /// @param n_past Number of tokens already evaluated.
513
+ /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
514
+ /// @param n_threads Number of threads as passed to llama_eval().
515
+ LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
516
+
517
  // Performance information
518
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
519
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
520
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
521
 
522
  // Print system information
523
  LLAMA_API const char * llama_print_system_info(void);
524
 
525
+ // Set callback for all future logging events.
526
+ // If this is not called, or NULL is supplied, everything is output on stderr.
527
+ LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
528
+
529
+ LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
530
+
531
  #ifdef __cplusplus
532
  }
533
  #endif
 
537
 
538
  #include <vector>
539
  #include <string>
540
+
541
  struct ggml_tensor;
542
 
543
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
544
 
545
+ #endif // LLAMA_API_INTERNAL
546
 
547
  #endif // LLAMA_H
examples/talk-llama/speak CHANGED
File without changes
examples/talk-llama/talk-llama.cpp CHANGED
@@ -25,6 +25,20 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
25
  return res;
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  // command-line parameters
29
  struct whisper_params {
30
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -33,14 +47,14 @@ struct whisper_params {
33
  int32_t max_tokens = 32;
34
  int32_t audio_ctx = 0;
35
 
36
- float vad_thold = 0.6f;
37
- float freq_thold = 100.0f;
38
 
39
- bool speed_up = false;
40
- bool translate = false;
41
- bool print_special = false;
42
- bool print_energy = false;
43
- bool no_timestamps = true;
44
  bool verbose_prompt = false;
45
 
46
  std::string person = "Georgi";
@@ -235,7 +249,7 @@ int main(int argc, char ** argv) {
235
 
236
  // llama init
237
 
238
- llama_init_backend();
239
 
240
  auto lparams = llama_context_default_params();
241
 
@@ -244,7 +258,9 @@ int main(int argc, char ** argv) {
244
  lparams.seed = 1;
245
  lparams.f16_kv = true;
246
 
247
- struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
 
 
248
 
249
  // print some info about the processing
250
  {
@@ -267,7 +283,6 @@ int main(int argc, char ** argv) {
267
  fprintf(stderr, "\n");
268
  }
269
 
270
-
271
  // init audio
272
 
273
  audio_async audio(30*1000);
@@ -278,8 +293,6 @@ int main(int argc, char ** argv) {
278
 
279
  audio.resume();
280
 
281
- int n_iter = 0;
282
-
283
  bool is_running = true;
284
  bool force_speak = false;
285
 
@@ -514,7 +527,7 @@ int main(int argc, char ** argv) {
514
  //printf("\n---\n");
515
  //printf("resetting: '");
516
  //for (int i = 0; i < (int) embd.size(); i++) {
517
- // printf("%s", llama_token_to_str(ctx_llama, embd[i]));
518
  //}
519
  //printf("'\n");
520
  //printf("\n---\n");
@@ -582,7 +595,7 @@ int main(int argc, char ** argv) {
582
  auto logits = llama_get_logits(ctx_llama);
583
  auto n_vocab = llama_n_vocab(ctx_llama);
584
 
585
- logits[llama_token_eos()] = 0;
586
 
587
  std::vector<llama_token_data> candidates;
588
  candidates.reserve(n_vocab);
@@ -593,13 +606,13 @@ int main(int argc, char ** argv) {
593
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
594
 
595
  // apply repeat penalty
596
- const float nl_logit = logits[llama_token_nl()];
597
 
598
  llama_sample_repetition_penalty(ctx_llama, &candidates_p,
599
  embd_inp.data() + std::max(0, n_past - repeat_last_n),
600
  repeat_last_n, repeat_penalty);
601
 
602
- logits[llama_token_nl()] = nl_logit;
603
 
604
  if (temp <= 0) {
605
  // Greedy sampling
@@ -613,22 +626,22 @@ int main(int argc, char ** argv) {
613
  }
614
  }
615
 
616
- if (id != llama_token_eos()) {
617
  // add it to the context
618
  embd.push_back(id);
619
 
620
- text_to_speak += llama_token_to_str(ctx_llama, id);
621
 
622
- printf("%s", llama_token_to_str(ctx_llama, id));
623
  }
624
  }
625
 
626
  {
627
  std::string last_output;
628
  for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
629
- last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
630
  }
631
- last_output += llama_token_to_str(ctx_llama, embd[0]);
632
 
633
  for (std::string & antiprompt : antiprompts) {
634
  if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
@@ -655,8 +668,6 @@ int main(int argc, char ** argv) {
655
  }
656
 
657
  audio.clear();
658
-
659
- ++n_iter;
660
  }
661
  }
662
  }
 
25
  return res;
26
  }
27
 
28
+ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
29
+ std::vector<char> result(8, 0);
30
+ const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
31
+ if (n_tokens < 0) {
32
+ result.resize(-n_tokens);
33
+ int check = llama_token_to_piece(ctx, token, result.data(), result.size());
34
+ GGML_ASSERT(check == -n_tokens);
35
+ } else {
36
+ result.resize(n_tokens);
37
+ }
38
+
39
+ return std::string(result.data(), result.size());
40
+ }
41
+
42
  // command-line parameters
43
  struct whisper_params {
44
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 
47
  int32_t max_tokens = 32;
48
  int32_t audio_ctx = 0;
49
 
50
+ float vad_thold = 0.6f;
51
+ float freq_thold = 100.0f;
52
 
53
+ bool speed_up = false;
54
+ bool translate = false;
55
+ bool print_special = false;
56
+ bool print_energy = false;
57
+ bool no_timestamps = true;
58
  bool verbose_prompt = false;
59
 
60
  std::string person = "Georgi";
 
249
 
250
  // llama init
251
 
252
+ llama_backend_init(true);
253
 
254
  auto lparams = llama_context_default_params();
255
 
 
258
  lparams.seed = 1;
259
  lparams.f16_kv = true;
260
 
261
+ struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lparams);
262
+
263
+ struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lparams);
264
 
265
  // print some info about the processing
266
  {
 
283
  fprintf(stderr, "\n");
284
  }
285
 
 
286
  // init audio
287
 
288
  audio_async audio(30*1000);
 
293
 
294
  audio.resume();
295
 
 
 
296
  bool is_running = true;
297
  bool force_speak = false;
298
 
 
527
  //printf("\n---\n");
528
  //printf("resetting: '");
529
  //for (int i = 0; i < (int) embd.size(); i++) {
530
+ // printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
531
  //}
532
  //printf("'\n");
533
  //printf("\n---\n");
 
595
  auto logits = llama_get_logits(ctx_llama);
596
  auto n_vocab = llama_n_vocab(ctx_llama);
597
 
598
+ logits[llama_token_eos(ctx_llama)] = 0;
599
 
600
  std::vector<llama_token_data> candidates;
601
  candidates.reserve(n_vocab);
 
606
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
607
 
608
  // apply repeat penalty
609
+ const float nl_logit = logits[llama_token_nl(ctx_llama)];
610
 
611
  llama_sample_repetition_penalty(ctx_llama, &candidates_p,
612
  embd_inp.data() + std::max(0, n_past - repeat_last_n),
613
  repeat_last_n, repeat_penalty);
614
 
615
+ logits[llama_token_nl(ctx_llama)] = nl_logit;
616
 
617
  if (temp <= 0) {
618
  // Greedy sampling
 
626
  }
627
  }
628
 
629
+ if (id != llama_token_eos(ctx_llama)) {
630
  // add it to the context
631
  embd.push_back(id);
632
 
633
+ text_to_speak += llama_token_to_piece(ctx_llama, id);
634
 
635
+ printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
636
  }
637
  }
638
 
639
  {
640
  std::string last_output;
641
  for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
642
+ last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
643
  }
644
+ last_output += llama_token_to_piece(ctx_llama, embd[0]);
645
 
646
  for (std::string & antiprompt : antiprompts) {
647
  if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
 
668
  }
669
 
670
  audio.clear();
 
 
671
  }
672
  }
673
  }