Spaces:
Sleeping
Sleeping
talk-llama : update to latest llama.cpp
Browse files- examples/talk-llama/README.md +2 -2
- examples/talk-llama/llama-util.h +0 -474
- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +341 -67
- examples/talk-llama/speak +0 -0
- examples/talk-llama/talk-llama.cpp +34 -23
examples/talk-llama/README.md
CHANGED
|
@@ -19,7 +19,7 @@ brew install sdl2
|
|
| 19 |
make talk-llama
|
| 20 |
|
| 21 |
# Run it
|
| 22 |
-
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/
|
| 23 |
```
|
| 24 |
|
| 25 |
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
|
|
@@ -36,7 +36,7 @@ This feature is especially helpful for maintaining context in long conversations
|
|
| 36 |
Example usage:
|
| 37 |
|
| 38 |
```bash
|
| 39 |
-
./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/
|
| 40 |
```
|
| 41 |
|
| 42 |
## TTS
|
|
|
|
| 19 |
make talk-llama
|
| 20 |
|
| 21 |
# Run it
|
| 22 |
+
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
|
| 23 |
```
|
| 24 |
|
| 25 |
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
|
|
|
|
| 36 |
Example usage:
|
| 37 |
|
| 38 |
```bash
|
| 39 |
+
./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
|
| 40 |
```
|
| 41 |
|
| 42 |
## TTS
|
examples/talk-llama/llama-util.h
DELETED
|
@@ -1,474 +0,0 @@
|
|
| 1 |
-
// Internal header to be included only by llama.cpp.
|
| 2 |
-
// Contains wrappers around OS interfaces.
|
| 3 |
-
|
| 4 |
-
#ifndef LLAMA_UTIL_H
|
| 5 |
-
#define LLAMA_UTIL_H
|
| 6 |
-
|
| 7 |
-
#include <cstdio>
|
| 8 |
-
#include <cstdint>
|
| 9 |
-
#include <cerrno>
|
| 10 |
-
#include <cstring>
|
| 11 |
-
#include <cstdarg>
|
| 12 |
-
#include <cstdlib>
|
| 13 |
-
#include <climits>
|
| 14 |
-
|
| 15 |
-
#include <string>
|
| 16 |
-
#include <vector>
|
| 17 |
-
#include <stdexcept>
|
| 18 |
-
|
| 19 |
-
#ifdef __has_include
|
| 20 |
-
#if __has_include(<unistd.h>)
|
| 21 |
-
#include <unistd.h>
|
| 22 |
-
#if defined(_POSIX_MAPPED_FILES)
|
| 23 |
-
#include <sys/mman.h>
|
| 24 |
-
#endif
|
| 25 |
-
#if defined(_POSIX_MEMLOCK_RANGE)
|
| 26 |
-
#include <sys/resource.h>
|
| 27 |
-
#endif
|
| 28 |
-
#endif
|
| 29 |
-
#endif
|
| 30 |
-
|
| 31 |
-
#if defined(_WIN32)
|
| 32 |
-
#define WIN32_LEAN_AND_MEAN
|
| 33 |
-
#ifndef NOMINMAX
|
| 34 |
-
#define NOMINMAX
|
| 35 |
-
#endif
|
| 36 |
-
#include <windows.h>
|
| 37 |
-
#include <io.h>
|
| 38 |
-
#include <stdio.h> // for _fseeki64
|
| 39 |
-
#endif
|
| 40 |
-
|
| 41 |
-
#define LLAMA_ASSERT(x) \
|
| 42 |
-
do { \
|
| 43 |
-
if (!(x)) { \
|
| 44 |
-
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
| 45 |
-
abort(); \
|
| 46 |
-
} \
|
| 47 |
-
} while (0)
|
| 48 |
-
|
| 49 |
-
#ifdef __GNUC__
|
| 50 |
-
#ifdef __MINGW32__
|
| 51 |
-
__attribute__((format(gnu_printf, 1, 2)))
|
| 52 |
-
#else
|
| 53 |
-
__attribute__((format(printf, 1, 2)))
|
| 54 |
-
#endif
|
| 55 |
-
#endif
|
| 56 |
-
static std::string format(const char * fmt, ...) {
|
| 57 |
-
va_list ap, ap2;
|
| 58 |
-
va_start(ap, fmt);
|
| 59 |
-
va_copy(ap2, ap);
|
| 60 |
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
| 61 |
-
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
| 62 |
-
std::vector<char> buf(size + 1);
|
| 63 |
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
| 64 |
-
LLAMA_ASSERT(size2 == size);
|
| 65 |
-
va_end(ap2);
|
| 66 |
-
va_end(ap);
|
| 67 |
-
return std::string(buf.data(), size);
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
struct llama_file {
|
| 71 |
-
// use FILE * so we don't have to re-open the file to mmap
|
| 72 |
-
FILE * fp;
|
| 73 |
-
size_t size;
|
| 74 |
-
|
| 75 |
-
llama_file(const char * fname, const char * mode) {
|
| 76 |
-
fp = std::fopen(fname, mode);
|
| 77 |
-
if (fp == NULL) {
|
| 78 |
-
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
| 79 |
-
}
|
| 80 |
-
seek(0, SEEK_END);
|
| 81 |
-
size = tell();
|
| 82 |
-
seek(0, SEEK_SET);
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
size_t tell() const {
|
| 86 |
-
#ifdef _WIN32
|
| 87 |
-
__int64 ret = _ftelli64(fp);
|
| 88 |
-
#else
|
| 89 |
-
long ret = std::ftell(fp);
|
| 90 |
-
#endif
|
| 91 |
-
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
| 92 |
-
return (size_t) ret;
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
void seek(size_t offset, int whence) {
|
| 96 |
-
#ifdef _WIN32
|
| 97 |
-
int ret = _fseeki64(fp, (__int64) offset, whence);
|
| 98 |
-
#else
|
| 99 |
-
int ret = std::fseek(fp, (long) offset, whence);
|
| 100 |
-
#endif
|
| 101 |
-
LLAMA_ASSERT(ret == 0); // same
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
void read_raw(void * ptr, size_t len) const {
|
| 105 |
-
if (len == 0) {
|
| 106 |
-
return;
|
| 107 |
-
}
|
| 108 |
-
errno = 0;
|
| 109 |
-
std::size_t ret = std::fread(ptr, len, 1, fp);
|
| 110 |
-
if (ferror(fp)) {
|
| 111 |
-
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
| 112 |
-
}
|
| 113 |
-
if (ret != 1) {
|
| 114 |
-
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
| 115 |
-
}
|
| 116 |
-
}
|
| 117 |
-
|
| 118 |
-
std::uint32_t read_u32() {
|
| 119 |
-
std::uint32_t ret;
|
| 120 |
-
read_raw(&ret, sizeof(ret));
|
| 121 |
-
return ret;
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
std::string read_string(std::uint32_t len) {
|
| 125 |
-
std::vector<char> chars(len);
|
| 126 |
-
read_raw(chars.data(), len);
|
| 127 |
-
return std::string(chars.data(), len);
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
void write_raw(const void * ptr, size_t len) const {
|
| 131 |
-
if (len == 0) {
|
| 132 |
-
return;
|
| 133 |
-
}
|
| 134 |
-
errno = 0;
|
| 135 |
-
size_t ret = std::fwrite(ptr, len, 1, fp);
|
| 136 |
-
if (ret != 1) {
|
| 137 |
-
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
| 138 |
-
}
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
void write_u32(std::uint32_t val) {
|
| 142 |
-
write_raw(&val, sizeof(val));
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
~llama_file() {
|
| 146 |
-
if (fp) {
|
| 147 |
-
std::fclose(fp);
|
| 148 |
-
}
|
| 149 |
-
}
|
| 150 |
-
};
|
| 151 |
-
|
| 152 |
-
#if defined(_WIN32)
|
| 153 |
-
static std::string llama_format_win_err(DWORD err) {
|
| 154 |
-
LPSTR buf;
|
| 155 |
-
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 156 |
-
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
| 157 |
-
if (!size) {
|
| 158 |
-
return "FormatMessageA failed";
|
| 159 |
-
}
|
| 160 |
-
std::string ret(buf, size);
|
| 161 |
-
LocalFree(buf);
|
| 162 |
-
return ret;
|
| 163 |
-
}
|
| 164 |
-
#endif
|
| 165 |
-
|
| 166 |
-
struct llama_mmap {
|
| 167 |
-
void * addr;
|
| 168 |
-
size_t size;
|
| 169 |
-
|
| 170 |
-
llama_mmap(const llama_mmap &) = delete;
|
| 171 |
-
|
| 172 |
-
#ifdef _POSIX_MAPPED_FILES
|
| 173 |
-
static constexpr bool SUPPORTED = true;
|
| 174 |
-
|
| 175 |
-
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
| 176 |
-
size = file->size;
|
| 177 |
-
int fd = fileno(file->fp);
|
| 178 |
-
int flags = MAP_SHARED;
|
| 179 |
-
#ifdef __linux__
|
| 180 |
-
flags |= MAP_POPULATE;
|
| 181 |
-
#endif
|
| 182 |
-
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
| 183 |
-
if (addr == MAP_FAILED) {
|
| 184 |
-
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
if (prefetch > 0) {
|
| 188 |
-
// Advise the kernel to preload the mapped memory
|
| 189 |
-
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
| 190 |
-
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
| 191 |
-
strerror(errno));
|
| 192 |
-
}
|
| 193 |
-
}
|
| 194 |
-
}
|
| 195 |
-
|
| 196 |
-
~llama_mmap() {
|
| 197 |
-
munmap(addr, size);
|
| 198 |
-
}
|
| 199 |
-
#elif defined(_WIN32)
|
| 200 |
-
static constexpr bool SUPPORTED = true;
|
| 201 |
-
|
| 202 |
-
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
| 203 |
-
size = file->size;
|
| 204 |
-
|
| 205 |
-
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
| 206 |
-
|
| 207 |
-
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
| 208 |
-
DWORD error = GetLastError();
|
| 209 |
-
|
| 210 |
-
if (hMapping == NULL) {
|
| 211 |
-
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
| 212 |
-
}
|
| 213 |
-
|
| 214 |
-
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
| 215 |
-
error = GetLastError();
|
| 216 |
-
CloseHandle(hMapping);
|
| 217 |
-
|
| 218 |
-
if (addr == NULL) {
|
| 219 |
-
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
-
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
| 223 |
-
if (prefetch) {
|
| 224 |
-
// Advise the kernel to preload the mapped memory
|
| 225 |
-
WIN32_MEMORY_RANGE_ENTRY range;
|
| 226 |
-
range.VirtualAddress = addr;
|
| 227 |
-
range.NumberOfBytes = (SIZE_T)size;
|
| 228 |
-
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
| 229 |
-
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
| 230 |
-
llama_format_win_err(GetLastError()).c_str());
|
| 231 |
-
}
|
| 232 |
-
}
|
| 233 |
-
#else
|
| 234 |
-
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
| 235 |
-
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
| 236 |
-
}
|
| 237 |
-
|
| 238 |
-
~llama_mmap() {
|
| 239 |
-
if (!UnmapViewOfFile(addr)) {
|
| 240 |
-
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
| 241 |
-
llama_format_win_err(GetLastError()).c_str());
|
| 242 |
-
}
|
| 243 |
-
}
|
| 244 |
-
#else
|
| 245 |
-
static constexpr bool SUPPORTED = false;
|
| 246 |
-
|
| 247 |
-
llama_mmap(struct llama_file *, bool prefetch = true) {
|
| 248 |
-
(void)prefetch;
|
| 249 |
-
throw std::runtime_error(std::string("mmap not supported"));
|
| 250 |
-
}
|
| 251 |
-
#endif
|
| 252 |
-
};
|
| 253 |
-
|
| 254 |
-
// Represents some region of memory being locked using mlock or VirtualLock;
|
| 255 |
-
// will automatically unlock on destruction.
|
| 256 |
-
struct llama_mlock {
|
| 257 |
-
void * addr = NULL;
|
| 258 |
-
size_t size = 0;
|
| 259 |
-
bool failed_already = false;
|
| 260 |
-
|
| 261 |
-
llama_mlock() {}
|
| 262 |
-
llama_mlock(const llama_mlock &) = delete;
|
| 263 |
-
|
| 264 |
-
~llama_mlock() {
|
| 265 |
-
if (size) {
|
| 266 |
-
raw_unlock(addr, size);
|
| 267 |
-
}
|
| 268 |
-
}
|
| 269 |
-
|
| 270 |
-
void init(void * ptr) {
|
| 271 |
-
LLAMA_ASSERT(addr == NULL && size == 0);
|
| 272 |
-
addr = ptr;
|
| 273 |
-
}
|
| 274 |
-
|
| 275 |
-
void grow_to(size_t target_size) {
|
| 276 |
-
LLAMA_ASSERT(addr);
|
| 277 |
-
if (failed_already) {
|
| 278 |
-
return;
|
| 279 |
-
}
|
| 280 |
-
size_t granularity = lock_granularity();
|
| 281 |
-
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
| 282 |
-
if (target_size > size) {
|
| 283 |
-
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
| 284 |
-
size = target_size;
|
| 285 |
-
} else {
|
| 286 |
-
failed_already = true;
|
| 287 |
-
}
|
| 288 |
-
}
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
#ifdef _POSIX_MEMLOCK_RANGE
|
| 292 |
-
static constexpr bool SUPPORTED = true;
|
| 293 |
-
|
| 294 |
-
size_t lock_granularity() {
|
| 295 |
-
return (size_t) sysconf(_SC_PAGESIZE);
|
| 296 |
-
}
|
| 297 |
-
|
| 298 |
-
#ifdef __APPLE__
|
| 299 |
-
#define MLOCK_SUGGESTION \
|
| 300 |
-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
| 301 |
-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
| 302 |
-
#else
|
| 303 |
-
#define MLOCK_SUGGESTION \
|
| 304 |
-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
| 305 |
-
#endif
|
| 306 |
-
|
| 307 |
-
bool raw_lock(const void * addr, size_t size) {
|
| 308 |
-
if (!mlock(addr, size)) {
|
| 309 |
-
return true;
|
| 310 |
-
} else {
|
| 311 |
-
char* errmsg = std::strerror(errno);
|
| 312 |
-
bool suggest = (errno == ENOMEM);
|
| 313 |
-
|
| 314 |
-
// Check if the resource limit is fine after all
|
| 315 |
-
struct rlimit lock_limit;
|
| 316 |
-
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
| 317 |
-
suggest = false;
|
| 318 |
-
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
| 319 |
-
suggest = false;
|
| 320 |
-
|
| 321 |
-
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
| 322 |
-
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
| 323 |
-
return false;
|
| 324 |
-
}
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
#undef MLOCK_SUGGESTION
|
| 328 |
-
|
| 329 |
-
void raw_unlock(void * addr, size_t size) {
|
| 330 |
-
if (munlock(addr, size)) {
|
| 331 |
-
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
| 332 |
-
}
|
| 333 |
-
}
|
| 334 |
-
#elif defined(_WIN32)
|
| 335 |
-
static constexpr bool SUPPORTED = true;
|
| 336 |
-
|
| 337 |
-
size_t lock_granularity() {
|
| 338 |
-
SYSTEM_INFO si;
|
| 339 |
-
GetSystemInfo(&si);
|
| 340 |
-
return (size_t) si.dwPageSize;
|
| 341 |
-
}
|
| 342 |
-
|
| 343 |
-
bool raw_lock(void * ptr, size_t len) {
|
| 344 |
-
for (int tries = 1; ; tries++) {
|
| 345 |
-
if (VirtualLock(ptr, len)) {
|
| 346 |
-
return true;
|
| 347 |
-
}
|
| 348 |
-
if (tries == 2) {
|
| 349 |
-
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
| 350 |
-
len, size, llama_format_win_err(GetLastError()).c_str());
|
| 351 |
-
return false;
|
| 352 |
-
}
|
| 353 |
-
|
| 354 |
-
// It failed but this was only the first try; increase the working
|
| 355 |
-
// set size and try again.
|
| 356 |
-
SIZE_T min_ws_size, max_ws_size;
|
| 357 |
-
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
| 358 |
-
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
| 359 |
-
llama_format_win_err(GetLastError()).c_str());
|
| 360 |
-
return false;
|
| 361 |
-
}
|
| 362 |
-
// Per MSDN: "The maximum number of pages that a process can lock
|
| 363 |
-
// is equal to the number of pages in its minimum working set minus
|
| 364 |
-
// a small overhead."
|
| 365 |
-
// Hopefully a megabyte is enough overhead:
|
| 366 |
-
size_t increment = len + 1048576;
|
| 367 |
-
// The minimum must be <= the maximum, so we need to increase both:
|
| 368 |
-
min_ws_size += increment;
|
| 369 |
-
max_ws_size += increment;
|
| 370 |
-
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
| 371 |
-
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
| 372 |
-
llama_format_win_err(GetLastError()).c_str());
|
| 373 |
-
return false;
|
| 374 |
-
}
|
| 375 |
-
}
|
| 376 |
-
}
|
| 377 |
-
|
| 378 |
-
void raw_unlock(void * ptr, size_t len) {
|
| 379 |
-
if (!VirtualUnlock(ptr, len)) {
|
| 380 |
-
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
| 381 |
-
llama_format_win_err(GetLastError()).c_str());
|
| 382 |
-
}
|
| 383 |
-
}
|
| 384 |
-
#else
|
| 385 |
-
static constexpr bool SUPPORTED = false;
|
| 386 |
-
|
| 387 |
-
size_t lock_granularity() {
|
| 388 |
-
return (size_t) 65536;
|
| 389 |
-
}
|
| 390 |
-
|
| 391 |
-
bool raw_lock(const void * addr, size_t len) {
|
| 392 |
-
fprintf(stderr, "warning: mlock not supported on this system\n");
|
| 393 |
-
return false;
|
| 394 |
-
}
|
| 395 |
-
|
| 396 |
-
void raw_unlock(const void * addr, size_t len) {}
|
| 397 |
-
#endif
|
| 398 |
-
};
|
| 399 |
-
|
| 400 |
-
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
| 401 |
-
struct llama_buffer {
|
| 402 |
-
uint8_t * addr = NULL;
|
| 403 |
-
size_t size = 0;
|
| 404 |
-
|
| 405 |
-
llama_buffer() = default;
|
| 406 |
-
|
| 407 |
-
void resize(size_t len) {
|
| 408 |
-
delete[] addr;
|
| 409 |
-
addr = new uint8_t[len];
|
| 410 |
-
size = len;
|
| 411 |
-
}
|
| 412 |
-
|
| 413 |
-
~llama_buffer() {
|
| 414 |
-
delete[] addr;
|
| 415 |
-
}
|
| 416 |
-
|
| 417 |
-
// disable copy and move
|
| 418 |
-
llama_buffer(const llama_buffer&) = delete;
|
| 419 |
-
llama_buffer(llama_buffer&&) = delete;
|
| 420 |
-
llama_buffer& operator=(const llama_buffer&) = delete;
|
| 421 |
-
llama_buffer& operator=(llama_buffer&&) = delete;
|
| 422 |
-
};
|
| 423 |
-
|
| 424 |
-
#ifdef GGML_USE_CUBLAS
|
| 425 |
-
#include "ggml-cuda.h"
|
| 426 |
-
struct llama_ctx_buffer {
|
| 427 |
-
uint8_t * addr = NULL;
|
| 428 |
-
bool is_cuda;
|
| 429 |
-
size_t size = 0;
|
| 430 |
-
|
| 431 |
-
llama_ctx_buffer() = default;
|
| 432 |
-
|
| 433 |
-
void resize(size_t size) {
|
| 434 |
-
free();
|
| 435 |
-
|
| 436 |
-
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
| 437 |
-
if (addr) {
|
| 438 |
-
is_cuda = true;
|
| 439 |
-
}
|
| 440 |
-
else {
|
| 441 |
-
// fall back to pageable memory
|
| 442 |
-
addr = new uint8_t[size];
|
| 443 |
-
is_cuda = false;
|
| 444 |
-
}
|
| 445 |
-
this->size = size;
|
| 446 |
-
}
|
| 447 |
-
|
| 448 |
-
void free() {
|
| 449 |
-
if (addr) {
|
| 450 |
-
if (is_cuda) {
|
| 451 |
-
ggml_cuda_host_free(addr);
|
| 452 |
-
}
|
| 453 |
-
else {
|
| 454 |
-
delete[] addr;
|
| 455 |
-
}
|
| 456 |
-
}
|
| 457 |
-
addr = NULL;
|
| 458 |
-
}
|
| 459 |
-
|
| 460 |
-
~llama_ctx_buffer() {
|
| 461 |
-
free();
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
// disable copy and move
|
| 465 |
-
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
| 466 |
-
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
| 467 |
-
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
| 468 |
-
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
| 469 |
-
};
|
| 470 |
-
#else
|
| 471 |
-
typedef llama_buffer llama_ctx_buffer;
|
| 472 |
-
#endif
|
| 473 |
-
|
| 474 |
-
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -1,8 +1,16 @@
|
|
| 1 |
#ifndef LLAMA_H
|
| 2 |
#define LLAMA_H
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
#include <stddef.h>
|
| 5 |
#include <stdint.h>
|
|
|
|
| 6 |
#include <stdbool.h>
|
| 7 |
|
| 8 |
#ifdef LLAMA_SHARED
|
|
@@ -19,17 +27,25 @@
|
|
| 19 |
# define LLAMA_API
|
| 20 |
#endif
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
#define
|
| 24 |
-
#
|
| 25 |
-
#define
|
| 26 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
#define
|
| 29 |
-
|
| 30 |
-
#define
|
| 31 |
-
#define
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
#ifdef __cplusplus
|
| 35 |
extern "C" {
|
|
@@ -41,10 +57,57 @@ extern "C" {
|
|
| 41 |
// TODO: show sample usage
|
| 42 |
//
|
| 43 |
|
|
|
|
| 44 |
struct llama_context;
|
| 45 |
|
| 46 |
typedef int llama_token;
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
typedef struct llama_token_data {
|
| 49 |
llama_token id; // token id
|
| 50 |
float logit; // log-odds of the token
|
|
@@ -60,67 +123,152 @@ extern "C" {
|
|
| 60 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 61 |
|
| 62 |
struct llama_context_params {
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
|
|
|
|
|
|
|
|
|
| 67 |
bool f16_kv; // use fp16 for KV cache
|
| 68 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
| 69 |
bool vocab_only; // only load the vocabulary, no weights
|
| 70 |
bool use_mmap; // use mmap if possible
|
| 71 |
bool use_mlock; // force system to keep model in RAM
|
| 72 |
bool embedding; // embedding mode only
|
| 73 |
-
|
| 74 |
-
// called with a progress value between 0 and 1, pass NULL to disable
|
| 75 |
-
llama_progress_callback progress_callback;
|
| 76 |
-
// context pointer passed to the progress callback
|
| 77 |
-
void * progress_callback_user_data;
|
| 78 |
};
|
| 79 |
|
| 80 |
-
//
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
};
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
LLAMA_API
|
| 97 |
-
LLAMA_API
|
| 98 |
|
| 99 |
-
// TODO: not great API - very likely to change
|
| 100 |
// Initialize the llama + ggml backend
|
|
|
|
| 101 |
// Call once at the start of the program
|
| 102 |
-
LLAMA_API void
|
| 103 |
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
-
|
| 107 |
-
// Allocate (almost) all memory needed for the model.
|
| 108 |
-
// Return NULL on failure
|
| 109 |
-
LLAMA_API struct llama_context * llama_init_from_file(
|
| 110 |
const char * path_model,
|
| 111 |
struct llama_context_params params);
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
// Frees all allocated memory
|
| 114 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
// Returns 0 on success
|
| 118 |
-
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
| 119 |
LLAMA_API int llama_model_quantize(
|
| 120 |
const char * fname_inp,
|
| 121 |
const char * fname_out,
|
| 122 |
-
|
| 123 |
-
int nthread);
|
| 124 |
|
| 125 |
// Apply a LoRA adapter to a loaded model
|
| 126 |
// path_base_model is the path to a higher quality model to use as a base for
|
|
@@ -128,17 +276,24 @@ extern "C" {
|
|
| 128 |
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
| 129 |
// will be applied on top of the previous one
|
| 130 |
// Returns 0 on success
|
| 131 |
-
LLAMA_API int llama_apply_lora_from_file(
|
| 132 |
struct llama_context * ctx,
|
| 133 |
const char * path_lora,
|
| 134 |
const char * path_base_model,
|
| 135 |
-
int n_threads)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
// Returns the number of tokens in the KV cache
|
| 138 |
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
| 139 |
|
| 140 |
// Sets the current rng seed.
|
| 141 |
-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx,
|
| 142 |
|
| 143 |
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
| 144 |
// and kv_cache) - will often be smaller after compacting tokens
|
|
@@ -168,21 +323,19 @@ extern "C" {
|
|
| 168 |
int n_past,
|
| 169 |
int n_threads);
|
| 170 |
|
| 171 |
-
//
|
| 172 |
-
|
| 173 |
-
// Returns the number of tokens on success, no more than n_max_tokens
|
| 174 |
-
// Returns a negative number on failure - the number of tokens that would have been returned
|
| 175 |
-
// TODO: not sure if correct
|
| 176 |
-
LLAMA_API int llama_tokenize(
|
| 177 |
struct llama_context * ctx,
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
int
|
| 181 |
-
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
| 186 |
|
| 187 |
// Token logits obtained from the last call to llama_eval()
|
| 188 |
// The logits for the last token are stored in the last row
|
|
@@ -195,15 +348,75 @@ extern "C" {
|
|
| 195 |
// shape: [n_embd] (1-dimensional)
|
| 196 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 197 |
|
| 198 |
-
//
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
// Special tokens
|
| 202 |
-
LLAMA_API llama_token llama_token_bos();
|
| 203 |
-
LLAMA_API llama_token llama_token_eos();
|
| 204 |
-
LLAMA_API llama_token llama_token_nl();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
// Sampling functions
|
|
|
|
| 207 |
|
| 208 |
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
| 209 |
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
|
@@ -211,6 +424,16 @@ extern "C" {
|
|
| 211 |
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
| 212 |
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 215 |
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 216 |
|
|
@@ -227,6 +450,9 @@ extern "C" {
|
|
| 227 |
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
| 228 |
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
| 229 |
|
|
|
|
|
|
|
|
|
|
| 230 |
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
| 231 |
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
| 232 |
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
@@ -248,13 +474,60 @@ extern "C" {
|
|
| 248 |
/// @details Randomly selects a token from the candidates based on their probabilities.
|
| 249 |
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
// Performance information
|
|
|
|
| 252 |
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
| 253 |
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
| 254 |
|
| 255 |
// Print system information
|
| 256 |
LLAMA_API const char * llama_print_system_info(void);
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
#ifdef __cplusplus
|
| 259 |
}
|
| 260 |
#endif
|
|
@@ -264,10 +537,11 @@ extern "C" {
|
|
| 264 |
|
| 265 |
#include <vector>
|
| 266 |
#include <string>
|
|
|
|
| 267 |
struct ggml_tensor;
|
| 268 |
|
| 269 |
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
| 270 |
|
| 271 |
-
#endif
|
| 272 |
|
| 273 |
#endif // LLAMA_H
|
|
|
|
| 1 |
#ifndef LLAMA_H
|
| 2 |
#define LLAMA_H
|
| 3 |
|
| 4 |
+
#include "ggml.h"
|
| 5 |
+
#ifdef GGML_USE_CUBLAS
|
| 6 |
+
#include "ggml-cuda.h"
|
| 7 |
+
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
| 8 |
+
#else
|
| 9 |
+
#define LLAMA_MAX_DEVICES 1
|
| 10 |
+
#endif // GGML_USE_CUBLAS
|
| 11 |
#include <stddef.h>
|
| 12 |
#include <stdint.h>
|
| 13 |
+
#include <stdio.h>
|
| 14 |
#include <stdbool.h>
|
| 15 |
|
| 16 |
#ifdef LLAMA_SHARED
|
|
|
|
| 27 |
# define LLAMA_API
|
| 28 |
#endif
|
| 29 |
|
| 30 |
+
#ifdef __GNUC__
|
| 31 |
+
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
| 32 |
+
#elif defined(_MSC_VER)
|
| 33 |
+
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
| 34 |
+
#else
|
| 35 |
+
# define DEPRECATED(func, hint) func
|
| 36 |
+
#endif
|
| 37 |
+
|
| 38 |
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
| 39 |
|
| 40 |
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 41 |
+
|
| 42 |
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 43 |
+
#define LLAMA_SESSION_VERSION 1
|
| 44 |
+
|
| 45 |
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 46 |
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 47 |
+
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
| 48 |
+
#endif
|
| 49 |
|
| 50 |
#ifdef __cplusplus
|
| 51 |
extern "C" {
|
|
|
|
| 57 |
// TODO: show sample usage
|
| 58 |
//
|
| 59 |
|
| 60 |
+
struct llama_model;
|
| 61 |
struct llama_context;
|
| 62 |
|
| 63 |
typedef int llama_token;
|
| 64 |
|
| 65 |
+
enum llama_log_level {
|
| 66 |
+
LLAMA_LOG_LEVEL_ERROR = 2,
|
| 67 |
+
LLAMA_LOG_LEVEL_WARN = 3,
|
| 68 |
+
LLAMA_LOG_LEVEL_INFO = 4
|
| 69 |
+
};
|
| 70 |
+
|
| 71 |
+
enum llama_vocab_type {
|
| 72 |
+
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
| 73 |
+
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
| 74 |
+
};
|
| 75 |
+
|
| 76 |
+
enum llama_token_type {
|
| 77 |
+
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
| 78 |
+
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
| 79 |
+
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
| 80 |
+
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
| 81 |
+
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
| 82 |
+
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
| 83 |
+
LLAMA_TOKEN_TYPE_BYTE = 6,
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
// model file types
|
| 87 |
+
enum llama_ftype {
|
| 88 |
+
LLAMA_FTYPE_ALL_F32 = 0,
|
| 89 |
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
| 90 |
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
| 91 |
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
| 92 |
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
| 93 |
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
| 94 |
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
| 95 |
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
| 96 |
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
| 97 |
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
| 98 |
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
| 99 |
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
| 100 |
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
| 101 |
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
| 102 |
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
| 103 |
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
| 104 |
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
| 105 |
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
| 106 |
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
| 107 |
+
|
| 108 |
+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
typedef struct llama_token_data {
|
| 112 |
llama_token id; // token id
|
| 113 |
float logit; // log-odds of the token
|
|
|
|
| 123 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
| 124 |
|
| 125 |
struct llama_context_params {
|
| 126 |
+
uint32_t seed; // RNG seed, -1 for random
|
| 127 |
+
int32_t n_ctx; // text context
|
| 128 |
+
int32_t n_batch; // prompt processing batch size
|
| 129 |
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 130 |
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
| 131 |
+
|
| 132 |
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
| 133 |
+
|
| 134 |
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
| 135 |
+
float rope_freq_base; // RoPE base frequency
|
| 136 |
+
float rope_freq_scale; // RoPE frequency scaling factor
|
| 137 |
+
|
| 138 |
+
// called with a progress value between 0 and 1, pass NULL to disable
|
| 139 |
+
llama_progress_callback progress_callback;
|
| 140 |
+
// context pointer passed to the progress callback
|
| 141 |
+
void * progress_callback_user_data;
|
| 142 |
|
| 143 |
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
| 144 |
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
| 145 |
+
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
| 146 |
bool f16_kv; // use fp16 for KV cache
|
| 147 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
| 148 |
bool vocab_only; // only load the vocabulary, no weights
|
| 149 |
bool use_mmap; // use mmap if possible
|
| 150 |
bool use_mlock; // force system to keep model in RAM
|
| 151 |
bool embedding; // embedding mode only
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
};
|
| 153 |
|
| 154 |
+
// Signature for logging events
|
| 155 |
+
// Note that text includes the new line character at the end for most events.
|
| 156 |
+
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
| 157 |
+
// if it exists.
|
| 158 |
+
// It might not exist for progress report where '.' is output repeatedly.
|
| 159 |
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
| 160 |
+
|
| 161 |
+
// model quantization parameters
|
| 162 |
+
typedef struct llama_model_quantize_params {
|
| 163 |
+
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
| 164 |
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
| 165 |
+
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
| 166 |
+
bool quantize_output_tensor; // quantize output.weight
|
| 167 |
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
| 168 |
+
} llama_model_quantize_params;
|
| 169 |
+
|
| 170 |
+
// grammar types
|
| 171 |
+
struct llama_grammar;
|
| 172 |
+
|
| 173 |
+
// grammar element type
|
| 174 |
+
enum llama_gretype {
|
| 175 |
+
// end of rule definition
|
| 176 |
+
LLAMA_GRETYPE_END = 0,
|
| 177 |
+
|
| 178 |
+
// start of alternate definition for rule
|
| 179 |
+
LLAMA_GRETYPE_ALT = 1,
|
| 180 |
+
|
| 181 |
+
// non-terminal element: reference to rule
|
| 182 |
+
LLAMA_GRETYPE_RULE_REF = 2,
|
| 183 |
+
|
| 184 |
+
// terminal element: character (code point)
|
| 185 |
+
LLAMA_GRETYPE_CHAR = 3,
|
| 186 |
+
|
| 187 |
+
// inverse char(s) ([^a], [^a-b] [^abc])
|
| 188 |
+
LLAMA_GRETYPE_CHAR_NOT = 4,
|
| 189 |
+
|
| 190 |
+
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
| 191 |
+
// be an inclusive range ([a-z])
|
| 192 |
+
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
| 193 |
+
|
| 194 |
+
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
| 195 |
+
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
| 196 |
+
LLAMA_GRETYPE_CHAR_ALT = 6,
|
| 197 |
};
|
| 198 |
|
| 199 |
+
typedef struct llama_grammar_element {
|
| 200 |
+
enum llama_gretype type;
|
| 201 |
+
uint32_t value; // Unicode code point or rule ID
|
| 202 |
+
} llama_grammar_element;
|
| 203 |
+
|
| 204 |
+
// performance timing information
|
| 205 |
+
struct llama_timings {
|
| 206 |
+
double t_start_ms;
|
| 207 |
+
double t_end_ms;
|
| 208 |
+
double t_load_ms;
|
| 209 |
+
double t_sample_ms;
|
| 210 |
+
double t_p_eval_ms;
|
| 211 |
+
double t_eval_ms;
|
| 212 |
+
|
| 213 |
+
int32_t n_sample;
|
| 214 |
+
int32_t n_p_eval;
|
| 215 |
+
int32_t n_eval;
|
| 216 |
+
};
|
| 217 |
|
| 218 |
+
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
| 219 |
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
| 220 |
|
|
|
|
| 221 |
// Initialize the llama + ggml backend
|
| 222 |
+
// If numa is true, use NUMA optimizations
|
| 223 |
// Call once at the start of the program
|
| 224 |
+
LLAMA_API void llama_backend_init(bool numa);
|
| 225 |
|
| 226 |
+
// Call once at the end of the program - currently only used for MPI
|
| 227 |
+
LLAMA_API void llama_backend_free(void);
|
| 228 |
|
| 229 |
+
LLAMA_API struct llama_model * llama_load_model_from_file(
|
|
|
|
|
|
|
|
|
|
| 230 |
const char * path_model,
|
| 231 |
struct llama_context_params params);
|
| 232 |
|
| 233 |
+
LLAMA_API void llama_free_model(struct llama_model * model);
|
| 234 |
+
|
| 235 |
+
LLAMA_API struct llama_context * llama_new_context_with_model(
|
| 236 |
+
struct llama_model * model,
|
| 237 |
+
struct llama_context_params params);
|
| 238 |
+
|
| 239 |
// Frees all allocated memory
|
| 240 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
| 241 |
|
| 242 |
+
LLAMA_API int64_t llama_time_us(void);
|
| 243 |
+
|
| 244 |
+
LLAMA_API int llama_max_devices (void);
|
| 245 |
+
LLAMA_API bool llama_mmap_supported (void);
|
| 246 |
+
LLAMA_API bool llama_mlock_supported(void);
|
| 247 |
+
|
| 248 |
+
LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
|
| 249 |
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
| 250 |
+
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
| 251 |
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
| 252 |
+
|
| 253 |
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
| 254 |
+
|
| 255 |
+
LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
|
| 256 |
+
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
| 257 |
+
LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
|
| 258 |
+
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
| 259 |
+
|
| 260 |
+
// Get a string describing the model type
|
| 261 |
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
| 262 |
+
// Returns the total size of all the tensors in the model in bytes
|
| 263 |
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
| 264 |
+
// Returns the total number of parameters in the model
|
| 265 |
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
| 266 |
+
|
| 267 |
// Returns 0 on success
|
|
|
|
| 268 |
LLAMA_API int llama_model_quantize(
|
| 269 |
const char * fname_inp,
|
| 270 |
const char * fname_out,
|
| 271 |
+
const llama_model_quantize_params * params);
|
|
|
|
| 272 |
|
| 273 |
// Apply a LoRA adapter to a loaded model
|
| 274 |
// path_base_model is the path to a higher quality model to use as a base for
|
|
|
|
| 276 |
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
| 277 |
// will be applied on top of the previous one
|
| 278 |
// Returns 0 on success
|
| 279 |
+
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
| 280 |
struct llama_context * ctx,
|
| 281 |
const char * path_lora,
|
| 282 |
const char * path_base_model,
|
| 283 |
+
int n_threads),
|
| 284 |
+
"please use llama_model_apply_lora_from_file instead");
|
| 285 |
+
|
| 286 |
+
LLAMA_API int llama_model_apply_lora_from_file(
|
| 287 |
+
const struct llama_model * model,
|
| 288 |
+
const char * path_lora,
|
| 289 |
+
const char * path_base_model,
|
| 290 |
+
int n_threads);
|
| 291 |
|
| 292 |
// Returns the number of tokens in the KV cache
|
| 293 |
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
| 294 |
|
| 295 |
// Sets the current rng seed.
|
| 296 |
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
| 297 |
|
| 298 |
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
| 299 |
// and kv_cache) - will often be smaller after compacting tokens
|
|
|
|
| 323 |
int n_past,
|
| 324 |
int n_threads);
|
| 325 |
|
| 326 |
+
// Same as llama_eval, but use float matrix input directly.
|
| 327 |
+
LLAMA_API int llama_eval_embd(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
struct llama_context * ctx,
|
| 329 |
+
const float * embd,
|
| 330 |
+
int n_tokens,
|
| 331 |
+
int n_past,
|
| 332 |
+
int n_threads);
|
| 333 |
|
| 334 |
+
// Export a static computation graph for context of 511 and batch size of 1
|
| 335 |
+
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
| 336 |
+
// parameters here to keep things simple
|
| 337 |
+
// IMPORTANT: do not use for anything else other than debugging and testing!
|
| 338 |
+
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
| 339 |
|
| 340 |
// Token logits obtained from the last call to llama_eval()
|
| 341 |
// The logits for the last token are stored in the last row
|
|
|
|
| 348 |
// shape: [n_embd] (1-dimensional)
|
| 349 |
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
| 350 |
|
| 351 |
+
//
|
| 352 |
+
// Vocab
|
| 353 |
+
//
|
| 354 |
+
|
| 355 |
+
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
| 356 |
+
|
| 357 |
+
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
| 358 |
+
|
| 359 |
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
| 360 |
|
| 361 |
// Special tokens
|
| 362 |
+
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
| 363 |
+
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
| 364 |
+
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
| 365 |
+
|
| 366 |
+
//
|
| 367 |
+
// Tokenization
|
| 368 |
+
//
|
| 369 |
|
| 370 |
+
// Convert the provided text into tokens.
|
| 371 |
+
// The tokens pointer must be large enough to hold the resulting tokens.
|
| 372 |
+
// Returns the number of tokens on success, no more than n_max_tokens
|
| 373 |
+
// Returns a negative number on failure - the number of tokens that would have been returned
|
| 374 |
+
LLAMA_API int llama_tokenize(
|
| 375 |
+
struct llama_context * ctx,
|
| 376 |
+
const char * text,
|
| 377 |
+
llama_token * tokens,
|
| 378 |
+
int n_max_tokens,
|
| 379 |
+
bool add_bos);
|
| 380 |
+
|
| 381 |
+
LLAMA_API int llama_tokenize_with_model(
|
| 382 |
+
const struct llama_model * model,
|
| 383 |
+
const char * text,
|
| 384 |
+
llama_token * tokens,
|
| 385 |
+
int n_max_tokens,
|
| 386 |
+
bool add_bos);
|
| 387 |
+
|
| 388 |
+
// Token Id -> Piece.
|
| 389 |
+
// Uses the vocabulary in the provided context.
|
| 390 |
+
// Does not write null terminator to the buffer.
|
| 391 |
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
| 392 |
+
LLAMA_API int llama_token_to_piece(
|
| 393 |
+
const struct llama_context * ctx,
|
| 394 |
+
llama_token token,
|
| 395 |
+
char * buf,
|
| 396 |
+
int length);
|
| 397 |
+
|
| 398 |
+
LLAMA_API int llama_token_to_piece_with_model(
|
| 399 |
+
const struct llama_model * model,
|
| 400 |
+
llama_token token,
|
| 401 |
+
char * buf,
|
| 402 |
+
int length);
|
| 403 |
+
|
| 404 |
+
//
|
| 405 |
+
// Grammar
|
| 406 |
+
//
|
| 407 |
+
|
| 408 |
+
LLAMA_API struct llama_grammar * llama_grammar_init(
|
| 409 |
+
const llama_grammar_element ** rules,
|
| 410 |
+
size_t n_rules,
|
| 411 |
+
size_t start_rule_index);
|
| 412 |
+
|
| 413 |
+
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
| 414 |
+
|
| 415 |
+
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
| 416 |
+
|
| 417 |
+
//
|
| 418 |
// Sampling functions
|
| 419 |
+
//
|
| 420 |
|
| 421 |
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
| 422 |
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
|
|
|
| 424 |
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
| 425 |
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
| 426 |
|
| 427 |
+
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
| 428 |
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
| 429 |
+
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
| 430 |
+
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
| 431 |
+
LLAMA_API void llama_sample_classifier_free_guidance(
|
| 432 |
+
struct llama_context * ctx,
|
| 433 |
+
llama_token_data_array * candidates,
|
| 434 |
+
struct llama_context * guidance_ctx,
|
| 435 |
+
float scale);
|
| 436 |
+
|
| 437 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 438 |
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 439 |
|
|
|
|
| 450 |
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
| 451 |
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
| 452 |
|
| 453 |
+
/// @details Apply constraints from grammar
|
| 454 |
+
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
| 455 |
+
|
| 456 |
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
| 457 |
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
| 458 |
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
|
|
| 474 |
/// @details Randomly selects a token from the candidates based on their probabilities.
|
| 475 |
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
| 476 |
|
| 477 |
+
/// @details Accepts the sampled token into the grammar
|
| 478 |
+
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
| 479 |
+
|
| 480 |
+
//
|
| 481 |
+
// Beam search
|
| 482 |
+
//
|
| 483 |
+
|
| 484 |
+
struct llama_beam_view {
|
| 485 |
+
const llama_token * tokens;
|
| 486 |
+
size_t n_tokens;
|
| 487 |
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
| 488 |
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
| 489 |
+
};
|
| 490 |
+
|
| 491 |
+
// Passed to beam_search_callback function.
|
| 492 |
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
| 493 |
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
| 494 |
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
| 495 |
+
struct llama_beams_state {
|
| 496 |
+
struct llama_beam_view * beam_views;
|
| 497 |
+
size_t n_beams; // Number of elements in beam_views[].
|
| 498 |
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
| 499 |
+
bool last_call; // True iff this is the last callback invocation.
|
| 500 |
+
};
|
| 501 |
+
|
| 502 |
+
// Type of pointer to the beam_search_callback function.
|
| 503 |
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
| 504 |
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
| 505 |
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
| 506 |
+
|
| 507 |
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
| 508 |
+
/// @param ctx Pointer to the llama_context.
|
| 509 |
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
| 510 |
+
/// @param callback_data A pointer that is simply passed back to callback.
|
| 511 |
+
/// @param n_beams Number of beams to use.
|
| 512 |
+
/// @param n_past Number of tokens already evaluated.
|
| 513 |
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
| 514 |
+
/// @param n_threads Number of threads as passed to llama_eval().
|
| 515 |
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
| 516 |
+
|
| 517 |
// Performance information
|
| 518 |
+
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
| 519 |
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
| 520 |
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
| 521 |
|
| 522 |
// Print system information
|
| 523 |
LLAMA_API const char * llama_print_system_info(void);
|
| 524 |
|
| 525 |
+
// Set callback for all future logging events.
|
| 526 |
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
| 527 |
+
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
| 528 |
+
|
| 529 |
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
| 530 |
+
|
| 531 |
#ifdef __cplusplus
|
| 532 |
}
|
| 533 |
#endif
|
|
|
|
| 537 |
|
| 538 |
#include <vector>
|
| 539 |
#include <string>
|
| 540 |
+
|
| 541 |
struct ggml_tensor;
|
| 542 |
|
| 543 |
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
| 544 |
|
| 545 |
+
#endif // LLAMA_API_INTERNAL
|
| 546 |
|
| 547 |
#endif // LLAMA_H
|
examples/talk-llama/speak
CHANGED
|
File without changes
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -25,6 +25,20 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
|
| 25 |
return res;
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
// command-line parameters
|
| 29 |
struct whisper_params {
|
| 30 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
@@ -33,14 +47,14 @@ struct whisper_params {
|
|
| 33 |
int32_t max_tokens = 32;
|
| 34 |
int32_t audio_ctx = 0;
|
| 35 |
|
| 36 |
-
float vad_thold
|
| 37 |
-
float freq_thold
|
| 38 |
|
| 39 |
-
bool speed_up
|
| 40 |
-
bool translate
|
| 41 |
-
bool print_special
|
| 42 |
-
bool print_energy
|
| 43 |
-
bool no_timestamps
|
| 44 |
bool verbose_prompt = false;
|
| 45 |
|
| 46 |
std::string person = "Georgi";
|
|
@@ -235,7 +249,7 @@ int main(int argc, char ** argv) {
|
|
| 235 |
|
| 236 |
// llama init
|
| 237 |
|
| 238 |
-
|
| 239 |
|
| 240 |
auto lparams = llama_context_default_params();
|
| 241 |
|
|
@@ -244,7 +258,9 @@ int main(int argc, char ** argv) {
|
|
| 244 |
lparams.seed = 1;
|
| 245 |
lparams.f16_kv = true;
|
| 246 |
|
| 247 |
-
struct
|
|
|
|
|
|
|
| 248 |
|
| 249 |
// print some info about the processing
|
| 250 |
{
|
|
@@ -267,7 +283,6 @@ int main(int argc, char ** argv) {
|
|
| 267 |
fprintf(stderr, "\n");
|
| 268 |
}
|
| 269 |
|
| 270 |
-
|
| 271 |
// init audio
|
| 272 |
|
| 273 |
audio_async audio(30*1000);
|
|
@@ -278,8 +293,6 @@ int main(int argc, char ** argv) {
|
|
| 278 |
|
| 279 |
audio.resume();
|
| 280 |
|
| 281 |
-
int n_iter = 0;
|
| 282 |
-
|
| 283 |
bool is_running = true;
|
| 284 |
bool force_speak = false;
|
| 285 |
|
|
@@ -514,7 +527,7 @@ int main(int argc, char ** argv) {
|
|
| 514 |
//printf("\n---\n");
|
| 515 |
//printf("resetting: '");
|
| 516 |
//for (int i = 0; i < (int) embd.size(); i++) {
|
| 517 |
-
// printf("%s",
|
| 518 |
//}
|
| 519 |
//printf("'\n");
|
| 520 |
//printf("\n---\n");
|
|
@@ -582,7 +595,7 @@ int main(int argc, char ** argv) {
|
|
| 582 |
auto logits = llama_get_logits(ctx_llama);
|
| 583 |
auto n_vocab = llama_n_vocab(ctx_llama);
|
| 584 |
|
| 585 |
-
logits[llama_token_eos()] = 0;
|
| 586 |
|
| 587 |
std::vector<llama_token_data> candidates;
|
| 588 |
candidates.reserve(n_vocab);
|
|
@@ -593,13 +606,13 @@ int main(int argc, char ** argv) {
|
|
| 593 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
| 594 |
|
| 595 |
// apply repeat penalty
|
| 596 |
-
const float nl_logit = logits[llama_token_nl()];
|
| 597 |
|
| 598 |
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
|
| 599 |
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
| 600 |
repeat_last_n, repeat_penalty);
|
| 601 |
|
| 602 |
-
logits[llama_token_nl()] = nl_logit;
|
| 603 |
|
| 604 |
if (temp <= 0) {
|
| 605 |
// Greedy sampling
|
|
@@ -613,22 +626,22 @@ int main(int argc, char ** argv) {
|
|
| 613 |
}
|
| 614 |
}
|
| 615 |
|
| 616 |
-
if (id != llama_token_eos()) {
|
| 617 |
// add it to the context
|
| 618 |
embd.push_back(id);
|
| 619 |
|
| 620 |
-
text_to_speak +=
|
| 621 |
|
| 622 |
-
printf("%s",
|
| 623 |
}
|
| 624 |
}
|
| 625 |
|
| 626 |
{
|
| 627 |
std::string last_output;
|
| 628 |
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
|
| 629 |
-
last_output +=
|
| 630 |
}
|
| 631 |
-
last_output +=
|
| 632 |
|
| 633 |
for (std::string & antiprompt : antiprompts) {
|
| 634 |
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
|
@@ -655,8 +668,6 @@ int main(int argc, char ** argv) {
|
|
| 655 |
}
|
| 656 |
|
| 657 |
audio.clear();
|
| 658 |
-
|
| 659 |
-
++n_iter;
|
| 660 |
}
|
| 661 |
}
|
| 662 |
}
|
|
|
|
| 25 |
return res;
|
| 26 |
}
|
| 27 |
|
| 28 |
+
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
| 29 |
+
std::vector<char> result(8, 0);
|
| 30 |
+
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
| 31 |
+
if (n_tokens < 0) {
|
| 32 |
+
result.resize(-n_tokens);
|
| 33 |
+
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
| 34 |
+
GGML_ASSERT(check == -n_tokens);
|
| 35 |
+
} else {
|
| 36 |
+
result.resize(n_tokens);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
return std::string(result.data(), result.size());
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
// command-line parameters
|
| 43 |
struct whisper_params {
|
| 44 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
|
|
| 47 |
int32_t max_tokens = 32;
|
| 48 |
int32_t audio_ctx = 0;
|
| 49 |
|
| 50 |
+
float vad_thold = 0.6f;
|
| 51 |
+
float freq_thold = 100.0f;
|
| 52 |
|
| 53 |
+
bool speed_up = false;
|
| 54 |
+
bool translate = false;
|
| 55 |
+
bool print_special = false;
|
| 56 |
+
bool print_energy = false;
|
| 57 |
+
bool no_timestamps = true;
|
| 58 |
bool verbose_prompt = false;
|
| 59 |
|
| 60 |
std::string person = "Georgi";
|
|
|
|
| 249 |
|
| 250 |
// llama init
|
| 251 |
|
| 252 |
+
llama_backend_init(true);
|
| 253 |
|
| 254 |
auto lparams = llama_context_default_params();
|
| 255 |
|
|
|
|
| 258 |
lparams.seed = 1;
|
| 259 |
lparams.f16_kv = true;
|
| 260 |
|
| 261 |
+
struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lparams);
|
| 262 |
+
|
| 263 |
+
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lparams);
|
| 264 |
|
| 265 |
// print some info about the processing
|
| 266 |
{
|
|
|
|
| 283 |
fprintf(stderr, "\n");
|
| 284 |
}
|
| 285 |
|
|
|
|
| 286 |
// init audio
|
| 287 |
|
| 288 |
audio_async audio(30*1000);
|
|
|
|
| 293 |
|
| 294 |
audio.resume();
|
| 295 |
|
|
|
|
|
|
|
| 296 |
bool is_running = true;
|
| 297 |
bool force_speak = false;
|
| 298 |
|
|
|
|
| 527 |
//printf("\n---\n");
|
| 528 |
//printf("resetting: '");
|
| 529 |
//for (int i = 0; i < (int) embd.size(); i++) {
|
| 530 |
+
// printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
|
| 531 |
//}
|
| 532 |
//printf("'\n");
|
| 533 |
//printf("\n---\n");
|
|
|
|
| 595 |
auto logits = llama_get_logits(ctx_llama);
|
| 596 |
auto n_vocab = llama_n_vocab(ctx_llama);
|
| 597 |
|
| 598 |
+
logits[llama_token_eos(ctx_llama)] = 0;
|
| 599 |
|
| 600 |
std::vector<llama_token_data> candidates;
|
| 601 |
candidates.reserve(n_vocab);
|
|
|
|
| 606 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
| 607 |
|
| 608 |
// apply repeat penalty
|
| 609 |
+
const float nl_logit = logits[llama_token_nl(ctx_llama)];
|
| 610 |
|
| 611 |
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
|
| 612 |
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
| 613 |
repeat_last_n, repeat_penalty);
|
| 614 |
|
| 615 |
+
logits[llama_token_nl(ctx_llama)] = nl_logit;
|
| 616 |
|
| 617 |
if (temp <= 0) {
|
| 618 |
// Greedy sampling
|
|
|
|
| 626 |
}
|
| 627 |
}
|
| 628 |
|
| 629 |
+
if (id != llama_token_eos(ctx_llama)) {
|
| 630 |
// add it to the context
|
| 631 |
embd.push_back(id);
|
| 632 |
|
| 633 |
+
text_to_speak += llama_token_to_piece(ctx_llama, id);
|
| 634 |
|
| 635 |
+
printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
|
| 636 |
}
|
| 637 |
}
|
| 638 |
|
| 639 |
{
|
| 640 |
std::string last_output;
|
| 641 |
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
|
| 642 |
+
last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
|
| 643 |
}
|
| 644 |
+
last_output += llama_token_to_piece(ctx_llama, embd[0]);
|
| 645 |
|
| 646 |
for (std::string & antiprompt : antiprompts) {
|
| 647 |
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
|
|
|
| 668 |
}
|
| 669 |
|
| 670 |
audio.clear();
|
|
|
|
|
|
|
| 671 |
}
|
| 672 |
}
|
| 673 |
}
|