Introduce prompt caching so prompts load instantly

This change also introduces an ephemeral status line in non-verbose mode to display a load percentage status when slow operations are happening.
jart · Apr 28, 2023 · b31ba86 · b31ba86
1 parent bf6459e
commit b31ba86
Show file tree

Hide file tree

Showing 7 changed files with 330 additions and 100 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # -*- conf -*-
 
 /o
+/.prompt.jtlp
 
 # TODO: Find some way to have Python write to o/
 __pycache__

diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo
@@ -16,7 +16,9 @@ ORIGIN
 
 LOCAL CHANGES
 
+  - Make it possible for loaded prompts to be cached to disk
   - Introduce -v and --verbose flags
+  - Reduce batch size from 512 to 32
   - Don't print stats / diagnostics unless -v is passed
   - Reduce --top_p default from 0.95 to 0.70
   - Change --reverse-prompt to no longer imply --interactive

diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  llama.cpp                                                                   │

diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h
@@ -23,7 +23,7 @@ struct gpt_params {
     int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch       = 32;   // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
@@ -34,6 +34,7 @@ struct gpt_params {
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
+    std::string prompt_path = ".prompt.jtlp";
     std::string input_prefix = "";       // string to prefix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
@@ -42,7 +43,7 @@ struct gpt_params {
 
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = false; // use color to distinguish generations and inputs
+    bool use_color         = isatty(1) == 1; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
 
     bool embedding         = false; // get only sentence embedding

diff --git a/third_party/ggml/ggml.mk b/third_party/ggml/ggml.mk
@@ -72,6 +72,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =				\
 	LIBC_NEXGEN32E						\
 	LIBC_RUNTIME						\
 	LIBC_STDIO						\
+	LIBC_LOG						\
 	LIBC_STR						\
 	LIBC_STUBS						\
 	LIBC_SYSV						\

diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
 │  llama.cpp                                                                   │
@@ -25,53 +25,40 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-// Defines fileno on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#include "third_party/libcxx/cstdint"
-#include "third_party/libcxx/cstdio"
-#endif
-
-#include "third_party/ggml/llama_util.h"
 #include "third_party/ggml/llama.h"
-
+#include "libc/intrin/bits.h"
 #include "third_party/ggml/ggml.h"
-
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/algorithm"
 #include "third_party/libcxx/array"
-#include "third_party/libcxx/ctime"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cassert"
 #include "third_party/libcxx/cinttypes"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/ctime"
 #include "third_party/libcxx/fstream"
-#include "third_party/libcxx/random"
+#include "third_party/libcxx/initializer_list"
 #include "third_party/libcxx/map"
-#include "third_party/libcxx/unordered_map"
-#include "third_party/libcxx/queue"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/climits"
 #include "third_party/libcxx/memory"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/initializer_list"
-#include "third_party/libcxx/thread"
-#include "third_party/libcxx/atomic"
 #include "third_party/libcxx/mutex"
+#include "third_party/libcxx/queue"
+#include "third_party/libcxx/random"
 #include "third_party/libcxx/sstream"
+#include "third_party/libcxx/thread"
+#include "third_party/libcxx/unordered_map"
+
+asm(".ident\t\"\\n\\n\
+llama.cpp (MIT License)\\n\
+Copyright (c) 2023 Georgi Gerganov\"");
+asm(".include \"libc/disclaimer.inc\"");
+// clang-format off
 
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
-#define READ32BE(s)                                     \
-    ((uint32_t)((const uint8_t *)(s))[0] << 030 |       \
-     (uint32_t)((const uint8_t *)(s))[1] << 020 |       \
-     (uint32_t)((const uint8_t *)(s))[2] << 010 |       \
-     (uint32_t)((const uint8_t *)(s))[3] << 000)
-
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,