Skip to content

Commit

Permalink
Introduce prompt caching so prompts load instantly
Browse files Browse the repository at this point in the history
This change also introduces an ephemeral status line in non-verbose mode
to display a load percentage status when slow operations are happening.
  • Loading branch information
jart committed Apr 28, 2023
1 parent bf6459e commit b31ba86
Show file tree
Hide file tree
Showing 7 changed files with 330 additions and 100 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- conf -*-

/o
/.prompt.jtlp

# TODO: Find some way to have Python write to o/
__pycache__
Expand Down
2 changes: 2 additions & 0 deletions third_party/ggml/README.cosmo
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ ORIGIN

LOCAL CHANGES

- Make it possible for loaded prompts to be cached to disk
- Introduce -v and --verbose flags
- Reduce batch size from 512 to 32
- Don't print stats / diagnostics unless -v is passed
- Reduce --top_p default from 0.95 to 0.70
- Change --reverse-prompt to no longer imply --interactive
Expand Down
4 changes: 2 additions & 2 deletions third_party/ggml/common.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi│
╚──────────────────────────────────────────────────────────────────────────────╝
│ │
│ llama.cpp │
Expand Down
5 changes: 3 additions & 2 deletions third_party/ggml/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct gpt_params {
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_batch = 32; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt

// sampling parameters
Expand All @@ -34,6 +34,7 @@ struct gpt_params {

std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string prompt_path = ".prompt.jtlp";
std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

Expand All @@ -42,7 +43,7 @@ struct gpt_params {

bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool use_color = isatty(1) == 1; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode

bool embedding = false; // get only sentence embedding
Expand Down
1 change: 1 addition & 0 deletions third_party/ggml/ggml.mk
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \
LIBC_NEXGEN32E \
LIBC_RUNTIME \
LIBC_STDIO \
LIBC_LOG \
LIBC_STR \
LIBC_STUBS \
LIBC_SYSV \
Expand Down
59 changes: 23 additions & 36 deletions third_party/ggml/llama.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi│
╚──────────────────────────────────────────────────────────────────────────────╝
│ │
│ llama.cpp │
Expand All @@ -25,53 +25,40 @@
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
│ │
╚─────────────────────────────────────────────────────────────────────────────*/

asm(".ident\t\"\\n\\n\
llama.cpp (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off

// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include "third_party/libcxx/cstdint"
#include "third_party/libcxx/cstdio"
#endif

#include "third_party/ggml/llama_util.h"
#include "third_party/ggml/llama.h"

#include "libc/intrin/bits.h"
#include "third_party/ggml/ggml.h"

#include "third_party/ggml/llama_util.h"
#include "third_party/libcxx/algorithm"
#include "third_party/libcxx/array"
#include "third_party/libcxx/ctime"
#include "third_party/libcxx/atomic"
#include "third_party/libcxx/cassert"
#include "third_party/libcxx/cinttypes"
#include "third_party/libcxx/climits"
#include "third_party/libcxx/cstdint"
#include "third_party/libcxx/cstdio"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/ctime"
#include "third_party/libcxx/fstream"
#include "third_party/libcxx/random"
#include "third_party/libcxx/initializer_list"
#include "third_party/libcxx/map"
#include "third_party/libcxx/unordered_map"
#include "third_party/libcxx/queue"
#include "third_party/libcxx/cassert"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/climits"
#include "third_party/libcxx/memory"
#include "third_party/libcxx/algorithm"
#include "third_party/libcxx/initializer_list"
#include "third_party/libcxx/thread"
#include "third_party/libcxx/atomic"
#include "third_party/libcxx/mutex"
#include "third_party/libcxx/queue"
#include "third_party/libcxx/random"
#include "third_party/libcxx/sstream"
#include "third_party/libcxx/thread"
#include "third_party/libcxx/unordered_map"

asm(".ident\t\"\\n\\n\
llama.cpp (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off

#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16

#define READ32BE(s) \
((uint32_t)((const uint8_t *)(s))[0] << 030 | \
(uint32_t)((const uint8_t *)(s))[1] << 020 | \
(uint32_t)((const uint8_t *)(s))[2] << 010 | \
(uint32_t)((const uint8_t *)(s))[3] << 000)

// available llama models
enum e_model {
MODEL_UNKNOWN,
Expand Down
Loading

0 comments on commit b31ba86

Please sign in to comment.