This repository has been archived by the owner on Dec 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
llamacpp-compose-gpu.yml
68 lines (64 loc) · 1.83 KB
/
llamacpp-compose-gpu.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
version: '3.7'
services:
cheshire-cat-core:
build:
context: ./core
container_name: cheshire_cat_core
depends_on:
- cheshire-cat-vector-memory
environment:
- PYTHONUNBUFFERED=1
- WATCHFILES_FORCE_POLLING=true
- CORE_HOST=${CORE_HOST:-localhost}
- CORE_PORT=${CORE_PORT:-1865}
- QDRANT_HOST=${QDRANT_HOST:-cheshire_cat_vector_memory}
- QDRANT_PORT=${QDRANT_PORT:-6333}
- CORE_USE_SECURE_PROTOCOLS=${CORE_USE_SECURE_PROTOCOLS:-}
- API_KEY=${API_KEY:-}
- LOG_LEVEL=${LOG_LEVEL:-WARNING}
- DEBUG=${DEBUG:-true}
- SAVE_MEMORY_SNAPSHOTS=${SAVE_MEMORY_SNAPSHOTS:-false}
ports:
- ${CORE_PORT:-1865}:80
volumes:
- ./core:/app
command:
- python
- "-m"
- "cat.main"
restart: unless-stopped
cheshire-cat-vector-memory:
image: qdrant/qdrant:v1.1.1
container_name: cheshire_cat_vector_memory
expose:
- 6333
volumes:
- ./long_term_memory/vector:/qdrant/storage
restart: unless-stopped
llama-cpp-server:
image: ghcr.io/abetlen/llama-cpp-python:latest
ports:
- ${LLAMA_PORT}:8000
environment:
- HOST=${LLAMA_HOST:-0.0.0.0}
- PORT=${LLAMA_PORT:-8000}
- MODELS_FOLDER=${MODELS_FOLDER:-models/}
- MODEL=/models/${MODEL}
- N_CTX=${N_CTX:-2048}
- N_GPU_LAYERS=${N_GPU_LAYERS:-0}
- MAIN_GPU=${MAIN_GPU:-0}
- N_THREADS=${N_THREADS:-4}
- LAST_N_TOKENS_SIZE=${LAST_N_TOKENS_SIZE:-64}
- CHAT_FORMAT=${CHAT_FORMAT:-llama-2}
- VERBOSE=${VERBOSE:-True}
- INTERRUPT_REQUESTS=${INTERRUPT_REQUESTS:-True}
volumes:
- ${MODELS_FOLDER}:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
restart: unless-stopped