llamacpp-compose-gpu.yml

version: '3.7'

services:

  cheshire-cat-core:
    build:
      context: ./core
    container_name: cheshire_cat_core
    depends_on:
      - cheshire-cat-vector-memory
    environment:
      - PYTHONUNBUFFERED=1
      - WATCHFILES_FORCE_POLLING=true
      - CORE_HOST=${CORE_HOST:-localhost}
      - CORE_PORT=${CORE_PORT:-1865}
      - QDRANT_HOST=${QDRANT_HOST:-cheshire_cat_vector_memory}
      - QDRANT_PORT=${QDRANT_PORT:-6333}
      - CORE_USE_SECURE_PROTOCOLS=${CORE_USE_SECURE_PROTOCOLS:-}
      - API_KEY=${API_KEY:-}
      - LOG_LEVEL=${LOG_LEVEL:-WARNING}
      - DEBUG=${DEBUG:-true}
      - SAVE_MEMORY_SNAPSHOTS=${SAVE_MEMORY_SNAPSHOTS:-false}
    ports:
      - ${CORE_PORT:-1865}:80
    volumes:
      - ./core:/app
    command:
      - python
      - "-m"
      - "cat.main"
    restart: unless-stopped

  cheshire-cat-vector-memory:
    image: qdrant/qdrant:v1.1.1
    container_name: cheshire_cat_vector_memory
    expose:
      - 6333
    volumes:
      - ./long_term_memory/vector:/qdrant/storage
    restart: unless-stopped

  llama-cpp-server:
    image: ghcr.io/abetlen/llama-cpp-python:latest
    ports:
      - ${LLAMA_PORT}:8000
    environment:
      - HOST=${LLAMA_HOST:-0.0.0.0}
      - PORT=${LLAMA_PORT:-8000}
      - MODELS_FOLDER=${MODELS_FOLDER:-models/}
      - MODEL=/models/${MODEL}
      - N_CTX=${N_CTX:-2048}
      - N_GPU_LAYERS=${N_GPU_LAYERS:-0}
      - MAIN_GPU=${MAIN_GPU:-0}
      - N_THREADS=${N_THREADS:-4}
      - LAST_N_TOKENS_SIZE=${LAST_N_TOKENS_SIZE:-64}
      - CHAT_FORMAT=${CHAT_FORMAT:-llama-2}
      - VERBOSE=${VERBOSE:-True}
      - INTERRUPT_REQUESTS=${INTERRUPT_REQUESTS:-True}
    volumes:
      - ${MODELS_FOLDER}:/models
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [ gpu ]
    restart: unless-stopped