Add Semantic Caching

svilupp · Jul 10, 2024 · 28a5de8 · 28a5de8 · svilupp · Jul 10, 2024
2 parents 30605f7 + 898daf6
commit 28a5de8
Show file tree

Hide file tree

Showing 11 changed files with 155 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+## [0.4.0]
+
+### Added
+- Added a launcher function `launch` to make it easier to launch the app.
+- Semantic caching enabled by SemanticCaches.jl. You can change it by setting `cached=false` in the `launch()` function.
+
 ## [0.3.0]
 
 ### Added

diff --git a/Project.toml b/Project.toml
@@ -1,21 +1,25 @@
 name = "ProToPortal"
 uuid = "f9496bd6-a3bb-4afc-927d-7268532ebfa9"
 authors = ["J S <[email protected]> and contributors"]
-version = "0.3.0"
+version = "0.4.0"
 
 [deps]
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 GenieFramework = "a59fdf5c-6bf0-4f5d-949c-a137c9e2f353"
 GenieSession = "03cc5b98-4f21-4eb6-99f2-22eced81f962"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+SemanticCaches = "03ba8f0e-aaaa-4626-a19b-56297996781b"
 
 [compat]
 Aqua = "0.7"
 Dates = "<0.0.1, 1"
 GenieFramework = "2.1"
 GenieSession = "1"
-PromptingTools = "0.33"
+HTTP = "1"
+PromptingTools = "0.37.1"
+SemanticCaches = "0.2"
 Test = "<0.0.1, 1"
 julia = "1.10"
 

diff --git a/README.md b/README.md
@@ -32,11 +32,15 @@ using Pkg; Pkg.activate("."); Pkg.instantiate(".")
 ```julia
 # as a quick hack if you don't have your environment variables set up, run the below line with your OpenAI key
 # ENV["OPENAI_API_KEY"] = "<your_openai_api_key>"
-include("main.jl")
+ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"  # required for caching
+using ProToPortal
+launch(; cached = true)
 ```
 
 Then head to your browser and go to [http://127.0.0.1:8000](http://127.0.0.1:8000) to see the app.
 
+It will now cache similar LLM requests by default (disable with `cached=false` in `launch()` function).
+
 For the purists: simply run `julia --project -t auto main.jl` in your terminal (once installed)!
 
 How to start? Type `Say hi!` in the question box on the Chat tab and click Submit (or press CTRL+ENTER).

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -54,8 +54,11 @@ It's the first Julia-focused GUI (evaluate Julia code, fix it, critique it - or
 Clone ProToPortal, instantiate it, enable your desired settings, and streamline your LLM interactions right away:
 
 ```julia
-using Pkg; Pkg.activate("."); Pkg.instantiate(".")
-include("main.jl")
+# as a quick hack if you don't have your environment variables set up, run the below line with your OpenAI key
+# ENV["OPENAI_API_KEY"] = "<your_openai_api_key>"
+ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"  # required for caching
+using ProToPortal
+launch(; cached = true)
 ```
 
 Then head to your browser and go to [http://127.0.0.1:8000](http://127.0.0.1:8000) to see the app.

diff --git a/docs/src/introduction.md b/docs/src/introduction.md
@@ -27,11 +27,15 @@ using Pkg; Pkg.activate("."); Pkg.instantiate(".")
 ```julia
 # as a quick hack if you don't have your environment variables set up, run the below line with your OpenAI key
 # ENV["OPENAI_API_KEY"] = "<your_openai_api_key>"
-include("main.jl")
+ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"  # required for caching
+using ProToPortal
+launch(; cached = true)
 ```
 
 Then head to your browser and go to [http://127.0.0.1:8000](http://127.0.0.1:8000) to see the app.
 
+It will now cache similar LLM requests by default (disable with `cached=false` in `launch()` function).
+
 For the purists: simply run `julia --project -t auto main.jl` in your terminal (once installed)!
 
 How to start? Type `Say hi!` in the question box on the Chat tab and click Submit (or press CTRL+ENTER).

diff --git a/docs/src/videos/screen-capture-code-fixing.gif b/docs/src/videos/screen-capture-code-fixing.gif
diff --git a/docs/src/videos/screen-capture-plain.webm b/docs/src/videos/screen-capture-plain.webm
diff --git a/main.jl b/main.jl
@@ -1,9 +1,6 @@
 using Pkg
 Pkg.activate(".")
-using GenieFramework
-ENV["GENIE_HOST"] = "127.0.0.1"
-ENV["PORT"] = "8000"
-## ENV["GENIE_ENV"] = "prod"
-include("app.jl") # hack for hot-reloading when fixing things
-Genie.loadapp();
-up(async = true);
+## Required to support semantic caching
+ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
+using ProToPortal
+ProToPortal.launch(8000, "0.0.0.0"; async = false, cached = true, cache_verbose = true)
diff --git a/src/CacheLayer.jl b/src/CacheLayer.jl
@@ -0,0 +1,81 @@
+## Define the new caching mechanism as a layer for HTTP
+## See documentation [here](https://juliaweb.github.io/HTTP.jl/stable/client/#Quick-Examples)
+"""
+    CacheLayer
+
+A module providing caching of LLM requests for ProToPortal.
+
+It caches 3 URL paths: 
+- `/v1/chat/completions` (for OpenAI API)
+- `/v1/embeddings` (for OpenAI API)
+- `/v1/rerank` (for Cohere API)
+
+# How to use
+You can use the layer directly
+`CacheLayer.get(req)`
+
+You can push the layer globally in all HTTP.jl requests
+`HTTP.pushlayer!(CacheLayer.cache_layer)`
+
+You can remove the layer later
+`HTTP.poplayer!()`
+
+"""
+module CacheLayer
+
+using SemanticCaches, HTTP
+using PromptingTools: JSON3
+
+const SEM_CACHE = SemanticCache()
+const HASH_CACHE = HashCache()
+
+function cache_layer(handler)
+    return function (req; kw...)
+        VERBOSE = Base.get(ENV, "CACHES_VERBOSE", "true") == "true"
+        if req.method == "POST" && !isempty(req.body)
+            body = JSON3.read(copy(req.body))
+            ## chat/completions is for OpenAI, v1/messages is for Anthropic
+            if occursin("v1/chat/completions", req.target) ||
+               occursin("v1/messages", req.target)
+                ## We're in chat completion endpoint
+                temperature_str = haskey(body, :temperature) ? body[:temperature] : "-"
+                cache_key = string("chat-", body[:model], "-", temperature_str)
+                input = join([m["content"] for m in body[:messages]], " ")
+            elseif occursin("v1/embeddings", req.target)
+                cache_key = string("emb-", body[:model])
+                ## We're in embedding endpoint
+                input = join(body[:input], " ")
+            elseif occursin("v1/rerank", req.target)
+                cache_key = string("rerank-", body[:model], "-", body[:top_n])
+                input = join([body[:query], body[:documents]...], " ")
+            else
+                ## Skip, unknown API 
+                VERBOSE && @info "Skipping cache for $(req.method) $(req.target)"
+                return handler(req; kw...)
+            end
+            ## Check the cache
+
+            VERBOSE && @info "Check if we can cache this request ($(length(input)) chars)"
+            active_cache = length(input) > 5000 ? HASH_CACHE : SEM_CACHE
+            item = active_cache(cache_key, input; verbose = 2 * VERBOSE) # change verbosity to 0 to disable detailed logs
+            if !isvalid(item)
+                VERBOSE && @info "Cache miss! Pinging the API"
+                # pass the request along to the next layer by calling `cache_layer` arg `handler`
+                resp = handler(req; kw...)
+                item.output = resp
+                # Let's remember it for the next time
+                push!(active_cache, item)
+            end
+            ## Return the calculated or cached result
+            return item.output
+        end
+        # pass the request along to the next layer by calling `cache_layer` arg `handler`
+        # also pass along the trailing keyword args `kw...`
+        return handler(req; kw...)
+    end
+end
+
+# Create a new client with the auth layer added
+HTTP.@client [cache_layer]
+
+end # module
diff --git a/src/ProToPortal.jl b/src/ProToPortal.jl
@@ -53,6 +53,11 @@ include("llm.jl")
 export meta_prompt_step!
 include("meta_prompting.jl")
 
+include("CacheLayer.jl")
+
+export launch
+include("server.jl")
+
 function __init__()
     ## Load extra templates
     PT.load_templates!(joinpath(@__DIR__, "..", "templates"); remember_path = true) # add our custom ones

diff --git a/src/server.jl b/src/server.jl
@@ -0,0 +1,38 @@
+"""
+    launch(
+        port::Int = get(ENV, "PORT", 8000), host::String = get(
+            ENV, "GENIE_HOST", "127.0.0.1");
+        async::Bool = true, cached::Bool = true, cache_verbose::Bool = false)
+
+Launches ProToPortal in the browser.
+
+Defaults to: `http://127.0.0.1:8000`. 
+This is a convenience wrapper around `Genie.up`, to customize the server configuration use `Genie.up()` and `Genie.config`.
+
+# Arguments
+- `port::Union{Int, String} = get(ENV, "PORT", "8000")`: The port to launch the server on.
+- `host::String = get(ENV, "GENIE_HOST", "127.0.0.1")`: The host to launch the server on.
+- `async::Bool = true`: Whether to launch the server asynchronously, ie, in the background.
+- `cached::Bool = true`: Whether to use semantic caching of the requests.
+- `cache_verbose::Bool = true`: Whether to print verbose information about the caching process.
+
+If you want to remove the cache layer later, you can use `import HTTP; HTTP.poplayer!()`.
+"""
+function launch(
+        port::Union{Int, String} = get(ENV, "PORT", "8000"),
+        host::String = get(ENV, "GENIE_HOST", "127.0.0.1");
+        async::Bool = true, cached::Bool = true, cache_verbose::Bool = true)
+    ## Loads app.jl in the root directory
+    Genie.loadapp(pkgdir(ProToPortal))
+
+    ## Enables caching
+    ENV["CACHES_VERBOSE"] = cache_verbose ? "true" : "false"
+    if cached
+        @info "Caching enabled globally (for all requests, see `CacheLayer` module for details). Remove with `HTTP.poplayer!()`"
+        HTTP.pushlayer!(CacheLayer.cache_layer)
+    end
+    ## Convert to INT
+    port_ = port isa Integer ? port : tryparse(Int, port)
+    @assert port_ isa Integer "Port must be an integer. Provided: $port"
+    up(port_, host; async)
+end