From d879da79b345063d29fb3b91427145b48be70ed7 Mon Sep 17 00:00:00 2001
From: Tijs Zwinkels <tijs@tinkertank.eu>
Date: Sun, 7 Jul 2024 21:35:03 +0200
Subject: [PATCH 1/4] make bot --model parameter function correctly

---
 bot.py           | 7 ++++---
 requirements.txt | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/bot.py b/bot.py
index 89f8f19..fd91825 100644
--- a/bot.py
+++ b/bot.py
@@ -118,7 +118,7 @@ def main(
 
     model = Prompt.ask(
         "\n1. What main model do you want to use?",
-        default="Qwen/Qwen2-72B-Instruct",
+        default=model,
     )
     console.print(f"Selected {model}.", style="yellow italic")
     temperature = float(
@@ -199,8 +199,9 @@ def main(
 
         for chunk in output:
             out = chunk.choices[0].delta.content
-            console.print(out, end="")
-            all_output += out
+            if out is not None:
+                console.print(out, end="")
+                all_output += out
         print()
 
         if DEBUG:
diff --git a/requirements.txt b/requirements.txt
index bf9f390..a09808a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ loguru
 datasets
 typer
 rich
+cffi

From 3d74f259b21f445c7def92e01a38e0f09780817e Mon Sep 17 00:00:00 2001
From: Tijs Zwinkels <tijs@tinkertank.eu>
Date: Sun, 7 Jul 2024 23:00:15 +0200
Subject: [PATCH 2/4] First hacky OpenAI compatible api

---
 api.py           | 176 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |   1 +
 2 files changed, 177 insertions(+)
 create mode 100644 api.py

diff --git a/api.py b/api.py
new file mode 100644
index 0000000..ede7e77
--- /dev/null
+++ b/api.py
@@ -0,0 +1,176 @@
+import typer
+from flask import Flask, request, jsonify, Response, stream_with_context
+import json
+from functools import partial
+import datasets
+from utils import generate_together_stream, generate_with_references, DEBUG
+from loguru import logger
+from datasets.utils.logging import disable_progress_bar
+
+disable_progress_bar()
+
+app = Flask(__name__)
+
+default_reference_models = [
+    "Qwen/Qwen2-72B-Instruct",
+    "Qwen/Qwen1.5-72B-Chat",
+    "mistralai/Mixtral-8x22B-Instruct-v0.1",
+    "databricks/dbrx-instruct",
+]
+
+def process_fn(item, temperature=0.7, max_tokens=2048):
+    references = item.get("references", [])
+    model = item["model"]
+    messages = item["instruction"]
+
+    output = generate_with_references(
+        model=model,
+        messages=messages,
+        references=references,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    if DEBUG:
+        logger.info(
+            f"model: {model}, instruction: {item['instruction']}, output: {output[:20]}"
+        )
+
+    return {"output": output}
+
+@app.route('/v1/chat/completions', methods=['POST'])
+def chat_completions():
+    data = request.json
+    messages = data.get('messages', [])
+    temperature = data.get('temperature', 0.7)
+    max_tokens = data.get('max_tokens', 512)
+    stream = data.get('stream', False)  # Check if the client requested streaming
+    
+    # Prepare data for processing
+    data = {
+        "instruction": [messages] * len(default_reference_models),
+        "references": [""] * len(default_reference_models),
+        "model": [m for m in default_reference_models],
+    }
+
+    eval_set = datasets.Dataset.from_dict(data)
+
+    # Process with reference models
+    eval_set = eval_set.map(
+        partial(
+            process_fn,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        ),
+        batched=False,
+        num_proc=len(default_reference_models),
+    )
+    references = [item["output"] for item in eval_set]
+
+    # Generate final output
+    output = generate_with_references(
+        model=default_model,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        messages=messages,
+        references=references,
+        generate_fn=generate_together_stream,
+    )
+
+    # Collect output
+    all_output = ""
+    for chunk in output:
+        out = chunk.choices[0].delta.content
+        if out is not None:
+            # print(out)
+            all_output += out
+
+    # Prepare response
+    print (all_output)
+    response = {
+        "id": "chatcmpl-123",  # TODO
+        "object": "chat.completion",
+        "created": 1720384636,  # TODO
+        "model": default_model,
+        "usage": {
+            "prompt_tokens": 42,  # TODO
+            "completion_tokens": len(all_output.split()),  # Rough estimate
+            "total_tokens": 42 + len(all_output.split()),  # Rough estimate
+        },
+        "choices": [
+            {
+                "message": {
+                    "role": "assistant",
+                    "content": all_output,
+                },
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
+    }
+
+    if DEBUG:
+      print(json.dumps(response, indent=2))
+
+    def generate():
+        if stream:
+            # Simulate streaming by yielding chunks
+            #chunks = [all_output[i:i+5] for i in range(0, len(all_output), 5)]  # Split into 5-character chunks
+            chunks = [all_output]
+            for chunk in chunks:
+                chunk_response = {
+                    "id": "chatcmpl-123",
+                    "object": "chat.completion.chunk",
+                    "created": 1720384636,
+                    "model": default_model,
+                    "choices": [
+                        {
+                            "delta": {
+                                "content": chunk,
+                            },
+                            "index": 0,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                yield f"data: {json.dumps(chunk_response)}\n\n"
+            
+            # Send the final chunk with finish_reason
+            final_chunk = {
+                "id": "chatcmpl-123",
+                "object": "chat.completion.chunk",
+                "created": 1720384636,
+                "model": default_model,
+                "choices": [
+                    {
+                        "delta": {},
+                        "index": 0,
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+            yield "data: [DONE]\n\n"
+        else:
+            # Non-streaming response
+            yield json.dumps(response)
+
+    if stream:
+        return Response(stream_with_context(generate()), content_type='text/event-stream')
+    else:
+        return jsonify(response)
+
+def main(
+    model: str = "Qwen/Qwen2-72B-Instruct",
+    reference_models: list[str] = default_reference_models,
+    temperature: float = 0.7,
+    max_tokens: int = 512,
+    rounds: int = 1,
+    port: int = 5000,
+):
+    global default_model, default_reference_models
+    default_model = model
+    default_reference_models = reference_models
+    app.run(port=port)
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/requirements.txt b/requirements.txt
index a09808a..ee5ca63 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ datasets
 typer
 rich
 cffi
+flask

From 1592cf9f3adc65d68c993a545da9ac1e944c6a3c Mon Sep 17 00:00:00 2001
From: Tijs Zwinkels <tijs@tinkertank.eu>
Date: Wed, 10 Jul 2024 12:46:35 +0200
Subject: [PATCH 3/4] Pass max-token etc. correctly

---
 api.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/api.py b/api.py
index ede7e77..f6b6bff 100644
--- a/api.py
+++ b/api.py
@@ -11,12 +11,16 @@
 
 app = Flask(__name__)
 
+default_model = None
 default_reference_models = [
     "Qwen/Qwen2-72B-Instruct",
     "Qwen/Qwen1.5-72B-Chat",
     "mistralai/Mixtral-8x22B-Instruct-v0.1",
     "databricks/dbrx-instruct",
 ]
+_temperature = 0.7
+_max_tokens = 512
+_rounds = 1
 
 def process_fn(item, temperature=0.7, max_tokens=2048):
     references = item.get("references", [])
@@ -27,8 +31,8 @@ def process_fn(item, temperature=0.7, max_tokens=2048):
         model=model,
         messages=messages,
         references=references,
-        temperature=temperature,
-        max_tokens=max_tokens,
+        temperature=_temperature,
+        max_tokens=_max_tokens,
     )
     if DEBUG:
         logger.info(
@@ -41,9 +45,9 @@ def process_fn(item, temperature=0.7, max_tokens=2048):
 def chat_completions():
     data = request.json
     messages = data.get('messages', [])
-    temperature = data.get('temperature', 0.7)
-    max_tokens = data.get('max_tokens', 512)
     stream = data.get('stream', False)  # Check if the client requested streaming
+    temperature = data.get('temperature', _temperature)
+    max_tokens = data.get('max_tokens', _max_tokens)
     
     # Prepare data for processing
     data = {
@@ -165,11 +169,14 @@ def main(
     temperature: float = 0.7,
     max_tokens: int = 512,
     rounds: int = 1,
-    port: int = 5000,
+    port: int = 5001,
 ):
-    global default_model, default_reference_models
+    global default_model, default_reference_models, _temperature, _max_tokens, _rounds
     default_model = model
     default_reference_models = reference_models
+    _temperature = temperature
+    _max_tokens = max_tokens
+    _rounds = rounds
     app.run(port=port)
 
 if __name__ == "__main__":

From 6ef8e8aeb729573968fd6274fa76512c1cbcfce9 Mon Sep 17 00:00:00 2001
From: Tijs Zwinkels <tijs@tinkertank.eu>
Date: Wed, 10 Jul 2024 16:30:21 +0200
Subject: [PATCH 4/4] Update Readme on usage of api.py

---
 README.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/README.md b/README.md
index 99b6f15..eb31c71 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,31 @@ You can configure the demo by specifying the following parameters:
 - `--num_proc`: Number of processes to run in parallel for faster execution.
 - `--multi_turn`: Boolean to toggle multi-turn interaction capability.
 
+## OpenAI Compatible API Endpoint
+
+We provide an OpenAI-compatible API endpoint that allows you to interact with the Mixture-of-Agents (MoA) system using a familiar API format. This makes it easy to integrate MoA into existing applications that use OpenAI-style APIs.
+
+To start the API server, use the `api.py` script. It supports the same parameters as `bot.py`, with an additional `--port` parameter to specify the port number for the server.
+
+### Usage
+
+To run the API server, use the following command:
+
+```bash
+python api.py --model <model_name> --reference-models <model1> --reference-models <model2> ... --temperature <temp> --max-tokens <max_tokens> --rounds <rounds> --port <port_number>
+```
+
+For example:
+
+```bash
+python api.py --model "Qwen/Qwen2-72B-Instruct" --reference-models "Qwen/Qwen2-72B-Instruct" "Qwen/Qwen1.5-72B-Chat" "mistralai/Mixtral-8x22B-Instruct-v0.1" "databricks/dbrx-instruct" --temperature 0.7 --max-tokens 512 --rounds 1 --port 5001
+```
+
+This will start an OpenAI-compatible API server on `http://localhost:5001`. You can then use this endpoint in your applications, just as you would use the OpenAI API.
+
+Note: This is a work-in-progress. It does not include all features of the official OpenAI API, and it doesn't stream responses as they come in. Might be unstable.
+
+
 ## Evaluation
 
 We provide scripts to quickly reproduce some of the results presented in our paper