From d879da79b345063d29fb3b91427145b48be70ed7 Mon Sep 17 00:00:00 2001 From: Tijs Zwinkels Date: Sun, 7 Jul 2024 21:35:03 +0200 Subject: [PATCH 1/4] make bot --model parameter function correctly --- bot.py | 7 ++++--- requirements.txt | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bot.py b/bot.py index 89f8f19..fd91825 100644 --- a/bot.py +++ b/bot.py @@ -118,7 +118,7 @@ def main( model = Prompt.ask( "\n1. What main model do you want to use?", - default="Qwen/Qwen2-72B-Instruct", + default=model, ) console.print(f"Selected {model}.", style="yellow italic") temperature = float( @@ -199,8 +199,9 @@ def main( for chunk in output: out = chunk.choices[0].delta.content - console.print(out, end="") - all_output += out + if out is not None: + console.print(out, end="") + all_output += out print() if DEBUG: diff --git a/requirements.txt b/requirements.txt index bf9f390..a09808a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ loguru datasets typer rich +cffi From 3d74f259b21f445c7def92e01a38e0f09780817e Mon Sep 17 00:00:00 2001 From: Tijs Zwinkels Date: Sun, 7 Jul 2024 23:00:15 +0200 Subject: [PATCH 2/4] First hacky OpenAI compatible api --- api.py | 176 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 177 insertions(+) create mode 100644 api.py diff --git a/api.py b/api.py new file mode 100644 index 0000000..ede7e77 --- /dev/null +++ b/api.py @@ -0,0 +1,176 @@ +import typer +from flask import Flask, request, jsonify, Response, stream_with_context +import json +from functools import partial +import datasets +from utils import generate_together_stream, generate_with_references, DEBUG +from loguru import logger +from datasets.utils.logging import disable_progress_bar + +disable_progress_bar() + +app = Flask(__name__) + +default_reference_models = [ + "Qwen/Qwen2-72B-Instruct", + "Qwen/Qwen1.5-72B-Chat", + "mistralai/Mixtral-8x22B-Instruct-v0.1", + "databricks/dbrx-instruct", +] + +def process_fn(item, temperature=0.7, max_tokens=2048): + references = item.get("references", []) + model = item["model"] + messages = item["instruction"] + + output = generate_with_references( + model=model, + messages=messages, + references=references, + temperature=temperature, + max_tokens=max_tokens, + ) + if DEBUG: + logger.info( + f"model: {model}, instruction: {item['instruction']}, output: {output[:20]}" + ) + + return {"output": output} + +@app.route('/v1/chat/completions', methods=['POST']) +def chat_completions(): + data = request.json + messages = data.get('messages', []) + temperature = data.get('temperature', 0.7) + max_tokens = data.get('max_tokens', 512) + stream = data.get('stream', False) # Check if the client requested streaming + + # Prepare data for processing + data = { + "instruction": [messages] * len(default_reference_models), + "references": [""] * len(default_reference_models), + "model": [m for m in default_reference_models], + } + + eval_set = datasets.Dataset.from_dict(data) + + # Process with reference models + eval_set = eval_set.map( + partial( + process_fn, + temperature=temperature, + max_tokens=max_tokens, + ), + batched=False, + num_proc=len(default_reference_models), + ) + references = [item["output"] for item in eval_set] + + # Generate final output + output = generate_with_references( + model=default_model, + temperature=temperature, + max_tokens=max_tokens, + messages=messages, + references=references, + generate_fn=generate_together_stream, + ) + + # Collect output + all_output = "" + for chunk in output: + out = chunk.choices[0].delta.content + if out is not None: + # print(out) + all_output += out + + # Prepare response + print (all_output) + response = { + "id": "chatcmpl-123", # TODO + "object": "chat.completion", + "created": 1720384636, # TODO + "model": default_model, + "usage": { + "prompt_tokens": 42, # TODO + "completion_tokens": len(all_output.split()), # Rough estimate + "total_tokens": 42 + len(all_output.split()), # Rough estimate + }, + "choices": [ + { + "message": { + "role": "assistant", + "content": all_output, + }, + "finish_reason": "stop", + "index": 0, + } + ], + } + + if DEBUG: + print(json.dumps(response, indent=2)) + + def generate(): + if stream: + # Simulate streaming by yielding chunks + #chunks = [all_output[i:i+5] for i in range(0, len(all_output), 5)] # Split into 5-character chunks + chunks = [all_output] + for chunk in chunks: + chunk_response = { + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1720384636, + "model": default_model, + "choices": [ + { + "delta": { + "content": chunk, + }, + "index": 0, + "finish_reason": None, + } + ], + } + yield f"data: {json.dumps(chunk_response)}\n\n" + + # Send the final chunk with finish_reason + final_chunk = { + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1720384636, + "model": default_model, + "choices": [ + { + "delta": {}, + "index": 0, + "finish_reason": "stop", + } + ], + } + yield f"data: {json.dumps(final_chunk)}\n\n" + yield "data: [DONE]\n\n" + else: + # Non-streaming response + yield json.dumps(response) + + if stream: + return Response(stream_with_context(generate()), content_type='text/event-stream') + else: + return jsonify(response) + +def main( + model: str = "Qwen/Qwen2-72B-Instruct", + reference_models: list[str] = default_reference_models, + temperature: float = 0.7, + max_tokens: int = 512, + rounds: int = 1, + port: int = 5000, +): + global default_model, default_reference_models + default_model = model + default_reference_models = reference_models + app.run(port=port) + +if __name__ == "__main__": + typer.run(main) diff --git a/requirements.txt b/requirements.txt index a09808a..ee5ca63 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ datasets typer rich cffi +flask From 1592cf9f3adc65d68c993a545da9ac1e944c6a3c Mon Sep 17 00:00:00 2001 From: Tijs Zwinkels Date: Wed, 10 Jul 2024 12:46:35 +0200 Subject: [PATCH 3/4] Pass max-token etc. correctly --- api.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/api.py b/api.py index ede7e77..f6b6bff 100644 --- a/api.py +++ b/api.py @@ -11,12 +11,16 @@ app = Flask(__name__) +default_model = None default_reference_models = [ "Qwen/Qwen2-72B-Instruct", "Qwen/Qwen1.5-72B-Chat", "mistralai/Mixtral-8x22B-Instruct-v0.1", "databricks/dbrx-instruct", ] +_temperature = 0.7 +_max_tokens = 512 +_rounds = 1 def process_fn(item, temperature=0.7, max_tokens=2048): references = item.get("references", []) @@ -27,8 +31,8 @@ def process_fn(item, temperature=0.7, max_tokens=2048): model=model, messages=messages, references=references, - temperature=temperature, - max_tokens=max_tokens, + temperature=_temperature, + max_tokens=_max_tokens, ) if DEBUG: logger.info( @@ -41,9 +45,9 @@ def process_fn(item, temperature=0.7, max_tokens=2048): def chat_completions(): data = request.json messages = data.get('messages', []) - temperature = data.get('temperature', 0.7) - max_tokens = data.get('max_tokens', 512) stream = data.get('stream', False) # Check if the client requested streaming + temperature = data.get('temperature', _temperature) + max_tokens = data.get('max_tokens', _max_tokens) # Prepare data for processing data = { @@ -165,11 +169,14 @@ def main( temperature: float = 0.7, max_tokens: int = 512, rounds: int = 1, - port: int = 5000, + port: int = 5001, ): - global default_model, default_reference_models + global default_model, default_reference_models, _temperature, _max_tokens, _rounds default_model = model default_reference_models = reference_models + _temperature = temperature + _max_tokens = max_tokens + _rounds = rounds app.run(port=port) if __name__ == "__main__": From 6ef8e8aeb729573968fd6274fa76512c1cbcfce9 Mon Sep 17 00:00:00 2001 From: Tijs Zwinkels Date: Wed, 10 Jul 2024 16:30:21 +0200 Subject: [PATCH 4/4] Update Readme on usage of api.py --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 99b6f15..eb31c71 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,31 @@ You can configure the demo by specifying the following parameters: - `--num_proc`: Number of processes to run in parallel for faster execution. - `--multi_turn`: Boolean to toggle multi-turn interaction capability. +## OpenAI Compatible API Endpoint + +We provide an OpenAI-compatible API endpoint that allows you to interact with the Mixture-of-Agents (MoA) system using a familiar API format. This makes it easy to integrate MoA into existing applications that use OpenAI-style APIs. + +To start the API server, use the `api.py` script. It supports the same parameters as `bot.py`, with an additional `--port` parameter to specify the port number for the server. + +### Usage + +To run the API server, use the following command: + +```bash +python api.py --model --reference-models --reference-models ... --temperature --max-tokens --rounds --port +``` + +For example: + +```bash +python api.py --model "Qwen/Qwen2-72B-Instruct" --reference-models "Qwen/Qwen2-72B-Instruct" "Qwen/Qwen1.5-72B-Chat" "mistralai/Mixtral-8x22B-Instruct-v0.1" "databricks/dbrx-instruct" --temperature 0.7 --max-tokens 512 --rounds 1 --port 5001 +``` + +This will start an OpenAI-compatible API server on `http://localhost:5001`. You can then use this endpoint in your applications, just as you would use the OpenAI API. + +Note: This is a work-in-progress. It does not include all features of the official OpenAI API, and it doesn't stream responses as they come in. Might be unstable. + + ## Evaluation We provide scripts to quickly reproduce some of the results presented in our paper