inference_server.py

import time
import numpy as np
from fastapi import FastAPI, WebSocket
from fastapi.middleware.cors import CORSMiddleware
import base64
import uvicorn
import traceback
import numpy as np
import argparse

import torch as T
import torch.nn.functional as F
import torchaudio

import os
from typing import Optional

from utils import print_colored
from model import get_hertz_dev_config


argparse = argparse.ArgumentParser()

argparse.add_argument('--prompt_path', type=str, default='./prompts/bob_mono.wav', help="""
                      We highly recommend making your own prompt based on a conversation between you and another person.
                      bob_mono.wav seems to work better for two-channel than bob_stereo.wav.
                      """)
args = argparse.parse_args()


device = 'cuda' if T.cuda.is_available() else T.device('cpu')
print_colored(f"Using device: {device}", "grey")

model_config = get_hertz_dev_config(is_split=True)

model = model_config()
model = model.eval().bfloat16().to(device)

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# Hyperparams or something.
SAMPLE_RATE = 16000 # Don't change this
TEMPS = (0.8, (0.4, 0.1)) # You can change this, but there's also an endpoint for it.

class AudioProcessor:
    def __init__(self, model, prompt_path):
        self.model = model
        self.prompt_path = prompt_path
        self.initialize_state(prompt_path)

    def initialize_state(self, prompt_path):
        loaded_audio, sr = torchaudio.load(prompt_path)
        
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
            loaded_audio = resampler(loaded_audio)
            
        if loaded_audio.shape[0] == 1:
            loaded_audio = loaded_audio.repeat(2, 1)
            
        audio_length = loaded_audio.shape[-1] 
        num_chunks = audio_length // 2000
        loaded_audio = loaded_audio[..., :num_chunks * 2000]
            
        self.loaded_audio = loaded_audio.to(device)
            
        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
                self.model.init_cache(bsize=1, device=device, dtype=T.bfloat16, length=1024)
                self.next_model_audio = self.model.next_audio_from_audio(self.loaded_audio.unsqueeze(0), temps=TEMPS)
        self.prompt_buffer = None
        self.prompt_position = 0
        self.chunks_until_live = 24
        self.initialize_prompt_buffer()
        print_colored("AudioProcessor state initialized", "green")

    def initialize_prompt_buffer(self):
        self.recorded_audio = self.loaded_audio
        prompt_audio = self.loaded_audio.reshape(1, 2, -1)
        prompt_audio = prompt_audio[:, :, -48000:].cpu().numpy()
        prompt_audio_mono = prompt_audio.mean(axis=1)
        self.prompt_buffer = np.array_split(prompt_audio_mono[0], 24)
        print_colored(f"Initialized prompt buffer with {len(self.prompt_buffer)} chunks", "grey")
    
    async def process_audio(self, audio_data):
        if self.chunks_until_live > 0:
            print_colored(f"Serving from prompt buffer, {self.chunks_until_live} chunks left", "grey")
            chunk = self.prompt_buffer[24 - self.chunks_until_live]
            self.chunks_until_live -= 1
            
            if self.chunks_until_live == 0:
                print_colored("Switching to live processing mode", "green")

            time.sleep(0.05)
            return chunk
        
        audio_tensor = T.from_numpy(audio_data).to(device)
        audio_tensor = audio_tensor.reshape(1, 1, -1)
        audio_tensor = T.cat([audio_tensor, self.next_model_audio], dim=1)
        
        with T.autocast(device_type=device, dtype=T.bfloat16), T.inference_mode():
            curr_model_audio = self.model.next_audio_from_audio(
                audio_tensor, 
                temps=TEMPS
            )
        print(f"Recorded audio shape {self.recorded_audio.shape}, audio tensor shape {audio_tensor.shape}")
        self.recorded_audio = T.cat([self.recorded_audio.cpu(), audio_tensor.squeeze(0).cpu()], dim=-1)

        self.next_model_audio = curr_model_audio

        return curr_model_audio.float().cpu().numpy()

    def cleanup(self):
        print_colored("Cleaning up audio processor...", "blue")
        os.makedirs('audio_recordings', exist_ok=True)
        torchaudio.save(f'audio_recordings/{time.strftime("%d-%H-%M")}.wav', self.recorded_audio.cpu(), SAMPLE_RATE)
        self.model.deinit_cache()
        self.initialize_state(self.prompt_path)
        print_colored("Audio processor cleanup complete", "green")

@app.post("/set_temperature")
async def set_temperature(token_temp: Optional[float] = None, categorical_temp: Optional[float] = None, gaussian_temp: Optional[float] = None):
    try:        
        global TEMPS
        TEMPS = (token_temp, (categorical_temp, gaussian_temp))
        
        print_colored(f"Temperature updated to: {TEMPS}", "green")
        return {"message": f"Temperature updated to: {TEMPS}", "status": "success"}
    except Exception as e:
        print_colored(f"Error setting temperature: {str(e)}", "red")
        return {"message": f"Error setting temperature: {str(e)}", "status": "error"}

@app.websocket("/audio")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_text()
            audio_data = np.frombuffer(
                base64.b64decode(data.split(",")[1]),
                dtype=np.int16
            )
            audio_data = audio_data.astype(np.float32) / 32767.0
            processed_audio = await audio_processor.process_audio(audio_data)
            processed_audio = (processed_audio * 32767).astype(np.int16)
            
            processed_data = base64.b64encode(processed_audio.tobytes()).decode('utf-8')
            await websocket.send_text(f"data:audio/raw;base64,{processed_data}")
            
    except Exception as e:
            print_colored(f"WebSocket error: {e}", "red")
            print_colored(f"Full traceback:\n{traceback.format_exc()}", "red")
    finally:
        audio_processor.cleanup()
        await websocket.close()


audio_processor = AudioProcessor(model=model, prompt_path=args.prompt_path)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
    print("Server started")