From 270ec4cec08309ef9d810ddafff9d323431a1edd Mon Sep 17 00:00:00 2001
From: Dev Aggarwal <devxpy@gmail.com>
Date: Wed, 4 Sep 2024 21:15:23 +0530
Subject: [PATCH] Add pixel_format parameter to ffmpeg functions and fix retro
 pip requirements

- Add `pixel_format` parameter with default "rgb24" to `ffmpeg_read_input_frames` and `ffmpeg_get_writer_proc` functions
- Update `wav2lip.py` to fix `max_frames` and `batch_size` calculation and ensure `ffproc` is initialized correctly
- remove un-used model lavis
---
 chart/model-values.yaml | 10 ++++---
 ffmpeg_util.py          | 21 +++++++++----
 retro/Dockerfile        | 12 ++++----
 retro/lv.py             | 66 -----------------------------------------
 retro/requirements1.txt |  5 +---
 retro/requirements2.txt | 12 +++++++-
 retro/wav2lip.py        | 36 +++++++++-------------
 7 files changed, 53 insertions(+), 109 deletions(-)
 delete mode 100644 retro/lv.py

diff --git a/chart/model-values.yaml b/chart/model-values.yaml
index d24af90..a77483d 100644
--- a/chart/model-values.yaml
+++ b/chart/model-values.yaml
@@ -284,13 +284,14 @@ deployments:
         thenlper/gte-base
 
   - name: "retro-sadtalker"
-    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:12"
+    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
     autoscaling:
       queueLength: 2
       minReplicaCount: 3
       maxReplicaCount: 10
+    limits_gpu: "8Gi"
     limits:
-      memory: "30Gi"
+      memory: "25Gi" # (220 / 80) * 8
     env:
       IMPORTS: |-
         retro.sadtalker
@@ -298,12 +299,13 @@ deployments:
         SadTalker_V0.0.2_512.safetensors
 
   - name: "retro-wav2lip-gan"
-    image: *retroImg
+    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
     autoscaling:
       minReplicaCount: 2
       maxReplicaCount: 10
+    limits_gpu: "10Gi"
     limits:
-      memory: "37Gi"
+      memory: "30Gi" # (220 / 80) * 10
     env:
       IMPORTS: |-
         retro.wav2lip
diff --git a/ffmpeg_util.py b/ffmpeg_util.py
index e63ac28..0d11f98 100644
--- a/ffmpeg_util.py
+++ b/ffmpeg_util.py
@@ -95,13 +95,18 @@ def ffprobe_video(input_path: str) -> VideoMetadata:
 
 
 def ffmpeg_read_input_frames(
-    *, width: float, height: float, input_path: str, fps: float
+    *,
+    width: float,
+    height: float,
+    input_path: str,
+    fps: float,
+    pixel_format: str = "rgb24",
 ) -> typing.Iterator[np.ndarray]:
     cmd_args = [
         "ffmpeg", "-hide_banner", "-nostats",
         "-i", input_path,
         "-f", "rawvideo",
-        "-pix_fmt", "rgb24",
+        "-pix_fmt", pixel_format,
         "-s", f"{width}x{height}",
         "-r", str(fps),
         "pipe:1",
@@ -124,19 +129,25 @@ def ffmpeg_read_input_frames(
 
 
 def ffmpeg_get_writer_proc(
-    *, width: int, height: int, output_path: str, fps: float, audio_path: str
+    *,
+    width: int,
+    height: int,
+    output_path: str,
+    fps: float,
+    audio_path: str,
+    pixel_format: str = "rgb24",
 ) -> subprocess.Popen:
     cmd_args = [
         "ffmpeg", "-hide_banner", "-nostats",
         # "-thread_queue_size", "128",
-        "-pixel_format", "rgb24",
+        "-pixel_format", pixel_format,
         "-f", "rawvideo",
         # "-vcodec", "rawvideo",
         "-s", f"{width}x{height}",
         "-r", str(fps),
         "-i", "pipe:0",  # stdin
         "-i", audio_path,
-        "-map", "0:v", "-map", "1:a",
+        "-map", "0:v", "-map", "1:a", "-shortest",
         # "-c:a", "copy",
         "-c:v", "libx264",
         "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers
diff --git a/retro/Dockerfile b/retro/Dockerfile
index 3bf900b..d9326e9 100644
--- a/retro/Dockerfile
+++ b/retro/Dockerfile
@@ -39,13 +39,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libsndfile1 \
 	&& rm -rf /var/lib/apt/lists/*
 
-COPY retro/*.txt .
-RUN pip install --no-cache-dir -U cython wheel setuptools pip \
-  && pip install --no-cache-dir -r requirements1.txt \
-  && pip install --no-cache-dir -r requirements2.txt \
-  && pip install --no-cache-dir git+https://github.com/elliottzheng/batch-face.git@master
-
-RUN pip install --no-cache-dir safetensors~=0.4.3 facexlib~=0.3.0 yacs~=0.1.8 gfpgan~=1.3.8 imageio==2.19.3 imageio-ffmpeg==0.4.7
+COPY retro/requirements1.txt .
+RUN pip install --no-cache-dir -U cython wheel setuptools pip
+RUN pip install --no-cache-dir -r requirements1.txt
+COPY retro/requirements2.txt .
+RUN pip install --no-cache-dir -r requirements2.txt
 
 # copy sources
 COPY . .
diff --git a/retro/lv.py b/retro/lv.py
deleted file mode 100644
index f77bdac..0000000
--- a/retro/lv.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-from fastapi import APIRouter
-from lavis.models import load_model_and_preprocess
-
-import gooey_gpu
-from api import PipelineInfo, VQAInput, MAX_IMAGE_SIZE, ImageCaptioningInput
-
-app = APIRouter()
-
-
-@app.post("/vqa/")
-@gooey_gpu.endpoint
-def vqa(pipeline: PipelineInfo, inputs: VQAInput):
-    # load model
-    model_id = pipeline.model_id.split("/")
-    model, vis_processors, txt_processors = load_lavis_model(*model_id)
-    # get inputs
-    inputs_kwargs = inputs.dict()
-    image = gooey_gpu.download_images(inputs_kwargs.pop("image"), MAX_IMAGE_SIZE)
-    question = inputs_kwargs.pop("question")
-    # do inference
-    with gooey_gpu.use_models(model):
-        # preprocess the image
-        # vis_processors stores image transforms for "train" and "eval" (validation / testing / inference)
-        image = torch.stack([vis_processors["eval"](im) for im in image]).to(
-            gooey_gpu.DEVICE_ID
-        )
-        question = [txt_processors["eval"](q) for q in question]
-        # generate answerss
-        return model.predict_answers(
-            samples={"image": image, "text_input": question}, **inputs_kwargs
-        )
-        # ['singapore']
-
-
-@app.post("/image-captioning/")
-@gooey_gpu.endpoint
-def image_captioning(pipeline: PipelineInfo, inputs: ImageCaptioningInput):
-    # load model
-    model_id = pipeline.model_id.split("/")
-    model, vis_processors, txt_processors = load_lavis_model(*model_id)
-    # get inputs
-    inputs_kwargs = inputs.dict()
-    image = gooey_gpu.download_images(inputs_kwargs.pop("image"), MAX_IMAGE_SIZE)
-    # do inference
-    with gooey_gpu.use_models(model):
-        # preprocess the image
-        # vis_processors stores image transforms for "train" and "eval" (validation / testing / inference)
-        image = torch.stack([vis_processors["eval"](im) for im in image]).to(
-            gooey_gpu.DEVICE_ID
-        )
-        # generate caption
-        return model.generate(samples={"image": image}, **inputs_kwargs)
-        # ['a large fountain spewing water into the air']
-
-
-_lavis_cache = {}
-
-
-def load_lavis_model(name, model_type):
-    try:
-        ret = _lavis_cache[(name, model_type)]
-    except KeyError:
-        ret = load_model_and_preprocess(name, model_type, is_eval=True)
-        _lavis_cache[(name, model_type)] = ret
-    return ret
diff --git a/retro/requirements1.txt b/retro/requirements1.txt
index eb70d6b..cee1414 100644
--- a/retro/requirements1.txt
+++ b/retro/requirements1.txt
@@ -1,8 +1,5 @@
-## lavis
-salesforce-lavis ~= 1.0.2
-
 ## nvidia NeMo
-nemo_toolkit[all] ~= 1.20.0
+nemo_toolkit[asr] ~= 1.20.0
 
 ## pytorch
 --extra-index-url https://download.pytorch.org/whl/cu116
diff --git a/retro/requirements2.txt b/retro/requirements2.txt
index 79b66b4..926f1fe 100644
--- a/retro/requirements2.txt
+++ b/retro/requirements2.txt
@@ -7,7 +7,7 @@ redis ~= 4.5.5
 
 ## xformers
 xformers==0.0.16
-triton==2.0.0.dev20221202
+triton==2.0.0
 
 ## huggingface diffusers
 diffusers ~= 0.14.0
@@ -22,9 +22,19 @@ tqdm ~= 4.45.0
 numba == 0.48
 mediapipe ~= 0.10.1
 scipy ~= 1.10.1
+git+https://github.com/elliottzheng/batch-face.git@master
 
 ## gfpgan
 git+https://github.com/xinntao/Real-ESRGAN.git
 git+https://github.com/TencentARC/GFPGAN.git
 git+https://github.com/xinntao/BasicSR
 git+https://github.com/xinntao/facexlib
+
+## sadtalker
+safetensors~=0.4.3
+facexlib~=0.3.0
+yacs~=0.1.8
+gfpgan~=1.3.8
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+kornia~=0.6.8
diff --git a/retro/wav2lip.py b/retro/wav2lip.py
index eda45b5..050d1fd 100644
--- a/retro/wav2lip.py
+++ b/retro/wav2lip.py
@@ -173,36 +173,28 @@ def main(model, detector, outfile: str, inputs: Wav2LipInputs):
     prev_faces = None
 
     mel_chunks = get_mel_chunks(inputs.audio, fps)
-    for idx in tqdm(range(0, len(mel_chunks), inputs.batch_size)):
-        if inputs.max_frames and idx >= inputs.max_frames:
-            break
+    max_frames = min(len(mel_chunks), inputs.max_frames or float("inf"))
+    batch_size = min(inputs.batch_size, max_frames)
+    for idx in tqdm(range(0, max_frames, batch_size)):
         if is_static:
-            frame_batch = [frame.copy()] * inputs.batch_size
+            frame_batch = [frame.copy()] * batch_size
         else:
             frame_batch = list(
-                read_n_frames(
-                    input_stream, inputs.face, inputs.batch_size, inputs.out_height
-                )
+                read_n_frames(input_stream, inputs.face, batch_size, inputs.out_height)
             )
 
         if ffproc is None:
             frame_h, frame_w = frame_batch[0].shape[:-1]
-            gooey_gpu.ffmpeg(
-                # "-thread_queue_size", "128",
-                "-pixel_format", "bgr24", # to match opencv
-                "-f", "rawvideo",
-                # "-vcodec", "rawvideo",
-                "-s", f"{frame_w}x{frame_h}",
-                "-r", str(fps),
-                "-i", "pipe:0", # stdin
-                "-i", inputs.audio,
-                # "-vcodec", "libx264",
-                "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers
-                # "-preset", "ultrafast",
-                outfile,
-            )  # fmt:skip
+            ffproc = gooey_gpu.ffmpeg_get_writer_proc(
+                width=frame_w,
+                height=frame_h,
+                output_path=outfile,
+                fps=fps,
+                audio_path=inputs.audio,
+                pixel_format="bgr24",
+            )
 
-        mel_batch = mel_chunks[idx : idx + inputs.batch_size]
+        mel_batch = mel_chunks[idx : idx + batch_size]
         frame_batch = frame_batch[: len(mel_batch)]
 
         coords_batch, prev_faces = face_detect(