Add pixel_format parameter to ffmpeg functions and fix retro pip requ…

…irements - Add `pixel_format` parameter with default "rgb24" to `ffmpeg_read_input_frames` and `ffmpeg_get_writer_proc` functions - Update `wav2lip.py` to fix `max_frames` and `batch_size` calculation and ensure `ffproc` is initialized correctly - remove un-used model lavis
GooeyAI · Sep 4, 2024 · 270ec4c · 270ec4c
1 parent 5bd9546
commit 270ec4c
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 109 deletions.
diff --git a/chart/model-values.yaml b/chart/model-values.yaml
@@ -284,26 +284,28 @@ deployments:
         thenlper/gte-base
 
   - name: "retro-sadtalker"
-    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:12"
+    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
     autoscaling:
       queueLength: 2
       minReplicaCount: 3
       maxReplicaCount: 10
+    limits_gpu: "8Gi"
     limits:
-      memory: "30Gi"
+      memory: "25Gi" # (220 / 80) * 8
     env:
       IMPORTS: |-
         retro.sadtalker
       SADTALKER_MODEL_IDS: |-
         SadTalker_V0.0.2_512.safetensors
 
   - name: "retro-wav2lip-gan"
-    image: *retroImg
+    image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
     autoscaling:
       minReplicaCount: 2
       maxReplicaCount: 10
+    limits_gpu: "10Gi"
     limits:
-      memory: "37Gi"
+      memory: "30Gi" # (220 / 80) * 10
     env:
       IMPORTS: |-
         retro.wav2lip

diff --git a/ffmpeg_util.py b/ffmpeg_util.py
@@ -95,13 +95,18 @@ def ffprobe_video(input_path: str) -> VideoMetadata:
 
 
 def ffmpeg_read_input_frames(
-    *, width: float, height: float, input_path: str, fps: float
+    *,
+    width: float,
+    height: float,
+    input_path: str,
+    fps: float,
+    pixel_format: str = "rgb24",
 ) -> typing.Iterator[np.ndarray]:
     cmd_args = [
         "ffmpeg", "-hide_banner", "-nostats",
         "-i", input_path,
         "-f", "rawvideo",
-        "-pix_fmt", "rgb24",
+        "-pix_fmt", pixel_format,
         "-s", f"{width}x{height}",
         "-r", str(fps),
         "pipe:1",
@@ -124,19 +129,25 @@ def ffmpeg_read_input_frames(
 
 
 def ffmpeg_get_writer_proc(
-    *, width: int, height: int, output_path: str, fps: float, audio_path: str
+    *,
+    width: int,
+    height: int,
+    output_path: str,
+    fps: float,
+    audio_path: str,
+    pixel_format: str = "rgb24",
 ) -> subprocess.Popen:
     cmd_args = [
         "ffmpeg", "-hide_banner", "-nostats",
         # "-thread_queue_size", "128",
-        "-pixel_format", "rgb24",
+        "-pixel_format", pixel_format,
         "-f", "rawvideo",
         # "-vcodec", "rawvideo",
         "-s", f"{width}x{height}",
         "-r", str(fps),
         "-i", "pipe:0",  # stdin
         "-i", audio_path,
-        "-map", "0:v", "-map", "1:a",
+        "-map", "0:v", "-map", "1:a", "-shortest",
         # "-c:a", "copy",
         "-c:v", "libx264",
         "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers

diff --git a/retro/Dockerfile b/retro/Dockerfile
@@ -39,13 +39,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libsndfile1 \
 	&& rm -rf /var/lib/apt/lists/*
 
-COPY retro/*.txt .
-RUN pip install --no-cache-dir -U cython wheel setuptools pip \
-  && pip install --no-cache-dir -r requirements1.txt \
-  && pip install --no-cache-dir -r requirements2.txt \
-  && pip install --no-cache-dir git+https://github.com/elliottzheng/batch-face.git@master
-
-RUN pip install --no-cache-dir safetensors~=0.4.3 facexlib~=0.3.0 yacs~=0.1.8 gfpgan~=1.3.8 imageio==2.19.3 imageio-ffmpeg==0.4.7
+COPY retro/requirements1.txt .
+RUN pip install --no-cache-dir -U cython wheel setuptools pip
+RUN pip install --no-cache-dir -r requirements1.txt
+COPY retro/requirements2.txt .
+RUN pip install --no-cache-dir -r requirements2.txt
 
 # copy sources
 COPY . .

diff --git a/retro/lv.py b/retro/lv.py
diff --git a/retro/requirements1.txt b/retro/requirements1.txt
@@ -1,8 +1,5 @@
-## lavis
-salesforce-lavis ~= 1.0.2
-
 ## nvidia NeMo
-nemo_toolkit[all] ~= 1.20.0
+nemo_toolkit[asr] ~= 1.20.0
 
 ## pytorch
 --extra-index-url https://download.pytorch.org/whl/cu116

diff --git a/retro/requirements2.txt b/retro/requirements2.txt
@@ -7,7 +7,7 @@ redis ~= 4.5.5
 
 ## xformers
 xformers==0.0.16
-triton==2.0.0.dev20221202
+triton==2.0.0
 
 ## huggingface diffusers
 diffusers ~= 0.14.0
@@ -22,9 +22,19 @@ tqdm ~= 4.45.0
 numba == 0.48
 mediapipe ~= 0.10.1
 scipy ~= 1.10.1
+git+https://github.com/elliottzheng/batch-face.git@master
 
 ## gfpgan
 git+https://github.com/xinntao/Real-ESRGAN.git
 git+https://github.com/TencentARC/GFPGAN.git
 git+https://github.com/xinntao/BasicSR
 git+https://github.com/xinntao/facexlib
+
+## sadtalker
+safetensors~=0.4.3
+facexlib~=0.3.0
+yacs~=0.1.8
+gfpgan~=1.3.8
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+kornia~=0.6.8
diff --git a/retro/wav2lip.py b/retro/wav2lip.py
@@ -173,36 +173,28 @@ def main(model, detector, outfile: str, inputs: Wav2LipInputs):
     prev_faces = None
 
     mel_chunks = get_mel_chunks(inputs.audio, fps)
-    for idx in tqdm(range(0, len(mel_chunks), inputs.batch_size)):
-        if inputs.max_frames and idx >= inputs.max_frames:
-            break
+    max_frames = min(len(mel_chunks), inputs.max_frames or float("inf"))
+    batch_size = min(inputs.batch_size, max_frames)
+    for idx in tqdm(range(0, max_frames, batch_size)):
         if is_static:
-            frame_batch = [frame.copy()] * inputs.batch_size
+            frame_batch = [frame.copy()] * batch_size
         else:
             frame_batch = list(
-                read_n_frames(
-                    input_stream, inputs.face, inputs.batch_size, inputs.out_height
-                )
+                read_n_frames(input_stream, inputs.face, batch_size, inputs.out_height)
             )
 
         if ffproc is None:
             frame_h, frame_w = frame_batch[0].shape[:-1]
-            gooey_gpu.ffmpeg(
-                # "-thread_queue_size", "128",
-                "-pixel_format", "bgr24", # to match opencv
-                "-f", "rawvideo",
-                # "-vcodec", "rawvideo",
-                "-s", f"{frame_w}x{frame_h}",
-                "-r", str(fps),
-                "-i", "pipe:0", # stdin
-                "-i", inputs.audio,
-                # "-vcodec", "libx264",
-                "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers
-                # "-preset", "ultrafast",
-                outfile,
-            )  # fmt:skip
+            ffproc = gooey_gpu.ffmpeg_get_writer_proc(
+                width=frame_w,
+                height=frame_h,
+                output_path=outfile,
+                fps=fps,
+                audio_path=inputs.audio,
+                pixel_format="bgr24",
+            )
 
-        mel_batch = mel_chunks[idx : idx + inputs.batch_size]
+        mel_batch = mel_chunks[idx : idx + batch_size]
         frame_batch = frame_batch[: len(mel_batch)]
 
         coords_batch, prev_faces = face_detect(