Skip to content

Commit

Permalink
Add pixel_format parameter to ffmpeg functions and fix retro pip requ…
Browse files Browse the repository at this point in the history
…irements

- Add `pixel_format` parameter with default "rgb24" to `ffmpeg_read_input_frames` and `ffmpeg_get_writer_proc` functions
- Update `wav2lip.py` to fix `max_frames` and `batch_size` calculation and ensure `ffproc` is initialized correctly
- remove un-used model lavis
  • Loading branch information
devxpy committed Sep 4, 2024
1 parent 5bd9546 commit 270ec4c
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 109 deletions.
10 changes: 6 additions & 4 deletions chart/model-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -284,26 +284,28 @@ deployments:
thenlper/gte-base
- name: "retro-sadtalker"
image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:12"
image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
autoscaling:
queueLength: 2
minReplicaCount: 3
maxReplicaCount: 10
limits_gpu: "8Gi"
limits:
memory: "30Gi"
memory: "25Gi" # (220 / 80) * 8
env:
IMPORTS: |-
retro.sadtalker
SADTALKER_MODEL_IDS: |-
SadTalker_V0.0.2_512.safetensors
- name: "retro-wav2lip-gan"
image: *retroImg
image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13"
autoscaling:
minReplicaCount: 2
maxReplicaCount: 10
limits_gpu: "10Gi"
limits:
memory: "37Gi"
memory: "30Gi" # (220 / 80) * 10
env:
IMPORTS: |-
retro.wav2lip
Expand Down
21 changes: 16 additions & 5 deletions ffmpeg_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,18 @@ def ffprobe_video(input_path: str) -> VideoMetadata:


def ffmpeg_read_input_frames(
*, width: float, height: float, input_path: str, fps: float
*,
width: float,
height: float,
input_path: str,
fps: float,
pixel_format: str = "rgb24",
) -> typing.Iterator[np.ndarray]:
cmd_args = [
"ffmpeg", "-hide_banner", "-nostats",
"-i", input_path,
"-f", "rawvideo",
"-pix_fmt", "rgb24",
"-pix_fmt", pixel_format,
"-s", f"{width}x{height}",
"-r", str(fps),
"pipe:1",
Expand All @@ -124,19 +129,25 @@ def ffmpeg_read_input_frames(


def ffmpeg_get_writer_proc(
*, width: int, height: int, output_path: str, fps: float, audio_path: str
*,
width: int,
height: int,
output_path: str,
fps: float,
audio_path: str,
pixel_format: str = "rgb24",
) -> subprocess.Popen:
cmd_args = [
"ffmpeg", "-hide_banner", "-nostats",
# "-thread_queue_size", "128",
"-pixel_format", "rgb24",
"-pixel_format", pixel_format,
"-f", "rawvideo",
# "-vcodec", "rawvideo",
"-s", f"{width}x{height}",
"-r", str(fps),
"-i", "pipe:0", # stdin
"-i", audio_path,
"-map", "0:v", "-map", "1:a",
"-map", "0:v", "-map", "1:a", "-shortest",
# "-c:a", "copy",
"-c:v", "libx264",
"-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers
Expand Down
12 changes: 5 additions & 7 deletions retro/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*

COPY retro/*.txt .
RUN pip install --no-cache-dir -U cython wheel setuptools pip \
&& pip install --no-cache-dir -r requirements1.txt \
&& pip install --no-cache-dir -r requirements2.txt \
&& pip install --no-cache-dir git+https://github.com/elliottzheng/batch-face.git@master

RUN pip install --no-cache-dir safetensors~=0.4.3 facexlib~=0.3.0 yacs~=0.1.8 gfpgan~=1.3.8 imageio==2.19.3 imageio-ffmpeg==0.4.7
COPY retro/requirements1.txt .
RUN pip install --no-cache-dir -U cython wheel setuptools pip
RUN pip install --no-cache-dir -r requirements1.txt
COPY retro/requirements2.txt .
RUN pip install --no-cache-dir -r requirements2.txt

# copy sources
COPY . .
Expand Down
66 changes: 0 additions & 66 deletions retro/lv.py

This file was deleted.

5 changes: 1 addition & 4 deletions retro/requirements1.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
## lavis
salesforce-lavis ~= 1.0.2

## nvidia NeMo
nemo_toolkit[all] ~= 1.20.0
nemo_toolkit[asr] ~= 1.20.0

## pytorch
--extra-index-url https://download.pytorch.org/whl/cu116
Expand Down
12 changes: 11 additions & 1 deletion retro/requirements2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ redis ~= 4.5.5

## xformers
xformers==0.0.16
triton==2.0.0.dev20221202
triton==2.0.0

## huggingface diffusers
diffusers ~= 0.14.0
Expand All @@ -22,9 +22,19 @@ tqdm ~= 4.45.0
numba == 0.48
mediapipe ~= 0.10.1
scipy ~= 1.10.1
git+https://github.com/elliottzheng/batch-face.git@master

## gfpgan
git+https://github.com/xinntao/Real-ESRGAN.git
git+https://github.com/TencentARC/GFPGAN.git
git+https://github.com/xinntao/BasicSR
git+https://github.com/xinntao/facexlib

## sadtalker
safetensors~=0.4.3
facexlib~=0.3.0
yacs~=0.1.8
gfpgan~=1.3.8
imageio==2.19.3
imageio-ffmpeg==0.4.7
kornia~=0.6.8
36 changes: 14 additions & 22 deletions retro/wav2lip.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,36 +173,28 @@ def main(model, detector, outfile: str, inputs: Wav2LipInputs):
prev_faces = None

mel_chunks = get_mel_chunks(inputs.audio, fps)
for idx in tqdm(range(0, len(mel_chunks), inputs.batch_size)):
if inputs.max_frames and idx >= inputs.max_frames:
break
max_frames = min(len(mel_chunks), inputs.max_frames or float("inf"))
batch_size = min(inputs.batch_size, max_frames)
for idx in tqdm(range(0, max_frames, batch_size)):
if is_static:
frame_batch = [frame.copy()] * inputs.batch_size
frame_batch = [frame.copy()] * batch_size
else:
frame_batch = list(
read_n_frames(
input_stream, inputs.face, inputs.batch_size, inputs.out_height
)
read_n_frames(input_stream, inputs.face, batch_size, inputs.out_height)
)

if ffproc is None:
frame_h, frame_w = frame_batch[0].shape[:-1]
gooey_gpu.ffmpeg(
# "-thread_queue_size", "128",
"-pixel_format", "bgr24", # to match opencv
"-f", "rawvideo",
# "-vcodec", "rawvideo",
"-s", f"{frame_w}x{frame_h}",
"-r", str(fps),
"-i", "pipe:0", # stdin
"-i", inputs.audio,
# "-vcodec", "libx264",
"-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers
# "-preset", "ultrafast",
outfile,
) # fmt:skip
ffproc = gooey_gpu.ffmpeg_get_writer_proc(
width=frame_w,
height=frame_h,
output_path=outfile,
fps=fps,
audio_path=inputs.audio,
pixel_format="bgr24",
)

mel_batch = mel_chunks[idx : idx + inputs.batch_size]
mel_batch = mel_chunks[idx : idx + batch_size]
frame_batch = frame_batch[: len(mel_batch)]

coords_batch, prev_faces = face_detect(
Expand Down

0 comments on commit 270ec4c

Please sign in to comment.