From 270ec4cec08309ef9d810ddafff9d323431a1edd Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Wed, 4 Sep 2024 21:15:23 +0530 Subject: [PATCH] Add pixel_format parameter to ffmpeg functions and fix retro pip requirements - Add `pixel_format` parameter with default "rgb24" to `ffmpeg_read_input_frames` and `ffmpeg_get_writer_proc` functions - Update `wav2lip.py` to fix `max_frames` and `batch_size` calculation and ensure `ffproc` is initialized correctly - remove un-used model lavis --- chart/model-values.yaml | 10 ++++--- ffmpeg_util.py | 21 +++++++++---- retro/Dockerfile | 12 ++++---- retro/lv.py | 66 ----------------------------------------- retro/requirements1.txt | 5 +--- retro/requirements2.txt | 12 +++++++- retro/wav2lip.py | 36 +++++++++------------- 7 files changed, 53 insertions(+), 109 deletions(-) delete mode 100644 retro/lv.py diff --git a/chart/model-values.yaml b/chart/model-values.yaml index d24af90..a77483d 100644 --- a/chart/model-values.yaml +++ b/chart/model-values.yaml @@ -284,13 +284,14 @@ deployments: thenlper/gte-base - name: "retro-sadtalker" - image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:12" + image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13" autoscaling: queueLength: 2 minReplicaCount: 3 maxReplicaCount: 10 + limits_gpu: "8Gi" limits: - memory: "30Gi" + memory: "25Gi" # (220 / 80) * 8 env: IMPORTS: |- retro.sadtalker @@ -298,12 +299,13 @@ deployments: SadTalker_V0.0.2_512.safetensors - name: "retro-wav2lip-gan" - image: *retroImg + image: "crgooeyprodwestus1.azurecr.io/gooey-gpu-retro:13" autoscaling: minReplicaCount: 2 maxReplicaCount: 10 + limits_gpu: "10Gi" limits: - memory: "37Gi" + memory: "30Gi" # (220 / 80) * 10 env: IMPORTS: |- retro.wav2lip diff --git a/ffmpeg_util.py b/ffmpeg_util.py index e63ac28..0d11f98 100644 --- a/ffmpeg_util.py +++ b/ffmpeg_util.py @@ -95,13 +95,18 @@ def ffprobe_video(input_path: str) -> VideoMetadata: def ffmpeg_read_input_frames( - *, width: float, height: float, input_path: str, fps: float + *, + width: float, + height: float, + input_path: str, + fps: float, + pixel_format: str = "rgb24", ) -> typing.Iterator[np.ndarray]: cmd_args = [ "ffmpeg", "-hide_banner", "-nostats", "-i", input_path, "-f", "rawvideo", - "-pix_fmt", "rgb24", + "-pix_fmt", pixel_format, "-s", f"{width}x{height}", "-r", str(fps), "pipe:1", @@ -124,19 +129,25 @@ def ffmpeg_read_input_frames( def ffmpeg_get_writer_proc( - *, width: int, height: int, output_path: str, fps: float, audio_path: str + *, + width: int, + height: int, + output_path: str, + fps: float, + audio_path: str, + pixel_format: str = "rgb24", ) -> subprocess.Popen: cmd_args = [ "ffmpeg", "-hide_banner", "-nostats", # "-thread_queue_size", "128", - "-pixel_format", "rgb24", + "-pixel_format", pixel_format, "-f", "rawvideo", # "-vcodec", "rawvideo", "-s", f"{width}x{height}", "-r", str(fps), "-i", "pipe:0", # stdin "-i", audio_path, - "-map", "0:v", "-map", "1:a", + "-map", "0:v", "-map", "1:a", "-shortest", # "-c:a", "copy", "-c:v", "libx264", "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers diff --git a/retro/Dockerfile b/retro/Dockerfile index 3bf900b..d9326e9 100644 --- a/retro/Dockerfile +++ b/retro/Dockerfile @@ -39,13 +39,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libsndfile1 \ && rm -rf /var/lib/apt/lists/* -COPY retro/*.txt . -RUN pip install --no-cache-dir -U cython wheel setuptools pip \ - && pip install --no-cache-dir -r requirements1.txt \ - && pip install --no-cache-dir -r requirements2.txt \ - && pip install --no-cache-dir git+https://github.com/elliottzheng/batch-face.git@master - -RUN pip install --no-cache-dir safetensors~=0.4.3 facexlib~=0.3.0 yacs~=0.1.8 gfpgan~=1.3.8 imageio==2.19.3 imageio-ffmpeg==0.4.7 +COPY retro/requirements1.txt . +RUN pip install --no-cache-dir -U cython wheel setuptools pip +RUN pip install --no-cache-dir -r requirements1.txt +COPY retro/requirements2.txt . +RUN pip install --no-cache-dir -r requirements2.txt # copy sources COPY . . diff --git a/retro/lv.py b/retro/lv.py deleted file mode 100644 index f77bdac..0000000 --- a/retro/lv.py +++ /dev/null @@ -1,66 +0,0 @@ -import torch -from fastapi import APIRouter -from lavis.models import load_model_and_preprocess - -import gooey_gpu -from api import PipelineInfo, VQAInput, MAX_IMAGE_SIZE, ImageCaptioningInput - -app = APIRouter() - - -@app.post("/vqa/") -@gooey_gpu.endpoint -def vqa(pipeline: PipelineInfo, inputs: VQAInput): - # load model - model_id = pipeline.model_id.split("/") - model, vis_processors, txt_processors = load_lavis_model(*model_id) - # get inputs - inputs_kwargs = inputs.dict() - image = gooey_gpu.download_images(inputs_kwargs.pop("image"), MAX_IMAGE_SIZE) - question = inputs_kwargs.pop("question") - # do inference - with gooey_gpu.use_models(model): - # preprocess the image - # vis_processors stores image transforms for "train" and "eval" (validation / testing / inference) - image = torch.stack([vis_processors["eval"](im) for im in image]).to( - gooey_gpu.DEVICE_ID - ) - question = [txt_processors["eval"](q) for q in question] - # generate answerss - return model.predict_answers( - samples={"image": image, "text_input": question}, **inputs_kwargs - ) - # ['singapore'] - - -@app.post("/image-captioning/") -@gooey_gpu.endpoint -def image_captioning(pipeline: PipelineInfo, inputs: ImageCaptioningInput): - # load model - model_id = pipeline.model_id.split("/") - model, vis_processors, txt_processors = load_lavis_model(*model_id) - # get inputs - inputs_kwargs = inputs.dict() - image = gooey_gpu.download_images(inputs_kwargs.pop("image"), MAX_IMAGE_SIZE) - # do inference - with gooey_gpu.use_models(model): - # preprocess the image - # vis_processors stores image transforms for "train" and "eval" (validation / testing / inference) - image = torch.stack([vis_processors["eval"](im) for im in image]).to( - gooey_gpu.DEVICE_ID - ) - # generate caption - return model.generate(samples={"image": image}, **inputs_kwargs) - # ['a large fountain spewing water into the air'] - - -_lavis_cache = {} - - -def load_lavis_model(name, model_type): - try: - ret = _lavis_cache[(name, model_type)] - except KeyError: - ret = load_model_and_preprocess(name, model_type, is_eval=True) - _lavis_cache[(name, model_type)] = ret - return ret diff --git a/retro/requirements1.txt b/retro/requirements1.txt index eb70d6b..cee1414 100644 --- a/retro/requirements1.txt +++ b/retro/requirements1.txt @@ -1,8 +1,5 @@ -## lavis -salesforce-lavis ~= 1.0.2 - ## nvidia NeMo -nemo_toolkit[all] ~= 1.20.0 +nemo_toolkit[asr] ~= 1.20.0 ## pytorch --extra-index-url https://download.pytorch.org/whl/cu116 diff --git a/retro/requirements2.txt b/retro/requirements2.txt index 79b66b4..926f1fe 100644 --- a/retro/requirements2.txt +++ b/retro/requirements2.txt @@ -7,7 +7,7 @@ redis ~= 4.5.5 ## xformers xformers==0.0.16 -triton==2.0.0.dev20221202 +triton==2.0.0 ## huggingface diffusers diffusers ~= 0.14.0 @@ -22,9 +22,19 @@ tqdm ~= 4.45.0 numba == 0.48 mediapipe ~= 0.10.1 scipy ~= 1.10.1 +git+https://github.com/elliottzheng/batch-face.git@master ## gfpgan git+https://github.com/xinntao/Real-ESRGAN.git git+https://github.com/TencentARC/GFPGAN.git git+https://github.com/xinntao/BasicSR git+https://github.com/xinntao/facexlib + +## sadtalker +safetensors~=0.4.3 +facexlib~=0.3.0 +yacs~=0.1.8 +gfpgan~=1.3.8 +imageio==2.19.3 +imageio-ffmpeg==0.4.7 +kornia~=0.6.8 diff --git a/retro/wav2lip.py b/retro/wav2lip.py index eda45b5..050d1fd 100644 --- a/retro/wav2lip.py +++ b/retro/wav2lip.py @@ -173,36 +173,28 @@ def main(model, detector, outfile: str, inputs: Wav2LipInputs): prev_faces = None mel_chunks = get_mel_chunks(inputs.audio, fps) - for idx in tqdm(range(0, len(mel_chunks), inputs.batch_size)): - if inputs.max_frames and idx >= inputs.max_frames: - break + max_frames = min(len(mel_chunks), inputs.max_frames or float("inf")) + batch_size = min(inputs.batch_size, max_frames) + for idx in tqdm(range(0, max_frames, batch_size)): if is_static: - frame_batch = [frame.copy()] * inputs.batch_size + frame_batch = [frame.copy()] * batch_size else: frame_batch = list( - read_n_frames( - input_stream, inputs.face, inputs.batch_size, inputs.out_height - ) + read_n_frames(input_stream, inputs.face, batch_size, inputs.out_height) ) if ffproc is None: frame_h, frame_w = frame_batch[0].shape[:-1] - gooey_gpu.ffmpeg( - # "-thread_queue_size", "128", - "-pixel_format", "bgr24", # to match opencv - "-f", "rawvideo", - # "-vcodec", "rawvideo", - "-s", f"{frame_w}x{frame_h}", - "-r", str(fps), - "-i", "pipe:0", # stdin - "-i", inputs.audio, - # "-vcodec", "libx264", - "-pix_fmt", "yuv420p", # because iphone, see https://trac.ffmpeg.org/wiki/Encode/H.264#Encodingfordumbplayers - # "-preset", "ultrafast", - outfile, - ) # fmt:skip + ffproc = gooey_gpu.ffmpeg_get_writer_proc( + width=frame_w, + height=frame_h, + output_path=outfile, + fps=fps, + audio_path=inputs.audio, + pixel_format="bgr24", + ) - mel_batch = mel_chunks[idx : idx + inputs.batch_size] + mel_batch = mel_chunks[idx : idx + batch_size] frame_batch = frame_batch[: len(mel_batch)] coords_batch, prev_faces = face_detect(