diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ea51bac31..651421694 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -13,5 +13,6 @@ dags/pytorch_xla @will-cromar @jonb377 @JackCaoG @vanbasten23 @zpcore dags/legacy_test/tests/pytorch @will-cromar @jonb377 @JackCaoG @vanbasten23 @zpcore dags/multipod @jonb377 +./.github/workflows/multipod @jonb377 @tonyjohnchen dags/mlcompass @ortibazar @sganeshb @brajiang @wlzhg diff --git a/.github/workflows/multipod/nightly_release_utils/build_and_upload_images.sh b/.github/workflows/multipod/nightly_release_utils/build_and_upload_images.sh new file mode 100644 index 000000000..2598a1b20 --- /dev/null +++ b/.github/workflows/multipod/nightly_release_utils/build_and_upload_images.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script builds and uploads two images - one with only dependencies, the other also has a code snapshot. +# These images are tagged in GCR with both "latest" and date in format YYYY-MM-DD via $(date +%Y-%m-%d) + +# Example command: +# bash build_and_upload_images.sh PROJECT= MODE=nightly MERGE_IMAGE_NAME= OUTPUT_IMAGE_NAME= + +set -e + +# Set environment variables +for ARGUMENT in "$@"; do + IFS='=' read -r KEY VALUE <<< "$ARGUMENT" + export "$KEY"="$VALUE" + echo "$KEY"="$VALUE" +done + +if [[ ! -v OUTPUT_IMAGE_NAME ]] || [[ ! -v PROJECT ]] || [[ ! -v MODE ]] || [[ ! -v MERGE_IMAGE_NAME ]]; then + echo "You must set OUTPUT_IMAGE_NAME, PROJECT, MODE, and MERGE_IMAGE_NAME" + exit 1 +fi + +gcloud auth configure-docker --quiet +image_date=$(date +%Y-%m-%d) + +if [[ "$MODE" == "nightly" ]]; then + merge_image=gcr.io/$PROJECT/$MERGE_IMAGE_NAME:latest +else + merge_image=gcr.io/$PROJECT/$MERGE_IMAGE_NAME:stable +fi + +output_image=gcr.io/$PROJECT/$OUTPUT_IMAGE_NAME:$image_date + +docker pull $merge_image +docker build --build-arg MERGE_IMAGE=$merge_image -f .github/workflows/multipod/nightly_release_utils/merge.Dockerfile -t ${OUTPUT_IMAGE_NAME}_runner . +docker tag ${OUTPUT_IMAGE_NAME}_runner $output_image +docker push $output_image diff --git a/.github/workflows/multipod/nightly_release_utils/merge.Dockerfile b/.github/workflows/multipod/nightly_release_utils/merge.Dockerfile new file mode 100644 index 000000000..e3d74187d --- /dev/null +++ b/.github/workflows/multipod/nightly_release_utils/merge.Dockerfile @@ -0,0 +1,26 @@ +ARG MERGE_IMAGE +FROM $MERGE_IMAGE AS merge + +# Use Python 3.10 as the base image +FROM python:3.10-slim-bullseye + +# Install system dependencies +RUN apt-get update && apt-get install -y curl gnupg + +# Add the Google Cloud SDK package repository +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list +RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - + +# Install the Google Cloud SDK +RUN apt-get update && apt-get install -y google-cloud-sdk + +# Set the default Python version to 3.10 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.10 1 + +# Set environment variables for Google Cloud SDK and Python 3.10 +ENV PATH="/usr/local/google-cloud-sdk/bin:/usr/local/bin/python3.10:$PATH" + +# Copy everything from the merge image +COPY --from=merge / / + +WORKDIR /app diff --git a/.github/workflows/upload-docker-images.yml b/.github/workflows/upload-docker-images.yml new file mode 100644 index 000000000..9188a8991 --- /dev/null +++ b/.github/workflows/upload-docker-images.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Build Images + +on: + push: + branches: + - docker-nightly-release + schedule: + # Run the job daily at 12AM UTC (5AM PT) + - cron: '0 0 * * *' + +jobs: + tpu: + strategy: + fail-fast: false + matrix: + device-type: ["v4-8"] + runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"] + steps: + - uses: actions/checkout@v3 + - name: build hybridsim nightly image + run : | + bash .github/workflows/multipod/nightly_release_utils/build_and_upload_images.sh PROJECT=tpu-prod-env-multipod MODE=nightly MERGE_IMAGE_NAME=hybridsim_nightly OUTPUT_IMAGE_NAME=cloud_hybridsim_nightly