From c49554ccf100ba45d37144f37689ca2d9d1b5683 Mon Sep 17 00:00:00 2001 From: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com> Date: Fri, 3 May 2024 13:25:49 -0700 Subject: [PATCH] Update distributed example tests in `run_python_examples.sh` (#1250) * Fix distributed test * fix parallel scripts * install dill * remove dill * run 2 gpu * remove gpucount, use default * Add examples to distributed examples * refactor distributed test * fx ERRORS overwriting * run with base dir * remove distributed from run_python_examples.sh * move basedir to source * separate init --------- Co-authored-by: Sirut Buasai --- run_distributed_examples.sh | 42 ++++++------------------------- run_python_examples.sh | 49 +++++-------------------------------- utils.sh | 38 ++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 77 deletions(-) create mode 100644 utils.sh diff --git a/run_distributed_examples.sh b/run_distributed_examples.sh index fbac38e7b1..c2260d7c78 100755 --- a/run_distributed_examples.sh +++ b/run_distributed_examples.sh @@ -10,13 +10,8 @@ # to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files. # Expects pytorch, torchvision to be installed. -BASE_DIR=`pwd`"/"`dirname $0` -EXAMPLES=`echo $1 | sed -e 's/ //g'` - -# Redirect 'python' calls to 'python3' -python() { - command python3 "$@" -} +BASE_DIR="$(pwd)/$(dirname $0)" +source $BASE_DIR/utils.sh USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())") case $USE_CUDA in @@ -35,33 +30,12 @@ case $USE_CUDA in ;; esac -ERRORS="" - -function error() { - ERR=$1 - ERRORS="$ERRORS\n$ERR" - echo $ERR -} - -function install_deps() { - echo "installing requirements" - cat $BASE_DIR/*/requirements.txt | \ - sort -u | \ - # testing the installed version of torch, so don't pip install it. - grep -vE '^torch$' | \ - pip install -r /dev/stdin || \ - { error "failed to install dependencies"; exit 1; } -} - -function start() { - EXAMPLE=${FUNCNAME[1]} - cd $BASE_DIR/$EXAMPLE - echo "Running example: $EXAMPLE" -} - function distributed() { start - torchrun --standalone --nnodes=1 --nproc_per_node=4 tensor_parallelism/fsdp_tp_example.py + bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed" + bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed" + bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed" + python ddp/main.py || error "ddp example failed" } function clean() { @@ -88,8 +62,8 @@ fi if [ "" == "$ERRORS" ]; then echo "Completed successfully with status $?" else - echo "Some examples failed:" - printf "$ERRORS" + echo "Some distributed examples failed:" + printf "$ERRORS\n" #Exit with error (0-255) in case of failure in one of the tests. exit 1 diff --git a/run_python_examples.sh b/run_python_examples.sh index c5665def13..66d6e23b66 100755 --- a/run_python_examples.sh +++ b/run_python_examples.sh @@ -10,13 +10,8 @@ # to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files. # Expects pytorch, torchvision to be installed. -BASE_DIR=`pwd`"/"`dirname $0` -EXAMPLES=`echo $1 | sed -e 's/ //g'` - -# Redirect 'python' calls to 'python3' -python() { - command python3 "$@" -} +BASE_DIR="$(pwd)/$(dirname $0)" +source $BASE_DIR/utils.sh USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())") case $USE_CUDA in @@ -35,43 +30,11 @@ case $USE_CUDA in ;; esac -ERRORS="" - -function error() { - ERR=$1 - ERRORS="$ERRORS\n$ERR" - echo $ERR -} - -function install_deps() { - echo "installing requirements" - cat $BASE_DIR/*/requirements.txt | \ - sort -u | \ - # testing the installed version of torch, so don't pip install it. - grep -vE '^torch$' | \ - pip install -r /dev/stdin || \ - { error "failed to install dependencies"; exit 1; } -} - -function start() { - EXAMPLE=${FUNCNAME[1]} - cd $BASE_DIR/$EXAMPLE - echo "Running example: $EXAMPLE" -} - function dcgan() { start python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed" } -function distributed() { - start - python tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed" - python tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed" - python tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed" - python ddp/main.py || error "ddp example failed" -} - function fast_neural_style() { start if [ ! -d "saved_models" ]; then @@ -223,9 +186,9 @@ function clean() { } function run_all() { - # cpp + # cpp moved to `run_cpp_examples.sh``` dcgan - distributed + # distributed moved to `run_distributed_examples.sh` fast_neural_style imagenet language_translation @@ -261,8 +224,8 @@ fi if [ "" == "$ERRORS" ]; then echo "Completed successfully with status $?" else - echo "Some examples failed:" - printf "$ERRORS" + echo "Some python examples failed:" + printf "$ERRORS\n" #Exit with error (0-255) in case of failure in one of the tests. exit 1 diff --git a/utils.sh b/utils.sh new file mode 100644 index 0000000000..b7ed613e6e --- /dev/null +++ b/utils.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# This script contains utility functions and initialize exmaple scripts. +# Eg: run_python_examples.sh, run_distributed_examples.sh + +BASE_DIR="$(pwd)/$(dirname $0)" +EXAMPLES=$(echo $1 | sed -e 's/ //g') + +# Redirect 'python' calls to 'python3' +python() { + command python3 "$@" +} + +ERRORS=${ERRORS-""} + +function error() { + ERR=$1 + if [ "" == "$ERRORS" ]; then + ERRORS="$ERR" + else + ERRORS="$ERRORS\n$ERR" + fi +} + +function install_deps() { + echo "installing requirements" + cat $BASE_DIR/*/requirements.txt | \ + sort -u | \ + # testing the installed version of torch, so don't pip install it. + grep -vE '^torch$' | \ + pip install -r /dev/stdin || \ + { error "failed to install dependencies"; exit 1; } +} + +function start() { + EXAMPLE=${FUNCNAME[1]} + cd $BASE_DIR/$EXAMPLE + echo "Running example: $EXAMPLE" +}