Skip to content

Commit

Permalink
Update distributed example tests in run_python_examples.sh (#1250)
Browse files Browse the repository at this point in the history
* Fix distributed test

* fix parallel scripts

* install dill

* remove dill

* run 2 gpu

* remove gpucount, use default

* Add examples to distributed examples

* refactor distributed test

* fx ERRORS overwriting

* run with base dir

* remove distributed from run_python_examples.sh

* move basedir to source

* separate init

---------

Co-authored-by: Sirut Buasai <[email protected]>
  • Loading branch information
sirutBuasai and Sirut Buasai authored May 3, 2024
1 parent 911816c commit c49554c
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 77 deletions.
42 changes: 8 additions & 34 deletions run_distributed_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}
BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh

USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())")
case $USE_CUDA in
Expand All @@ -35,33 +30,12 @@ case $USE_CUDA in
;;
esac

ERRORS=""

function error() {
ERR=$1
ERRORS="$ERRORS\n$ERR"
echo $ERR
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}

function distributed() {
start
torchrun --standalone --nnodes=1 --nproc_per_node=4 tensor_parallelism/fsdp_tp_example.py
bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
python ddp/main.py || error "ddp example failed"
}

function clean() {
Expand All @@ -88,8 +62,8 @@ fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some examples failed:"
printf "$ERRORS"
echo "Some distributed examples failed:"
printf "$ERRORS\n"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1

Expand Down
49 changes: 6 additions & 43 deletions run_python_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}
BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh

USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
case $USE_CUDA in
Expand All @@ -35,43 +30,11 @@ case $USE_CUDA in
;;
esac

ERRORS=""

function error() {
ERR=$1
ERRORS="$ERRORS\n$ERR"
echo $ERR
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}

function dcgan() {
start
python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed"
}

function distributed() {
start
python tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
python tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
python tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
python ddp/main.py || error "ddp example failed"
}

function fast_neural_style() {
start
if [ ! -d "saved_models" ]; then
Expand Down Expand Up @@ -223,9 +186,9 @@ function clean() {
}

function run_all() {
# cpp
# cpp moved to `run_cpp_examples.sh```
dcgan
distributed
# distributed moved to `run_distributed_examples.sh`
fast_neural_style
imagenet
language_translation
Expand Down Expand Up @@ -261,8 +224,8 @@ fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some examples failed:"
printf "$ERRORS"
echo "Some python examples failed:"
printf "$ERRORS\n"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1

Expand Down
38 changes: 38 additions & 0 deletions utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# This script contains utility functions and initialize exmaple scripts.
# Eg: run_python_examples.sh, run_distributed_examples.sh

BASE_DIR="$(pwd)/$(dirname $0)"
EXAMPLES=$(echo $1 | sed -e 's/ //g')

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}

ERRORS=${ERRORS-""}

function error() {
ERR=$1
if [ "" == "$ERRORS" ]; then
ERRORS="$ERR"
else
ERRORS="$ERRORS\n$ERR"
fi
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}

0 comments on commit c49554c

Please sign in to comment.