Skip to content

Commit

Permalink
Merge branch 'main' into grpc-ra-tls
Browse files Browse the repository at this point in the history
  • Loading branch information
RodgerZhu committed Feb 17, 2022
2 parents a6e9dc8 + 5f807e0 commit 67d648f
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 53 deletions.
26 changes: 18 additions & 8 deletions cczoo/horizontal_fl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,32 +66,42 @@ Steps **②**-**⑥** will be repeated continuously during the training process.
- framework: TensorFlow 2.4.2
- model: ResNet-50
- dataset: Cifar-10
- ps num: 2
- ps num: 1
- worker num: 2
- container num: 3

### Build Docker image

```shell
./build_docker_image.sh
```

### Start container
### Start containers and aesm services
Start three containers (ps0, worker0, worker1) and aesm services.
```shell
./start_container.sh <attestation ip addr>
./start_container.sh <attestation ip addr> ps0
/start_aesm_service.sh
```
```shell
./start_container.sh <attestation ip addr> worker0
/start_aesm_service.sh
```

### Start aesm service
```shell
./start_container.sh <attestation ip addr> worker1
/start_aesm_service.sh
```

### Run the training scripts
Run the script for the corresponding job in each container.
```shell
cd hfl-tensorflow
test-sgx.sh make
test-sgx.sh ps0
test-sgx.sh ps1
```
```shell
cd hfl-tensorflow
test-sgx.sh worker0
```
```shell
cd hfl-tensorflow
test-sgx.sh worker1
```

Expand Down
2 changes: 1 addition & 1 deletion cczoo/horizontal_fl/hfl-tensorflow/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ ifeq ($(SGX),1)
all: python.manifest.sgx python.sig python.token
endif

################################ fedlearner MANIFEST ###############################
################################ MANIFEST ###############################

python.manifest: python.manifest.template
gramine-manifest \
Expand Down
21 changes: 8 additions & 13 deletions cczoo/horizontal_fl/hfl-tensorflow/test-sgx.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
#
# Copyright (c) 2021 Intel Corporation
#
Expand All @@ -13,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

#!/bin/bash
set -ex

shopt -s expand_aliases
Expand Down Expand Up @@ -48,33 +48,28 @@ function make_custom_env() {
}

ROLE=$1
PS_HOSTS=$2
WORKER_HOSTS=$3

if [ "$ROLE" == "make" ]; then
rm -rf model *.log
make clean && make | make_logfilter
kill -9 `pgrep -f gramine`
elif [ "$ROLE" == "ps0" ]; then
make_custom_env
taskset -c 0-3 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=ps --loglevel=debug 2>&1 | runtime_logfilter | tee -a ps0-gramine-python.log &
taskset -c 0-3 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=ps $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a ps0-gramine-python.log &
if [ "$DEBUG" != "0" ]; then
wait && kill -9 `pgrep -f gramine`
fi
elif [ "$ROLE" == "ps1" ]; then
make_custom_env
taskset -c 4-7 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=ps --loglevel=debug 2>&1 | runtime_logfilter | tee -a ps1-gramine-python.log &
if [ "$DEBUG" != "0" ]; then
wait && kill -9 `pgrep -f gramine`
fi

elif [ "$ROLE" == "worker0" ]; then
make_custom_env
taskset -c 8-11 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=worker --loglevel=debug 2>&1 | runtime_logfilter | tee -a worker0-gramine-python.log &
taskset -c 8-11 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a worker0-gramine-python.log &
if [ "$DEBUG" != "0" ]; then
wait && kill -9 `pgrep -f gramine`
fi
elif [ "$ROLE" == "worker1" ]; then
make_custom_env
taskset -c 11-15 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=worker --loglevel=debug 2>&1 | runtime_logfilter | tee -a worker1-gramine-python.log &
taskset -c 11-15 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a worker1-gramine-python.log &
if [ "$DEBUG" != "0" ]; then
wait && kill -9 `pgrep -f gramine`
fi
fi

23 changes: 7 additions & 16 deletions cczoo/horizontal_fl/hfl-tensorflow/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,43 +16,34 @@
#!/bin/bash
set -ex

function get_env() {
gramine-sgx-get-token -s python.sig -o /dev/null | grep $1 | awk -F ":" '{print $2}' | xargs
}

function make_custom_env() {
export CUDA_VISIBLE_DEVICES=""
export DNNL_VERBOSE=1
export GRPC_VERBOSITY=ERROR
export TF_CPP_MIN_LOG_LEVEL=1
# export TF_GRPC_SGX_RA_TLS_ENABLE=""
export TF_GRPC_SGX_RA_TLS_ENABLE=on
export TF_GRPC_SGX_RA_TLS_ENABLE=""
export TF_DISABLE_MKL=0
export TF_ENABLE_MKL_NATIVE_FORMAT=1
export parallel_num_threads=4
export INTRA_OP_PARALLELISM_THREADS=$parallel_num_threads
export INTER_OP_PARALLELISM_THREADS=$parallel_num_threads
export KMP_SETTINGS=1
export KMP_BLOCKTIME=0
export MR_ENCLAVE=`get_env mr_enclave`
export MR_SIGNER=`get_env mr_signer`
export ISV_PROD_ID=`get_env isv_prod_id`
export ISV_SVN=`get_env isv_svn`
# network proxy
unset http_proxy https_proxy
}

ROLE=$1
PS_HOSTS=$2
WORKER_HOSTS=$3
if [ "$ROLE" == "ps0" ]; then
make_custom_env
taskset -c 0-3 stdbuf -o0 python -u train.py --task_index=0 --job_name=ps --loglevel=debug 2>&1 | tee -a ps0-python.log &
if [ "$ROLE" == "ps1" ]; then
make_custom_env
taskset -c 4-7 stdbuf -o0 python -u train.py --task_index=1 --job_name=ps --loglevel=debug 2>&1 | tee -a ps1-python.log &
taskset -c 0-3 stdbuf -o0 python -u train.py --task_index=0 --job_name=ps 2>&1 $PS_HOSTS $WORKER_HOSTS | tee -a ps0-python.log &
elif [ "$ROLE" == "worker0" ]; then
make_custom_env
taskset -c 8-11 stdbuf -o0 python -u train.py --task_index=0 --job_name=worker --loglevel=debug 2>&1 | tee -a worker0-python.log &
taskset -c 8-11 stdbuf -o0 python -u train.py --task_index=0 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | tee -a worker0-python.log &
elif [ "$ROLE" == "worker1" ]; then
make_custom_env
taskset -c 12-15 stdbuf -o0 python -u train.py --task_index=1 --job_name=worker --loglevel=debug 2>&1 | tee -a worker1-python.log &
taskset -c 12-15 stdbuf -o0 python -u train.py --task_index=1 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | tee -a worker1-python.log &
fi

13 changes: 9 additions & 4 deletions cczoo/horizontal_fl/hfl-tensorflow/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,20 @@
tf.disable_eager_execution()

# Configuration of cluster
ps_hosts = [ "localhost:60002", "localhost:60003"]
worker_hosts = [ "localhost:61002", "localhost:61003"]

tf.app.flags.DEFINE_string("job_name", "worker", "One of 'ps', 'worker'")
tf.app.flags.DEFINE_string("job_name", "worker", "'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
tf.app.flags.DEFINE_string("ps_hosts", "['localhost:60002']", "ps hosts")
tf.app.flags.DEFINE_string("worker_hosts", "['localhost:61002','localhost:61003']", "worker hosts")

FLAGS = tf.app.flags.FLAGS

ps_hosts = eval(FLAGS.ps_hosts)
worker_hosts = eval(FLAGS.worker_hosts)

# cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

FLAGS = tf.app.flags.FLAGS

def get_batch(x_train, y_train, batch_size):
# num_epochs = 128
Expand Down
5 changes: 4 additions & 1 deletion cczoo/horizontal_fl/horizontal_fl.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
ENV WERROR=1
ENV SGX=1

RUN apt-get install -y gawk bison python3-click python3-jinja2 golang ninja-build python3
RUN apt-get install -y gawk bison python3-click python3-jinja2 golang ninja-build
RUN apt-get install -y libcurl4-openssl-dev libprotobuf-c-dev python3-protobuf protobuf-c-compiler
RUN apt-get install -y libgmp-dev libmpfr-dev libmpc-dev libisl-dev

Expand Down Expand Up @@ -139,6 +139,9 @@ RUN cd /hfl-tensorflow && wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.
RUN echo "enabled=0" > /etc/default/apport
RUN echo "exit 0" > /usr/sbin/policy-rc.d

# make project
RUN cd /hfl-tensorflow && test-sgx.sh make

# Clean tmp files
RUN apt-get clean all \
&& rm -rf /var/lib/apt/lists/* \
Expand Down
12 changes: 10 additions & 2 deletions cczoo/horizontal_fl/start_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,16 @@ else
ip_addr=127.0.0.1
fi

if [ ! -n "$2" ] ; then
if [ -n "$2" ] ; then
name=$2
else
name=ps0
fi

if [ ! -n "$3" ] ; then
tag=latest
else
tag=$2
tag=$3
fi

docker run -it \
Expand All @@ -34,8 +40,10 @@ docker run -it \
--security-opt seccomp=unconfined \
--device=/dev/sgx_enclave:/dev/sgx/enclave \
--device=/dev/sgx_provision:/dev/sgx/provision \
--name=${name} \
-v /var/run/aesmd/aesm:/var/run/aesmd/aesm \
-v /home:/home/host-home \
--net=host \
--add-host=pa.com:127.0.0.1 \
--add-host=pb.com:127.0.0.1 \
--add-host=attestation.service.com:${ip_addr} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,32 +64,42 @@ Steps **②**-**⑥** will be repeated continuously during the training process.
- framework: TensorFlow 2.4.2
- model: ResNet-50
- dataset: Cifar-10
- ps num: 2
- ps num: 1
- worker num: 2
- container num: 3

### Build Docker image

```shell
./build_docker_image.sh
```

### Start container
### Start containers and aesm services
Start three containers (ps0, worker0, worker1) and aesm services.
```shell
./start_container.sh <attestation ip addr>
./start_container.sh <attestation ip addr> ps0
/start_aesm_service.sh
```
```shell
./start_container.sh <attestation ip addr> worker0
/start_aesm_service.sh
```

### Start aesm service
```shell
./start_container.sh <attestation ip addr> worker1
/start_aesm_service.sh
```

### Run the training scripts
Run the script for the corresponding job in each container.
```shell
cd hfl-tensorflow
test-sgx.sh make
test-sgx.sh ps0
test-sgx.sh ps1
```
```shell
cd hfl-tensorflow
test-sgx.sh worker0
```
```shell
cd hfl-tensorflow
test-sgx.sh worker1
```

Expand Down

0 comments on commit 67d648f

Please sign in to comment.