Merge branch 'main' into grpc-ra-tls

intel · Feb 17, 2022 · 67d648f · 67d648f
2 parents a6e9dc8 + 5f807e0
commit 67d648f
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 53 deletions.
diff --git a/cczoo/horizontal_fl/README.md b/cczoo/horizontal_fl/README.md
@@ -66,32 +66,42 @@ Steps **②**-**⑥** will be repeated continuously during the training process.
 - framework: TensorFlow 2.4.2
 - model: ResNet-50
 - dataset: Cifar-10
-- ps num: 2
+- ps num: 1
 - worker num: 2
+- container num: 3
 
 ### Build Docker image
-
 ```shell
 ./build_docker_image.sh
 ```
 
-### Start container
+### Start containers and aesm services
+Start three containers (ps0, worker0, worker1) and aesm services.
 ```shell
-./start_container.sh <attestation ip addr>
+./start_container.sh <attestation ip addr> ps0
+/start_aesm_service.sh
+```
+```shell
+./start_container.sh <attestation ip addr> worker0
+/start_aesm_service.sh
 ```
-
-### Start aesm service
 ```shell
+./start_container.sh <attestation ip addr> worker1
 /start_aesm_service.sh
 ```
 
 ### Run the training scripts
+Run the script for the corresponding job in each container.
 ```shell
 cd hfl-tensorflow
-test-sgx.sh make
 test-sgx.sh ps0
-test-sgx.sh ps1
+```
+```shell
+cd hfl-tensorflow
 test-sgx.sh worker0
+```
+```shell
+cd hfl-tensorflow
 test-sgx.sh worker1
 ```
 

diff --git a/cczoo/horizontal_fl/hfl-tensorflow/Makefile b/cczoo/horizontal_fl/hfl-tensorflow/Makefile
@@ -32,7 +32,7 @@ ifeq ($(SGX),1)
 all: python.manifest.sgx python.sig python.token
 endif
 
-################################ fedlearner MANIFEST ###############################
+################################ MANIFEST ###############################
 
 python.manifest: python.manifest.template
 	gramine-manifest \

diff --git a/cczoo/horizontal_fl/hfl-tensorflow/test-sgx.sh b/cczoo/horizontal_fl/hfl-tensorflow/test-sgx.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 #
 # Copyright (c) 2021 Intel Corporation
 #
@@ -13,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#!/bin/bash
 set -ex
 
 shopt -s expand_aliases
@@ -48,33 +48,28 @@ function make_custom_env() {
 }
 
 ROLE=$1
+PS_HOSTS=$2
+WORKER_HOSTS=$3
+
 if [ "$ROLE" == "make" ]; then
-    rm -rf model *.log
     make clean && make | make_logfilter
-    kill -9 `pgrep -f gramine`
 elif [ "$ROLE" == "ps0" ]; then
     make_custom_env
-    taskset -c 0-3 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=ps --loglevel=debug 2>&1 | runtime_logfilter | tee -a ps0-gramine-python.log &
+    taskset -c 0-3 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=ps $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a ps0-gramine-python.log &
     if [ "$DEBUG" != "0" ]; then
         wait && kill -9 `pgrep -f gramine`
     fi
-elif [ "$ROLE" == "ps1" ]; then
-    make_custom_env
-    taskset -c 4-7 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=ps --loglevel=debug 2>&1 | runtime_logfilter | tee -a ps1-gramine-python.log &
-    if [ "$DEBUG" != "0" ]; then
-        wait && kill -9 `pgrep -f gramine`
-    fi
-
 elif [ "$ROLE" == "worker0" ]; then
     make_custom_env
-    taskset -c 8-11 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=worker --loglevel=debug 2>&1 | runtime_logfilter | tee -a worker0-gramine-python.log &
+    taskset -c 8-11 stdbuf -o0 gramine-sgx python -u train.py --task_index=0 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a worker0-gramine-python.log &
     if [ "$DEBUG" != "0" ]; then
         wait && kill -9 `pgrep -f gramine`
     fi
 elif [ "$ROLE" == "worker1" ]; then
     make_custom_env
-    taskset -c 11-15 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=worker --loglevel=debug 2>&1 | runtime_logfilter | tee -a worker1-gramine-python.log &
+    taskset -c 11-15 stdbuf -o0 gramine-sgx python -u train.py --task_index=1 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | runtime_logfilter | tee -a worker1-gramine-python.log &
     if [ "$DEBUG" != "0" ]; then
         wait && kill -9 `pgrep -f gramine`
     fi
 fi
+
diff --git a/cczoo/horizontal_fl/hfl-tensorflow/test.sh b/cczoo/horizontal_fl/hfl-tensorflow/test.sh
@@ -16,43 +16,34 @@
 #!/bin/bash
 set -ex
 
-function get_env() {
-    gramine-sgx-get-token -s python.sig -o /dev/null | grep $1 | awk -F ":" '{print $2}' | xargs
-}
-
 function make_custom_env() {
     export CUDA_VISIBLE_DEVICES=""
     export DNNL_VERBOSE=1
     export GRPC_VERBOSITY=ERROR
     export TF_CPP_MIN_LOG_LEVEL=1
-    # export TF_GRPC_SGX_RA_TLS_ENABLE=""
-    export TF_GRPC_SGX_RA_TLS_ENABLE=on
+    export TF_GRPC_SGX_RA_TLS_ENABLE=""
     export TF_DISABLE_MKL=0
     export TF_ENABLE_MKL_NATIVE_FORMAT=1
     export parallel_num_threads=4
     export INTRA_OP_PARALLELISM_THREADS=$parallel_num_threads
     export INTER_OP_PARALLELISM_THREADS=$parallel_num_threads
     export KMP_SETTINGS=1
     export KMP_BLOCKTIME=0
-    export MR_ENCLAVE=`get_env mr_enclave`
-    export MR_SIGNER=`get_env mr_signer`
-    export ISV_PROD_ID=`get_env isv_prod_id`
-    export ISV_SVN=`get_env isv_svn`
     # network proxy
     unset http_proxy https_proxy
 }
 
 ROLE=$1
+PS_HOSTS=$2
+WORKER_HOSTS=$3
 if [ "$ROLE" == "ps0" ]; then
     make_custom_env
-    taskset -c 0-3 stdbuf -o0 python -u train.py --task_index=0 --job_name=ps --loglevel=debug 2>&1 | tee -a ps0-python.log &
-if [ "$ROLE" == "ps1" ]; then
-    make_custom_env
-    taskset -c 4-7 stdbuf -o0 python -u train.py --task_index=1 --job_name=ps --loglevel=debug 2>&1 | tee -a ps1-python.log &
+    taskset -c 0-3 stdbuf -o0 python -u train.py --task_index=0 --job_name=ps 2>&1 $PS_HOSTS $WORKER_HOSTS | tee -a ps0-python.log &
 elif [ "$ROLE" == "worker0" ]; then
     make_custom_env
-    taskset -c 8-11 stdbuf -o0 python -u train.py --task_index=0 --job_name=worker --loglevel=debug 2>&1 | tee -a worker0-python.log &
+    taskset -c 8-11 stdbuf -o0 python -u train.py --task_index=0 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | tee -a worker0-python.log &
 elif [ "$ROLE" == "worker1" ]; then
     make_custom_env
-    taskset -c 12-15 stdbuf -o0 python -u train.py --task_index=1 --job_name=worker --loglevel=debug 2>&1 | tee -a worker1-python.log &
+    taskset -c 12-15 stdbuf -o0 python -u train.py --task_index=1 --job_name=worker $PS_HOSTS $WORKER_HOSTS 2>&1 | tee -a worker1-python.log &
 fi
+
diff --git a/cczoo/horizontal_fl/hfl-tensorflow/train.py b/cczoo/horizontal_fl/hfl-tensorflow/train.py
@@ -24,15 +24,20 @@
 tf.disable_eager_execution()
 
 # Configuration of cluster 
-ps_hosts = [ "localhost:60002", "localhost:60003"]
-worker_hosts = [ "localhost:61002", "localhost:61003"]
 
-tf.app.flags.DEFINE_string("job_name", "worker", "One of 'ps', 'worker'")
+tf.app.flags.DEFINE_string("job_name", "worker", "'ps' or 'worker'")
 tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
+tf.app.flags.DEFINE_string("ps_hosts", "['localhost:60002']", "ps hosts")
+tf.app.flags.DEFINE_string("worker_hosts", "['localhost:61002','localhost:61003']", "worker hosts")
 
+FLAGS = tf.app.flags.FLAGS
+
+ps_hosts = eval(FLAGS.ps_hosts)
+worker_hosts = eval(FLAGS.worker_hosts)
+
+# cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
 cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
 
-FLAGS = tf.app.flags.FLAGS
 
 def get_batch(x_train, y_train, batch_size):
     # num_epochs = 128

diff --git a/cczoo/horizontal_fl/horizontal_fl.dockerfile b/cczoo/horizontal_fl/horizontal_fl.dockerfile
@@ -60,7 +60,7 @@ ENV LC_ALL=C.UTF-8 LANG=C.UTF-8
 ENV WERROR=1
 ENV SGX=1
 
-RUN apt-get install -y gawk bison python3-click python3-jinja2 golang ninja-build python3
+RUN apt-get install -y gawk bison python3-click python3-jinja2 golang ninja-build
 RUN apt-get install -y libcurl4-openssl-dev libprotobuf-c-dev python3-protobuf protobuf-c-compiler
 RUN apt-get install -y libgmp-dev libmpfr-dev libmpc-dev libisl-dev
 
@@ -139,6 +139,9 @@ RUN cd /hfl-tensorflow && wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.
 RUN echo "enabled=0" > /etc/default/apport
 RUN echo "exit 0" > /usr/sbin/policy-rc.d
 
+# make project
+RUN cd /hfl-tensorflow && test-sgx.sh make
+
 # Clean tmp files
 RUN apt-get clean all \
     && rm -rf /var/lib/apt/lists/* \

diff --git a/cczoo/horizontal_fl/start_container.sh b/cczoo/horizontal_fl/start_container.sh
@@ -22,10 +22,16 @@ else
     ip_addr=127.0.0.1
 fi
 
-if  [ ! -n "$2" ] ; then
+if  [ -n "$2" ] ; then
+    name=$2
+else
+    name=ps0
+fi
+
+if  [ ! -n "$3" ] ; then
     tag=latest
 else
-    tag=$2
+    tag=$3
 fi
 
 docker run -it \
@@ -34,8 +40,10 @@ docker run -it \
     --security-opt seccomp=unconfined \
     --device=/dev/sgx_enclave:/dev/sgx/enclave \
     --device=/dev/sgx_provision:/dev/sgx/provision \
+    --name=${name} \
     -v /var/run/aesmd/aesm:/var/run/aesmd/aesm \
     -v /home:/home/host-home \
+	--net=host \
     --add-host=pa.com:127.0.0.1 \
     --add-host=pb.com:127.0.0.1 \
     --add-host=attestation.service.com:${ip_addr} \

diff --git a/documents/readthedoc/docs/source/Solutions/horizontal-federated-learning/hfl.md b/documents/readthedoc/docs/source/Solutions/horizontal-federated-learning/hfl.md
@@ -64,32 +64,42 @@ Steps **②**-**⑥** will be repeated continuously during the training process.
 - framework: TensorFlow 2.4.2
 - model: ResNet-50
 - dataset: Cifar-10
-- ps num: 2
+- ps num: 1
 - worker num: 2
+- container num: 3
 
 ### Build Docker image
-
 ```shell
 ./build_docker_image.sh
 ```
 
-### Start container
+### Start containers and aesm services
+Start three containers (ps0, worker0, worker1) and aesm services.
 ```shell
-./start_container.sh <attestation ip addr>
+./start_container.sh <attestation ip addr> ps0
+/start_aesm_service.sh
+```
+```shell
+./start_container.sh <attestation ip addr> worker0
+/start_aesm_service.sh
 ```
-
-### Start aesm service
 ```shell
+./start_container.sh <attestation ip addr> worker1
 /start_aesm_service.sh
 ```
 
 ### Run the training scripts
+Run the script for the corresponding job in each container.
 ```shell
 cd hfl-tensorflow
-test-sgx.sh make
 test-sgx.sh ps0
-test-sgx.sh ps1
+```
+```shell
+cd hfl-tensorflow
 test-sgx.sh worker0
+```
+```shell
+cd hfl-tensorflow
 test-sgx.sh worker1
 ```