From f49082d705337bfc301ff71239cb942be5c1655b Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Fri, 22 Feb 2019 06:40:14 -0500 Subject: [PATCH 01/13] Global Variables Integrated --- scripts/cluster/yarn_cluster_setup/README.md | 4 +- .../install_yarn_master_slave.sh | 40 +++++++++++++++---- .../yarn_cluster_setup/start_yarn_cluster.sh | 22 ++++++---- .../yarn_cluster_setup/stop_yarn_cluster.sh | 8 ++-- .../test_scripts/run_spark_test_job_pi.sh | 8 ++-- 5 files changed, 59 insertions(+), 23 deletions(-) diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md index 5e7b1fc..bc02d71 100644 --- a/scripts/cluster/yarn_cluster_setup/README.md +++ b/scripts/cluster/yarn_cluster_setup/README.md @@ -15,7 +15,7 @@ Specify the hostnames of nodes as arguments. sudo bash install_yarn_cluster.sh master,worker1,worker2 ... ``` -# Start Yarn Spark Cluster +# Start Yarn Spark Cluster and Run Spark Job on Master Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders. ``` sudo bash start_yarn_cluster.sh @@ -56,4 +56,4 @@ http://:19888/ cd .. sudo bash stop_yarn_job.sh ``` -After running this command, the web interfaces will not work. +After running this command, the web interfaces will not work. \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh index ae806b0..1ed3c47 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh @@ -7,7 +7,8 @@ HADOOP_SYMLINK=/usr/local/hadoop HADOOP_CONFIG_LOCATION=${HADOOP_HOME_INFILE}etc/hadoop/ HADOOP_VERSION=2.9.2 HADOOP_WEB_SOURCE=https://www-us.apache.org/dist/hadoop/common/ -GLOBAL_VARIABLES_SOURCE=/etc/environment +ROOT_VARIABLES_ADDRESS=/root/.bashrc +USER_VARIABLES_ADDRESS=~/.bashrc # Install Pre-Reqs apt-get update -y @@ -17,6 +18,22 @@ apt-get install -y python default-jdk wget unlink ${HADOOP_SYMLINK} && rm -rf ${HADOOP_DATA} rm -rf /usr/local/hadoop-*/ +# Remove Global Variables +sed -i /JAVA_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /HADOOP_HOME/d $ROOT_VARIABLES_ADDRESS +sed -i /hadoop/d $ROOT_VARIABLES_ADDRESS && sed -i /default-java/d $ROOT_VARIABLES_ADDRESS + +# Make Hadoop Global Variables for User and Root +echo "export JAVA_HOME="$JAVA_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export PATH=$PATH:"$JAVA_HOME_INFILE"bin/:"$JAVA_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS +echo "export HADOOP_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export HADOOP_MAPRED_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export HADOOP_COMMON_HOME="$HADOOP_HOME_INFILE>> $ROOT_VARIABLES_ADDRESS +echo "export HADOOP_HDFS_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export YARN_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export HADOOP_COMMON_LIB_NATIVE_DIR="$HADOOP_HOME_INFILE"lib/native" >> $ROOT_VARIABLES_ADDRESS +echo "export PATH=$PATH:"$HADOOP_HOME_INFILE"bin/:"$HADOOP_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS +source $ROOT_VARIABLES_ADDRESS + # Make Data Directories for Hadoop mkdir -p ${HADOOP_DATA}name mkdir -p ${HADOOP_DATA}data @@ -28,20 +45,18 @@ current_directory=`pwd` if [ ! -f "${current_directory}/hadoop-$HADOOP_VERSION.tar.gz" ]; then echo "Downloading Hadoop ${HADOOP_VERSION} ..." sudo wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz - # wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -P /hadoop-${HADOOP_VERSION}.tar.gz - # sudo curl ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz > /hadoop-${HADOOP_VERSION}.tar.gz echo "Download of Hadoop ${HADOOP_VERSION} Successful!" fi # Unzip and Install Hadoop Tar tar -xzf $current_directory/hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/ -# tar -xzf /hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/ + +rm $current_directory/hadoop-$HADOOP_VERSION.tar.gz # Make Symbolic link ln -s /usr/local/hadoop-$HADOOP_VERSION/ $HADOOP_SYMLINK -# Copy Config Files - +# Copy Hadoop Config Files cp -a $current_directory/configs/hadoop/. $HADOOP_CONFIG_LOCATION cp $current_directory/configs/master $HADOOP_CONFIG_LOCATION cp $current_directory/configs/slaves $HADOOP_CONFIG_LOCATION @@ -52,10 +67,21 @@ SPARK_HOME_INFILE=`cd ${current_directory}/../../../.. && pwd` SPARK_CONFIG_LOCATION=$SPARK_HOME_INFILE/conf/ +# Remove Spark Global Variables +sed -i /SPARK_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /spark/d $ROOT_VARIABLES_ADDRESS + +# Make Spark Global Variables for User and Root +echo "export SPARK_HOME="$SPARK_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS +echo "export PATH=$PATH:"$SPARK_HOME_INFILE"/bin/" >> $ROOT_VARIABLES_ADDRESS +source $ROOT_VARIABLES_ADDRESS + +# Copy Spark Config Files cp -a $current_directory/configs/spark/. $SPARK_CONFIG_LOCATION cp -a $current_directory/configs/hadoop/. $SPARK_CONFIG_LOCATION cp $current_directory/configs/master $SPARK_CONFIG_LOCATION cp $current_directory/configs/slaves $SPARK_CONFIG_LOCATION # Format Namenode -/usr/local/hadoop/bin/hdfs namenode -format \ No newline at end of file +$HADOOP_HOME_INFILE/bin/hdfs namenode -format + +echo "Run the following on master node: sudo bash start_yarn_cluster.sh" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh index 3e1c531..2474ac9 100644 --- a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh @@ -1,21 +1,29 @@ #!/bin/bash echo "STARTING HADOOP SERVICES" -/usr/local/hadoop/sbin/start-dfs.sh -/usr/local/hadoop/sbin/start-yarn.sh +$HADOOP_HOME/sbin/start-dfs.sh -/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver +$HADOOP_HOME/sbin/start-yarn.sh -/usr/local/hadoop/bin/hdfs dfsadmin -safemode leave +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver + +$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave echo "STARTING SPARK SERVICES" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/start-all.sh +SPARK_HOME/sbin/start-all.sh echo "RUN jps - Java Virtual Machine Process Status Tool" jps echo "Get basic filesystem information and statistics." -/usr/local/hadoop/bin/hdfs dfsadmin -report +$HADOOP_HOME/bin/hdfs dfsadmin -report + +echo "Yarn Cluster is Active" + +master_node_ip_address=`hostname -i` -echo "Yarn Cluster is Active" \ No newline at end of file +echo "YARN Interface Available At: "$master_node_ip_address":8088/" +echo "Spark Interface Available At: "$master_node_ip_address":8080/" +echo "NameNode Interface Available At: "$master_node_ip_address":50070/" +echo "Job Master Interface Available At: "$master_node_ip_address":19888/" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh index d1cad90..9e6a5ba 100644 --- a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh @@ -2,14 +2,14 @@ echo -e "STOPPING SPARK SERVICES" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/stop-all.sh +$SPARK_HOME/sbin/stop-all.sh echo -e "STOPPING HADOOP SERVICES" -/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh stop historyserver +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver -/usr/local/hadoop/sbin/stop-dfs.sh +$HADOOP_HOME/sbin/stop-dfs.sh -/usr/local/hadoop/sbin/stop-yarn.sh +$HADOOP_HOME/sbin/stop-yarn.sh echo "Hadoop Cluster is Inactive Now" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh index 1dcd31d..0adfae4 100644 --- a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh +++ b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh @@ -1,12 +1,14 @@ #!/bin/bash echo "SPARK TEST" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/bin/spark-submit --class org.apache.spark.examples.SparkPi \ +$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi \ --master yarn \ --deploy-mode cluster \ --num-executors 1 \ --driver-memory 1g \ --executor-memory 512m \ --executor-cores 1 \ - /spark-3.0.0-SNAPSHOT-bin-SparkFHE/examples/jars/spark-examples*.jar \ - 10 \ No newline at end of file + $SPARK_HOME/examples/jars/spark-examples*.jar \ + 10 + +echo "Stop Cluster If not in Use" \ No newline at end of file From 917be6fcaf007c412bb94ed4c487c1a1198559be Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Sun, 24 Feb 2019 15:39:37 -0500 Subject: [PATCH 02/13] Security Features Added --- .../configs/hadoop/core-site.xml | 5 +++++ .../configs/hadoop/hadoop-policy.xml | 8 ++++++++ .../configs/hadoop/hdfs-site.xml | 4 ++++ .../yarn_cluster_setup/start_yarn_cluster.sh | 14 ++++++++------ .../yarn_cluster_setup/stop_yarn_cluster.sh | 8 ++++---- .../test_scripts/run_spark_test_job_pi.sh | 4 ++-- 6 files changed, 31 insertions(+), 12 deletions(-) create mode 100644 scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml index 7d1a4bb..3f4f6bd 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml @@ -11,4 +11,9 @@ dfs.namenode.rpc-bind-host 0.0.0.0 + + hadoop.security.authorization + true + Service level authorization params. + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml new file mode 100644 index 0000000..e22e187 --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml @@ -0,0 +1,8 @@ + + + + + security.job.client.protocol.acl + iotx-PG0 + + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml index 90ad504..09fccf8 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml @@ -25,4 +25,8 @@ dfs.datanode.data.dir /data/hadoop/data + + dfs.webhdfs.enabled + false + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh index 2474ac9..643f1e8 100644 --- a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh @@ -2,22 +2,24 @@ echo "STARTING HADOOP SERVICES" -$HADOOP_HOME/sbin/start-dfs.sh +/usr/local/hadoop/sbin/start-dfs.sh -$HADOOP_HOME/sbin/start-yarn.sh +/usr/local/hadoop/sbin/start-yarn.sh -$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver +/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver -$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave +# /usr/local/hadoop/bin/hdfs dfsadmin -safemode leave + +# /usr/local/hadoop/bin/hdfs dfsadmin echo "STARTING SPARK SERVICES" -SPARK_HOME/sbin/start-all.sh +/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/start-all.sh echo "RUN jps - Java Virtual Machine Process Status Tool" jps echo "Get basic filesystem information and statistics." -$HADOOP_HOME/bin/hdfs dfsadmin -report +/usr/local/hadoop/bin/hdfs dfsadmin -report echo "Yarn Cluster is Active" diff --git a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh index 9e6a5ba..d1cad90 100644 --- a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh @@ -2,14 +2,14 @@ echo -e "STOPPING SPARK SERVICES" -$SPARK_HOME/sbin/stop-all.sh +/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/stop-all.sh echo -e "STOPPING HADOOP SERVICES" -$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver +/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh stop historyserver -$HADOOP_HOME/sbin/stop-dfs.sh +/usr/local/hadoop/sbin/stop-dfs.sh -$HADOOP_HOME/sbin/stop-yarn.sh +/usr/local/hadoop/sbin/stop-yarn.sh echo "Hadoop Cluster is Inactive Now" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh index 0adfae4..b6d16ba 100644 --- a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh +++ b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh @@ -1,14 +1,14 @@ #!/bin/bash echo "SPARK TEST" -$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi \ +/spark-3.0.0-SNAPSHOT-bin-SparkFHE/bin/spark-submit --class org.apache.spark.examples.SparkPi \ --master yarn \ --deploy-mode cluster \ --num-executors 1 \ --driver-memory 1g \ --executor-memory 512m \ --executor-cores 1 \ - $SPARK_HOME/examples/jars/spark-examples*.jar \ + /spark-3.0.0-SNAPSHOT-bin-SparkFHE/examples/jars/spark-examples*.jar \ 10 echo "Stop Cluster If not in Use" \ No newline at end of file From 85ea362cabe78df2b4d5aa30cef56b5e799509d6 Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Sun, 24 Feb 2019 20:21:51 -0500 Subject: [PATCH 03/13] Global Variables Bugs Fixed Using source --- scripts/cluster/yarn_cluster_setup/README.md | 1 + .../install_yarn_master_slave.sh | 7 +++---- .../yarn_cluster_setup/start_yarn_cluster.sh | 16 ++++++++-------- .../yarn_cluster_setup/stop_yarn_cluster.sh | 10 ++++++---- .../test_scripts/run_spark_test_job_pi.sh | 6 ++++-- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md index bc02d71..4a180b5 100644 --- a/scripts/cluster/yarn_cluster_setup/README.md +++ b/scripts/cluster/yarn_cluster_setup/README.md @@ -27,6 +27,7 @@ Use the link generated after successful completion of cluster building to view t cd test_scripts sudo bash run_spark_test_job_pi.sh ``` +To view the output open http://:8088/, select the latest application, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster. ### Useful Links: Other links can be generated by changing the port number. diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh index 1ed3c47..778cb22 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh @@ -7,8 +7,7 @@ HADOOP_SYMLINK=/usr/local/hadoop HADOOP_CONFIG_LOCATION=${HADOOP_HOME_INFILE}etc/hadoop/ HADOOP_VERSION=2.9.2 HADOOP_WEB_SOURCE=https://www-us.apache.org/dist/hadoop/common/ -ROOT_VARIABLES_ADDRESS=/root/.bashrc -USER_VARIABLES_ADDRESS=~/.bashrc +ROOT_VARIABLES_ADDRESS=/etc/profile # Install Pre-Reqs apt-get update -y @@ -19,8 +18,8 @@ unlink ${HADOOP_SYMLINK} && rm -rf ${HADOOP_DATA} rm -rf /usr/local/hadoop-*/ # Remove Global Variables -sed -i /JAVA_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /HADOOP_HOME/d $ROOT_VARIABLES_ADDRESS -sed -i /hadoop/d $ROOT_VARIABLES_ADDRESS && sed -i /default-java/d $ROOT_VARIABLES_ADDRESS +sed -i /JAVA_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /default-java/d $ROOT_VARIABLES_ADDRESS +sed -i /HADOOP_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /hadoop/d $ROOT_VARIABLES_ADDRESS # Make Hadoop Global Variables for User and Root echo "export JAVA_HOME="$JAVA_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh index 643f1e8..53a8729 100644 --- a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh @@ -1,25 +1,25 @@ #!/bin/bash -echo "STARTING HADOOP SERVICES" +source /etc/profile -/usr/local/hadoop/sbin/start-dfs.sh +echo "STARTING HADOOP SERVICES" -/usr/local/hadoop/sbin/start-yarn.sh +$HADOOP_HOME/sbin/start-dfs.sh -/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver +$HADOOP_HOME/sbin/start-yarn.sh -# /usr/local/hadoop/bin/hdfs dfsadmin -safemode leave +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver -# /usr/local/hadoop/bin/hdfs dfsadmin +$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave echo "STARTING SPARK SERVICES" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/start-all.sh +$SPARK_HOME/sbin/start-all.sh echo "RUN jps - Java Virtual Machine Process Status Tool" jps echo "Get basic filesystem information and statistics." -/usr/local/hadoop/bin/hdfs dfsadmin -report +$HADOOP_HOME/bin/hdfs dfsadmin -report echo "Yarn Cluster is Active" diff --git a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh index d1cad90..6e00bb5 100644 --- a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh @@ -1,15 +1,17 @@ #!/bin/bash +source /etc/profile + echo -e "STOPPING SPARK SERVICES" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/stop-all.sh +$SPARK_HOME/sbin/stop-all.sh echo -e "STOPPING HADOOP SERVICES" -/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh stop historyserver +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver -/usr/local/hadoop/sbin/stop-dfs.sh +$HADOOP_HOME/sbin/stop-dfs.sh -/usr/local/hadoop/sbin/stop-yarn.sh +$HADOOP_HOME/sbin/stop-yarn.sh echo "Hadoop Cluster is Inactive Now" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh index b6d16ba..4768ca2 100644 --- a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh +++ b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh @@ -1,14 +1,16 @@ #!/bin/bash +source /etc/profile + echo "SPARK TEST" -/spark-3.0.0-SNAPSHOT-bin-SparkFHE/bin/spark-submit --class org.apache.spark.examples.SparkPi \ +$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi \ --master yarn \ --deploy-mode cluster \ --num-executors 1 \ --driver-memory 1g \ --executor-memory 512m \ --executor-cores 1 \ - /spark-3.0.0-SNAPSHOT-bin-SparkFHE/examples/jars/spark-examples*.jar \ + $SPARK_HOME/examples/jars/spark-examples*.jar \ 10 echo "Stop Cluster If not in Use" \ No newline at end of file From a75c4364bfc022ac502b05a65471935a62b89cd9 Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Sun, 3 Mar 2019 17:27:26 -0500 Subject: [PATCH 04/13] Security Improvements, Readme Updated with SSH Tunneling --- scripts/cluster/yarn_cluster_setup/README.md | 67 +++++++++++++++---- .../configs/hadoop/core-site.xml | 14 ++-- .../configs/hadoop/hadoop-policy.xml | 8 --- .../configs/hadoop/hdfs-site.xml | 8 ++- .../configs/hadoop/yarn-site-capacity.xml | 10 ++- .../configs/hadoop/yarn-site-fair.xml | 10 ++- .../configs/hadoop/yarn-site.xml | 10 ++- .../yarn_cluster_setup/start_yarn_cluster.sh | 12 ++-- 8 files changed, 96 insertions(+), 43 deletions(-) delete mode 100644 scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md index 4a180b5..ba9553f 100644 --- a/scripts/cluster/yarn_cluster_setup/README.md +++ b/scripts/cluster/yarn_cluster_setup/README.md @@ -27,30 +27,73 @@ Use the link generated after successful completion of cluster building to view t cd test_scripts sudo bash run_spark_test_job_pi.sh ``` -To view the output open http://:8088/, select the latest application, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster. -### Useful Links: +## Web Interfaces: + +The public IP addresses of all nodes have been closed to bolster security. To view the web Interface, some additional steps will have to be performed. + +### Setup SSH Tunneling for nodes + +Open a Terminal window on local machine and type the following: + +``` +ssh -4 -ND +``` +This step will bind the local machine's port to the IP address of Master Node. + +### Get Internal IP of Master Node + +On the master node run the following to get the internal IP of Master Node: +``` +hostname -I | awk '{print $1}' +``` +This same step can be done on any of the worker nodes. + +### Configure Browser to open link + +Open Mozilla Firefox browser in the local machine. + +Click on three horizontal bars available on the top right hand side. + +Select Preferences and look for 'Network Settings' on the page. + +Once inside Network Settings, Select Manual Proxy Configuration. + +Select Socks_v5 and type in the Port Number chosen in the previous step for SOCKS Host. The IP of SOCKS Host does not need to be changed. Save the Settings. + +### List of Web Interfaces + Other links can be generated by changing the port number. -YARN Interface: +#### YARN Interface: + +http://:8088/ + +The output of test job is available in the link above. + +Select the latest application, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster. + +#### Spark Interface: + +http://:8080/ -http://:8088/ +#### Namenode Interface: -Spark Interface: +http://:50070/ -http://:8080/ +#### JobMaster Interface: -Namenode Interface: +http://:19888/ -http://:50070/ +#### Datanode Interface: -Datanode Interface: +http://:50075/ -http://:50075/ +### Remove Browser Configuration -JobMaster Interface: +To use the Mozilla Firefox browser regularly, Select 'No Proxy' in Network Settings and Save. -http://:19888/ +Stop the SSH tunneling by Closing the Terminal Window or Hit Ctrl + C in the terminal window. # Stop the Cluster ``` diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml index 3f4f6bd..710fa1e 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml @@ -8,12 +8,12 @@ - dfs.namenode.rpc-bind-host - 0.0.0.0 - - - hadoop.security.authorization - true - Service level authorization params. + dfs.namenode.rpc-bind-host + master + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml deleted file mode 100644 index e22e187..0000000 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - security.job.client.protocol.acl - iotx-PG0 - - \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml index 09fccf8..1c8b404 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml @@ -7,11 +7,11 @@ dfs.namenode.http-address - 0.0.0.0:50070 + master:50070 dfs.namenode.secondary.http-address - 0.0.0.0:50090 + master:50090 dfs.replication @@ -29,4 +29,8 @@ dfs.webhdfs.enabled false + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml index b403f69..10056c7 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml @@ -27,11 +27,11 @@ yarn.resourcemanager.admin.address - 0.0.0.0:8033 + master:8033 yarn.resourcemanager.webapp.address - 0.0.0.0:8088 + master:8088 mapreduce.jobhistory.address @@ -39,6 +39,10 @@ mapreduce.jobhistory.webapp.address - 0.0.0.0:19888 + master:19888 + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml index 2903140..dec55d6 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml @@ -27,11 +27,11 @@ yarn.resourcemanager.admin.address - 0.0.0.0:8033 + master:8033 yarn.resourcemanager.webapp.address - 0.0.0.0:8088 + master:8088 mapreduce.jobhistory.address @@ -39,8 +39,12 @@ mapreduce.jobhistory.webapp.address - 0.0.0.0:19888 + master:19888 + yarn.resourcemanager.scheduler.class org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml index b403f69..e046711 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml @@ -27,11 +27,11 @@ yarn.resourcemanager.admin.address - 0.0.0.0:8033 + master:8033 yarn.resourcemanager.webapp.address - 0.0.0.0:8088 + master:8088 mapreduce.jobhistory.address @@ -39,6 +39,10 @@ mapreduce.jobhistory.webapp.address - 0.0.0.0:19888 + master:19888 + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh index 53a8729..10f67e3 100644 --- a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh @@ -23,9 +23,11 @@ $HADOOP_HOME/bin/hdfs dfsadmin -report echo "Yarn Cluster is Active" -master_node_ip_address=`hostname -i` +echo "Follow the instructions for Web Interfaces specified in the Readme page" -echo "YARN Interface Available At: "$master_node_ip_address":8088/" -echo "Spark Interface Available At: "$master_node_ip_address":8080/" -echo "NameNode Interface Available At: "$master_node_ip_address":50070/" -echo "Job Master Interface Available At: "$master_node_ip_address":19888/" \ No newline at end of file +master_node_ip_address_internal=`hostname -I | awk '{print $1}'` + +echo "YARN Interface Available At: "$master_node_ip_address_internal":8088/" +echo "Spark Interface Available At: "$master_node_ip_address_internal":8080/" +echo "NameNode Interface Available At: "$master_node_ip_address_internal":50070/" +echo "Job Master Interface Available At: "$master_node_ip_address_internal":19888/" \ No newline at end of file From c23b18a161f6e20ac3d60bf96fa1444364217378 Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Mon, 4 Mar 2019 01:31:08 -0500 Subject: [PATCH 05/13] Client Code Added, Code Refinements Pending --- scripts/cluster/yarn_cluster_setup/README.md | 14 +++--- .../install_yarn_cluster.sh | 45 +++++++------------ .../install_yarn_master_slave.sh | 4 +- .../test_scripts/run_spark_test_job_pi.sh | 4 +- 4 files changed, 24 insertions(+), 43 deletions(-) diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md index ba9553f..23df2a7 100644 --- a/scripts/cluster/yarn_cluster_setup/README.md +++ b/scripts/cluster/yarn_cluster_setup/README.md @@ -1,32 +1,30 @@ Setup an experiment on Cloudlab using the SparkFHE-Dist-Ubuntu18.04 image. Use the Wisconsin server. -Please note that the scripts are designed to run on Master Node. - -# SSH into Master Node +# SSH into Client Node SSH into the master node and navigate to the address specified below: ``` cd /spark-3.0.0-SNAPSHOT-bin-SparkFHE/SparkFHE-Addon/scripts/cluster/yarn_cluster_setup ``` -# Install Hadoop and Configure Spark on all nodes through Master Node +# Install Hadoop and Configure Spark on all nodes through Client Node Specify the hostnames of nodes as arguments. ``` sudo bash install_yarn_cluster.sh master,worker1,worker2 ... ``` -# Start Yarn Spark Cluster and Run Spark Job on Master +# Start Yarn Spark Cluster and Run Spark Job on Master(Step Automatic For Now) Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders. ``` sudo bash start_yarn_cluster.sh ``` -# Run Test Spark Job on Master -Use the link generated after successful completion of cluster building to view the web interface for Yarn. +# Run Test Spark Job on Master Through Client(Step Automatic For Now) ``` cd test_scripts sudo bash run_spark_test_job_pi.sh ``` +If the job is successfulll completed, final status is 'SUCCEEDED'. The links generated can be used by following the guide specified below. ## Web Interfaces: @@ -95,7 +93,7 @@ To use the Mozilla Firefox browser regularly, Select 'No Proxy' in Network Setti Stop the SSH tunneling by Closing the Terminal Window or Hit Ctrl + C in the terminal window. -# Stop the Cluster +# Stop the Cluster(Step Automatic For Now) ``` cd .. sudo bash stop_yarn_job.sh diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh index 90ac210..cfd6319 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh @@ -13,7 +13,7 @@ eval $(echo $cluster | awk '{split($0, array, ",");for(i in array)print "host_ar function checkSSH() { echo "Checking SSH connections" - for(( i=2;i<=${#host_array[@]};i++)) ; do + for(( i=1;i<=${#host_array[@]};i++)) ; do ssh ${host_array[i]} "hostname" if [ $? -eq 0 ] then @@ -27,11 +27,11 @@ function checkSSH() { checkSSH +current_directory=`pwd` + # Make Master and Slaves File # Clear Content from Files -current_directory=`pwd` - rm -rf $current_directory/configs/master || true touch $current_directory/configs/master rm -rf $current_directory/configs/slaves || true @@ -46,30 +46,17 @@ for(( i=2;i<=${#host_array[@]};i++)) ; do echo ${host_array[i]} >> $current_directory/configs/slaves done -echo ========================================================= -echo "Setup Yarn Master" -echo ========================================================= -echo "Installing Yarn-master" -# Setup Environment at node -bash install_yarn_master_slave.sh - -echo ========================================================= -echo "Setting up Yarn Slaves" -echo ========================================================= - -# Read addresses in slaves file -cat $current_directory/configs/slaves | while read line - -do - if [ "$line" = "-" ]; then - echo "Skip $line" - else - # Move master and slaves file to worker nodes - scp $current_directory/configs/master root@$line:$current_directory/configs - scp $current_directory/configs/slaves root@$line:$current_directory/configs - echo "Installing on $line" - echo "Installing Yarn-slave" - ssh root@$line -n "cd ${current_directory} && sudo bash install_yarn_master_slave.sh" - echo "Finished config node $line" - fi +# Move Master and Slaves File on all Nodes +# Install Cluster on all Nodes +for(( i=1;i<=${#host_array[@]};i++)) ; do + scp $current_directory/configs/master ${host_array[i]}:$current_directory/configs + scp $current_directory/configs/slaves ${host_array[i]}:$current_directory/configs + echo "Installing on "${host_array[i]} + ssh root@${host_array[i]} -n "cd ${current_directory} && sudo bash install_yarn_master_slave.sh" + echo "Finished configuration on "${host_array[i]} done + +# Trigger Scripts on Master Node +ssh root@${host_array[$master_limit]} -n "cd ${current_directory} && sudo bash start_yarn_cluster.sh" +ssh root@${host_array[$master_limit]} -n "cd ${current_directory}/test_scripts && sudo bash run_spark_test_job_pi.sh" +ssh root@${host_array[$master_limit]} -n "cd ${current_directory} && sudo bash stop_yarn_cluster.sh" \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh index 778cb22..4e9d017 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh @@ -81,6 +81,4 @@ cp $current_directory/configs/master $SPARK_CONFIG_LOCATION cp $current_directory/configs/slaves $SPARK_CONFIG_LOCATION # Format Namenode -$HADOOP_HOME_INFILE/bin/hdfs namenode -format - -echo "Run the following on master node: sudo bash start_yarn_cluster.sh" \ No newline at end of file +$HADOOP_HOME_INFILE/bin/hdfs namenode -format \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh index 4768ca2..2f37677 100644 --- a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh +++ b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh @@ -11,6 +11,4 @@ $SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi \ --executor-memory 512m \ --executor-cores 1 \ $SPARK_HOME/examples/jars/spark-examples*.jar \ - 10 - -echo "Stop Cluster If not in Use" \ No newline at end of file + 10 \ No newline at end of file From bce5932b364ce8fbda3f2299c193ca2ece649f5b Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Tue, 5 Mar 2019 07:43:09 -0500 Subject: [PATCH 06/13] Client Integration completed, webHDFS enabled --- scripts/cluster/yarn_cluster_setup/README.md | 30 +++++---- .../configs/hadoop/hdfs-site.xml | 4 +- .../install_yarn_cluster.sh | 66 +++++++++---------- .../yarn_cluster_setup/start_yarn_cluster.sh | 46 ++++++++----- .../yarn_cluster_setup/stop_yarn_cluster.sh | 24 +++++-- .../test_scripts/run_spark_test_job_pi.sh | 14 ---- .../run_spark_test_job_pi_remotely.sh | 24 +++++++ 7 files changed, 119 insertions(+), 89 deletions(-) delete mode 100644 scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh create mode 100644 scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi_remotely.sh diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md index 23df2a7..9b02052 100644 --- a/scripts/cluster/yarn_cluster_setup/README.md +++ b/scripts/cluster/yarn_cluster_setup/README.md @@ -1,6 +1,8 @@ Setup an experiment on Cloudlab using the SparkFHE-Dist-Ubuntu18.04 image. Use the Wisconsin server. +Please note that all scripts are designed to run on Client node. + # SSH into Client Node SSH into the master node and navigate to the address specified below: ``` @@ -8,21 +10,21 @@ cd /spark-3.0.0-SNAPSHOT-bin-SparkFHE/SparkFHE-Addon/scripts/cluster/yarn_cluste ``` # Install Hadoop and Configure Spark on all nodes through Client Node -Specify the hostnames of nodes as arguments. +The hostnames of nodes in cluster will be picked up from etc/hosts ``` -sudo bash install_yarn_cluster.sh master,worker1,worker2 ... +sudo bash install_yarn_cluster.sh ``` -# Start Yarn Spark Cluster and Run Spark Job on Master(Step Automatic For Now) +# Start Yarn Spark Cluster Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders. ``` sudo bash start_yarn_cluster.sh ``` -# Run Test Spark Job on Master Through Client(Step Automatic For Now) +# Run Test Spark Job on Master ``` cd test_scripts -sudo bash run_spark_test_job_pi.sh +sudo bash run_spark_test_job_pi_remotely.sh ``` If the job is successfulll completed, final status is 'SUCCEEDED'. The links generated can be used by following the guide specified below. @@ -30,6 +32,14 @@ If the job is successfulll completed, final status is 'SUCCEEDED'. The links gen The public IP addresses of all nodes have been closed to bolster security. To view the web Interface, some additional steps will have to be performed. +### Find Internal IP of Master Node + +On the client node run the following to get the internal IP of Master Node: +``` +sudo ssh master "hostname -I | awk '{print \$1}'" +``` +This same step can be done on any of the worker nodes. + ### Setup SSH Tunneling for nodes Open a Terminal window on local machine and type the following: @@ -39,14 +49,6 @@ ssh -4 -ND ``` This step will bind the local machine's port to the IP address of Master Node. -### Get Internal IP of Master Node - -On the master node run the following to get the internal IP of Master Node: -``` -hostname -I | awk '{print $1}' -``` -This same step can be done on any of the worker nodes. - ### Configure Browser to open link Open Mozilla Firefox browser in the local machine. @@ -61,7 +63,7 @@ Select Socks_v5 and type in the Port Number chosen in the previous step for SOCK ### List of Web Interfaces -Other links can be generated by changing the port number. +Different Web Interfaces can be accessed by changing the port number. #### YARN Interface: diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml index 1c8b404..4cb3516 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml @@ -25,10 +25,10 @@ dfs.datanode.data.dir /data/hadoop/data - + + fs.defaultFS - hdfs://master:9000 + hdfs://master-public-ip:9000 - dfs.namenode.rpc-bind-host - master + master-public-ip - \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml index b52fb77..7fba571 100644 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml @@ -7,11 +7,15 @@ dfs.namenode.http-address - master:50070 + master-variable-ip:50070 dfs.namenode.secondary.http-address - master:50090 + master-public-ip:50090 + + + dfs.namenode.datanode.registration.ip-hostname-check + false dfs.replication @@ -29,8 +33,4 @@ dfs.webhdfs.enabled true - \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml index 10056c7..4a3ef7d 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml @@ -3,7 +3,7 @@ yarn.resourcemanager.hostname - master + master-internal-ip yarn.nodemanager.aux-services @@ -15,34 +15,30 @@ yarn.resourcemanager.address - master:8032 + master-internal-ip:8032 yarn.resourcemanager.scheduler.address - master:8030 + master-internal-ip:8030 yarn.resourcemanager.resource-tracker.address - master:8031 + master-internal-ip:8031 yarn.resourcemanager.admin.address - master:8033 + master-internal-ip:8033 yarn.resourcemanager.webapp.address - master:8088 + master-internal-ip:8088 mapreduce.jobhistory.address - master:10020 + master-internal-ip:10020 mapreduce.jobhistory.webapp.address - master:19888 + master-internal-ip:19888 - \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml index dec55d6..687725d 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml @@ -3,7 +3,7 @@ yarn.resourcemanager.hostname - master + master-internal-ip yarn.nodemanager.aux-services @@ -15,36 +15,32 @@ yarn.resourcemanager.address - master:8032 + master-internal-ip:8032 yarn.resourcemanager.scheduler.address - master:8030 + master-internal-ip:8030 yarn.resourcemanager.resource-tracker.address - master:8031 + master-internal-ip:8031 yarn.resourcemanager.admin.address - master:8033 + master-internal-ip:8033 yarn.resourcemanager.webapp.address - master:8088 + master-internal-ip:8088 mapreduce.jobhistory.address - master:10020 + master-internal-ip:10020 mapreduce.jobhistory.webapp.address - master:19888 + master-internal-ip:19888 - yarn.resourcemanager.scheduler.class org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-regular.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-regular.xml new file mode 100755 index 0000000..4a3ef7d --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-regular.xml @@ -0,0 +1,44 @@ + + + + + yarn.resourcemanager.hostname + master-internal-ip + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + yarn.resourcemanager.address + master-internal-ip:8032 + + + yarn.resourcemanager.scheduler.address + master-internal-ip:8030 + + + yarn.resourcemanager.resource-tracker.address + master-internal-ip:8031 + + + yarn.resourcemanager.admin.address + master-internal-ip:8033 + + + yarn.resourcemanager.webapp.address + master-internal-ip:8088 + + + mapreduce.jobhistory.address + master-internal-ip:10020 + + + mapreduce.jobhistory.webapp.address + master-internal-ip:19888 + + \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml index e046711..4a3ef7d 100755 --- a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml +++ b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site.xml @@ -3,7 +3,7 @@ yarn.resourcemanager.hostname - master + master-internal-ip yarn.nodemanager.aux-services @@ -15,34 +15,30 @@ yarn.resourcemanager.address - master:8032 + master-internal-ip:8032 yarn.resourcemanager.scheduler.address - master:8030 + master-internal-ip:8030 yarn.resourcemanager.resource-tracker.address - master:8031 + master-internal-ip:8031 yarn.resourcemanager.admin.address - master:8033 + master-internal-ip:8033 yarn.resourcemanager.webapp.address - master:8088 + master-internal-ip:8088 mapreduce.jobhistory.address - master:10020 + master-internal-ip:10020 mapreduce.jobhistory.webapp.address - master:19888 + master-internal-ip:19888 - \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/configs/hostnames b/scripts/cluster/yarn_cluster_setup/configs/hostnames new file mode 100644 index 0000000..821616e --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/hostnames @@ -0,0 +1,3 @@ +master +worker1 +worker2 diff --git a/scripts/cluster/yarn_cluster_setup/configs/master b/scripts/cluster/yarn_cluster_setup/configs/master new file mode 100644 index 0000000..1f7391f --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/master @@ -0,0 +1 @@ +master diff --git a/scripts/cluster/yarn_cluster_setup/configs/slaves b/scripts/cluster/yarn_cluster_setup/configs/slaves new file mode 100644 index 0000000..6e273a2 --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/slaves @@ -0,0 +1,2 @@ +worker1 +worker2 diff --git a/scripts/cluster/yarn_cluster_setup/configs/spark/spark-defaults.conf b/scripts/cluster/yarn_cluster_setup/configs/spark/spark-defaults.conf new file mode 100644 index 0000000..11e7fbb --- /dev/null +++ b/scripts/cluster/yarn_cluster_setup/configs/spark/spark-defaults.conf @@ -0,0 +1,28 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://127.0.0.1:7077 +spark.eventLog.enabled true +spark.eventLog.dir file:///tmp/spark-events +spark.history.fs.logDirectory file:///tmp/spark-events +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh index 7625dbc..77e2049 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh @@ -32,7 +32,10 @@ function checkSSH() { echo "Checking SSH connections" for(( i=0;i<${#host_array[@]};i++)) ; do echo ${host_array[i]} - ssh root@${host_array[i]} "hostname" + PUBLIC_IP=`ssh root@${host_array[i]} "hostname -i"` + # Replace internal hostnames with public IP + sed -i "s/${host_array[i]}/${PUBLIC_IP}/g" "$current_directory/configs/master" + sed -i "s/${host_array[i]}/${PUBLIC_IP}/g" "$current_directory/configs/slaves" if [ $? -eq 0 ] then echo -e "Can SSH to ${host_array[i]}" @@ -45,12 +48,13 @@ function checkSSH() { checkSSH +MASTER_PUBLIC_IP=`hostname -i` + echo ========================================================= echo "Setup Yarn Master" echo ========================================================= echo "Installing Yarn-master" -bash install_yarn_master_slave.sh - +bash install_yarn_master_slave.sh $MASTER_PUBLIC_IP # Move Config Files and install_yarn_master_slave.sh # Install Cluster on all Worker Nodes @@ -62,7 +66,7 @@ for(( i=1;i<${#host_array[@]};i++)) ; do rsync -a --rsync-path="sudo rsync" $current_directory/configs/ ${host_array[i]}:$current_directory/configs/ scp $current_directory/install_yarn_master_slave.sh ${host_array[i]}:$current_directory/ echo "Installing on "${host_array[i]} - ssh root@${host_array[i]} -n "cd ${current_directory} && sudo bash install_yarn_master_slave.sh" + ssh root@${host_array[i]} -n "cd ${current_directory} && sudo bash install_yarn_master_slave.sh ${MASTER_PUBLIC_IP}" echo "Finished configuration on "${host_array[i]} echo "" done \ No newline at end of file diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh index 4e9d017..29f51fb 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh @@ -1,5 +1,13 @@ #!/bin/sh +if [ $# -eq 0 ] + then + echo "No arguments supplied, installation on node terminated" + exit 255 +fi + +# Accept Public IP of master as a parameter +MASTER_PUBLIC_IP=$1 JAVA_HOME_INFILE=/usr/lib/jvm/default-java/ HADOOP_DATA=/data/hadoop/ HADOOP_HOME_INFILE=/usr/local/hadoop/ @@ -8,6 +16,12 @@ HADOOP_CONFIG_LOCATION=${HADOOP_HOME_INFILE}etc/hadoop/ HADOOP_VERSION=2.9.2 HADOOP_WEB_SOURCE=https://www-us.apache.org/dist/hadoop/common/ ROOT_VARIABLES_ADDRESS=/etc/profile +SPARK_HISTORY_DATA=/tmp/spark-events + +# These variable values will change as node names change +MASTER_INTERNAL_NAME=master +WORKER_INTERNAL_NAME=worker +current_hostname=`hostname` # Install Pre-Reqs apt-get update -y @@ -60,10 +74,27 @@ cp -a $current_directory/configs/hadoop/. $HADOOP_CONFIG_LOCATION cp $current_directory/configs/master $HADOOP_CONFIG_LOCATION cp $current_directory/configs/slaves $HADOOP_CONFIG_LOCATION +# Editing Config Files +# Making Uniform Changes applicable to all nodes +sed -i "s/master-public-ip/${MASTER_PUBLIC_IP}/g" "$HADOOP_CONFIG_LOCATION/core-site.xml" +sed -i "s/master-public-ip/${MASTER_PUBLIC_IP}/g" "$HADOOP_CONFIG_LOCATION/hdfs-site.xml" +sed -i "s/master-internal-ip/${MASTER_INTERNAL_NAME}/g" "$HADOOP_CONFIG_LOCATION/yarn-site-capacity.xml" +sed -i "s/master-internal-ip/${MASTER_INTERNAL_NAME}/g" "$HADOOP_CONFIG_LOCATION/yarn-site-fair.xml" +sed -i "s/master-internal-ip/${MASTER_INTERNAL_NAME}/g" "$HADOOP_CONFIG_LOCATION/yarn-site-regular.xml" +sed -i "s/master-internal-ip/${MASTER_INTERNAL_NAME}/g" "$HADOOP_CONFIG_LOCATION/yarn-site.xml" + +# Following changes are different on master and worker node +if [[ $current_hostname == *$MASTER_INTERNAL_NAME* ]]; then + echo "Changing namenode IP on master" + sed -i "s/master-variable-ip/0.0.0.0/g" "$HADOOP_CONFIG_LOCATION/hdfs-site.xml" +else + echo "Changing namenode IP on worker" + sed -i "s/master-variable-ip/${MASTER_PUBLIC_IP}/g" "$HADOOP_CONFIG_LOCATION/hdfs-site.xml" +fi + echo "Hadoop Installation Complete on this node" SPARK_HOME_INFILE=`cd ${current_directory}/../../../.. && pwd` - SPARK_CONFIG_LOCATION=$SPARK_HOME_INFILE/conf/ # Remove Spark Global Variables @@ -74,9 +105,13 @@ echo "export SPARK_HOME="$SPARK_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS echo "export PATH=$PATH:"$SPARK_HOME_INFILE"/bin/" >> $ROOT_VARIABLES_ADDRESS source $ROOT_VARIABLES_ADDRESS +# Make Spark Directory for History Recording +sudo rm -rf $SPARK_HISTORY_DATA +mkdir -p $SPARK_HISTORY_DATA + # Copy Spark Config Files cp -a $current_directory/configs/spark/. $SPARK_CONFIG_LOCATION -cp -a $current_directory/configs/hadoop/. $SPARK_CONFIG_LOCATION +cp -a $HADOOP_CONFIG_LOCATION. $SPARK_CONFIG_LOCATION cp $current_directory/configs/master $SPARK_CONFIG_LOCATION cp $current_directory/configs/slaves $SPARK_CONFIG_LOCATION diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh index 15161ac..fde2de5 100644 --- a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh @@ -1,11 +1,15 @@ #!/bin/bash -MASTER_HOSTNAME=master -CLIENT_HOSTNAME=client - -if [[ `hostname` == *${CLIENT_HOSTNAME}* ]]; then +# Master, Client Name depends on cluster config +# If cluster config changes, the variable values should change +client_name=client +master_name=master +MASTER_HOSTNAME=`ssh root@$master_name "hostname -i"` +current_hostname=`hostname` + +if [[ $current_hostname == *$client_name* ]]; then echo "Commands running from correct node" - ssh $MASTER_HOSTNAME ' + ssh root@$MASTER_HOSTNAME ' source /etc/profile echo "STARTING HADOOP SERVICES" @@ -19,6 +23,7 @@ if [[ `hostname` == *${CLIENT_HOSTNAME}* ]]; then $HADOOP_HOME/bin/hdfs dfsadmin -safemode leave echo "STARTING SPARK SERVICES" + $SPARK_HOME/sbin/start-history-server.sh $SPARK_HOME/sbin/start-all.sh echo "RUN jps - Java Virtual Machine Process Status Tool" diff --git a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh index c0fe0b4..921ef4c 100644 --- a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh @@ -1,15 +1,19 @@ #!/bin/bash -MASTER_HOSTNAME=master -CLIENT_HOSTNAME=client - -if [[ `hostname` == *${CLIENT_HOSTNAME}* ]]; then +# Master, Client Name depends on cluster config +# If cluster config changes, the variable values should change +client_name=client +master_name=master +MASTER_HOSTNAME=`ssh root@$master_name "hostname -i"` +current_hostname=`hostname` + +if [[ $current_hostname == *$client_name* ]]; then echo "Commands running from correct node" - ssh $MASTER_HOSTNAME ' + ssh root@$MASTER_HOSTNAME ' source /etc/profile echo -e "STOPPING SPARK SERVICES" - + $SPARK_HOME/sbin/stop-history-server.sh $SPARK_HOME/sbin/stop-all.sh echo -e "STOPPING HADOOP SERVICES" diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi_remotely.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi_remotely.sh index b4f7753..33eeb24 100644 --- a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi_remotely.sh +++ b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi_remotely.sh @@ -1,11 +1,15 @@ #!/bin/bash -MASTER_HOSTNAME=master -CLIENT_HOSTNAME=client +# Master, Client Name depends on cluster config +# If cluster config changes, the variable values should change +client_name=client +master_name=master +MASTER_HOSTNAME=`ssh root@$master_name "hostname -i"` +current_hostname=`hostname` -if [[ `hostname` == *${CLIENT_HOSTNAME}* ]]; then +if [[ $current_hostname == *"client"* ]]; then echo "Commands running from correct node" - ssh $MASTER_HOSTNAME ' + ssh root@$MASTER_HOSTNAME ' source /etc/profile echo "SPARK TEST" From b33c89b611cbc8b17364a34e34f6ac501240f0d2 Mon Sep 17 00:00:00 2001 From: pranavmrane Date: Sun, 21 Apr 2019 01:10:30 -0400 Subject: [PATCH 13/13] Host Verification Bug Solved by Pinging all Nodes --- .../install_yarn_cluster.sh | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh index 77e2049..4504523 100644 --- a/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh +++ b/scripts/cluster/yarn_cluster_setup/install_yarn_cluster.sh @@ -69,4 +69,25 @@ for(( i=1;i<${#host_array[@]};i++)) ; do ssh root@${host_array[i]} -n "cd ${current_directory} && sudo bash install_yarn_master_slave.sh ${MASTER_PUBLIC_IP}" echo "Finished configuration on "${host_array[i]} echo "" -done \ No newline at end of file +done + + +echo "Starting Cluster to Ping all Nodes" + +source /etc/profile + +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver +$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave +$SPARK_HOME/sbin/start-history-server.sh +$SPARK_HOME/sbin/start-all.sh +jps +$HADOOP_HOME/bin/hdfs dfsadmin -report + +echo "Stopping Cluster" +$SPARK_HOME/sbin/stop-history-server.sh +$SPARK_HOME/sbin/stop-all.sh +$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver +$HADOOP_HOME/sbin/stop-dfs.sh +$HADOOP_HOME/sbin/stop-yarn.sh \ No newline at end of file