SpiRITlab · pranavmrane · Feb 22, 2019 · Feb 24, 2019 · Feb 25, 2019 · Mar 3, 2019
diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md
@@ -15,7 +15,7 @@ Specify the hostnames of nodes as arguments.
 sudo bash install_yarn_cluster.sh master,worker1,worker2 ...
 ```
 
-# Start Yarn Spark Cluster
+# Start Yarn Spark Cluster and Run Spark Job on Master
 Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders.
 ```
 sudo bash start_yarn_cluster.sh
@@ -27,6 +27,7 @@ Use the link generated after successful completion of cluster building to view t
 cd test_scripts
 sudo bash run_spark_test_job_pi.sh
 ```
+To view the output open http://<MASTER_NODE_IP_ADDRESS>:8088/, select the latest application, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster.
 
 ### Useful Links: 
 Other links can be generated by changing the port number.
@@ -56,4 +57,4 @@ http://<MASTER_NODE_IP_ADDRESS>:19888/
 cd ..
 sudo bash stop_yarn_job.sh
 ```
-After running this command, the web interfaces will not work.
+After running this command, the web interfaces will not work.
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml
@@ -11,4 +11,9 @@
 	<name>dfs.namenode.rpc-bind-host</name>
 	<value>0.0.0.0</value>
     </property>
+    <property>
+    <name>hadoop.security.authorization</name>
+    <value>true</value>
+    <description>Service level authorization params.</description>
+    </property>
 </configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hadoop-policy.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+	 <property>
+	     <name>security.job.client.protocol.acl</name>
+	     <value> iotx-PG0</value>
+	</property>
+</configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml
@@ -25,4 +25,8 @@
         <name>dfs.datanode.data.dir</name>
         <value>/data/hadoop/data</value>
     </property>
+    <property>
+        <name>dfs.webhdfs.enabled</name>
+        <value>false</value>
+    </property>
 </configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh b/scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh
@@ -7,7 +7,7 @@ HADOOP_SYMLINK=/usr/local/hadoop
 HADOOP_CONFIG_LOCATION=${HADOOP_HOME_INFILE}etc/hadoop/
 HADOOP_VERSION=2.9.2
 HADOOP_WEB_SOURCE=https://www-us.apache.org/dist/hadoop/common/
-GLOBAL_VARIABLES_SOURCE=/etc/environment
+ROOT_VARIABLES_ADDRESS=/etc/profile
 
 # Install Pre-Reqs
 apt-get update -y
@@ -17,6 +17,22 @@ apt-get install -y python default-jdk wget
 unlink ${HADOOP_SYMLINK} && rm -rf ${HADOOP_DATA}
 rm -rf /usr/local/hadoop-*/
 
+# Remove Global Variables
+sed -i /JAVA_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /default-java/d $ROOT_VARIABLES_ADDRESS
+sed -i /HADOOP_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /hadoop/d $ROOT_VARIABLES_ADDRESS
+
+# Make Hadoop Global Variables for User and Root
+echo "export JAVA_HOME="$JAVA_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export PATH=$PATH:"$JAVA_HOME_INFILE"bin/:"$JAVA_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS
+echo "export HADOOP_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export HADOOP_MAPRED_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export HADOOP_COMMON_HOME="$HADOOP_HOME_INFILE>> $ROOT_VARIABLES_ADDRESS
+echo "export HADOOP_HDFS_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export YARN_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export HADOOP_COMMON_LIB_NATIVE_DIR="$HADOOP_HOME_INFILE"lib/native" >> $ROOT_VARIABLES_ADDRESS
+echo "export PATH=$PATH:"$HADOOP_HOME_INFILE"bin/:"$HADOOP_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS
+source $ROOT_VARIABLES_ADDRESS
+
 # Make Data Directories for Hadoop
 mkdir -p ${HADOOP_DATA}name
 mkdir -p ${HADOOP_DATA}data
@@ -28,20 +44,18 @@ current_directory=`pwd`
 if [ ! -f "${current_directory}/hadoop-$HADOOP_VERSION.tar.gz" ]; then
 	echo "Downloading Hadoop ${HADOOP_VERSION} ..."
 	sudo wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
-	# wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -P /hadoop-${HADOOP_VERSION}.tar.gz
-    # sudo curl ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz > /hadoop-${HADOOP_VERSION}.tar.gz
     echo "Download of Hadoop ${HADOOP_VERSION} Successful!"
 fi
 
 # Unzip and Install Hadoop Tar
 tar -xzf $current_directory/hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/
-# tar -xzf /hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/
+
+rm $current_directory/hadoop-$HADOOP_VERSION.tar.gz
 
 # Make Symbolic link
 ln -s /usr/local/hadoop-$HADOOP_VERSION/ $HADOOP_SYMLINK
 
-# Copy Config Files
-
+# Copy Hadoop Config Files
 cp -a $current_directory/configs/hadoop/. $HADOOP_CONFIG_LOCATION
 cp $current_directory/configs/master $HADOOP_CONFIG_LOCATION
 cp $current_directory/configs/slaves $HADOOP_CONFIG_LOCATION
@@ -52,10 +66,21 @@ SPARK_HOME_INFILE=`cd ${current_directory}/../../../.. && pwd`
 
 SPARK_CONFIG_LOCATION=$SPARK_HOME_INFILE/conf/
 
+# Remove Spark Global Variables
+sed -i /SPARK_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /spark/d $ROOT_VARIABLES_ADDRESS
+
+# Make Spark Global Variables for User and Root
+echo "export SPARK_HOME="$SPARK_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
+echo "export PATH=$PATH:"$SPARK_HOME_INFILE"/bin/" >> $ROOT_VARIABLES_ADDRESS
+source $ROOT_VARIABLES_ADDRESS
+
+# Copy Spark Config Files
 cp -a $current_directory/configs/spark/. $SPARK_CONFIG_LOCATION
 cp -a $current_directory/configs/hadoop/. $SPARK_CONFIG_LOCATION
 cp $current_directory/configs/master $SPARK_CONFIG_LOCATION
 cp $current_directory/configs/slaves $SPARK_CONFIG_LOCATION
 
 # Format Namenode
-/usr/local/hadoop/bin/hdfs namenode -format
+$HADOOP_HOME_INFILE/bin/hdfs namenode -format
+
+echo "Run the following on master node: sudo bash start_yarn_cluster.sh"
diff --git a/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh
@@ -1,21 +1,31 @@
 #!/bin/bash
 
+source /etc/profile
+
 echo "STARTING HADOOP SERVICES"
-/usr/local/hadoop/sbin/start-dfs.sh
 
-/usr/local/hadoop/sbin/start-yarn.sh
+$HADOOP_HOME/sbin/start-dfs.sh
+
+$HADOOP_HOME/sbin/start-yarn.sh
 
-/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver
+$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
 
-/usr/local/hadoop/bin/hdfs dfsadmin -safemode leave
+$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave
 
 echo "STARTING SPARK SERVICES"
-/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/start-all.sh
+$SPARK_HOME/sbin/start-all.sh
 
 echo "RUN jps - Java Virtual Machine Process Status Tool"
 jps
 
 echo "Get basic filesystem information and statistics."
-/usr/local/hadoop/bin/hdfs dfsadmin -report
+$HADOOP_HOME/bin/hdfs dfsadmin -report
+
+echo "Yarn Cluster is Active"
+
+master_node_ip_address=`hostname -i`
 
-echo "Yarn Cluster is Active"
+echo "YARN Interface Available At: "$master_node_ip_address":8088/"
+echo "Spark Interface Available At: "$master_node_ip_address":8080/"
+echo "NameNode Interface Available At: "$master_node_ip_address":50070/"
+echo "Job Master Interface Available At: "$master_node_ip_address":19888/"
diff --git a/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh b/scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh
@@ -1,15 +1,17 @@
 #!/bin/bash
 
+source /etc/profile
+
 echo -e "STOPPING SPARK SERVICES"
 
-/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/stop-all.sh
+$SPARK_HOME/sbin/stop-all.sh
 
 echo -e "STOPPING HADOOP SERVICES"
 
-/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh stop historyserver
+$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver
 
-/usr/local/hadoop/sbin/stop-dfs.sh
+$HADOOP_HOME/sbin/stop-dfs.sh
 
-/usr/local/hadoop/sbin/stop-yarn.sh
+$HADOOP_HOME/sbin/stop-yarn.sh
 
 echo "Hadoop Cluster is Inactive Now"
diff --git a/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh b/scripts/cluster/yarn_cluster_setup/test_scripts/run_spark_test_job_pi.sh
@@ -1,12 +1,16 @@
 #!/bin/bash
 
+source /etc/profile
+
 echo "SPARK TEST"
-/spark-3.0.0-SNAPSHOT-bin-SparkFHE/bin/spark-submit --class org.apache.spark.examples.SparkPi  \
+$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi  \
     --master yarn \
     --deploy-mode cluster \
     --num-executors 1 \
     --driver-memory 1g \
     --executor-memory 512m \
     --executor-cores 1 \
-    /spark-3.0.0-SNAPSHOT-bin-SparkFHE/examples/jars/spark-examples*.jar \
-    10
+    $SPARK_HOME/examples/jars/spark-examples*.jar \
+    10
+
+echo "Stop Cluster If not in Use"