SpiRITlab · pranavmrane · Feb 22, 2019 · Feb 24, 2019 · Feb 25, 2019 · Mar 3, 2019
diff --git a/scripts/cluster/yarn_cluster_setup/README.md b/scripts/cluster/yarn_cluster_setup/README.md
@@ -1,7 +1,7 @@
 
-Setup an experiment on Cloudlab using the SparkFHE-Dist-Ubuntu18.04 image. Use the Wisconsin server.
+Setup an experiment on Cloudlab using the SparkFHE-YARN-Client-Ub18-HDFS image. Use the Wisconsin server.
 
-Please note that the scripts are designed to run on Master Node.
+The installation scripts are designed to run from master. The cluster start/stop scripts and example scripts are designed to work from client.
 
 # SSH into Master Node
 SSH into the master node and navigate to the address specified below:
@@ -10,50 +10,166 @@ cd /spark-3.0.0-SNAPSHOT-bin-SparkFHE/SparkFHE-Addon/scripts/cluster/yarn_cluste
 ```
 
 # Install Hadoop and Configure Spark on all nodes through Master Node
-Specify the hostnames of nodes as arguments.
+The hostnames of nodes in cluster will be picked up from etc/hosts. Read Appendix for further details about Hostnames.
 ```
-sudo bash install_yarn_cluster.sh master,worker1,worker2 ...
+sudo bash install_yarn_cluster.sh
 ```
 
-# Start Yarn Spark Cluster
-Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders.
+# SSH into Client Node
+SSH into the Client node and navigate to the address specified below:
+```
+cd /spark-3.0.0-SNAPSHOT-bin-SparkFHE/SparkFHE-Addon/scripts/cluster/yarn_cluster_setup
+```
+
+# Start Yarn Spark Cluster and HDFS from Client Node
+Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders. Check Appendix for HDFS Commands.
 ```
 sudo bash start_yarn_cluster.sh
 ```
 
-# Run Test Spark Job on Master
-Use the link generated after successful completion of cluster building to view the web interface for Yarn.
+# Run Test Spark Job on Master Through Client
 ```
 cd test_scripts
-sudo bash run_spark_test_job_pi.sh
+sudo bash run_spark_test_job_pi_remotely.sh
 ```
+If the job is successfulll completed, final status is 'SUCCEEDED'. The links generated can be used by following the guide specified below.
+
+# Web Interfaces:
+
+Different Web Interfaces can be accessed by changing the port number. The list is specified directly below.
 
-### Useful Links: 
-Other links can be generated by changing the port number.
+To view the web Interface, some additional steps will have to be performed. Check Appendix for SSH Tunneling Instructions.
 
-YARN Interface:
+The public IP addresses of some nodes have been closed to bolster security. Check Appendix for Security for individual aspects of cluster.
 
-http://<MASTER_NODE_IP_ADDRESS>:8088/
+## YARN Interface:
 
-Spark Interface:
+http://<MASTER_NODE_IP_ADDRESS_INTERNAL>:8088/
 
-http://<MASTER_NODE_IP_ADDRESS>:8080/
+The output of test job is available in the link above.  
 
-Namenode Interface:
+Select the latest spplications, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster.
 
-http://<MASTER_NODE_IP_ADDRESS>:50070/
+## Spark Interface:
 
-Datanode Interface:
+http://<MASTER_NODE_IP_ADDRESS_INTERNAL>:8080/
 
-http://<WORKER_NODE_IP_ADDRESS>:50075/
+## Namenode Interface:
 
-JobMaster Interface:
+http://<MASTER_NODE_IP_ADDRESS_INTERNAL>:50070/
 
-http://<MASTER_NODE_IP_ADDRESS>:19888/
+## JobMaster Interface:
 
-# Stop the Cluster
+http://<MASTER_NODE_IP_ADDRESS_INTERNAL>:19888/
+
+## Datanode Interface:
+
+http://<WORKER_NODE_IP_ADDRESS_INTERNAL>:50075/
+
+# Stop the Cluster Through the client Node
 ```
 cd ..
 sudo bash stop_yarn_job.sh
 ```
 After running this command, the web interfaces will not work.
+
+# Appendix
+
+## Hostnames
+The current process is designed to read worker names from etc/hosts. This might not be the case for 3rd party products Amazon EC2. Changes will have to be made to the step. The user would have to manually enter public IP addresses of master and worker nodes.
+
+## HDFS Commands
+An important condition to for HDFS to work is the public IP address. Please make sure that every node in the cluster has a publicly accessible IP address. 
+
+HDFS is turned on when start_yarn_cluster.sh is executed. The individual command to turn on HDFS is <HADOOP_HOME>/sbin/start-dfs.sh. To close use <HADOOP_HOME>/sbin/stop-dfs.sh. 
+
+### HDFS Commands on cluster nodes
+Once on, following commands can be run from any of nodes in the cluster. 
+```
+# List Folders in HDFS
+hdfs dfs -ls /
+# Make Folder
+hadoop fs -mkdir -p /<DIRECTORY_TO_BE_CREATED>
+# Confirm Folder Creation
+hdfs dfs -ls /
+# Move Local file into HDFS
+hadoop fs -put <LOCAL_FILE_ADDRESS>/<FILE_NAME> /<DIRECTORY_TO_BE_CREATED>/
+# View content of file created in HDFS
+hdfs dfs -cat /<DIRECTORY_TO_BE_CREATED>/<FILE_NAME>
+```
+Additional Information can be found [here](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/FileSystemShell.html)
+
+### HDFS Commands from remote machines using webHDFS i.e. port 9000
+For the most part the HDFS commands here stay similar HDFS commands on cluster nodes. The address to access the HDFS needs to be changed in the following manner. The standard address for hdfs/hadoop can be /usr/local/hadoop/etc/hadoop
+```
+# List Folders in HDFS
+hdfs dfs -ls hdfs://<MASTER_NODE_IP_ADDRESS_PUBLIC>:9000/
+# Make Folder
+hadoop fs -mkdir -p hdfs://<MASTER_NODE_IP_ADDRESS_PUBLIC>:9000/<DIRECTORY_TO_BE_CREATED>
+# Confirm Folder Creation
+hdfs dfs -ls hdfs://<MASTER_NODE_IP_ADDRESS_PUBLIC>:9000/
+# Move Local file into HDFS
+hadoop fs -put <FILE_ADDRESS>/<FILE_NAME> hdfs://<MASTER_NODE_IP_ADDRESS_PUBLIC>:9000/<DIRECTORY_TO_BE_CREATED>/
+# View content of file created in HDFS
+hdfs dfs -cat hdfs://<MASTER_NODE_IP_ADDRESS_PUBLIC>:9000/<DIRECTORY_TO_BE_CREATED>/<FILE_NAME>
+```
+Additional Information can be found [here](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/FileSystemShell.html)
+
+### HDFS Commands from remote machines using webHDFS i.e. port 50070
+To run commands from machines outside the cluster, REST API can be used. Here are a few examples.
+```
+# Make folder
+curl -X put "http://<MASTER_NODE_IP_ADDRESS_PUBLIC>:50070/webhdfs/v1/user/<DIRECTORY_TO_BE_CREATED>?user.name=root&op=MKDIRS"
+# Create an empty file
+curl -i -X put "http://<MASTER_NODE_IP_ADDRESS_PUBLIC>:50070/webhdfs/v1/user/<DIRECTORY_TO_BE_CREATED>/<FILE_TO_BE_UPLOADED>?user.name=root&op=CREATE"
+# The command above generates a link(specified in quotes) that can be used to upload the file. Use it to append <FILE_TO_BE_UPLOADED> onto HDFS.
+curl -i -T <FILE_TO_BE_UPLOADED> "http://<MASTER_NODE_IP_ADDRESS_PUBLIC>:50075/webhdfs/v1/user/<DIRECTORY_TO_BE_CREATED>/<FILE_TO_BE_UPLOADED>?op=CREATE&user.name=root&namenoderpcaddress=master:9000&createflag=&createparent=true&overwrite=false"
+
+```
+Additional Information can be found [here](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/WebHDFS.html)
+
+## SSH Tunneling Instructions
+
+### Find Internal IP of Master/Worker Node
+
+On the client node run the following to get the internal IP of Master Node:
+```
+sudo ssh master "hostname -I | awk '{print \$1}'"
+sudo ssh worker1 "hostname -I | awk '{print \$1}'"
+```
+
+### Setup SSH Tunneling for nodes
+
+Open a Terminal window on local machine and type the following:
+
+```
+ssh -4 -ND <PORT_NUMBER> <USERNAME@MASTER_NODE_ID.SERVER_AREA.cloudlab.us>
+```
+This step will bind the local machine's port to the IP address of Master Node.
+
+### Configure Browser to open link
+
+* Open Mozilla Firefox browser in the local machine. 
+
+* Click on three horizontal bars available on the top right hand side.
+
+* Select Preferences and look for 'Network Settings' on the page.
+
+* Once inside Network Settings, Select Manual Proxy Configuration.
+
+* Select Socks_v5 and type in the Port Number chosen in the previous step for SOCKS Host. The IP of SOCKS Host does not need to be changed. Select OK.
+
+### Open Weblinks (address-format and port number specified above)
+
+### Stop SSH Tunneling
+
+* To use the Mozilla Firefox browser as usual, Select 'No Proxy' in Network Settings and Select OK.
+
+* Stop the SSH tunneling by Closing the Terminal Window or Hit Ctrl + C in the terminal window.
+
+
+## Security for individual aspects of cluster
+* YARN - Accessible only on internal IP
+* Remote HDFS(Port 9000) - Publicly accessible
+* webHDFS(Port 50070) - Publicly Accesible
+* Spark - Publicly accessible
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/core-site.xml
@@ -1,14 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
-    <!-- Default HDFS ip and port -->
     <property>
          <name>fs.defaultFS</name>
-         <value>hdfs://master:9000</value>
+         <value>hdfs://master-public-ip:9000</value>
     </property>
-    <!-- default RPC IP，and use 0.0.0.0 to represent all ips-->
     <property>
-	<name>dfs.namenode.rpc-bind-host</name>
-	<value>0.0.0.0</value>
+	   <name>dfs.namenode.rpc-bind-host</name>
+	   <value>master-public-ip</value>
     </property>
 </configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/hdfs-site.xml
@@ -7,11 +7,15 @@
    </property>
     <property>
         <name>dfs.namenode.http-address</name>
-        <value>0.0.0.0:50070</value>
+        <value>master-variable-ip:50070</value>
     </property>
     <property>
         <name>dfs.namenode.secondary.http-address</name>
-        <value>0.0.0.0:50090</value>
+        <value>master-public-ip:50090</value>
+    </property>
+    <property>
+        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
+        <value>false</value>
     </property>
     <property>
         <name>dfs.replication</name>
@@ -25,4 +29,8 @@
         <name>dfs.datanode.data.dir</name>
         <value>/data/hadoop/data</value>
     </property>
+    <property>
+        <name>dfs.webhdfs.enabled</name>
+        <value>true</value>
+    </property>
 </configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-capacity.xml
@@ -3,7 +3,7 @@
 <configuration>
     <property>
         <name>yarn.resourcemanager.hostname</name>
-        <value>master</value>
+        <value>master-internal-ip</value>
     </property>
     <property>
         <name>yarn.nodemanager.aux-services</name>
@@ -15,30 +15,30 @@
     </property>
   <property>
     <name>yarn.resourcemanager.address</name>
-    <value>master:8032</value>
+    <value>master-internal-ip:8032</value>
   </property>
   <property>
      <name>yarn.resourcemanager.scheduler.address</name>
-     <value>master:8030</value>
+     <value>master-internal-ip:8030</value>
   </property>
   <property>
      <name>yarn.resourcemanager.resource-tracker.address</name>
-     <value>master:8031</value>
+     <value>master-internal-ip:8031</value>
   </property>
   <property>
      <name>yarn.resourcemanager.admin.address</name>
-     <value>0.0.0.0:8033</value>
+     <value>master-internal-ip:8033</value>
    </property>
    <property>
       <name>yarn.resourcemanager.webapp.address</name>
-      <value>0.0.0.0:8088</value>
+      <value>master-internal-ip:8088</value>
    </property>
    <property>
       <name>mapreduce.jobhistory.address</name>
-      <value>master:10020</value>
+      <value>master-internal-ip:10020</value>
    </property>
    <property>
       <name>mapreduce.jobhistory.webapp.address</name>
-      <value>0.0.0.0:19888</value>
+      <value>master-internal-ip:19888</value>
    </property>
 </configuration>
diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-fair.xml
@@ -3,7 +3,7 @@
 <configuration>
     <property>
         <name>yarn.resourcemanager.hostname</name>
-        <value>master</value>
+        <value>master-internal-ip</value>
     </property>
     <property>
         <name>yarn.nodemanager.aux-services</name>
@@ -15,31 +15,31 @@
     </property>
   <property>
     <name>yarn.resourcemanager.address</name>
-    <value>master:8032</value>
+    <value>master-internal-ip:8032</value>
   </property>
   <property>
      <name>yarn.resourcemanager.scheduler.address</name>
-     <value>master:8030</value>
+     <value>master-internal-ip:8030</value>
   </property>
   <property>
      <name>yarn.resourcemanager.resource-tracker.address</name>
-     <value>master:8031</value>
+     <value>master-internal-ip:8031</value>
   </property>
   <property>
      <name>yarn.resourcemanager.admin.address</name>
-     <value>0.0.0.0:8033</value>
+     <value>master-internal-ip:8033</value>
    </property>
    <property>
       <name>yarn.resourcemanager.webapp.address</name>
-      <value>0.0.0.0:8088</value>
+      <value>master-internal-ip:8088</value>
    </property>
    <property>
       <name>mapreduce.jobhistory.address</name>
-      <value>master:10020</value>
+      <value>master-internal-ip:10020</value>
    </property>
    <property>
       <name>mapreduce.jobhistory.webapp.address</name>
-      <value>0.0.0.0:19888</value>
+      <value>master-internal-ip:19888</value>
    </property>
   <property>
     <name>yarn.resourcemanager.scheduler.class</name>

diff --git a/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-regular.xml b/scripts/cluster/yarn_cluster_setup/configs/hadoop/yarn-site-regular.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+    <property>
+        <name>yarn.resourcemanager.hostname</name>
+        <value>master-internal-ip</value>
+    </property>
+    <property>
+        <name>yarn.nodemanager.aux-services</name>
+        <value>mapreduce_shuffle</value>
+    </property>
+    <property>
+         <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
+         <value>org.apache.hadoop.mapred.ShuffleHandler</value>
+    </property>
+  <property>
+    <name>yarn.resourcemanager.address</name>
+    <value>master-internal-ip:8032</value>
+  </property>
+  <property>
+     <name>yarn.resourcemanager.scheduler.address</name>
+     <value>master-internal-ip:8030</value>
+  </property>
+  <property>
+     <name>yarn.resourcemanager.resource-tracker.address</name>
+     <value>master-internal-ip:8031</value>
+  </property>
+  <property>
+     <name>yarn.resourcemanager.admin.address</name>
+     <value>master-internal-ip:8033</value>
+   </property>
+   <property>
+      <name>yarn.resourcemanager.webapp.address</name>
+      <value>master-internal-ip:8088</value>
+   </property>
+   <property>
+      <name>mapreduce.jobhistory.address</name>
+      <value>master-internal-ip:10020</value>
+   </property>
+   <property>
+      <name>mapreduce.jobhistory.webapp.address</name>
+      <value>master-internal-ip:19888</value>
+   </property>
+</configuration>