Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yarn spark cluster - Security Features, Global Variables #8

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
5 changes: 3 additions & 2 deletions scripts/cluster/yarn_cluster_setup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Specify the hostnames of nodes as arguments.
sudo bash install_yarn_cluster.sh master,worker1,worker2 ...
```

# Start Yarn Spark Cluster
# Start Yarn Spark Cluster and Run Spark Job on Master
Cluster can only be started on master node after installation is complete on all nodes and configuration files for Yarn and Spark are placed in correct folders.
```
sudo bash start_yarn_cluster.sh
Expand All @@ -27,6 +27,7 @@ Use the link generated after successful completion of cluster building to view t
cd test_scripts
sudo bash run_spark_test_job_pi.sh
```
To view the output open http://<MASTER_NODE_IP_ADDRESS>:8088/, select the latest application, open the logs for that application, and select stdout. This should show the value for Pi calculated on the cluster.

### Useful Links:
Other links can be generated by changing the port number.
Expand Down Expand Up @@ -56,4 +57,4 @@ http://<MASTER_NODE_IP_ADDRESS>:19888/
cd ..
sudo bash stop_yarn_job.sh
```
After running this command, the web interfaces will not work.
After running this command, the web interfaces will not work.
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@
<name>dfs.namenode.rpc-bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might also want to add a property in yarn-site.xml to enable the acl

<name>hadoop.security.authorization</name>
<value>true</value>
<description>Service level authorization params.</description>
</property>
</configuration>
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>security.job.client.protocol.acl</name>
<value> iotx-PG0</value>
</property>
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@
<name>dfs.datanode.data.dir</name>
<value>/data/hadoop/data</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>false</value>
</property>
</configuration>
39 changes: 32 additions & 7 deletions scripts/cluster/yarn_cluster_setup/install_yarn_master_slave.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ HADOOP_SYMLINK=/usr/local/hadoop
HADOOP_CONFIG_LOCATION=${HADOOP_HOME_INFILE}etc/hadoop/
HADOOP_VERSION=2.9.2
HADOOP_WEB_SOURCE=https://www-us.apache.org/dist/hadoop/common/
GLOBAL_VARIABLES_SOURCE=/etc/environment
ROOT_VARIABLES_ADDRESS=/etc/profile

# Install Pre-Reqs
apt-get update -y
Expand All @@ -17,6 +17,22 @@ apt-get install -y python default-jdk wget
unlink ${HADOOP_SYMLINK} && rm -rf ${HADOOP_DATA}
rm -rf /usr/local/hadoop-*/

# Remove Global Variables
sed -i /JAVA_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /default-java/d $ROOT_VARIABLES_ADDRESS
sed -i /HADOOP_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /hadoop/d $ROOT_VARIABLES_ADDRESS

# Make Hadoop Global Variables for User and Root
echo "export JAVA_HOME="$JAVA_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export PATH=$PATH:"$JAVA_HOME_INFILE"bin/:"$JAVA_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS
echo "export HADOOP_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export HADOOP_MAPRED_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export HADOOP_COMMON_HOME="$HADOOP_HOME_INFILE>> $ROOT_VARIABLES_ADDRESS
echo "export HADOOP_HDFS_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export YARN_HOME="$HADOOP_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export HADOOP_COMMON_LIB_NATIVE_DIR="$HADOOP_HOME_INFILE"lib/native" >> $ROOT_VARIABLES_ADDRESS
echo "export PATH=$PATH:"$HADOOP_HOME_INFILE"bin/:"$HADOOP_HOME_INFILE"sbin/" >> $ROOT_VARIABLES_ADDRESS
source $ROOT_VARIABLES_ADDRESS

# Make Data Directories for Hadoop
mkdir -p ${HADOOP_DATA}name
mkdir -p ${HADOOP_DATA}data
Expand All @@ -28,20 +44,18 @@ current_directory=`pwd`
if [ ! -f "${current_directory}/hadoop-$HADOOP_VERSION.tar.gz" ]; then
echo "Downloading Hadoop ${HADOOP_VERSION} ..."
sudo wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
# wget ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -P /hadoop-${HADOOP_VERSION}.tar.gz
# sudo curl ${HADOOP_WEB_SOURCE}hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz > /hadoop-${HADOOP_VERSION}.tar.gz
echo "Download of Hadoop ${HADOOP_VERSION} Successful!"
fi

# Unzip and Install Hadoop Tar
tar -xzf $current_directory/hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/
# tar -xzf /hadoop-$HADOOP_VERSION.tar.gz -C /usr/local/

rm $current_directory/hadoop-$HADOOP_VERSION.tar.gz

# Make Symbolic link
ln -s /usr/local/hadoop-$HADOOP_VERSION/ $HADOOP_SYMLINK

# Copy Config Files

# Copy Hadoop Config Files
cp -a $current_directory/configs/hadoop/. $HADOOP_CONFIG_LOCATION
cp $current_directory/configs/master $HADOOP_CONFIG_LOCATION
cp $current_directory/configs/slaves $HADOOP_CONFIG_LOCATION
Expand All @@ -52,10 +66,21 @@ SPARK_HOME_INFILE=`cd ${current_directory}/../../../.. && pwd`

SPARK_CONFIG_LOCATION=$SPARK_HOME_INFILE/conf/

# Remove Spark Global Variables
sed -i /SPARK_HOME/d $ROOT_VARIABLES_ADDRESS && sed -i /spark/d $ROOT_VARIABLES_ADDRESS

# Make Spark Global Variables for User and Root
echo "export SPARK_HOME="$SPARK_HOME_INFILE >> $ROOT_VARIABLES_ADDRESS
echo "export PATH=$PATH:"$SPARK_HOME_INFILE"/bin/" >> $ROOT_VARIABLES_ADDRESS
source $ROOT_VARIABLES_ADDRESS

# Copy Spark Config Files
cp -a $current_directory/configs/spark/. $SPARK_CONFIG_LOCATION
cp -a $current_directory/configs/hadoop/. $SPARK_CONFIG_LOCATION
cp $current_directory/configs/master $SPARK_CONFIG_LOCATION
cp $current_directory/configs/slaves $SPARK_CONFIG_LOCATION

# Format Namenode
/usr/local/hadoop/bin/hdfs namenode -format
$HADOOP_HOME_INFILE/bin/hdfs namenode -format

echo "Run the following on master node: sudo bash start_yarn_cluster.sh"
24 changes: 17 additions & 7 deletions scripts/cluster/yarn_cluster_setup/start_yarn_cluster.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
#!/bin/bash

source /etc/profile

echo "STARTING HADOOP SERVICES"
/usr/local/hadoop/sbin/start-dfs.sh

/usr/local/hadoop/sbin/start-yarn.sh
$HADOOP_HOME/sbin/start-dfs.sh

$HADOOP_HOME/sbin/start-yarn.sh

/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh start historyserver
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver

/usr/local/hadoop/bin/hdfs dfsadmin -safemode leave
$HADOOP_HOME/bin/hdfs dfsadmin -safemode leave

echo "STARTING SPARK SERVICES"
/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/start-all.sh
$SPARK_HOME/sbin/start-all.sh

echo "RUN jps - Java Virtual Machine Process Status Tool"
jps

echo "Get basic filesystem information and statistics."
/usr/local/hadoop/bin/hdfs dfsadmin -report
$HADOOP_HOME/bin/hdfs dfsadmin -report

echo "Yarn Cluster is Active"

master_node_ip_address=`hostname -i`

echo "Yarn Cluster is Active"
echo "YARN Interface Available At: "$master_node_ip_address":8088/"
echo "Spark Interface Available At: "$master_node_ip_address":8080/"
echo "NameNode Interface Available At: "$master_node_ip_address":50070/"
echo "Job Master Interface Available At: "$master_node_ip_address":19888/"
10 changes: 6 additions & 4 deletions scripts/cluster/yarn_cluster_setup/stop_yarn_cluster.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/bin/bash

source /etc/profile

echo -e "STOPPING SPARK SERVICES"

/spark-3.0.0-SNAPSHOT-bin-SparkFHE/sbin/stop-all.sh
$SPARK_HOME/sbin/stop-all.sh

echo -e "STOPPING HADOOP SERVICES"

/usr/local/hadoop/sbin/mr-jobhistory-daemon.sh stop historyserver
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver

/usr/local/hadoop/sbin/stop-dfs.sh
$HADOOP_HOME/sbin/stop-dfs.sh

/usr/local/hadoop/sbin/stop-yarn.sh
$HADOOP_HOME/sbin/stop-yarn.sh

echo "Hadoop Cluster is Inactive Now"
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#!/bin/bash

source /etc/profile

echo "SPARK TEST"
/spark-3.0.0-SNAPSHOT-bin-SparkFHE/bin/spark-submit --class org.apache.spark.examples.SparkPi \
$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode cluster \
--num-executors 1 \
--driver-memory 1g \
--executor-memory 512m \
--executor-cores 1 \
/spark-3.0.0-SNAPSHOT-bin-SparkFHE/examples/jars/spark-examples*.jar \
10
$SPARK_HOME/examples/jars/spark-examples*.jar \
10

echo "Stop Cluster If not in Use"