Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a script to run a jupyter notebook on Raijin #13

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions scripts/raijin_jupyter
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/bin/bash

# Scott Wales 20190522

print_help() {
cat <<EOF
Run a Jupyter notebook on Raijin's compute nodes, presenting the interface in a
browser on the local machine

General Options:
-h: Print help
-l: Raijin username
-L: Raijin login node (default 'raijin.nci.org.au')

Queue Options:
-q QUEUE: Queue name
-n NCPU: Use NCPU cpus
-m MEM: Memory allocation (default 2*NCPU GB)
-t TIME: Walltime limit (default 1 hour)
-J JOBFS: Jobfs allocation (default 100 GB)
-P PROJ: Submit job under project PROJ

EOF
}

set -eu

USER=''
PROJECT=''
LOGINNODE='raijin.nci.org.au'
QUEUE='express'
NCPUS='1'
MEM=''
WALLTIME=1:00:00
JOBFS=100gb

# Handle arguments
optspec="hl:L:q:n:m:t:J:P:"
while getopts "$optspec" optchar; do
case "${optchar}" in
h)
print_help
exit 2
;;
l)
USER="${OPTARG}"
;;
L)
LOGINNODE="${OPTARG}"
;;
q)
QUEUE="${OPTARG}"
;;
n)
NCPUS="${OPTARG}"
;;
m)
MEM="${OPTARG}"
;;
t)
WALLTIME="${OPTARG}"
;;
J)
JOBFS="${OPTARG}"
;;
P)
PROJECT="-P '${OPTARG}'"
;;
*)
print_help
exit 2
;;
esac
done

# This gets evaluated on Raijin in the SSH script
WORKDIR=\$TMPDIR/runjp

SSH='ssh -oBatchMode=yes'
if [ -n "$USER" ]; then
SSH="${SSH_ARGS} -l ${USER}"
fi
if [ -z "$MEM" ]; then
MEM="$(( NCPUS * 2 ))gb"
fi

SUBMITOPTS="-N jupyter-notebook $PROJECT -q '$QUEUE' -l 'ncpus=${NCPUS},mem=${MEM},walltime=${WALLTIME},jobfs=${JOBFS}'"

echo "Starting notebook on raijin..."

# Check connection
$SSH "$LOGINNODE" true

echo "qsub ${SUBMITOPTS}"

# Kill the job if this top-level script is cancelled while the job is still in the queue
trap "{ echo 'Stopping queued job...' ; $SSH \"$LOGINNODE\" <<< \"qdel \\\$(cat \\$WORKDIR/jobid)\" ; }" EXIT

message=$(
$SSH -q "$LOGINNODE" <<EOF

set -eu

WORKDIR="$WORKDIR"
mkdir -p "\$WORKDIR"
rm -f "\$WORKDIR/message"

qsub $SUBMITOPTS -j oe -o "\$WORKDIR/pbs.log" > \$WORKDIR/jobid <<EOQ

set -eu

# Jupyter security token
TOKEN=\\\$(uuidgen)

# Write message file with info for the local connection
echo "\\\$HOSTNAME \\\$TOKEN \\\$PBS_JOBID" > "\$WORKDIR/message"

echo "runjp log dir \$WORKDIR"
cat "\$WORKDIR/message"

module purge
module use /g/data3/hh5/public/modules
module load pbs
module load conda/analysis3-unstable

export DASK_LABEXTENSION__FACTORY__MODULE='dask_jobqueue'
export DASK_LABEXTENSION__FACTORY__CLASS='PBSCluster'
export DASK_LABEXTENSION__FACTORY__KWARGS__QUEUE='normal'
export DASK_LABEXTENSION__FACTORY__KWARGS__CORES='8'
export DASK_LABEXTENSION__FACTORY__KWARGS__MEMORY='16gb'
export DASK_LABEXTENSION__FACTORY__KWARGS__RESOURCE_SPEC='ncpus=8,mem=16gb'
export DASK_LABEXTENSION__FACTORY__KWARGS__INTERFACE='ib0'

jupyter notebook --NotebookApp.token="\\\$TOKEN" --no-browser --ip="\\\$HOSTNAME" --port 8888
EOQ


# Wait for the message file to appear, then return to the local process
while [ ! -f "\$WORKDIR/message" ]; do
sleep 5
done
cat "\$WORKDIR/message"
EOF
)

# Grab info from the PBS job
read jobhost token jobid <<< "$message"

echo "Notebook running as PBS job ${jobid}"
echo
echo "Starting tunnel..."
$SSH -N -L "8888:$jobhost:8888" -L "8787:$jobhost:8787" "$LOGINNODE" &
tunnelid=$!

# Shut everything down on exit
trap "{ echo 'Closing connections...' ; kill $tunnelid ; $SSH "$LOGINNODE" qdel $jobid ; }" EXIT

# Wait for startup then open a browser
sleep 5
URL="http://localhost:8888/lab?token=${token}"

echo
echo "Opening ${URL}"
if [ "$(uname)" = "Darwin" ]; then
open "$URL"
else
xdg-open "$URL"
fi

# Keep open as long as the tunnel exists
wait "$tunnelid"