OpenOmics · skchronicles · Feb 8, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/config/skyline.json b/config/skyline.json
@@ -0,0 +1,20 @@
+{
+    "sif": "/data/openomics/SIFs/",
+    "mounts": {
+        "kaiju": {
+            "to": "/opt/kaiju",
+            "from": "/data/openomics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10",
+            "mode": "ro"
+        },
+        "kraken2" : {
+            "to": "/opt/kraken2",
+            "from": "/data/openomics/references/weave/kraken2/k2_pluspfp_20230605",
+            "mode": "ro"
+        }, 
+        "fastq_screen" : {
+            "to": "/fdb/fastq_screen/FastQ_Screen_Genomes",
+            "from": "/data/openomics/references/weave/FastQ_Screen_Genomes",
+            "mode": "ro"
+        }
+    }
+}
diff --git a/docs/execution.md b/docs/execution.md
@@ -0,0 +1,117 @@
+**weave** is capable of automatically distributing its pipeline jobs across a slurm cluster. The context for it's initial execution can be varied as well.
+
+The context is also centrally related to the configuration and setup of a particular cluster. Right now weave is configured to work with NIH clusters **skyline**, **biowulf**, and **bigsky**.
+
+
+Typical contexts of execution include:
+
+# srun (real time execution) (non-interactive)
+
+The **weave** pipeline can be triggered from a head node in a non-interactive fashion:
+
+## Bigsky/Skyline
+
+!!! Note
+    Dependency files for skyline and bigsky differ <br />
+    **Bigsky: `/gs1/RTS/OpenOmics/bin/dependencies.sh`** <br />
+    **Skyline: `/data/openomics/bin/dependencies.sh`**
+
+```bash
+source ${dependencies}
+srun --export=ALL "weave run [keyword args] ${run_id}"
+```
+
+!!! Note
+    srun <a href="https://slurm.schedmd.com/srun.html#OPT_export">by default</a> exports all environmental variables from the executing environment and `--export=ALL` can be left off
+
+## Biowulf
+
+```bash
+srun --export=ALL "module load snakemake singularity; weave run [keyword args] ${run_id}"
+```
+
+# srun (real time execution) (interactive)
+
+## Bigsky/Skyline
+
+!!! Note
+    Dependency files for skyline and bigsky differ <br />
+    **Bigsky: `/gs1/RTS/OpenOmics/bin/dependencies.sh`** <br />
+    **Skyline: `/data/openomics/bin/dependencies.sh`**
+
+```bash
+> # <head node>
+srun --pty bash
+> # <compute node>
+source ${dependencies}
+weave run [keyword args] ${run_id}
+```
+
+## Biowulf
+
+```bash 
+> # <head node>
+sinteractive
+> # <compute node>
+module purge
+module load snakemake singularity
+weave run [keyword args] ${run_id}
+```
+
+Biowulf uses environmental modules to control software. After executing the above you should see a message similar to:
+
+> [+] Loading snakemake  7.XX.X on cnXXXX<br />
+> [+] Loading singularity  4.X.X  on cnXXXX<br />
+
+# sbatch (later time execution)
+
+## Bigsky/Skyline
+
+### sbatch tempalte
+```bash title="<b>bigsky-skyline sbatch template</b>"
+#!/bin/bash
+#SBATCH --job-name=<job_name>
+#SBATCH --export=ALL
+#SBATCH --time=01-00:00:00
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks=1
+#SBATCH --mem=8g
+#SBATCH --output=<stdout_file>_%j.out
+source ${dependencies}
+weave run \
+-s /sequencing/root/dir \
+-o output_dir \
+<run_id>
+```
+
+This above script can serve as a template to create an sbatch script for weave. Update the psuedo-variables in the script to suit your particular needs then execute using sbatch command:
+
+```bash
+sbatch weave_script.sbatch
+```
+
+## Biowulf
+
+### sbatch tempalte
+```bash title="<b>biowulf sbatch template</b>"
+#!/bin/bash
+#SBATCH --job-name=<job_name>
+#SBATCH --export=ALL
+#SBATCH --time=01-00:00:00
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks=1
+#SBATCH --mem=8g
+#SBATCH --output=<stdout_file>_%j.out
+module purge
+module load snakemake singularity
+weave run \
+-s /sequencing/root/dir \
+-o output_dir \
+<run_id>
+```
+
+Same sbatch execution as bigsky/skyline.
+
+```bash
+sbatch weave_script.sbatch
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -109,5 +109,6 @@ nav:
     - weave run: usage/run.md
     - weave cache: usage/cache.md
   - Installation: install.md
+  - Execution context: execution.md
   - Reference: ref/reference.md
   - License: license.md
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,3 @@
-pandas
-requests
-terminaltables
 pyyaml
-tabulate
 progressbar
+python-dateutil
diff --git a/scripts/cache.py b/scripts/cache.py
@@ -7,7 +7,6 @@
 import subprocess
 import json
 import urllib.request
-import progressbar
 from argparse import ArgumentTypeError
 from pathlib import Path
 from urllib.parse import urlparse
@@ -81,7 +80,14 @@ def handle_download(output_dir, resource, protocol, url):
     if protocol in ('http', 'https', 'ftp'):
         info_download(f"Getting web resource {resource}...")
         fnurl = Path(urlparse(url).path).stem
-        urllib.request.urlretrieve(uri, filename=Path(output_dir, fnurl), reporthook=DownloadProgressBar())
+        try:
+            import progressbar
+            urllib.request.urlretrieve(uri, filename=Path(output_dir, fnurl), reporthook=DownloadProgressBar())
+        except ModuleNotFoundError:
+            print('Downloading resources....')
+            urllib.request.urlretrieve(uri, filename=Path(output_dir, fnurl))
+            print('....done.')
+
     elif protocol in ('docker'):
         info_download(f"Getting docker resource {resource}...")
         docker_tag = url.split('/')[-1]

diff --git a/scripts/config.py b/scripts/config.py
@@ -1,8 +1,12 @@
 import re
 import json
+import traceback
+import logging
 from pathlib import Path
 from os import access as check_access, R_OK
+from os.path import expandvars, expanduser
 from socket import gethostname
+from uuid import uuid4
 from collections import defaultdict
 
 
@@ -20,11 +24,11 @@ def get_current_server():
     re_biowulf_head = (r"biowulf\.nih\.gov", "biowulf")
     re_biowulf_compute = (r"cn\d{4}", "biowulf")
 
-    # locus hostnames
-    re_locus_head = (r"ai\-submit\d{1}", "locus")
-    re_locus_compute = (r"ai\-hpcn\d{3}", "locus")
+    # skyline hostnames
+    re_skyline_head = (r"ai-hpc(submit|n)(\d+)?", "skyline")
+    re_skyline_compute = (r"ai-hpc(submit|n)(\d+)?", "skyline")
 
-    host_profiles = [re_bigsky, re_biowulf_compute, re_biowulf_head, re_locus_compute, re_locus_head]
+    host_profiles = [re_bigsky, re_biowulf_compute, re_biowulf_head, re_skyline_head, re_skyline_compute]
 
     host = None
     for pat, this_host in host_profiles:
@@ -91,13 +95,14 @@ def get_resource_config():
     return json.load(open(resource_json))
 
 
-def base_config(keys=None, qc=True):
+def base_config(keys=None, qc=True, slurm_id=None):
     base_keys = ('runs', 'run_ids', 'project', 'rnums', 'bcl_files', \
                 'sample_sheet', 'samples', 'sids', 'out_to', 'demux_input_dir', \
                 'bclconvert', 'demux_data')
     this_config = {k: [] for k in base_keys}
     this_config['resources'] = get_resource_config()
     this_config['runqc'] = qc
+    this_config['use_scratch'] = True if slurm_id else False
 
     if keys:
         for elem_key in keys:
@@ -147,6 +152,22 @@ def get_bigsky_seq_dirs():
     return seq_dirs
 
 
+def get_tmp_dir(host):
+    TMP_CONFIGS = {
+        'skyline': {'user': '/data/scratch/$USER/$SLURM_JOBID', 'global': '/data/scratch/$USER/' + str(uuid4())},
+        'bigsky': {'user': '/gs1/Scratch/$USER/$SLURM_JOBID', 'global': '/gs1/Scratch/$USER/' + str(uuid4())},
+        'biowulf': {'user': '/lscratch/$SLURM_JOBID', 'global': '/tmp/$USER/' + str(uuid4())}
+    }
+
+    this_tmp = TMP_CONFIGS[host]['user']
+
+    # this directory, if it does not exist, 
+    if Path(this_tmp).parents[0].exists():
+        return this_tmp
+    else:
+        return TMP_CONFIGS[host]['global']
+
+
 DIRECTORY_CONFIGS = {
     "bigsky": {
         "seqroot": "/gs1/RTS/NextGen/SequencerRuns/",
@@ -157,6 +178,11 @@ def get_bigsky_seq_dirs():
         "seqroot": "/data/RTB_GRS/SequencerRuns/",
         "seq": get_biowulf_seq_dirs(),
         "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "biowulf").resolve(),
+    },
+    "skyline": {
+        "seqroot": "/data/rtb_grs/SequencerRuns/",
+        "seq": get_bigsky_seq_dirs(),
+        "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "skyline").resolve(),
     }
 }
 

diff --git a/scripts/files.py b/scripts/files.py
@@ -155,6 +155,10 @@ def get_run_directories(runids, seq_dir=None, sheetname=None):
     host = get_current_server()
     seq_dirs = Path(seq_dir).absolute() if seq_dir else Path(DIRECTORY_CONFIGS[host]['seqroot'])
     seq_contents = [_child for _child in seq_dirs.iterdir()]
+    for firstchild in seq_dirs.iterdir():
+        if firstchild.is_dir():
+            for secondchild in firstchild.iterdir():
+                seq_contents.append(secondchild)
     seq_contents_names = [child for child in map(lambda d: d.name, seq_contents)]
 
     run_paths, invalid_runs  = [], []

diff --git a/scripts/samplesheet.py b/scripts/samplesheet.py
@@ -65,16 +65,24 @@ def parse_sheet(self, sheet):
     def process_v1_reads_section(self, section):
         section = [x for x in section if set(x) != {','}]
         r1, r2 = None, None
-        for i, line in enumerate(section, start=1):
-            if line.split(',')[0].isnumeric() and i == 1:
-                r1 = int(line.split(',')[0])
-            elif line.split(',')[0].isnumeric() and i == 2:
-                r2 = int(line.split(',')[0])
+        for i, line in enumerate(section):
+            this_line_name = line.split(',')[0]
+            this_line_val = line.split(',')[1]
+            if this_line_name.lower() in ('read01', 'read02') and this_line_val.isnumeric():
+                if this_line_name.endswith('1') and int(this_line_val) > 0:
+                    r1 = int(this_line_val)
+                elif this_line_name.endswith('2') and int(this_line_val) > 0:
+                    r2 = int(this_line_val)
+            elif this_line_name.isnumeric() and int(this_line_name) > 0:
+                if i == 0:
+                    r1 = int(this_line_name)
+                elif i == 1:
+                    r2 = int(this_line_name)
             else:
-                self.process_simple_section([line])        
-        if r1:
+                self.process_simple_section([line])
+        if r1 and r1 > 0:
             setattr(self, 'Read01', r1)
-        if r2:
+        if r2 and r2 > 0:
             setattr(self, 'Read02', r2)
         return