Skip to content

Commit

Permalink
feat: add disambiguate functionality to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Routsong committed Jan 10, 2024
1 parent 679ad59 commit e857d0f
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ jobs:
restore-keys: |
mkdocs-material-
- run: pip install -r docs/requirements.txt
- run: mkdocs gh-deploy --force
- run: mkdocs gh-deploy --forcescreen -r
6 changes: 3 additions & 3 deletions config/bigsky.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
"mounts": {
"kaiju": {
"to": "/opt/kaiju",
"from": "/gs1/RTS/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10",
"from": "/gs1/RTS/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10",
"mode": "ro"
},
"kraken2" : {
"to": "/opt/kraken2",
"from": "/gs1/RTS/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605",
"from": "/gs1/RTS/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605",
"mode": "ro"
},
"fastq_screen" : {
"to": "/fdb/fastq_screen/FastQ_Screen_Genomes",
"from": "/gs1/RTS/OpenOmics/references/Dmux/FastQ_Screen_Genomes",
"from": "/gs1/RTS/OpenOmics/references/weave/FastQ_Screen_Genomes",
"mode": "ro"
}
}
Expand Down
4 changes: 2 additions & 2 deletions config/biowulf.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
"mounts": {
"kaiju": {
"to": "/opt/kaiju",
"from": "/data/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10",
"from": "/data/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10",
"mode": "ro"
},
"kraken2" : {
"to": "/opt/kraken2",
"from": "/data/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605",
"from": "/data/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605",
"mode": "ro"
},
"fastq_screen" : {
Expand Down
38 changes: 38 additions & 0 deletions scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,42 @@ def get_bigsky_seq_dirs():
"seq": get_biowulf_seq_dirs(),
"profile": Path(Path(__file__).parent.parent, "utils", "profiles", "biowulf").resolve(),
}
}


GENOME_CONFIGS = {
"biowulf": {
"hg19": "/vf/users/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz",
"grch37": "/vf/users/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz",
"hg38": "/vf/users/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz",
"grch38": "/vf/users/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz",
"mm9": "/vf/users/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz",
"grcm37": "/vf/users/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz",
"mm10": "/vf/users/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz",
"grcm38": "/vf/users/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz",
"mm39": "/vf/users/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz",
"grcm39": "/vf/users/OpenOmics/references/genomes/mouse/GRCm39/GRCm3s9.genome.fa.gz",
"rhemac10": "/vf/users/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz",
"mmul10": "/vf/users/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz",
"agm": "/vf/users/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz",
"mesaur": "/vf/users/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz",
"cynomac": "/vf/users/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz",
},
"bigsky": {
"hg19": "/gs1/RTS/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz",
"grch37": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz",
"hg38": "/gs1/RTS/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz",
"grch38": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz",
"mm9": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz",
"grcm37": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz",
"mm10": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz",
"grcm38": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz",
"mm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz",
"grcm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm39/GRCm39.genome.fa.gz",
"rhemac10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz",
"mmul10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz",
"agm": "/gs1/RTS/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz",
"mesaur": "/gs1/RTS/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz",
"cynomac": "/gs1/RTS/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz",
},
}
47 changes: 45 additions & 2 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# ~~~ internals ~~~
from .files import parse_samplesheet, mk_or_pass_dirs
from .config import SNAKEFILE, DIRECTORY_CONFIGS, get_current_server, get_resource_config
from .config import SNAKEFILE, DIRECTORY_CONFIGS, GENOME_CONFIGS, get_current_server, get_resource_config


host = get_current_server()
Expand All @@ -40,6 +40,19 @@ def default(self, obj):
return super(self).default(obj)


def get_alias_table():
pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')])
return pp_tbl("""+----------------+-------------------------------------------+
| Organism | Genomes supported (aka) |
+----------------+-------------------------------------------+
| HUMAN | hg19(grch37) / hg38(grch38) |
+----------------+-------------------------------------------+
| MOUSE | mm9(grcm37) / mm10(grcm38) / mm39(grcm39) |
+----------------+-------------------------------------------+
| RHESUS MACAQUE | RHEMAC10(mmul10) |
+----------------+-------------------------------------------+""")


def valid_runid(id_to_check):
'''
Given an input ID get it's validity against the run id format:
Expand Down Expand Up @@ -233,6 +246,9 @@ def exec_pipeline(configs, dry_run=False, local=False):
if not bclcon_log_dir.exists():
bclcon_log_dir.mkdir(mode=0o755, parents=True)
extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw")
if this_config['disambiguate']:
extra_to_mount.append(Path(this_config['host_genome']).parent)
extra_to_mount.append(Path(this_config['pathogen_genome']).parent)
singularity_binds = get_mounts(*extra_to_mount)
config_file = Path(this_config['out_to'], '.config', f'config_job_{str(i)}.json').absolute()
json.dump(this_config, open(config_file, 'w'), cls=PathJSONEncoder, indent=4)
Expand Down Expand Up @@ -275,4 +291,31 @@ def is_bclconvert(samplesheet):
check = False
if samplesheet.instrument in BCLCONVERT_INSTRUMENTS or samplesheet.platform in BCLCONVERT_PLATFORMS:
check = True
return check
return check


def valid_host_pathogen_genomes(host, pathogen):
g1, g2 = False, False
genomes = GENOME_CONFIGS[get_current_server()]

if Path(host).absolute().exists():
g1 = True
host = str(Path(host).absolute())
elif host.lower() in genomes:
g1 = True
host = genomes[host.lower()]

if Path(g2).absolute().exists():
g2 = True
pathogen = str(Path(pathogen).absolute())
elif pathogen.lower() in genomes:
g2 = True
pathogen = genomes[pathogen.lower()]

if not all([g1, g2]):
if not g1:
raise ValueError('Host genome does not exist on the file system or in the aliases.')
if not g2:
raise ValueError('Pathogen genome does not exist on the file system or in the aliases.')

return host, pathogen
24 changes: 18 additions & 6 deletions weave
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ def run(args):
exec_config['bcl_files'].append(bcls)
exec_config['demux_data'].append(files.check_if_demuxed(rundir))

# ~~~ disambiguate genome configuration ~~~
exec_config['host_genome'] = None
exec_config['pathogen_genome'] = None
if all([args.host, args.pathogen]):
utils.valid_host_pathogen_genomes(args.host, args.pathogen)
exec_config['disambiguate'] = True
exec_config['host_genome'] = config.get_genome_path(args.host)
exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen)
else:
assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!'

# ~~~ QC/QA configuration ~~~
exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet))
exec_config['run_ids'].append(rundir.name)
Expand Down Expand Up @@ -86,12 +97,13 @@ if __name__ == '__main__':
help='Name of the sample sheet file to look for (default is SampleSheet.csv)')
parser_run.add_argument('-l', '--local', action='store_true',
help='Execute pipeline locally without a dispatching executor')
# force endedness flags
endedness = parser_run.add_mutually_exclusive_group()
endedness.add_argument('--single_end', default=None, action='store_true',
help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to paired_end')
endedness.add_argument('--paired_end', default=None, action='store_true',
help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to single_end')

# disambiguate arguments
parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None,
help='Execute pipeline locally without a dispatching executor.')
alias_table = "\n" + utils.get_alias_table() + "\n"
parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None,
help='Execute pipeline locally without a dispatching executor. ' + alias_table)

parser_cache = sub_parsers.add_parser('cache')
parser_cache.add_argument('cachedir', metavar='<cache directory>', type=cache.valid_dir,
Expand Down
9 changes: 9 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ else:
if config["runqc"]:
all_outputs.extend(qa_qc_outputs)

if config['disambiguate']:
all_outputs.extend(flatten([
expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args),
expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args),
expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam", **demux_expand_args),
expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam", **demux_expand_args),
expand("{out_to}/{project}/{sids}/disambiguate/{sids}_summary.txt", **demux_expand_args),
]))


rule all:
input:
Expand Down
63 changes: 55 additions & 8 deletions workflow/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ else:

rule fastqc_untrimmed:
input:
samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz",
samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz",
output:
html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html",
fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip",
html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html",
fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip",
params:
output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/"
output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/"
log: config['out_to'] + "/logs/" + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log"
threads: 4
containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif"
Expand All @@ -36,12 +36,12 @@ rule fastqc_untrimmed:

rule fastqc_trimmed:
input:
in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz",
in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz",
output:
html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html",
fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip",
html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html",
fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip",
params:
output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/"
output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/"
containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif"
threads: 4
resources: mem_mb = 8096
Expand All @@ -53,6 +53,53 @@ rule fastqc_trimmed:
"""


rule bwa:
input:
in_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [],
in_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [],
output:
aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
params:
host_genome = config['host_genome']
path_genome = config['pathogen_genome']
threads: 32
resources: mem_mb = 64768
containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif"
log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log"
shell:
"""
bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} -
bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} -
"""


rule disambiguate:
input:
aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
output:
ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam"
ambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam"
disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam"
disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam"
ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt"
params:
host_genome = config['host_genome']
path_genome = config['pathogen_genome']
this_sid = lambda wc: wc.sids
out_dir = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/"
containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif"
log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log"
threads: 32
resources: mem_mb = 64768
shell:
"""
ngs_disambiguate -s {params.this_sid} -o {params.out_dir} -a bwa {input.aligntoA} {input.aligntoB}
"""



rule multiqc_report:
input:
# demux status
Expand Down

0 comments on commit e857d0f

Please sign in to comment.