Skip to content

Commit

Permalink
Adding quinine QC functions, and quinine_pipelines (resolves BD2KGeno…
Browse files Browse the repository at this point in the history
…mics#380)

* Adds the quinine tool (https://github.com/bigdatagenomics/quinine) along with
  helper functions for running QC on targeted sequencing panels, RNA-seq data,
  and for estimating sample contamination.
* Instantiates these as command line tools in
  `toil_scripts.quinine_pipelines.metrics`.
  • Loading branch information
fnothaft committed Jul 26, 2016
1 parent 3f2d497 commit fef077d
Show file tree
Hide file tree
Showing 4 changed files with 479 additions and 0 deletions.
Empty file.
57 changes: 57 additions & 0 deletions src/toil_scripts/quinine_pipelines/adam_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from subprocess import check_call

from toil_scripts.adam_pipeline.adam_preprocessing import call_adam

def __call_adam_native(cmd, memory, native_path):
'''
Calls ADAM running in Spark local mode, where ADAM is not in a docker container.
:param list<str> cmd: ADAM command line arguments
:param int memory: Amount of memory in GB to allocate.
:param str native_path: String path to the ADAM install directory.
'''

check_call(['%s/bin/adam-submit' % native_path,
'--master', 'local[*]',
'--conf', 'spark.driver.memory=%dg' % memory,
'--'] + cmd)


def bam_to_adam_native(bam, parquet, memory, native_path):
'''
Converts a BAM file into an ADAM AlignmentRecord Parquet file.
:param str bam: Path to input SAM/BAM file.
:param str parquet: Path to save Parquet file at.
:param int memory: Amount of memory in GB to allocate.
:param str native_path: String path to the ADAM install directory.
'''

__call_adam_native(['transform', bam, parquet], memory, native_path)


def feature_to_adam_native(feature, parquet, memory, native_path):
'''
Converts a feature file (e.g., BED, GTF, GFF) into an ADAM Feature Parquet
file.
:param str feature: Path to input BED/GTF/GFF/NarrowPeak file.
:param str parquet: Path to save Parquet file at.
:param int memory: Amount of memory in GB to allocate.
:param str native_path: String path to the ADAM install directory.
'''

__call_adam_native(['features2adam', feature, parquet], memory, native_path)


def vcf_to_adam_native(vcf, parquet, memory, native_path):
'''
Converts a VCF file into an ADAM Genotype Parquet file.
:param str bam: Path to input VCF file.
:param str parquet: Path to save Parquet file at.
:param int memory: Amount of memory in GB to allocate.
:param str native_path: String path to the ADAM install directory.
'''

__call_adam_native(['vcf2adam', vcf, parquet], memory, native_path)
255 changes: 255 additions & 0 deletions src/toil_scripts/quinine_pipelines/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
#!/usr/bin/env python2.7

import argparse
import os

from toil.job import Job

from toil_scripts.tools.qc import ( call_quinine_af_native,
call_quinine_contamination_native,
call_quinine_hs_native,
call_quinine_rna_native )
from toil_scripts.quinine_pipelines.adam_helpers import ( bam_to_adam_native,
feature_to_adam_native )

def run_rna_qc(job,
reads, transcriptome, output_path,
memory,
adam_native_path, quinine_native_path):
'''
Runs a job that computes various RNA-seq quality control statistics.
:param toil.job.Job job: The toil job running this function.
:param str reads: Path to the input reads in SAM/BAM format.
:param str transcriptome: Path to the transcriptome definition (GTF/GFF).
:param str output_path: Path to write the statistics at.
:param int memory: GB of memory to allocate.
:param str adam_native_path: The path where ADAM is installed.
:param str quinine_native_path: The path where Quinine is installed.
'''

# get a temp work directory
local_dir = job.fileStore.getLocalTempDir()

# convert the reads to ADAM format
adam_reads = os.path.join(local_dir, 'reads.adam')
bam_to_adam_native(reads, adam_reads, memory, adam_native_path)

# convert the features to ADAM format
adam_features = os.path.join(local_dir, 'transcriptome.adam')
feature_to_adam_native(transcriptome, adam_features, memory, adam_native_path)

# run the qc job
call_quinine_rna_native(adam_reads, adam_features, output_path,
local_dir,
memory, quinine_native_path)


def run_targeted_qc(job,
reads, bait, targets, output_path,
memory,
adam_native_path, quinine_native_path):
'''
Runs a job that computes various quality control statistics for reads
sequenced using a hybrid-selection panel that requires targeting.
:param toil.job.Job job: The toil job running this function.
:param str reads: Path to the input reads in SAM/BAM format.
:param str bait: Path to the description of the baited regions.
:param str targets: Path to the description of the regions targeted.
:param str output_path: Path to write the statistics at.
:param int memory: GB of memory to allocate.
:param str adam_native_path: The path where ADAM is installed.
:param str quinine_native_path: The path where Quinine is installed.
'''

# get a temp work directory
local_dir = job.fileStore.getLocalTempDir()

# convert the reads to ADAM format
adam_reads = os.path.join(local_dir, 'reads.adam')
bam_to_adam_native(reads, adam_reads, memory, adam_native_path)

# convert the bait features to ADAM format
adam_bait = os.path.join(local_dir, 'bait.adam')
feature_to_adam_native(bait, adam_bait, memory, adam_native_path)

# convert the target features to ADAM format
adam_targets = os.path.join(local_dir, 'targets.adam')
feature_to_adam_native(targets, adam_targets, memory, adam_native_path)

# run the metrics
call_quinine_hs_native(adam_reads, adam_targets, adam_bait, output_path,
local_dir,
memory, quinine_native_path)


def run_contamination_estimation(job,
reads, population, sample_vcf, output_path,
memory,
adam_native_path, quinine_native_path):
'''
Runs a job that computes various quality control statistics for reads
sequenced using a hybrid-selection panel that requires targeting.
:param toil.job.Job job: The toil job running this function.
:param str reads: Path to the input reads in SAM/BAM format.
:param str bait: Path to the description of the baited regions.
:param str targets: Path to the description of the regions targeted.
:param str output_path: Path to write the statistics at.
:param int memory: GB of memory to allocate.
:param str adam_native_path: The path where ADAM is installed.
:param str quinine_native_path: The path where Quinine is installed.
'''

# get a temp work directory
local_dir = job.fileStore.getLocalTempDir()

# convert the reads to ADAM format
adam_reads = os.path.join(local_dir, 'reads.adam')
bam_to_adam_native(reads, adam_reads, memory, adam_native_path)

# convert the population vcf to ADAM format
adam_population = os.path.join(local_dir, 'population.adam')
vcf_to_adam_native(population, adam_population, memory, adam_native_path)

# convert the sample vcf to ADAM format
adam_calls = os.path.join(local_dir, 'sample.adam')
vcf_to_adam_native(sample_vcf, adam_calls, memory, adam_native_path)

# compute MAF's
maf_annotations = os.path.join(local_dir, 'mafs.adam')
call_quinine_af_native(adam_population, maf_annotations,
local_dir,
memory,
quinine_native_path)

# estimate contamination
call_quinine_contamination_native(adam_reads,
adam_calls,
maf_annotations,
output_path,
local_dir,
memory,
native_path)


def __add_common_args(parser):
'''
Adds commonly used arguments to a subparser.
:param argparse.Subparser parser: A subparser to add arguments to.
'''

parser.add_argument('--output',
help='Location to write outputs to.',
required=True)
parser.add_argument('--memory',
help='The amount of memory to allocate, in GB. Defaults to 1.',
type=int,
default=1)
parser.add_argument('--adam_native_path',
help='The native path where ADAM is installed.'
'Defaults to /opt/cgl-docker-lib/adam',
default='/opt/cgl-docker-lib/adam',
type=str)
parser.add_argument('--quinine_native_path',
help='The native path where Quinine is installed.'
'Defaults to /opt/cgl-docker-lib/quinine',
default='/opt/cgl-docker-lib/quinine',
type=str)
Job.Runner.addToilOptions(parser)


def main():
'''
Parses arguments and starts the job.
'''

# build the argument parser
parser = argparse.ArgumentParser()

# we run three different commands: hs, cont, rna
subparsers = parser.add_subparsers(dest='command')
parser_rna = subparsers.add_parser('rna', help='Runs the RNA QC metrics.')
parser_hs = subparsers.add_parser('targeted',
help='Runs the QC metrics for a targeted sequencing protocol.')
parser_cont = subparsers.add_parser('contamination',
help='Runs the contamination estimator.')

# add arguments to the rna panel
parser_rna.add_argument('--reads',
help='The RNA-seq reads.',
type=str,
required=True)
parser_rna.add_argument('--transcriptome',
help='The transcriptome description (e.g., a GENCODE GTF)',
type=str,
required=True)
__add_common_args(parser_rna)

# add arguments to the hs panel
parser_hs.add_argument('--reads',
help='The aligned reads.',
type=str,
required=True)
parser_hs.add_argument('--bait',
help='The bait used for capturing this panel.',
type=str,
required=True)
parser_hs.add_argument('--targets',
help='The regions covered by this panel.',
type=str,
required=True)
__add_common_args(parser_hs)

# add arguments for contaimination estimation
parser_cont.add_argument('--reads',
help='The aligned reads.',
type=str,
required=True)
parser_cont.add_argument('--population',
help='The VCF to derive allele frequencies from.',
type=str,
required=True)
parser_cont.add_argument('--sample-vcf',
help='A VCF containing known genotypes for the sample.',
type=str,
required=True)
__add_common_args(parser_cont)

# parse the arguments
args = parser.parse_args()

# check which command got called, and set up and run
if args.command == 'rna':
Job.Runner.startToil(Job.wrapJobFn(run_rna_qc,
args.reads,
args.transcriptome,
args.output,
args.memory,
args.adam_native_path,
args.quinine_native_path), args)
elif args.command == 'targeted':
Job.Runner.startToil(Job.wrapJobFn(run_targeted_qc,
args.reads,
args.bait,
args.output,
args.targets,
args.memory,
args.adam_native_path,
args.quinine_native_path), args)
elif args.command == 'contamination':
Job.Runner.startToil(Job.wrapJobFn(run_contamination_estimation,
args.reads,
args.population,
args.sample_vcf,
args.output,
args.memory,
args.adam_native_path,
args.quinine_native_path), args)
else:
raise ValueError('Unknown command: %s' % args.command)

if __name__ == '__main__':
main()
Loading

0 comments on commit fef077d

Please sign in to comment.