Add mix and match functionality for mutation calling (resolves #265)

BD2KGenomics · May 15, 2018 · 466c800 · 466c800
1 parent 101565a
commit 466c800
Show file tree

Hide file tree

Showing 12 changed files with 87 additions and 26 deletions.
diff --git a/MANUAL.md b/MANUAL.md
@@ -362,8 +362,10 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             java_Xmx: 5G                                          -> The heap size to use for MuTect
                                                                      per job (i.e. per chromosome)
             version: 1.1.7
+            run: True                                             -> Switch to skip calling mutect
         muse:
             version: 1.0rc_submission_b391201
+            run: True                                             -> Switch to skip calling muse
         radia:                                                -> Radia uses perchrom bed files in
                                                                  folders as references.
             cosmic_beds: /path/to/radia_cosmic.tar.gz
@@ -372,17 +374,20 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             pseudogene_beds: /path/to/radia_pseudogenes.tar.gz
             gencode_beds: /path/to/radia_gencode.tar.gz
             version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
+            run: True                                             -> Switch to skip calling radia
         somaticsniper:
             version: 1.0.4
             samtools:                                           -> pileup reads
                 version: 0.1.8
             bam_readcount:                                      -> obtain readcounts
                 version: 0.7.4
+            run: True                                            -> Switch to skip calling somaticsniper
         strelka:
             version: 1.0.15
             config_file: /path/to/strelka_config.ini.tar.gz       -> The Strelka config file for a
                                                                      bwa run (modified for a WXS run
                                                                      if necessary)
+            run: True                                           -> Switch to skip calling strelka
         star_fusion:
             run: True                                           -> Switch to skip fusion calling
             version: 1.0.0

diff --git a/src/protect/mutation_annotation/snpeff.py b/src/protect/mutation_annotation/snpeff.py
@@ -39,14 +39,15 @@ def run_snpeff(job, merged_mutation_file, univ_options, snpeff_options):
     :return: fsID for the snpeffed vcf
     :rtype: toil.fileStore.FileID
     """
+    if merged_mutation_file is None:
+        return None
     work_dir = os.getcwd()
     input_files = {
         'merged_mutations.vcf': merged_mutation_file,
         'snpeff_index.tar.gz': snpeff_options['index']}
     input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
     input_files['snpeff_index'] = untargz(input_files['snpeff_index.tar.gz'], work_dir)
     input_files = {key: docker_path(path) for key, path in input_files.items()}
-
     parameters = ['eff',
                   '-dataDir', input_files['snpeff_index'],
                   '-c', '/'.join([input_files['snpeff_index'],

diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py
@@ -63,12 +63,30 @@ def run_mutation_aggregator(job, mutation_results, univ_options):
     """
     # Setup an input data structure for the merge function
     out = {}
-    for chrom in mutation_results['mutect'].keys():
-        out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
-                                       univ_options).rv()
-    merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
-    job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
-    return merged_snvs.rv()
+    chroms = {}
+    # Extract the chromosomes from a mutation caller if at least one mutation caller is selected.  All callers should
+    # have the same chromosomes.
+    for caller in mutation_results:
+        if mutation_results[caller] is None:
+            continue
+        else:
+            if caller == 'strelka':
+                if mutation_results['strelka']['snvs'] is None:
+                    continue
+                chroms = mutation_results['strelka']['snvs'].keys()
+            else:
+                chroms = mutation_results[caller].keys()
+            break
+    if chroms:
+        for chrom in chroms:
+            out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
+                                           univ_options).rv()
+        merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
+        job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
+        return merged_snvs.rv()
+    else:
+        return None
+
 
 
 def merge_perchrom_mutations(job, chrom, mutations, univ_options):
@@ -100,30 +118,26 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
                      'indels': {'strelka_indels': process_strelka_vcf
                                 }
                      }
-    #                 'fusions': lambda x: None,
-    #                 'indels': lambda x: None}
-    # For now, let's just say 2 out of n need to call it.
-    # num_preds = len(mutations)
-    # majority = int((num_preds + 0.5) / 2)
-    majority = {'snvs': 2,
-                'indels': 1}
-
     accepted_hits = defaultdict(dict)
-
     for mut_type in vcf_processor.keys():
         # Get input files
         perchrom_mutations = {caller: vcf_processor[mut_type][caller](job, mutations[caller][chrom],
                                                                       work_dir, univ_options)
-                              for caller in vcf_processor[mut_type]}
+                              for caller in vcf_processor[mut_type]
+                              if mutations[caller] is not None}
+        if not perchrom_mutations:
+            continue
         # Process the strelka key
-        perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
-        perchrom_mutations.pop('strelka_' + mut_type)
+        if 'strelka' + mut_type in perchrom_mutations:
+            perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
+            perchrom_mutations.pop('strelka_' + mut_type)
+        majority = 1 if len(perchrom_mutations) <= 2 else (len(perchrom_mutations) + 1) / 2
         # Read in each file to a dict
         vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
         all_positions = list(set(itertools.chain(*vcf_lists.values())))
         for position in sorted(all_positions):
             hits = {caller: position in vcf_lists[caller] for caller in perchrom_mutations.keys()}
-            if sum(hits.values()) >= majority[mut_type]:
+            if sum(hits.values()) >= majority:
                 callers = ','.join([caller for caller, hit in hits.items() if hit])
                 assert position[1] not in accepted_hits[position[0]]
                 accepted_hits[position[0]][position[1]] = (position[2], position[3], callers)

diff --git a/src/protect/mutation_calling/indel.py b/src/protect/mutation_calling/indel.py
@@ -27,7 +27,9 @@ def run_indel_caller(job, tumor_bam, normal_bam, univ_options, indel_options):
     :return: fsID to the merged fusion calls
     :rtype: toil.fileStore.FileID
     """
-    job.fileStore.logToMaster('INDELs are currently unsupported.... Skipping.')
+    job.fileStore.logToMaster('INDELs currently occur only through strelka.')
+    if indel_options['run'] is False:
+        return None
     indel_file = job.fileStore.getLocalTempFile()
     output_file = job.fileStore.writeGlobalFile(indel_file)
     job.fileStore.logToMaster('Ran INDEL on %s successfully' % univ_options['patient'])

diff --git a/src/protect/mutation_calling/muse.py b/src/protect/mutation_calling/muse.py
@@ -78,6 +78,8 @@ def run_muse(job, tumor_bam, normal_bam, univ_options, muse_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if muse_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if muse_options['chromosomes']:
         chromosomes = muse_options['chromosomes']

diff --git a/src/protect/mutation_calling/mutect.py b/src/protect/mutation_calling/mutect.py
@@ -75,6 +75,8 @@ def run_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if mutect_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if mutect_options['chromosomes']:
         chromosomes = mutect_options['chromosomes']

diff --git a/src/protect/mutation_calling/radia.py b/src/protect/mutation_calling/radia.py
@@ -87,6 +87,8 @@ def run_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options):
                  +- 'chrM': fsID
     :rtype: dict
     """
+    if radia_options['run'] is False:
+        return None
     if 'rna_genome' in rna_bam.keys():
         rna_bam = rna_bam['rna_genome']
     elif set(rna_bam.keys()) == {'rna_genome_sorted.bam', 'rna_genome_sorted.bam.bai'}:

diff --git a/src/protect/mutation_calling/somaticsniper.py b/src/protect/mutation_calling/somaticsniper.py
@@ -85,6 +85,8 @@ def run_somaticsniper(job, tumor_bam, normal_bam, univ_options, somaticsniper_op
                  +- 'chrM': fsID
     :rtype: toil.fileStore.FileID|dict
     """
+    if somaticsniper_options['run'] is False:
+        return None
     # Get a list of chromosomes to handle
     if somaticsniper_options['chromosomes']:
         chromosomes = somaticsniper_options['chromosomes']

diff --git a/src/protect/mutation_calling/strelka.py b/src/protect/mutation_calling/strelka.py
@@ -45,7 +45,10 @@ def run_strelka_with_merge(job, tumor_bam, normal_bam, univ_options, strelka_opt
     :param dict univ_options: Dict of universal options used by almost all tools
     :param dict strelka_options: Options specific to strelka
     :return: fsID to the merged strelka calls
-    :rtype: toil.fileStore.FileID
+    :rtype: dict(str, toil.fileStore.FileID)
+              |-'snvs': fsID
+              |
+              +-'indels': fsID
     """
     spawn = job.wrapJobFn(run_strelka, tumor_bam, normal_bam, univ_options,
                           strelka_options, split=False).encapsulate()
@@ -63,8 +66,9 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
     :param dict univ_options: Dict of universal options used by almost all tools
     :param dict strelka_options: Options specific to strelka
     :param bool split: Should the results be split into perchrom vcfs?
-    :return: Either the fsID to the genome-level vcf or a dict of results from running strelka
-             on every chromosome
+    :return: Either the a dict of results from running strelka on every chromosome, or a dict of running strelka on the
+             whole genome
+
              perchrom_strelka:
                  |- 'chr1':
                  |      |-'snvs': fsID
@@ -77,8 +81,16 @@ def run_strelka(job, tumor_bam, normal_bam, univ_options, strelka_options, split
                  +- 'chrM':
                         |-'snvs': fsID
                         +-'indels': fsID
-    :rtype: toil.fileStore.FileID|dict
+
+             genome_strelka:
+                 |-'snvs': fsID
+                 |
+                 +-'indels': fsID
+
+    :rtype: dict
     """
+    if strelka_options['run'] is False:
+            return {'snvs': None, 'indels': None}
     if strelka_options['chromosomes']:
         chromosomes = strelka_options['chromosomes']
     else:

diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py
@@ -354,6 +354,8 @@ def _parse_config_file(job, config_file, max_cores=None):
 
     # Flags to check for presence of encryption keys if required
     gdc_inputs = ssec_encrypted = False
+    # Flag to check if a sample without an input vcf/bedpe was provided
+    sample_without_variants = False
     for key in input_config.keys():
         if key == 'patients':
             # Ensure each patient contains the required entries
@@ -373,6 +375,9 @@ def _parse_config_file(job, config_file, max_cores=None):
                         raise ParameterError('Cannot run ProTECT using GDC RNA bams. Please fix '
                                              'sample %s' % sample_name)
                     gdc_inputs = True
+                if ('mutation_vcf' not in sample_set[sample_name]
+                        and 'fusion_bedpe' not in sample_set[sample_name]):
+                    sample_without_variants = True
         else:
             # Ensure the required entries exist for this key
             _ensure_set_contains(input_config[key], required_keys[key], key)
@@ -427,6 +432,10 @@ def _parse_config_file(job, config_file, max_cores=None):
                                                                 'token.')
     # Get all the tool inputs
     job.fileStore.logToMaster('Obtaining tool inputs')
+    if (sample_without_variants and all(tool_options[k]['run'] is False
+                                        for k in mutation_caller_list
+                                        if k not in ('indexes', 'fusion_inspector'))):
+        raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")
     process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
                                             mutation_caller_list=mutation_caller_list)
     job.fileStore.logToMaster('Obtained tool inputs')
@@ -670,7 +679,7 @@ def launch_protect(job, patient_data, univ_options, tool_options):
                                      bam_files['normal_dna'].rv(), univ_options,
                                      tool_options['strelka']).encapsulate(),
             'indels': job.wrapJobFn(run_indel_caller, bam_files['tumor_dna'].rv(),
-                                    bam_files['normal_dna'].rv(), univ_options, 'indel_options',
+                                    bam_files['normal_dna'].rv(), univ_options, {'run': False},
                                     disk='100M', memory='100M', cores=1)}
         for sample_type in 'tumor_dna', 'normal_dna':
             for caller in mutations:

diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml
@@ -60,17 +60,22 @@ mutation_calling:
     mutect:
         java_Xmx: 2G
         version: 1.1.7
+        run: True
     muse:
         version: 1.0rc_submission_b391201
+        run: True
     radia:
         version: bcda721fc1f9c28d8b9224c2f95c440759cd3a03
+        run: True
     somaticsniper:
+        run: True
         version: 1.0.4
         samtools:
             version: 0.1.8
         bam_readcount:
             version: 0.7.4
     strelka:
+        run: True
         version: 1.0.15
 
     star_fusion:

diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml
@@ -95,17 +95,21 @@ mutation_calling:
         dbsnp_tbi: S3://protect-data/hg38_references/dbsnp_coding.vcf.gz.tbi
     mutect:
         java_Xmx: 2G
+        # run: True
         # version: 1.1.7
     muse:
+        # run: True
         # version: 1.0rc_submission_b391201
     radia:
         cosmic_beds: S3://protect-data/hg38_references/radia_cosmic.tar.gz
         dbsnp_beds: S3://protect-data/hg38_references/radia_dbsnp.tar.gz
         retrogene_beds: S3://protect-data/hg38_references/radia_retrogenes.tar.gz
         pseudogene_beds: S3://protect-data/hg38_references/radia_pseudogenes.tar.gz
         gencode_beds: S3://protect-data/hg38_references/radia_gencode.tar.gz
+        # run: True
         # version: 398366ef07b5911d8082ed61cbf03d487a41f286
     somaticsniper:
+        # run: True
         # version: 1.0.4
         samtools:
             # version: 0.1.8
@@ -118,6 +122,7 @@ mutation_calling:
         #run_trinity: True
         #version: 1.0.1
     strelka:
+        # run: True
         # version: 1.0.15
         config_file: S3://protect-data/hg38_references/strelka_bwa_WXS_config.ini.tar.gz