Fix for #26, #27 and additional changes

Issues: - #26 VCF was not converted to correctly when Spectre did not find any CNVs - #27 VCF FORMAT issue with DP tag. Tag was changed from DP to CD (coverage read depth) - issue with merging overlapping CNVs. They are now merging correctly - min-cnv-len default value has been changed to 100kb (previously 1mb) Further changes: - Adding an execution timer for Spectre
fritzsedlazeck · Jul 15, 2024 · 07e61de · 07e61de
1 parent b80fd32
commit 07e61de
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 18 deletions.
diff --git a/spectre/analysis/analysis.py b/spectre/analysis/analysis.py
@@ -70,7 +70,7 @@ def __init__(self, coverage_file, outbed, outvcf, sample_id, spectre_args, metad
         # SNFJ
         self.snfj_breakpoints = False
         # DEV
-        self.min_chr_length = 1e6
+        self.min_chr_length = 1e5  # 100kb
         self.dist_min_overwrite = 10000  # 10kb TODO
         self.max_std_outlier_rm = self.spectre_args.dev_max_std_outlier_rm  # 5
         self.mosdepth_cov_genome_chr_diff = self.spectre_args.dev_mosdepth_cov_genome_chr_diff  # 0.10  # 10%
@@ -523,8 +523,9 @@ def clean_by_overlap(self, merged_candidates):
         offset = 0
 
         # Walk through candidates by index (idx)
-        for idx in range(len(merged_candidates) - 1):
-            current_cand = merged_candidates[idx]
+        idx = 0
+        while idx + offset < number_of_candidates:
+            current_cand = merged_candidates[idx+offset]
             early_stop = False
 
             # Walk through all following candidates (offset)
@@ -592,6 +593,7 @@ def clean_by_overlap(self, merged_candidates):
                     # # Remove the slave candidate from the list.
                     remove_idx.append(remove_candidate)
                     offset += 1
+            idx += 1
 
         # remove the candidates that overlap
         for idx in range(len(merged_candidates)):
@@ -640,7 +642,7 @@ def clean_by_threshold(self, merged_candidates):
             n_outside_ci = del_outside + dup_outside
             # check if more than 50% of the values are outside the confidence interval
             has_more_values_outside_of_normal_cov_space = n_outside_ci > len(
-                each_cand.cov) * 0.5  # reduce FP due to noise
+                each_cand.cov) * self.spectre_args.qc_noise_allow # reduce FP due to noise. Allow by default is 50%
 
             # if candidate coverage mean is outside the thresholds, keep it
             if not (del_threshold <= threshold <= dup_threshold) and has_more_values_outside_of_normal_cov_space:
@@ -799,6 +801,11 @@ def write_intermediate_candidates(self) -> str:
             loh_cnv_calls_list_dict = intermediate_output_writer.convert_candidates_to_dictionary(
                 self.snv_loh)
 
+        # check strict thresholds If None, they were not used due to the cancer mode
+        if self.cnv_metrics.strict_del_threshold is None:
+            self.cnv_metrics.strict_del_threshold = self.lower_2n_threshold
+            self.cnv_metrics.strict_dup_threshold = self.upper_2n_threshold
+
         analysis_dict = {
             "metadata": {"source": "spectre", "spectre_version": "0.2.alpha", "bin_size": int(self.bin_size),
                          "min_cn_len": int(self.candidate_final_threshold),

diff --git a/spectre/spectre.py b/spectre/spectre.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 import sys
+import time
 
 sys.dont_write_bytecode = True
 import argparse
@@ -32,6 +33,7 @@ def __init__(self):
         self.ploidy_chr_list = ""
         self.min_cnv_len = 1000000
         self.sample_coverage_overwrite = None
+        self.qc_noise_allow = 0.5
         self.call_from_console = False
         self.is_cancer = False
         # dev/debug + hidden params
@@ -67,6 +69,7 @@ def set_params_from_args(self, user_args):
         self.ploidy_chr_list = user_args.ploidy_chr
         self.min_cnv_len = user_args.min_cnv_len
         self.sample_coverage_overwrite = user_args.sample_coverage_overwrite
+        self.qc_noise_allow = user_args.qc_noise_allow
         self.n_size = user_args.n_size
         self.threads = user_args.threads
         self.run_population_mode = user_args.run_population_mode
@@ -149,7 +152,7 @@ def outside_spectre_worker(si: dict):
 class Spectre:
     def __init__(self, as_dev=False):
         # version
-        self.version = "0.2.1"
+        self.version = "0.2.1-patch-july15"
         # get a custom logger & set the logging level
         self.logger = logger.setup_log(__name__, as_dev)
 
@@ -518,7 +521,7 @@ def get_arguments():
     subparser_cnv_caller.add_argument('-n', '--n-size', type=int, required=False, dest='n_size', default=5,
                                       help='..., default = 5')
     subparser_cnv_caller.add_argument('-mcl', '--min-cnv-len', type=int, required=False, dest='min_cnv_len',
-                                      default=1000000, help='..., default = 1000000')
+                                      default=100000, help='..., default = 100000')
     subparser_cnv_caller.add_argument('-t', '--threads', type=int, required=False, dest='threads', default=1,
                                       help='..., default = 1')
     subparser_cnv_caller.add_argument('-i', '--population', action='store_true', required=False,
@@ -532,7 +535,8 @@ def get_arguments():
 
     subparser_cnv_caller.add_argument('-a', '--cancer', action='store_true', required=False, dest='is_cancer',
                                       default=False, help='..., default = False')
-
+    subparser_cnv_caller.add_argument('-qcna', '--qc-noise-allow', type=float, required=False,
+                                      dest='qc_noise_allow', default=0.5, help='..., float. default = 0.5')
     # Loh
     subparser_cnv_caller.add_argument('-lohkb', '--loh-min-snv-perkb', type=int, required=False, default=10,
                                       dest='loh_min_snv_perkb', help='default = 10')
@@ -636,6 +640,8 @@ def main():
     main_logger.info(f"Spectre input: {command} {' '.join(sys.argv[2:])}")
     main_logger.debug(f'Debug output is enabled') if run_as_dev else None
     spectre_run = Spectre(run_as_dev)
+    # add timer
+    start_time = time.time()
     if command == "CNVCaller":
         # ARGS:  bin_size, coverage_file_bed, sample_id, output_dir, reference="", metadata="", ...
         spectre_run.spectre_args.set_params_from_args(spectre_args)
@@ -655,6 +661,8 @@ def main():
     else:
         print(spectre_help)
         pass
+    end_time = time.time() - start_time
+    main_logger.info(f"Execution time: {end_time:.2f} seconds")
     sys.exit(0)
 
 

diff --git a/spectre/util/outputWriter.py b/spectre/util/outputWriter.py
@@ -49,7 +49,7 @@ def __init__(self):
         self.QUAL = "."
         self.FILTER = "."
         self.INFO = "."
-        self.FORMAT = "GT:HO:GQ:DP"
+        self.FORMAT = "GT:HO:GQ:CD"
         self.format_data = []
         self.sample_format_data = {}
         self.supp_vec = {}
@@ -119,7 +119,7 @@ def make_vcf_header(self):
         vcf_header += ['##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                        '##FORMAT=<ID=HO,Number=2,Type=Float,Description="Homozygosity proportion">',
                        '##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">',
-                       '##FORMAT=<ID=DP,Number=2,Type=Float,Description="Read depth">',
+                       '##FORMAT=<ID=CD,Number=2,Type=Float,Description="Coverage read depth">',
                        '##FORMAT=<ID=ID,Number=1,Type=String,Description="Population ID of supporting CNV calls">'
                        ]
 
@@ -176,27 +176,27 @@ def vcf_result(self, chromosome_list, cnv_candidate_list):
                             ids = []
                             scores = []
                             gts = []
-                            dps = []  # raw read depth
+                            cds = []  # raw read depth
                             # sample_id =""
                             for candidate in list(sample_value):
                                 ids.append(candidate.id)
                                 scores.append(np.float16(candidate.statistics['z-score']['sample_score']))
                                 gts.append(candidate.gt)
                                 # check if median_raw_cov exists
                                 if candidate.median_raw_cov:
-                                    dps.append(candidate.median_raw_cov)
+                                    cds.append(candidate.median_raw_cov)
                             gt = Counter(gts).most_common(1)[0][0]
-                            # check if scores or dps contains only nan values
+                            # check if scores or cds contains only nan values
                             if all(np.isnan(x) for x in scores):
                                 score = np.nan
                             else:
                                 score = np.nanmean(scores)
 
-                            if all(np.isnan(x) for x in dps):
-                                dp = np.nan
+                            if all(np.isnan(x) for x in cds):
+                                cd = np.nan
                             else:
-                                dp = np.nanmean(dps)
-                            vcf_line.sample_format_data[sample_key] = [gt, "0.0", score, round(dp, 2), ",".join(ids)]
+                                cd = np.nanmean(cds)
+                            vcf_line.sample_format_data[sample_key] = [gt, "0.0", score, round(cd, 2), ",".join(ids)]
                             vcf_line.supp_vec[sample_key] = 1
                         else:
                             vcf_line.sample_format_data[sample_key] = ["./.", "0.0", 0, 0.0, "NULL"]
@@ -223,9 +223,10 @@ def make_vcf(self, chromosome_list, cnv_candidate_list, sample_id, population_sa
         vcf_lines = self.vcf_result(chromosome_list, cnv_candidate_list)
         file_handler.write(f'{vcf_header}\n')
         file_handler.write(f'{vcf_sample_header}\n')
-        file_handler.write(f'{vcf_lines}\n')
+        if len(vcf_lines) > 0:
+            file_handler.write(f'{vcf_lines}\n')
         file_handler.close()
-        if "gz" in self.output_vcf and len(vcf_lines) > 0:
+        if "gz" in self.output_vcf: # and len(vcf_lines) > 0:
             # sort vcf file
             bcftools.sort(f"-o", tmp_out + ".sort.tmp", tmp_out, catch_stdout=False)