Fix: #20

# Fixed Issues: - GQ always showed the default score of the variant. This has been changed to show a phread like score. This score is based on a Z-score which is used to determine how likely it is that the variant is true variant. - The SVSUPPORT flag was always visible even if no SNFJ file was used. This has been updated to be only present in the VCF if a SNFJ file has been used.
fritzsedlazeck · Apr 22, 2024 · 06135b8 · 06135b8
1 parent 0e96275
commit 06135b8
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 35 deletions.
diff --git a/spectre/analysis/analysis.py b/spectre/analysis/analysis.py
@@ -67,6 +67,8 @@ def __init__(self, coverage_file, outbed, outvcf, sample_id, spectre_args, metad
         self.merged_candidates = None
         # output
         self.chromosome_names_out = None
+        # SNFJ
+        self.snfj_breakpoints = False
         # DEV
         self.min_chr_length = 1e6
         self.dist_min_overwrite = 10000  # 10kb TODO
@@ -353,6 +355,7 @@ def refine_cnv_calls(self):
     def check_snfj_breakpoints(self, snfspc_file):
         bp = Breakpoints(snfspc_file, self.as_dev)
         bp.correlate_cnvs_with_breakpoints(cnv_candidates=self.cnv_calls_list, bin_size=self.bin_size)
+        self.snfj_breakpoints = True
 
     # Candidate CNV merge by distance and CNV type
     def merge_candidates(self, candidates_cnv_list, chromosome_name):
@@ -742,7 +745,7 @@ def cnv_result_vcf(self, method=""):
         # TODO add LoH if self.snv_loh is not None
         # TODO modify make_vcf
         output_vcf = os.path.join(os.path.join(self.output_directory, f'{method}{self.sample_id}.vcf.gz'))
-        vcf_output = outputWriter.VCFOutput(output_vcf, self.genome_info)
+        vcf_output = outputWriter.VCFOutput(output_vcf, self.genome_info, self.snfj_breakpoints)
         vcf_output.make_vcf(self.chromosome_names_out, self.merged_candidates, self.sample_id)
 
     # Plots

diff --git a/spectre/analysis/cnv_metrics.py b/spectre/analysis/cnv_metrics.py
@@ -449,7 +449,7 @@ def check_del_dup_borders(self, del_threshold: float, dup_threshold: float) -> t
                 dup_threshold = self.ploidy + 1
                 self.logger.warning(f"Using fallback to threshold of: {dup_threshold}")
         return del_threshold, dup_threshold
-    def evaluate_cnvs(self, cnv_calls=None, refined_cnvs: bool = False) -> dict:
+    def evaluate_cnvs(self, refined_cnvs: bool = False) -> dict:
         """
         Evaluating all submitted CNV calls, by applying statistical tests. Optionally, plotting the location of the CNVs
         on the global coverage samples per chr.
@@ -483,31 +483,31 @@ def evaluate_cnvs(self, cnv_calls=None, refined_cnvs: bool = False) -> dict:
                                              "pvalue": cnv_metrics_Z["pvalue"],
                                              "sample_score": cnv_metrics_Z["sample_score"]}
 
-                if self.as_dev and False:  # BUG line 385
-                    if not np.isnan(cnv_metrics_Z["cnv_call_mean"]):
-
-                        # pre check --> avoide div by 0
-                        if cnv_metrics_Z["cnv_call_mean"] == 0:
-                            x_index = 0
-                        else:
-                            if not np.isinf(cnv_metrics_Z["cnv_call_mean"]):
-                                x_index = int(cnv_metrics_Z["cnv_call_mean"] / width)
-                            else:
-                                x_index = int(x[-1])
-                        # check if out of bounds
-                        if x_index >= len(x):
-                            cnv_y = x[-1]
-                        else:
-                            cnv_y = int(x[x_index])
-                        cnv_y = int(cnv_y)
-                        annotation_endpoint_offset = 5 * (100 - x_index) * (
-                                1 - (cnv_y / 150)) + (max(x) / 10)  # np.random.randint(100,300)
-
-                        # Add
-                        ax.annotate(f"{cnv.id}", xy=(cnv_metrics_Z["cnv_call_mean"], cnv_y),
-                                    xytext=(cnv_metrics_Z["cnv_call_mean"], cnv_y + annotation_endpoint_offset),
-                                    rotation=60,
-                                    fontsize=12, arrowprops=dict(facecolor='green', shrink=0.05))
+                # if self.as_dev and False:  # BUG line 385
+                #     if not np.isnan(cnv_metrics_Z["cnv_call_mean"]):
+                #
+                #         # pre check --> avoide div by 0
+                #         if cnv_metrics_Z["cnv_call_mean"] == 0:
+                #             x_index = 0
+                #         else:
+                #             if not np.isinf(cnv_metrics_Z["cnv_call_mean"]):
+                #                 x_index = int(cnv_metrics_Z["cnv_call_mean"] / width)
+                #             else:
+                #                 x_index = int(x[-1])
+                #         # check if out of bounds
+                #         if x_index >= len(x):
+                #             cnv_y = x[-1]
+                #         else:
+                #             cnv_y = int(x[x_index])
+                #         cnv_y = int(cnv_y)
+                #         annotation_endpoint_offset = 5 * (100 - x_index) * (
+                #                 1 - (cnv_y / 150)) + (max(x) / 10)  # np.random.randint(100,300)
+                #
+                #         # Add
+                #         ax.annotate(f"{cnv.id}", xy=(cnv_metrics_Z["cnv_call_mean"], cnv_y),
+                #                     xytext=(cnv_metrics_Z["cnv_call_mean"], cnv_y + annotation_endpoint_offset),
+                #                     rotation=60,
+                #                     fontsize=12, arrowprops=dict(facecolor='green', shrink=0.05))
         if self.as_dev:
             self.logger.debug(f"Creating CNV distribution plot")
             fig, ax = plt.subplots()

diff --git a/spectre/util/outputWriter.py b/spectre/util/outputWriter.py
@@ -67,11 +67,12 @@ def format_vcf_line(self):
 
 
 class VCFOutput(object):
-    def __init__(self, output_file, genome_info):
+    def __init__(self, output_file, genome_info, sv_support: bool = False):
         self.supp_vec = {}
         self.output_vcf = output_file
         self.population_sample_ids = []
         self.genome_info = genome_info
+        self.sv_support = sv_support
         # example {'chromosomes': ['chr6'], 'chr_lengths': {'chr6': 170805979}}
 
     @staticmethod
@@ -105,7 +106,8 @@ def make_vcf_header(self):
                        '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">',
                        '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of copy number variant">',
                        '##INFO=<ID=CN,Number=1,Type=Integer,Description="Estimated copy number status">',
-                       '##INFO=<ID=SVSUPPORT,Number=1,Type=String,Description="SV support">']
+                       '##INFO=<ID=SVSUPPORT,Number=1,Type=String,Description="Indicator if a SV support was found in a provided SNFJ file">',
+                       ]
         if population_mode:
             vcf_header += ['##INFO=<ID=SUPP_VEC,Number=1,Type=String,Description="Support vector">']
 
@@ -151,14 +153,15 @@ def vcf_result(self, chromosome_list, cnv_candidate_list):
                 vcf_line.QUAL = str(each_candidate.quality)
                 vcf_line.INFO = f"END={each_candidate.end};SVLEN={each_candidate.size};SVTYPE={each_candidate.type};" \
                                 f"CN={each_candidate.cn_status};"
-                vcf_line.INFO += f"SVSUPPORT={'TRUE' if each_candidate.sv_support else 'FALSE'};"
+                if self.sv_support:
+                    vcf_line.INFO += f"SVSUPPORT={'TRUE' if each_candidate.sv_support else 'FALSE'};"
 
                 # checking if any other CNV through merging supported the given CNV
                 vcf_line.supp_vec = self.supp_vec.copy()
                 if not each_candidate.support_cnv_calls:
                     vcf_line.format_data = [each_candidate.gt, f'{round(each_candidate.het_score, 2)}',
                                             f"{int(each_candidate.statistics['z-score']['sample_score'])}",
-                                            f'{round(each_candidate.median_raw_cov,2)}']
+                                            f'{round(each_candidate.median_raw_cov, 2)}']
                 else:
                     vcf_line.format_data = []
                     vcf_line.FORMAT += ":ID"  # ADD ID tag in format as everything that is following are IDs
@@ -169,7 +172,7 @@ def vcf_result(self, chromosome_list, cnv_candidate_list):
                             ids = []
                             scores = []
                             gts = []
-                            dps = [] # raw read depth
+                            dps = []  # raw read depth
                             # sample_id =""
                             for candidate in list(sample_value):
                                 ids.append(candidate.id)
@@ -189,10 +192,10 @@ def vcf_result(self, chromosome_list, cnv_candidate_list):
                                 dp = np.nan
                             else:
                                 dp = np.nanmean(dps)
-                            vcf_line.sample_format_data[sample_key] = [gt, "0.0", score,round(dp,2), ",".join(ids)]
+                            vcf_line.sample_format_data[sample_key] = [gt, "0.0", score, round(dp, 2), ",".join(ids)]
                             vcf_line.supp_vec[sample_key] = 1
                         else:
-                            vcf_line.sample_format_data[sample_key] = ["./.", "0.0", 0, 0.0,"NULL"]
+                            vcf_line.sample_format_data[sample_key] = ["./.", "0.0", 0, 0.0, "NULL"]
                     # add support vector only if population mode is active
                     vcf_line.INFO += ";SUPP_VEC=" + "".join([str(s) for s in vcf_line.supp_vec.values()])
                 vcf_lines.append(vcf_line.format_vcf_line())
@@ -226,11 +229,12 @@ def make_vcf(self, chromosome_list, cnv_candidate_list, sample_id, population_sa
 
         # clean up
         if os.path.isfile(self.output_vcf) and os.stat(self.output_vcf).st_size != 0:
-            os.remove(tmp_out) # remove the original unsorted vcf
+            os.remove(tmp_out)  # remove the original unsorted vcf
             # sorting file
             if os.path.exists(tmp_out + ".sort.tmp"):
                 os.remove(tmp_out + ".sort.tmp")
 
+
 class IntermediateFile(object):
     def __init__(self, output_dir: str, as_dev=False):
         self.output_dir = output_dir