Skip to content

Commit

Permalink
Fix for #26, #27 and additional changes
Browse files Browse the repository at this point in the history
Issues:
- #26 VCF was not converted to correctly when Spectre did not find any CNVs
- #27 VCF FORMAT issue with DP tag. Tag was changed from DP to CD (coverage read depth)
- issue with merging overlapping CNVs. They are now merging correctly
- min-cnv-len default value has been changed to 100kb (previously 1mb)

Further changes:
- Adding an execution timer for Spectre
  • Loading branch information
philippesanio committed Jul 15, 2024
1 parent b80fd32 commit 07e61de
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 18 deletions.
15 changes: 11 additions & 4 deletions spectre/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, coverage_file, outbed, outvcf, sample_id, spectre_args, metad
# SNFJ
self.snfj_breakpoints = False
# DEV
self.min_chr_length = 1e6
self.min_chr_length = 1e5 # 100kb
self.dist_min_overwrite = 10000 # 10kb TODO
self.max_std_outlier_rm = self.spectre_args.dev_max_std_outlier_rm # 5
self.mosdepth_cov_genome_chr_diff = self.spectre_args.dev_mosdepth_cov_genome_chr_diff # 0.10 # 10%
Expand Down Expand Up @@ -523,8 +523,9 @@ def clean_by_overlap(self, merged_candidates):
offset = 0

# Walk through candidates by index (idx)
for idx in range(len(merged_candidates) - 1):
current_cand = merged_candidates[idx]
idx = 0
while idx + offset < number_of_candidates:
current_cand = merged_candidates[idx+offset]
early_stop = False

# Walk through all following candidates (offset)
Expand Down Expand Up @@ -592,6 +593,7 @@ def clean_by_overlap(self, merged_candidates):
# # Remove the slave candidate from the list.
remove_idx.append(remove_candidate)
offset += 1
idx += 1

# remove the candidates that overlap
for idx in range(len(merged_candidates)):
Expand Down Expand Up @@ -640,7 +642,7 @@ def clean_by_threshold(self, merged_candidates):
n_outside_ci = del_outside + dup_outside
# check if more than 50% of the values are outside the confidence interval
has_more_values_outside_of_normal_cov_space = n_outside_ci > len(
each_cand.cov) * 0.5 # reduce FP due to noise
each_cand.cov) * self.spectre_args.qc_noise_allow # reduce FP due to noise. Allow by default is 50%

# if candidate coverage mean is outside the thresholds, keep it
if not (del_threshold <= threshold <= dup_threshold) and has_more_values_outside_of_normal_cov_space:
Expand Down Expand Up @@ -799,6 +801,11 @@ def write_intermediate_candidates(self) -> str:
loh_cnv_calls_list_dict = intermediate_output_writer.convert_candidates_to_dictionary(
self.snv_loh)

# check strict thresholds If None, they were not used due to the cancer mode
if self.cnv_metrics.strict_del_threshold is None:
self.cnv_metrics.strict_del_threshold = self.lower_2n_threshold
self.cnv_metrics.strict_dup_threshold = self.upper_2n_threshold

analysis_dict = {
"metadata": {"source": "spectre", "spectre_version": "0.2.alpha", "bin_size": int(self.bin_size),
"min_cn_len": int(self.candidate_final_threshold),
Expand Down
14 changes: 11 additions & 3 deletions spectre/spectre.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import os
import sys
import time

sys.dont_write_bytecode = True
import argparse
Expand Down Expand Up @@ -32,6 +33,7 @@ def __init__(self):
self.ploidy_chr_list = ""
self.min_cnv_len = 1000000
self.sample_coverage_overwrite = None
self.qc_noise_allow = 0.5
self.call_from_console = False
self.is_cancer = False
# dev/debug + hidden params
Expand Down Expand Up @@ -67,6 +69,7 @@ def set_params_from_args(self, user_args):
self.ploidy_chr_list = user_args.ploidy_chr
self.min_cnv_len = user_args.min_cnv_len
self.sample_coverage_overwrite = user_args.sample_coverage_overwrite
self.qc_noise_allow = user_args.qc_noise_allow
self.n_size = user_args.n_size
self.threads = user_args.threads
self.run_population_mode = user_args.run_population_mode
Expand Down Expand Up @@ -149,7 +152,7 @@ def outside_spectre_worker(si: dict):
class Spectre:
def __init__(self, as_dev=False):
# version
self.version = "0.2.1"
self.version = "0.2.1-patch-july15"
# get a custom logger & set the logging level
self.logger = logger.setup_log(__name__, as_dev)

Expand Down Expand Up @@ -518,7 +521,7 @@ def get_arguments():
subparser_cnv_caller.add_argument('-n', '--n-size', type=int, required=False, dest='n_size', default=5,
help='..., default = 5')
subparser_cnv_caller.add_argument('-mcl', '--min-cnv-len', type=int, required=False, dest='min_cnv_len',
default=1000000, help='..., default = 1000000')
default=100000, help='..., default = 100000')
subparser_cnv_caller.add_argument('-t', '--threads', type=int, required=False, dest='threads', default=1,
help='..., default = 1')
subparser_cnv_caller.add_argument('-i', '--population', action='store_true', required=False,
Expand All @@ -532,7 +535,8 @@ def get_arguments():

subparser_cnv_caller.add_argument('-a', '--cancer', action='store_true', required=False, dest='is_cancer',
default=False, help='..., default = False')

subparser_cnv_caller.add_argument('-qcna', '--qc-noise-allow', type=float, required=False,
dest='qc_noise_allow', default=0.5, help='..., float. default = 0.5')
# Loh
subparser_cnv_caller.add_argument('-lohkb', '--loh-min-snv-perkb', type=int, required=False, default=10,
dest='loh_min_snv_perkb', help='default = 10')
Expand Down Expand Up @@ -636,6 +640,8 @@ def main():
main_logger.info(f"Spectre input: {command} {' '.join(sys.argv[2:])}")
main_logger.debug(f'Debug output is enabled') if run_as_dev else None
spectre_run = Spectre(run_as_dev)
# add timer
start_time = time.time()
if command == "CNVCaller":
# ARGS: bin_size, coverage_file_bed, sample_id, output_dir, reference="", metadata="", ...
spectre_run.spectre_args.set_params_from_args(spectre_args)
Expand All @@ -655,6 +661,8 @@ def main():
else:
print(spectre_help)
pass
end_time = time.time() - start_time
main_logger.info(f"Execution time: {end_time:.2f} seconds")
sys.exit(0)


Expand Down
23 changes: 12 additions & 11 deletions spectre/util/outputWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self):
self.QUAL = "."
self.FILTER = "."
self.INFO = "."
self.FORMAT = "GT:HO:GQ:DP"
self.FORMAT = "GT:HO:GQ:CD"
self.format_data = []
self.sample_format_data = {}
self.supp_vec = {}
Expand Down Expand Up @@ -119,7 +119,7 @@ def make_vcf_header(self):
vcf_header += ['##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=HO,Number=2,Type=Float,Description="Homozygosity proportion">',
'##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">',
'##FORMAT=<ID=DP,Number=2,Type=Float,Description="Read depth">',
'##FORMAT=<ID=CD,Number=2,Type=Float,Description="Coverage read depth">',
'##FORMAT=<ID=ID,Number=1,Type=String,Description="Population ID of supporting CNV calls">'
]

Expand Down Expand Up @@ -176,27 +176,27 @@ def vcf_result(self, chromosome_list, cnv_candidate_list):
ids = []
scores = []
gts = []
dps = [] # raw read depth
cds = [] # raw read depth
# sample_id =""
for candidate in list(sample_value):
ids.append(candidate.id)
scores.append(np.float16(candidate.statistics['z-score']['sample_score']))
gts.append(candidate.gt)
# check if median_raw_cov exists
if candidate.median_raw_cov:
dps.append(candidate.median_raw_cov)
cds.append(candidate.median_raw_cov)
gt = Counter(gts).most_common(1)[0][0]
# check if scores or dps contains only nan values
# check if scores or cds contains only nan values
if all(np.isnan(x) for x in scores):
score = np.nan
else:
score = np.nanmean(scores)

if all(np.isnan(x) for x in dps):
dp = np.nan
if all(np.isnan(x) for x in cds):
cd = np.nan
else:
dp = np.nanmean(dps)
vcf_line.sample_format_data[sample_key] = [gt, "0.0", score, round(dp, 2), ",".join(ids)]
cd = np.nanmean(cds)
vcf_line.sample_format_data[sample_key] = [gt, "0.0", score, round(cd, 2), ",".join(ids)]
vcf_line.supp_vec[sample_key] = 1
else:
vcf_line.sample_format_data[sample_key] = ["./.", "0.0", 0, 0.0, "NULL"]
Expand All @@ -223,9 +223,10 @@ def make_vcf(self, chromosome_list, cnv_candidate_list, sample_id, population_sa
vcf_lines = self.vcf_result(chromosome_list, cnv_candidate_list)
file_handler.write(f'{vcf_header}\n')
file_handler.write(f'{vcf_sample_header}\n')
file_handler.write(f'{vcf_lines}\n')
if len(vcf_lines) > 0:
file_handler.write(f'{vcf_lines}\n')
file_handler.close()
if "gz" in self.output_vcf and len(vcf_lines) > 0:
if "gz" in self.output_vcf: # and len(vcf_lines) > 0:
# sort vcf file
bcftools.sort(f"-o", tmp_out + ".sort.tmp", tmp_out, catch_stdout=False)

Expand Down

0 comments on commit 07e61de

Please sign in to comment.