Skip to content

Commit

Permalink
Add new --regions-overlap option
Browse files Browse the repository at this point in the history
Add new --regions-overlap option which allows to take into account overlapping deletions
that start out of the fasta file target region.

Note this will need to be updated if the issue samtools/htslib#1746
is resolved.

Resolves #2091
  • Loading branch information
pd3 committed Feb 12, 2024
1 parent 7a4f801 commit d25e3f1
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 26 deletions.
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@

Changes affecting specific commands:

* bcftools consensus

- Add new --regions-overlap option which allows to take into account overlapping deletions
that start out of the fasta file target region.

* bcftools norm

- Change the order of atomization and multiallelic splitting (when both -a,-m are given)
Expand Down
95 changes: 69 additions & 26 deletions consensus.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2014-2023 Genome Research Ltd.
Copyright (c) 2014-2024 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -118,7 +118,7 @@ typedef struct
char **argv;
int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied;
uint8_t *iupac_bitmask, *iupac_als;
int miupac_bitmask, miupac_als;
int miupac_bitmask, miupac_als, regions_overlap;
char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
char mark_del, mark_ins, mark_snv;
smpl_ilist_t *smpl;
Expand Down Expand Up @@ -348,20 +348,28 @@ static void init_region(args_t *args, char *line)
args->prev_base_pos = -1;
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
args->fa_ori_pos = from;
args->fa_src_pos = from;
args->fa_end_pos = to; // 0-based
args->fa_ori_pos = from; // 0-based
args->fa_src_pos = from; // 0-based
args->fa_mod_off = 0;
args->fa_frz_pos = -1;
args->fa_frz_mod = -1;
args->fa_case = -1;
args->vcf_rbuf.n = 0;


// bcf_sr_set_regions accepts 1-based coordinates
kstring_t str = {0,0,0};
if ( from==0 ) from = 1;
if ( to==0 ) to = HTS_POS_MAX;
if ( !from ) from = 1;
else from++;
#ifndef MAX_CSI_COOR
#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1)
#endif
if ( to==0 ) to = MAX_CSI_COOR - 1;
else to++;
ksprintf(&str,"%s:%"PRIhts_pos"-%"PRIhts_pos,line,from,to);
bcf_sr_set_regions(args->files,line,0);
bcf_sr_set_opt(args->files,BCF_SR_REGIONS_OVERLAP,args->regions_overlap);
bcf_sr_set_regions(args->files,str.s,0);
free(str.s);

if ( tmp_ptr ) *tmp_ptr = tmp;
Expand Down Expand Up @@ -790,15 +798,42 @@ static void apply_variant(args_t *args, bcf1_t *rec)

}

char *ref_allele = rec->d.allele[0];
char *alt_allele = rec->d.allele[ialt];
int rmme_alt = 0;

int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; // position of the variant within the modified fasta sequence
if ( idx<0 )
{
fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
return;
if ( alt_allele[0]=='<' ) // symbolic allele
{
rec->pos -= idx + 1;
rec->rlen += idx + 1;
idx = -1;
}
else if ( strlen(ref_allele) < -idx ) // the ref allele is shorter but overlaps the fa sequence? This should never happen
{
assert(0);
fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
return;
}
else if ( strlen(alt_allele) > -idx ) // the ref allele overlaps the fa and so does the alt allele
{
rec->pos -= idx;
rec->rlen += idx;
ref_allele -= idx;
alt_allele -= idx;
idx = 0;
}
else // the ref allele overlaps the fa but alt allele does not: trim to leave one base before
{
rec->pos -= idx + 1;
rec->rlen += idx + 1;
ref_allele -= idx + 1;
alt_allele += strlen(alt_allele) - 1;
idx = -1;
}
}
if ( rec->rlen > args->fa_buf.l - idx )
{
Expand All @@ -813,7 +848,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
}
}
if ( idx>=args->fa_buf.l )
if ( idx>0 && idx>=args->fa_buf.l )
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);

// sanity check the reference base
Expand All @@ -828,7 +863,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( !strcasecmp(alt_allele,"<DEL>") )
{
static int multibase_ref_del_warned = 0;
if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned )
if ( ref_allele[1]!=0 && !multibase_ref_del_warned )
{
fprintf(stderr,
"Warning: one REF base is expected with <DEL>, assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n"
Expand All @@ -837,15 +872,15 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
if ( args->mark_del ) // insert dashes instead of delete sequence
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del);
alt_allele = mark_del(ref_allele, rec->rlen, NULL, args->mark_del);
alen = rec->rlen;
len_diff = 0;
rmme_alt = 1;
}
else
{
len_diff = 1-rec->rlen;
alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event
alt_allele = ref_allele; // according to VCF spec, the first REF base must precede the event
alen = 1;
}
}
Expand All @@ -856,18 +891,18 @@ static void apply_variant(args_t *args, bcf1_t *rec)
return;
}
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
else if ( idx>=0 && strncasecmp(ref_allele,args->fa_buf.s+idx,rec->rlen) )
{
// This is hacky, handle a special case: if SNP or an insert follows a deletion (AAC>A, C>CAA),
// the reference base in fa_buf is lost and the check fails. We do not keep a buffer
// with the original sequence as it should not be necessary, we should encounter max
// one base overlap

int fail = 1;
if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
if ( args->prev_base_pos==rec->pos && toupper(ref_allele[0])==toupper(args->prev_base) )
{
if ( rec->rlen==1 ) fail = 0;
else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
else if ( !strncasecmp(ref_allele+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
}

if ( fail )
Expand All @@ -883,7 +918,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
" REF .vcf: [%s]\n"
" ALT .vcf: [%s]\n"
" REF .fa : [%s]%c%s\n",
bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx,
bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, ref_allele, alt_allele, args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
Expand All @@ -892,7 +927,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)

if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
len_diff = 0;
rmme_alt = 1;
Expand All @@ -905,23 +940,24 @@ static void apply_variant(args_t *args, bcf1_t *rec)

if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alt_allele = mark_del(ref_allele, rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
len_diff = 0;
rmme_alt = 1;
}
}

args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER;
int safe_idx = idx<0 ? 0 : idx; // idx can be negative in case of overlapping deletion
args->fa_case = toupper(args->fa_buf.s[safe_idx])==args->fa_buf.s[safe_idx] ? TO_UPPER : TO_LOWER;
if ( args->fa_case==TO_UPPER )
for (i=0; i<alen; i++) alt_allele[i] = toupper(alt_allele[i]);
else
for (i=0; i<alen; i++) alt_allele[i] = tolower(alt_allele[i]);

if ( args->mark_ins && len_diff>0 )
mark_ins(rec->d.allele[0], alt_allele, args->mark_ins);
mark_ins(ref_allele, alt_allele, args->mark_ins);
if ( args->mark_snv )
mark_snv(rec->d.allele[0], alt_allele, args->mark_snv);
mark_snv(ref_allele, alt_allele, args->mark_snv);

if ( len_diff <= 0 )
{
Expand Down Expand Up @@ -953,7 +989,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
// 1 C T
// 1 C CAA
int ibeg = 0;
while ( ibeg<alen && rec->d.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
while ( ibeg<alen && ref_allele[ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++;
for (i=ibeg; i<alen; i++)
args->fa_buf.s[idx+i] = alt_allele[i];

Expand All @@ -962,7 +998,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if (args->chain && len_diff != 0)
{
// If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0)
if ( strncasecmp(ref_allele,alt_allele,1) == 0)
{
// ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
Expand Down Expand Up @@ -1135,6 +1171,7 @@ static void usage(args_t *args)
fprintf(stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n");
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
fprintf(stderr, " -p, --prefix STRING Prefix to add to output sequence names\n");
fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n");
fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
fprintf(stderr, "Examples:\n");
Expand All @@ -1151,6 +1188,7 @@ int main_consensus(int argc, char *argv[])
{
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
args->regions_overlap = 1;

static struct option loptions[] =
{
Expand All @@ -1172,6 +1210,7 @@ int main_consensus(int argc, char *argv[])
{"absent",1,0,'a'},
{"chain",1,0,'c'},
{"prefix",required_argument,0,'p'},
{"regions-overlap",required_argument,0,5},
{0,0,0,0}
};
int c;
Expand All @@ -1192,6 +1231,10 @@ int main_consensus(int argc, char *argv[])
else if ( !optarg[1] && optarg[0]>32 && optarg[0]<127 ) args->mark_snv = optarg[0];
else error("The argument is not recognised: --mark-snv %s\n",optarg);
break;
case 5 :
args->regions_overlap = parse_overlap_option(optarg);
if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
case 'p': args->chr_prefix = optarg; break;
case 's': args->sample = optarg; break;
case 'S': args->sample_fname = optarg; break;
Expand Down
10 changes: 10 additions & 0 deletions test/consensus.22.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1:11-19
ACGTACGT
>2:11-19
ACGTACGT
>3:11-19
ACGTACGT
>4:11-19
ACGTACGT
>5:11-19
ACGTACGT
20 changes: 20 additions & 0 deletions test/consensus.22.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##fileformat=VCFv4.2
##reference=file://some/path/human_g1k_v37.fasta
##contig=<ID=1>
##contig=<ID=2>
##contig=<ID=3>
##contig=<ID=4>
##contig=<ID=5>
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a
1 8 . ACGA A . . . GT 1/1
1 12 . C A . . . GT 1/1
2 8 . ACGA TTTT . . . GT 1/1
2 12 . C A . . . GT 1/1
3 10 . A T . . . GT 1/1
3 12 . C A . . . GT 1/1
4 8 . A <DEL> . . END=11 GT 1/1
4 12 . C A . . . GT 1/1
5 8 . ACGA ACG . . . GT 1/1
5 12 . C A . . . GT 1/1
10 changes: 10 additions & 0 deletions test/consensus22.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1:11-19
AAGTACGT
>2:11-19
AAGTACGT
>3:11-19
AAGTACGT
>4:11-19
AAGTACGT
>5:11-19
AAGTACGT
10 changes: 10 additions & 0 deletions test/consensus22.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1:11-19
.AGTACGT
>2:11-19
TAGTACGT
>3:11-19
AAGTACGT
>4:11-19
.AGTACGT
>5:11-19
.AGTACGT
10 changes: 10 additions & 0 deletions test/consensus22.3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>1:11-19
AGTACGT
>2:11-19
TAGTACGT
>3:11-19
AAGTACGT
>4:11-19
AGTACGT
>5:11-19
AGTACGT
6 changes: 6 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,12 @@
run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.3.out',fa=>'consensus.20.fa',args=>'-M . -s b');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.20',out=>'consensus20.4.out',fa=>'consensus.20.fa',args=>'-M . -s a');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.21',out=>'consensus21.1.out',fa=>'consensus.21.fa',args=>'');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.1.out',fa=>'consensus.22.fa',args=>'--regions-overlap 0 --mark-del .');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.2.out',fa=>'consensus.22.fa',args=>'--regions-overlap 1 --mark-del .');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.2.out',fa=>'consensus.22.fa',args=>'--regions-overlap 2 --mark-del .');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.1.out',fa=>'consensus.22.fa',args=>'--regions-overlap 0');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 1');
run_test(\&test_vcf_consensus,$opts,in=>'consensus.22',out=>'consensus22.3.out',fa=>'consensus.22.fa',args=>'--regions-overlap 2');
run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1);
run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite
run_test(\&test_mpileup,$opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite
Expand Down

0 comments on commit d25e3f1

Please sign in to comment.