-
Notifications
You must be signed in to change notification settings - Fork 2
/
gfa_variants.pl
executable file
·368 lines (258 loc) · 9.54 KB
/
gfa_variants.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/env perl
use Getopt::Long;
use Pod::Usage;
use strict;
use warnings;
my $gfa_file;
my $chr_delim = '\.';
my $chr_prefix;
my $help;
my %paths = ();
my %seqs = ();
my %inv_head_nodes = ();
my @genos = ();
my $use_w_lines = 0;
parse_args();
parse_gfa_file();
call_vars();
exit(0);
sub call_vars {
my $print_header = 1;
foreach my $chr (sort keys %paths) {
my %node_counts = ();
my %links = ();
my %geno_linked_nodes = ();
my @ordered_nodes = ();
foreach my $geno (sort keys %{$paths{$chr}}) {
my ($rec_type, $path_name, $nodes, $overlaps);
my @nodes = ();
if ($use_w_lines == 0) {
($rec_type, $path_name, $nodes, $overlaps) = split(/\t/, $paths{$chr}{$geno});
@nodes = split(',', $nodes);
}
else {
my ($rec_type, $sample, $hap_index, $seq_id, $seq_start, $seq_end, $walk) = split(/\t/, $paths{$chr}{$geno});
my @walk_comps = split(/(<|>)/, $walk);
$path_name = $seq_id;
my $index = 1;
while ($index < $#walk_comps) {
my $orientation = '+';
if ($walk_comps[$index] eq '<') {
$orientation = '-';
}
my $node = "$walk_comps[$index + 1]"."$orientation";
push(@nodes, $node);
$index += 2;
}
}
foreach my $node_index (0..$#nodes - 1) {
my $node1 = $nodes[$node_index];
my $node2 = $nodes[$node_index + 1];
my $node1_orientation = '+';
my $node2_orientation = '+';
if ($node1 =~ s/([\-\+])$//) {
$node1_orientation = $1;
}
if ($node2 =~ s/([\-\+])$//) {
$node2_orientation = $1;
}
if ($node1_orientation eq '-') {
if ($node2_orientation eq '-') {
# double inversion, invert nodes and process as +/+ link
my $tmp = $node1;
$node1 = $node2;
$node2 = $tmp;
}
elsif ($node2_orientation eq '+') {
# negate node1 id to differentiate from +node1
$node1 = "-$node1";
$inv_head_nodes{$node1} = "path: $path_name\tnode1: $node1\tnode2: $node2";
}
}
if (! exists($links{$node1}{$node2})) {
$node_counts{$node1}++;
$links{$node1}{$node2} = $node_counts{$node1} - 1;
push(@ordered_nodes, $node1);
}
$geno_linked_nodes{$node1}{$geno} = $node2;
}
}
if ($print_header == 1) {
print(STDOUT "## fileformat=VCFvPanPipes\n");
print(STDOUT "## https://github.com/USDA-ARS-GBRU/PanPipes\n");
print(STDOUT "## input graph: $gfa_file\n");
print(STDOUT '#', join("\t", 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', @genos), "\n");
$print_header = 0;
}
# head_node -> var_node -> link_node
my %proc_head_nodes = ();
foreach my $head_node (@ordered_nodes) {
if (exists($proc_head_nodes{$head_node})) {
next();
}
$proc_head_nodes{$head_node}++;
# monomorphic
if ($node_counts{$head_node} <= 1) {
next();
}
if (exists($inv_head_nodes{$head_node})) {
print(STDERR "warning: inverted head node\t$inv_head_nodes{$head_node}\n");
}
my $ref_node;
my $ref_seq;
my @alt_nodes;
my @alt_seqs;
foreach my $var_node (sort { $a <=> $b } keys %{$links{$head_node}}) {
if ($links{$head_node}{$var_node} == 0) {
$ref_node = $var_node;
}
else {
my $alt_index = $links{$head_node}{$var_node} - 1;
$alt_nodes[$alt_index] = $var_node;
}
}
my $alt_count = $#alt_nodes + 1;
my $ref_link_count = keys %{$links{$ref_node}};
my $alt_link_count = keys %{$links{$alt_nodes[0]}};
if ($alt_count == 1 && $ref_link_count == 1 && $alt_link_count == 1) {
my @ref_link_nodes = keys %{$links{$ref_node}};
my $ref_link_node = $ref_link_nodes[0];
my @alt_link_nodes = keys %{$links{$alt_nodes[0]}};
my $alt_link_node = $alt_link_nodes[0];
if ($ref_link_node eq $alt_nodes[0]) {
# insertion as ref
$ref_seq = $seqs{$ref_node};
@alt_seqs = ('-');
}
elsif ($alt_link_node eq $ref_node) {
# deletion as ref
$ref_seq = ('-');
@alt_seqs = ($seqs{$alt_nodes[0]});
}
elsif ($ref_link_node eq $alt_link_node) {
# substitution
$ref_seq = $seqs{$ref_node};
@alt_seqs = ($seqs{$alt_nodes[0]});
}
else {
$ref_seq = "long\.$ref_node";
@alt_seqs = @alt_nodes;
}
}
elsif ($alt_count > 1) {
$ref_seq = "multipath\.$ref_node";
@alt_seqs = @alt_nodes;
}
else {
$ref_seq = "dense\.$ref_node";
@alt_seqs = @alt_nodes;
}
my $vcf_rec = join("\t", $chr, $head_node, "$ref_node-$alt_nodes[0]", $ref_seq, join(',', @alt_seqs), '.', '.', '.', 'GT');
foreach my $geno (@genos) {
my $call = '.';
if (exists($geno_linked_nodes{$head_node}{$geno})) {
my $linked_node = $geno_linked_nodes{$head_node}{$geno};
$call = $links{$head_node}{$linked_node};
}
$vcf_rec .= "\t$call";
}
print(STDOUT "$vcf_rec\n");
}
delete($paths{$chr});
}
return(0);
}
sub parse_gfa_file {
my %genos = ();
open(GFA, '<', $gfa_file) or error("can't open $gfa_file: $!");
while (my $line = <GFA>) {
chomp($line);
if ($line =~ /^S\t/) {
my ($rec_type, $seg, $seq) = split(/\t/, $line);
$seqs{$seg} = $seq;
}
elsif ($line =~ /^P\t/) {
my ($rec_type, $path_name, $nodes, $overlaps) = split(/\t/, $line);
my ($path_geno, $path_chr) = split(/$chr_delim/, $path_name);
if (defined($chr_prefix)) {
$path_chr = "${chr_prefix}${path_chr}";
}
if (! defined($path_chr)) {
print(STDERR "warning: can't determine path chr from $path_name, using chr0\n");
print(STDERR "\tif $gfa_file contains chromosomes in the path names, use -d/--delim option to configure path name parsing\n");
$path_chr = 'chr0';
}
$genos{$path_geno}++;
$paths{$path_chr}{$path_geno} = $line;
}
elsif ($line =~ /^W\t/) {
my ($rec_type, $sample, $hap_index, $seq_id, $seq_start, $seq_end, $walk) = split(/\t/, $line);
if ($sample eq '_MINIGRAPH_') {
next();
}
$genos{$sample}++;
$paths{$seq_id}{$sample} = $line;
$use_w_lines = 1;
}
}
close(GFA);
@genos = sort(keys %genos);
print(STDERR "the following genotypes were found: ", join(', ', @genos), "\n");
print(STDERR "the following chromosomes were found: ", join(', ', sort(keys %paths)), "\n");
return(0);
}
sub error {
my $msg = shift();
if (defined($msg)) {
print(STDERR "error: $msg\n");
}
exit(0);
}
sub arg_error {
my $msg = shift();
if (defined($msg)) {
print(STDERR "error: $msg\n\n");
}
print("use option -h or --help to display help menu\n");
exit(0);
}
sub parse_args {
if ($#ARGV == -1) {
pod2usage(-exitval => 0, -verbose => 1);
}
GetOptions ('g|gfa=s' => \$gfa_file,
'd|delim=s' => \$chr_delim,
'p|prefix=s' => \$chr_prefix,
'h|help' => \$help) or error('cannot parse arguments');
if (defined($help)) {
pod2usage(-exitval => 0, -verbose => 1);
}
if (! defined($gfa_file)) {
arg_error('gfa file required');
}
return(0);
}
__END__
=head1 NAME
gfa_variants.pl
=head1 SYNOPSIS
gfa_variants.pl -g graph.gfa [options] > graph.vcf
gfa_variants.pl writes node-based GFA graph variants to a VCF-like file
Variants are based on branch positions in the graph. gfa_variants.pl understands when a reverse-oriented variant is the same as a forward variant and so produces biologically appropriate variants within long inversions that are ortholgous despite their structural difference. This behavior constrasts to available tools that we know of.
Output is VCF-like, but instead of the 'POS' field being expressed in linear coordinates they are instead ordered graph node ids. If you have used xmfa_tools.pl with sorting enabled (https://github.com/brianabernathy/xmfa_tools), the node order will reflect the primary sort reference that you specified (then secondary, etc.). In other words, they are collinear with the genome sequence of that reference. Though these node ids do not represent useful physical distances, they can be determined using gfa_nodes_to_linear_coords.pl and vcf_nodes_to_linear_coords.pl to convert graph-based vcfs to linear coordinates.
The gfa_variants.pl output POS field refers to the node id corresponding to the 'head node'. 'Allelic nodes', which serve as the REF and ALT fields, are the two possible branches extending from the head node. If these two branches return to the same node over after a single allelic node, node sequences are used to determine the explicit base change. Otherwise, variants are encoded as 'long', 'dense' or 'multipath' with the relevant node id suffix.
Simple indels are encoded differently than conventional vcf format. Instead of giving the reference base and the insertion (or vice versus for deletions), gfa_variants.pl uses the '-' character and the indel sequence. We find this more intuitive and in keeping with the variation graph concept, but this difference may break tools expecting a traditional vcf.
Inversion variants can result in a negative sign prefix in the 'POS' field head node id. Conventionally, this field represents the position in the reference genome and negative values may cause issues with tools that use vcfs. Note, the reciprocal variant of the inversion should be called regardless, so the variant information is still retained for most practical purposes.
=head1 AUTHOR
Brian Abernathy
=head1 OPTIONS
-g --gfa genome gfa file, vg-based (required)
-d --delim pattern used to split genotype from chromosome in path names
default: '\.'
-p --prefix prefix prepended to chromosome names
useful if part of the chromosome name is used to split path
names and therefore discarded
ex: -d '\.chr' -p 'chr'
default: none
-h --help display help menu
=cut