-
Notifications
You must be signed in to change notification settings - Fork 5
/
split.pl
84 lines (68 loc) · 3.05 KB
/
split.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
# split.pl takes a VCF file and splits it into many files, each with approximately the given number of variant lines, ensuring that multiline variants all end up in the same file. The input VCF files can be bgzipped or uncompressed.
my $linenum = 1;
my $filenum = 1;
sub main::HELP_MESSAGE(); # Declaration to make perl happy
$Getopt::Std::STANDARD_HELP_VERSION = 1; # Make --help and --version flags halt execution
$main::VERSION = '0.1';
my %options = ();
getopts('hn:o:',\%options);
main::HELP_MESSAGE() && exit if $options{'h'} || !defined $ARGV[0];
my $linesperfile = (defined $options{'n'} ? $options{'n'} : 10000);
my $outputdir = (defined $options{'o'} ? $options{'o'} : ".");
mkdir $outputdir unless -d $outputdir;
die "Please provide a valid input file" unless -s $ARGV[0];
my $infile = $ARGV[0];
my $catprefix = ($infile =~ /\.gz$/ ? "z" : "");
print STDERR "Grabbing header\n";
system("${catprefix}cat $infile | grep \"^#\" > $outputdir/header.tmp") && die "Could not extract header from $infile: $!"; # Extract header
print STDERR "Sorting file\n";
open(IN,"${catprefix}cat $infile | grep -v \"^#\" | sort -k3,3n |") || die "Could not open $infile: $!"; # Remove header, sort, and open
open(OUT,"| sort -k1,1V -k2,2n | cat $outputdir/header.tmp - | bgzip -c > $outputdir/split1.vcf.gz") || die "Cannot pipe to bgzip: $!";
my $lastid;
print STDERR "Starting file 1\n";
while(my $line = <IN>) {
my $id = (split(/\s+/,$line))[2];
if ($linenum >= $linesperfile) { # Need to split
if ($id =~ /_/) { # Last line is multiline variant
my ($event) = ($id =~ /^(.*)_/);
my ($lastevent) = ($lastid =~ /^(.*)_/);
while($lastevent ne $event) { # Keep printing until a new event is found
print OUT $line;
$lastevent = $event;
$line = <IN>;
$id = (split(/\s+/,$line))[2];
if ($id =~ /_/) { # Get next line's event ID from its variant ID
($event) = ($id =~ /^(.*)_/);
} else {
$event = $id;
}
}
}
# Close old file and open a new one
print OUT $line;
close OUT;
$filenum++;
$linenum = 1;
#my $filenumstring = ($filenum < 10 ? "0$filenum" : $filenum);
open(OUT,"| sort -k1,1V -k2,2n | cat $outputdir/header.tmp - | bgzip -c > $outputdir/split$filenum.vcf.gz") || die "Cannot pipe to bgzip: $!";
print STDERR "Starting file $filenum\n";
} else { # Don't need to split
print OUT $line;
$linenum++;
}
$lastid = $id;
}
close OUT;
unlink("$outputdir/header.tmp") || die "Could not delete header.tmp: $!";
sub main::HELP_MESSAGE() {
print STDERR "split.pl takes a VCF file and splits it into many files, each with approximately the given number of variant lines, ensuring that multiline variants all end up in the same file. The input VCF files can be bgzipped or uncompressed.
usage: perl split.pl [-n linesperfile] [-o outputdir] input.vcf[.gz]
-n Number of lines per file (1000)
-o Output directory (defaults to current directory)i
-h Display this message
--help Display this message
--version Display version\n"
}