-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_ANNOVAR.py
160 lines (133 loc) · 5.91 KB
/
run_ANNOVAR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import sys
import re
import os
import csv
vcf= ""
caller = "Veritas"
damaging = "use"
ancestry1KG = "EUR"
for arg in sys.argv:
vcfResult = re.search("^--vcf=(.*)",arg)
callerResult = re.search("^--caller=(.*)",arg)
damagingResult = re.search("^--damaging=(.*)",arg)
ancestryResult = re.search("^--ancestry=(.*)",arg)
helpResult = re.search("^--help",arg)
if vcfResult:
vcf = vcfResult.group(1)
if callerResult:
caller = callerResult.group(1)
if damagingResult:
damaging = damagingResult.group(1)
if ancestryResult:
ancestry1KG = ancestryResult.group(1)
if helpResult:
print "Usage: python combine_bams.py --vcf=file.vcf --caller=Veritas --damaging=use --ancestry=EUR\n"
print "--vcf : VCF file containing variants\n"
print "--damaging : Use damaging prediction for SNPs and small indels? 'skip' or 'use'\n"
print "--ancestry : 1000 Genomes Ancestry: EUR (European), AFR (African), AMR (Mixed/Central American), SAS (South Asian), EAS (East Asian), or ALL\n"
print "--caller : Algorithm used to call variants\n"
sys.exit()
if vcf =="":
print "You must provide a --vcf file name"
sys.exit()
if caller == "Veritas":
if damaging != "skip":
print "**Downloading 4 ANNOVAR Databases (might take a several minutes)**"
print "WARNING: If you have limited space on your hard drive, you may want to re-run this script with --damaging=skip"
else:
print "**Downloading 3 ANNOVAR Databases (might take a several minutes)**"
command = "annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene annovar/humandb/"
os.system(command)
command = "annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar clinvar_20160302 annovar/humandb/"
os.system(command)
command = "annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar popfreq_all_20150413 annovar/humandb/"
os.system(command)
if damaging != "skip":
command = "annovar/annotate_variation.pl -buildver hg19 -downdb -webfrom annovar ljb26_all annovar/humandb/"
os.system(command)
print "**Creating Annotation Table (the whole-genome WGS file is quite large, so this may take several minutes)**"
print "NOTE: You may see error messages pop up - don't worry about those. You'll still probably get your final results."
annotationPrefix = "annovar_" + re.sub(".vcf$","",vcf)
annovarVar = annotationPrefix + ".avinput"
command = "annovar/convert2annovar.pl -format vcf4 -coverage 10 -fraction 0.3 " + vcf + " > " + annovarVar
os.system(command)
command = "annovar/table_annovar.pl " + annovarVar +" annovar/humandb/ -csvout -buildver hg19 -out " + annotationPrefix +" -protocol refGene,clinvar_20160302,popfreq_all_20150413 -operation g,f,f -nastring NA"
os.system(command)
annovarTable = annotationPrefix + ".hg19_multianno.csv"
filteredVar2 = annotationPrefix + "_ClinVar_Pathogenic_plus_Rare.txt"
outHandle = open(filteredVar2, 'w')
with open(annovarTable, 'rb') as csvfile:
reader = csv.reader(csvfile)
lineInfo = reader.next()
lineCount = 0
while len(lineInfo) > 1:
lineCount += 1
if lineCount > 1:
chr = lineInfo[0]
start = lineInfo[1]
stop = lineInfo[2]
ref = lineInfo[3]
alt = lineInfo[4]
genomicRegion = lineInfo[5]
geneResult = re.search("exonic|splicing|UTR|downstream|upstream",genomicRegion)
if geneResult:
ClinVar = lineInfo[10]
pathResult = re.search("Pathogenic|pathogenic",ClinVar)
if pathResult:
text = chr + "\t" + start + "\t" + stop + "\t" + ref + "\t" + alt + "\t" + "\n"
outHandle.write(text)
else:
#1KG
freq1 = 0
#ESP
freq2 = 0
#CG
freq3 = 0
if ancestry1KG == "ALL":
if (lineInfo[16] != "NA") & (lineInfo[16] != "."):
freq1 = float(lineInfo[16])
if (lineInfo[30] != "NA") & (lineInfo[30] != "."):
freq2 = float(lineInfo[30])
elif ancestry1KG == "AFR":
if (lineInfo[17] != "NA") & (lineInfo[17] != "."):
freq1 = float(lineInfo[17])
if (lineInfo[31] != "NA") & (lineInfo[31] != "."):
freq2 = float(lineInfo[31])
elif ancestry1KG == "AMR":
if (lineInfo[18] != "NA") & (lineInfo[18] != "."):
freq1 = float(lineInfo[18])
if (lineInfo[30] != "NA") & (lineInfo[30] != "."):
freq2 = float(lineInfo[30])
elif ancestry1KG == "EAS":
if (lineInfo[19] != "NA") & (lineInfo[19] != "."):
freq1 = float(lineInfo[19])
if (lineInfo[30] != "NA") & (lineInfo[30] != "."):
freq2 = float(lineInfo[30])
elif ancestry1KG == "EUR":
if (lineInfo[20] != "NA") & (lineInfo[20] != "."):
freq1 = float(lineInfo[20])
if (lineInfo[32] != "NA") & (lineInfo[32] != "."):
freq2 = float(lineInfo[32])
elif ancestry1KG == "SAS":
if (lineInfo[21] != "NA") & (lineInfo[21] != "."):
freq1 = float(lineInfo[21])
if (lineInfo[30] != "NA") & (lineInfo[30] != "."):
freq2 = float(lineInfo[30])
else:
print "Ancestry must be EUR, AFR (African), AMR (Mixed/Central American), SAS (South Asian), EAS (East Asian), or ALL"
sys.exit
if (lineInfo[33] != "NA") & (lineInfo[33] != "."):
freq3 = float(lineInfo[33])
if (freq1 < 0.01) & (freq2 < 0.01) & (freq3 < 0.01):
text = chr + "\t" + start + "\t" + stop + "\t" + ref + "\t" + alt + "\t" + "\n"
outHandle.write(text)
try:
lineInfo = reader.next()
except:
break
if damaging != "skip":
damPrefix = annotationPrefix + "_ClinVar_Pathogenic_plus_Rare_with_Damaging_Score"
command = "annovar/table_annovar.pl " + filteredVar2 +" annovar/humandb/ -csvout -buildver hg19 -out " + damPrefix +" -protocol refGene,ljb26_all -operation g,f -nastring NA"
os.system(command)
else:
print "Template currently only written for Veritas / VCF annoations"