-
Notifications
You must be signed in to change notification settings - Fork 10
/
calculate_phastcons.py
61 lines (45 loc) · 1.7 KB
/
calculate_phastcons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
'''
the script is written by Mikael Hussius @ SciLifeLab, https://github.com/hussius/gff-phastcons-human.
slightly modified to work after the bigwig file is downloaded locally.
'''
import pyBigWig as pw
import numpy as np
import sys
if len(sys.argv)<4:
sys.exit("USAGE: python " + sys.argv[0] + "<GFF file with regions> <BigWig file> <output file>")
# Need to download hg19.100way.phastCons.bw first, use the command:
# wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons.bw
infile = sys.argv[1]
bw_file = sys.argv[2]
outfile = sys.argv[3]
bw = pw.open(bw_file)
oF = open(outfile, "w")
header = ["Peptide","phastcon_max_score","phastcon_mean_score"]
oF.write("\t".join(header)+"\n")
pep_dic = {}
if not infile.endswith("gff") and not infile.endswith("gff3"):
sys.exit("The region file needs to be a GFF(3) file!")
for line in open(infile):
if not line.startswith("chr"):
continue
fields = line.strip().split()
(chr, start, end, pept) = (fields[0], fields[3], fields[4], fields[8])
if not pept.startswith("Parent="): continue
pept = pept.replace("Parent=","") # Remove "Parent="
try:
values = bw.values(chr, int(start), int(end))
max_val = np.max(values)
mean_val = np.mean(values)
except:
print("Encountered error for line: " + line.strip())
max_val = -1
min_val = -1
if pept not in pep_dic:
pep_dic[pept]=[max_val, mean_val]
else:
pep_dic[pept][0]=max(max_val,pep_dic[pept][0])
pep_dic[pept][1]=np.mean([mean_val,pep_dic[pept][1]])
bw.close()
for pept in pep_dic:
oF.write("%s\t%f\t%f\n" % (pept,pep_dic[pept][0],pep_dic[pept][1]))
oF.close()