forked from hall-lab/svtools
-
Notifications
You must be signed in to change notification settings - Fork 8
/
splitterToBreakpoint
executable file
·123 lines (114 loc) · 6.47 KB
/
splitterToBreakpoint
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
import sys
from optparse import OptionParser
import os
def splitterToBreakpoint(contigFile,splitReadSlop,localRange):
#Define whether event is inv, del, or dup (classifying translocations as if they were local intrachromosomal events)
if contigFile == "stdin":
contigs = sys.stdin
else:
contigs = open(str(contigFile))
for line in contigs:
split = line.split()
if (split[0] > split[3]) or ((split[0] == split[3]) and (int(split[1]) > int(split[4]))):
chrom1 = split[3]
start1 = int(split[4])
end1 = int(split[5])
chrom2 = split[0]
start2 = int(split[1])
end2 = int(split[2])
ID = split[6]
score = int(split[7])
strand1 = split[9]
strand2 = split[8]
queryStart1 = int(split[12])
queryEnd1 = int(split[13])
queryStart2 = int(split[10])
queryEnd2 = int(split[11])
minNonOverlap = int(split[14])
queryLength = int(split[15])
if len(split) > 16: qualScores = split[16]
else: qualScores = '*'
else:
chrom1 = split[0]
start1 = int(split[1])
end1 = int(split[2])
chrom2 = split[3]
start2 = int(split[4])
end2 = int(split[5])
ID = split[6]
score = int(split[7])
strand1 = split[8]
strand2 = split[9]
queryStart1 = int(split[10])
queryEnd1 = int(split[11])
queryStart2 = int(split[12])
queryEnd2 = int(split[13])
minNonOverlap = int(split[14])
queryLength = int(split[15])
if len(split) > 16: qualScores = split[16]
else: qualScores = '*'
if (queryStart1 == queryStart2):
raise StandardError("spitterToBreakpoint cannot handle bedpe with equal starting query offsets in " + ID);
if strand1 != strand2: #Classifying Inversions
if chrom1 == chrom2 and abs(end2 - start1) <= localRange:
variant = "local_inversion"
else:
variant = "distant_inversion"
elif ((strand1=="+" and strand2=="+" and queryStart1 < queryStart2) or (strand1=="-" and strand2=="-" and queryStart1 > queryStart2)): #Classifying Deletions
if chrom1 == chrom2 and (end2 - start1) <= localRange:
variant = "local_deletion"
else:
variant = "distant_deletion"
elif ((strand1=="+" and strand2=="+" and queryStart1 > queryStart2) or (strand1=="-" and strand2=="-" and queryStart1 < queryStart2)): #Classifying Tandem Duplications
if chrom1 == chrom2 and (end2 - start1) <= localRange:
variant = "local_duplication"
else:
variant = "distant_duplication"
else:
print "ERROR: variant " + ID + " is lost in variant classification step. Contact Mitchell"
# Below modifed by GGF to account for zero-based half-open coordinate system.
# When a start takes its coordinate from an end, subtract one.
# When an end takes its coordinate from a start, add one.
if variant == "local_inversion" or variant == "distant_inversion":
if ((queryStart1 < queryStart2) and (strand1 == "-") and (strand2 == "+")) or ((queryStart1 > queryStart2) and (strand1 == "+") and (strand2 == "-")):
breakStart1 = str(start1-splitReadSlop)
breakEnd1 = str(start1+1+splitReadSlop)
breakStart2 = str(start2-splitReadSlop)
breakEnd2 = str(start2+1+splitReadSlop)
elif ((queryStart1 < queryStart2) and (strand1 == "+") and (strand2 == "-")) or ((queryStart1 > queryStart2) and (strand1 == "-") and (strand2 == "+")):
breakStart1 = str(end1-1-splitReadSlop)
breakEnd1 = str(end1+splitReadSlop)
breakStart2 = str(end2-1-splitReadSlop)
breakEnd2 = str(end2+splitReadSlop)
elif variant == "local_deletion" or variant == "distant_deletion":
breakStart1 = str(end1-1-splitReadSlop)
breakEnd1 = str(end1+splitReadSlop)
breakStart2 = str(start2-splitReadSlop)
breakEnd2 = str(start2+1+splitReadSlop)
elif variant == "local_duplication" or variant == "distant_duplication":
breakStart1 = str(start1-splitReadSlop)
breakEnd1 = str(start1+1+splitReadSlop)
breakStart2 = str(end2-1-splitReadSlop)
breakEnd2 = str(end2+splitReadSlop)
else:
print "ERROR: " + ID + " is lost in breakpoint locus prediction step. Contact Mitchell"
print '\t'.join(map(str, [chrom1, breakStart1, breakEnd1, chrom2, breakStart2, breakEnd2, ID, score, strand1, strand2, queryStart1, queryEnd1, queryStart2, queryEnd2, minNonOverlap, queryLength, qualScores, variant]))
def main():
usage = "%prog -b <bedpe> -c <contigFile.bedpe> [options]\nVersion: 0.2\nAuthor: Mitchell L. Leibowitz\nEdited: Aug, 2011 by Colby Chiang (details below)\n \
\nEdits included:\n 1) When switching ordering of bedpe positioning (if start2>start1), also switch orientation \n 2) Makes sure all comparisons are integer/integer comparisons or \
string/string comparisons. \n 3) Allowed for bedpe files of larger lengths (17 fields; see -s help).\n\n \
Note: Add slop at your own risk.\n\nPrints modified bedpe so bedpe entry 1 is always less than bedpe entry 2\n\nModified by GGF on 2/13/2012 to account for zero-base half-open coordinates."
parser = OptionParser(usage)
parser.add_option("-i", dest="contigFile", metavar="FILE", help="BEDPE file containing the split contigs; requires 10 column bedpe plus 11-14 as query coordinates, field 15 as query length \
and field 16 minimum non overlap.")
parser.add_option("-s", dest="splitReadSlop", metavar="INT", type="int", default=1, help="Bidirectional slop around the breakpoint [default = 1]")
parser.add_option("-r", dest="localRange", default=1000000, type="int", metavar="INT", help="the range of coordinates considered local (for breakpoint classification) [default = 1000000]; Calculated by subtracting field 6 from field 2.")
(opts, args) = parser.parse_args()
if opts.contigFile is None:
parser.print_help()
print
else:
splitterToBreakpoint(opts.contigFile, opts.splitReadSlop, opts.localRange)
if __name__ == "__main__":
sys.exit(main())