-
Notifications
You must be signed in to change notification settings - Fork 0
/
datatypes.py
63 lines (53 loc) · 2.38 KB
/
datatypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
""" Data types for genomic information representation
"""
import re
class Segment(object):
"""Wrapper class for SEGMENT files which contains a list of
coordinates (end exclusive) representing regions
"""
def __init__(self, segment_file):
self.regions = self.parseSegmentFile(segment_file)
def parseSegmentFile(self, segment_file):
"""parses a SEGMENT file to an array of coordinates"""
regions = list()
for line in segment_file:
coordinatesString = re.split(r'\t+', line.rstrip('\n\t'))
if len(coordinatesString) != 2:
raise ValueError("Invalid input structure. Each row should contain two tab separated numbers!")
start = int(coordinatesString[0])
end = int(coordinatesString[1])
coordinates = [start, end]
regions.append(coordinates)
self.verifyRegions(regions)
return regions
def verifyRegions(self, regions):
""" verifies that no regions within the same segment overlaps
assumes that all coordinates are sorted
"""
i = 0
numregions = len(regions)
for coordinates in regions:
if(i<numregions-1):
nextcoordinates = regions[i+1]
if(nextcoordinates[0] < coordinates[1]):
raise ValueError("Invalid input structure. Coordinates overlap within segment!")
i += 1
return
class Function(object):
"""Wrapper class for FUNCTION files containing list of genome
positions
"""
def __init__(self, function_file, genome_length):
self.positions = self.parseFunctionFile(function_file, genome_length)
def parseFunctionFile(self, function_file, genome_length):
positions = []
for line in function_file:
#TODO check if value has decimals so that it is in the valid float format
position = float(line)
positions.append(position)
self.verifyPositions(positions, genome_length)
return positions
def verifyPositions(self, positions, genome_length):
if not genome_length == len(positions):
raise ValueError('Genome length (' + str(genome_length) + ') is not the same as number of values in '
'input file (' + str(len(positions)) + ')')