-
Notifications
You must be signed in to change notification settings - Fork 3
/
funs2.R
80 lines (69 loc) · 2.18 KB
/
funs2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
readTableData =
#
# Given the lines containing just the data part of a table in the file
# and the column starting positions, read the data into a data.frame.
#
function(txt, columnStarts)
{
widths = diff(columnStarts)
con = textConnection(txt)
ans = read.fwf(con, widths = widths)
w = sapply(ans, is.character)
ans[w] = lapply(ans[w], trimws)
# Convert the 7th column - cover from x% to numbers.
# The % is the last character so get the substring up to the
# next to last character and then covert the result.
# Could also use gsub("%", "", ans[[7]])
ans[[7]] = as.numeric(substring(ans[[7]], 1, nchar(ans[[7]]) - 1))
ans
}
readTable =
#
# high-level function to read the NCBI table.
# Assumes
#
# Expects the header line
# readTable(tt[-1])
#
function(txt, header = txt[1], columnStarts = findColumnStarts(header))
{
# read the data part
ans = readTableData(txt[-1], columnStarts)
# get the column names from the first line.
vars = read.fwf(textConnection(header), widths = diff(columnStarts))
names(ans) = trimws(vars)
ans
}
# One more top-level function to read all the tables in the NCBI file
readNCBITables =
function(file, lines = readLines(file))
{
starts = grep("Query #", lines)
ends = grep("Alignments:", lines)
mapply(readNCBITable, starts, ends, MoreArgs = list(lines = lines), SIMPLIFY = FALSE)
}
readNCBITable =
function(start, end, lines)
{
txt = lines[(start + 3):(end - 1)]
txt = txt[ txt != "" ]
readTable(txt[-1])
}
findColumnStarts =
# Compute the starting point of each column
# using the start of each word in the line h
# Split the text into characters and find
# the locations of the capital letters which start
# each word except "cover"
# So then find any lower case letters that are preceded by a " ", a space
#
# Also put the end of the last column so we have the start and end
# of each column with which we can compute the widths.
function(h)
{
chars = strsplit(h, "")[[1]]
i = which(chars %in% LETTERS)
w = which(chars %in% letters)
st = chars[w - 1] == " "
sort(c(i, w[st], nchar(h)))
}