-
Notifications
You must be signed in to change notification settings - Fork 0
/
seq_alignment.R
executable file
·80 lines (61 loc) · 3.01 KB
/
seq_alignment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/mlab/data/software/R-3.2.1-F22/bin/Rscript
##!/usr/bin/Rscript
############################################################################################
##
## Written by Robert Flight, 07/20/2016
## Copyright Sen Yao, Robert Flight, and Hunter Moseley, 07/20/2016. All rights reserved.
##
###########################################################################################
## ------------------------------------------------------------------------
library(atom2seq)
library(Biobase)
library(Biostrings)
## ----testAlign-----------------------------------------------------------
# need to add a test for non-standards existing as insertion character.
atom <- AAStringSet(c(rmFrontAddBack="MDDTEKMSMKLT", addFrontRmBack="TSMDDTEKMSMK", insertion="MDDAATEKMSMKL", deletion="SMTEKMSMKL", indel="SMTEKMSMAAKL", nonAAInsert="SMDD--TEKMSMKL", nonAAInsertRmFront="MDD--TEKMSMKL"))
seq <- AAStringSet(c(rep("SMDDTEKMSMKL", 5), nonAAInsert="SMDDTEKMSMKL", nonAAInsertRmFront="SMDDTEKMSMKL"))
refNum <- list(rmFrontAddBack=c(2,3,4,5,6,7,8,9,10,11,12,0),
addFrontRmBack=c(0,1,2,3,4,5,6,7,8,9,10,11),
insertion=c(2,3,4,0,0,5,6,7,8,9,10,11,12),
deletion=c(1,2,5,6,7,8,9,10,11,12),
indel=c(1,2,5,6,7,8,9,10,0,0,11,12),
nonAAInsert=c(1,2,3,4,0,0,5,6,7,8,9,10,11,12),
nonAAInsertRmFront=c(2,3,4,0,0,5,6,7,8,9,10,11,12))
names(seq) <- names(atom)
useNames <- names(atom) # do a small sample for testing
## ----genAlignTest--------------------------------------------------------
outTest <- lapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])})
outNum <- lapply(outTest, function(x){x$atomNum})
names(outNum) <- useNames
all.equal(outNum, refNum)
## ----loadData------------------------------------------------------------
#useDir <- "/mlab/data/rmflight/Documents/projects/work/sen/coordination_families"
#setwd("../output_allMetal/")
args = commandArgs(trailingOnly=TRUE)
setwd(args[1])
## ----read_sequences----------------------------------------
library(Biostrings)
seq <- readAAStringSet("seqs.SEQRES.shell.txt", use.names=TRUE)
atom <- readAAStringSet("seqs.ATOM.shell.txt", use.names=TRUE)
## ----sanityCheck-------------------------------------------
splitDot <- function(inNames){
allSplit <- strsplit(inNames, ".", fixed=TRUE)
outNames <- sapply(allSplit, function(x){x[[1]]})
}
seqNames <- splitDot(names(seq))
atomNames <- splitDot(names(atom))
all.equal(seqNames, atomNames)
seqSort <- sort(names(seq))
atomSort <- sort(names(atom))
all.equal(seqSort, atomSort)
## ----runAlign------------------------------------------------
useNames <- sample(names(atom), 100)
outAlign <- mclapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])}, mc.cores=6)
## ----runAllAlign---------------------------------------------
useNames <- names(atom)
outNum <- mclapply(useNames, function(x){
tmpRes <- genAlign(atom[[x]], seq[[x]])
tmpRes$atomNum
}, mc.cores=6) ## This takes about 20 min
names(outNum) <- useNames
save(outNum, file="atom2seq.RData")