-
Notifications
You must be signed in to change notification settings - Fork 16
/
pyssw.py
275 lines (236 loc) · 10.7 KB
/
pyssw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@package pyssw
@brief Python standalone program for ssw alignment using the C library
Complete-Striped-Smith-Waterman-Library
Biopython module is require for fastq/fastq parsing
@copyright [The MIT licence](http://opensource.org/licenses/MIT)
@author Adrien Leger - 2014
* [Github](https://github.com/a-slide)
* [Atlantic Gene Therapies - INSERM 1089] (http://www.atlantic-gene-therapies.fr/)
"""
#~~~~~~~GLOBAL IMPORTS~~~~~~~#
# Standard library packages
import optparse
import sys
from time import time
import gzip
#~~~~~~~MAIN FUNCTION~~~~~~~#
def main (opt):
print ("Inport subject sequence")
# Import fasta subject
if opt.subject.rpartition(".")[2].lower() == "gz":
subject_handle = gzip.open(opt.subject, "r")
else:
subject_handle = open(opt.subject, "r")
subject = SeqIO.read(subject_handle, "fasta")
print ("Inport query sequences and count the number of sequences")
# Import fasta subject
if opt.query.rpartition(".")[2].lower() == "gz":
nseq = count_seq(opt.query, opt.qtype, True)
query_handle = gzip.open(opt.query, "r")
else:
nseq = count_seq(opt.query, opt.qtype, False)
query_handle = open(opt.query, "r")
query_gen = SeqIO.parse(query_handle, opt.qtype)
print("{} contains {} sequences to align".format(opt.query, nseq))
# Calculate a step list for the progress bar
nseq_list = [int(nseq*i/100.0) for i in range(5,101,5)]
print ("Initialize ssw aligner with the subject sequence")
# Init the an Aligner object with the reference value
ssw = Aligner(
str(subject.seq),
match=int(opt.match),
mismatch=int(opt.mismatch),
gap_open=int(opt.gap_open),
gap_extend= int(opt.gap_extend),
report_secondary=False,
report_cigar=True)
# Write the header of the SAM file
with open("result.sam", "w") as f:
f.write("@HD\tVN:1.0\tSO:unsorted\n")
f.write("@SQ\tSN:{}\tLN:{}\n".format(subject.id, len(subject.seq)))
f.write("@PG\tID:Striped-Smith-Waterman\tPN:pyssw\tVN:0.1\n")
f.write("@CO\tScore_values = match {}, mismatch {}, gap_open {}, gap_extend {}\n".format(
opt.match,
opt.mismatch,
opt.gap_open,
opt.gap_extend))
f.write("@CO\tFilter Options = min_score {}, min_len {}\n".format(
opt.min_score,
opt.min_len))
print ("Starting alignment of queries against the subject sequence")
start = time()
# Align each query along the subject an write result in a SAM file
i = 0
for query in query_gen:
# Find the best alignment
if opt.reverse:
al, orient = find_best_align (ssw, query, float(opt.min_score), int(opt.min_len))
else:
al, orient = ssw.align(str(query.seq), float(opt.min_score), int(opt.min_len)), True
# If valid match found
if al:
f.write(sam_line(
qname=query.id,
flag=0 if orient else 16,
rname=subject.id,
pos=al.ref_begin+1,
cigar=al.cigar_string,
seq=str(query.seq),
qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*",
tags=["AS:i:{}".format(al.score)]))
# If no valid match found and -u flag activated (report unaligned)
elif opt.unaligned:
f.write(sam_line(
qname=query.id,
flag=4,
seq=str(query.seq),
qual=SeqIO.QualityIO._get_sanger_quality_str(query) if opt.qtype == "fastq" else "*"))
# Else = match unreported
# Progress bar
i+=1
if i in nseq_list:
frac = i/float(nseq)
t = time()-start
print ("{} sequences \t{}% \tRemaining time = {}s".format(i, int(frac*100), round(t/frac-t, 2)))
print ("\n{} Sequences processed in {}s".format(i, round(time()-start, 2)))
#~~~~~~~HELPER FUNCTIONS~~~~~~~#
def sam_line (qname='*', flag=4, rname='*', pos=0, mapq=0, cigar='*', rnext='*', pnext=0, tlen=0, seq='*', qual='*', tags=None):
"""
Return a minimal sam line = by default return an undetermined sam line. Check the document
[SAM Format Specification](http://samtools.sourceforge.net/SAM1.pdf) for a full description.
@param qname Query template NAME
@param flag bitwise FLAG
@param rname Reference sequence NAME of the alignment
@param pos 1-based leftmost mapping POSition of the first matching base
@param mapq MAPping Quality
@param cigar CIGAR string
@param rnext Reference sequence name of the primary alignment of the mate
@param pnext 1-based leftmost position of the primary alignment of the mate
@param tlen signed observed Template LENgth
@param seq segment SEQuence
@param qual ASCII of base QUALity plus 33
@param tags list of optional tags
@return A Sam alignment line
"""
if tags:
return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, " ".join(tags))
else:
return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual)
def find_best_align (ssw, query, min_score, min_len):
# Align reverse and forward query
forward_al = ssw.align(str(query.seq), min_score, min_len)
reverse_al = ssw.align(str(query.seq.reverse_complement()), min_score, min_len)
# Decision tree to return the best aligned sequence taking into acount the absence of result
# by ssw_wrap in case of score filtering
if not forward_al:
if not reverse_al:
return (None, None)
else:
return (reverse_al, False)
else:
if not reverse_al:
return (forward_al, True)
else:
if forward_al.score >= reverse_al.score:
return (forward_al, True)
else:
return (reverse_al, False)
def count_seq (filename, seq_type="fasta", gziped=False):
"""
Count the number of sequences in a fastq or a fastq file
@param filename Path to a valid readeable file
@param file_type Should be either fastq or fastq. Default fasta
@param gziped Boolean indicating if the file is gziped or not. Default False
"""
#Standard library import
import gzip
from mmap import mmap
# Verify if the file is fasta or fastq type
assert seq_type in ["fasta", "fastq"], "The file has to be either fastq or fasta format"
# Open the file
if gziped:
f = gzip.open(filename, "r")
else:
f = open(filename, "r")
# FASTA Find a start line seq character ">" an increment the counter each time
if seq_type == "fasta":
nline = 0
for line in f:
if line[0] == ">":
nline+=1
f.close()
return nline
# FASTQ No motif to find, but 4 lines correspond to 1 sequence
else:
nline = 0
for line in f:
nline+=1
f.close()
return nline/4
def optparser():
print("Parse command line options")
# Usage and version strings
program_name = "pyssw"
program_version = 0.1
version_string = "{}\t{}".format(program_name, program_version)
usage_string = "{}.py -s subject.fasta -q fastq (or fasta) [Facultative options]".format(program_name)
optparser = optparse.OptionParser(usage = usage_string, version = version_string)
# Define optparser options
hstr = "Path of the fasta file containing the subject genome sequence. Can be gziped. [REQUIRED] "
optparser.add_option( '-s', '--subject', dest="subject", help=hstr)
hstr = "Path of the fastq or fasta file containing the short read to be aligned. Can be gziped. [REQUIRED]"
optparser.add_option( '-q', '--query', dest="query", help=hstr)
hstr = "Type of the query file = fastq or fasta. [default: fastq]"
optparser.add_option( '-t', '--qtype', dest="qtype", default="fastq", help=hstr)
hstr = "Positive integer for weight match in genome sequence alignment. [default: 2]"
optparser.add_option( '-m', '--match', dest="match",default=2, help=hstr)
hstr = "Positive integer. The negative value will be used as weight mismatch in genome sequence alignment. [default: 2]"
optparser.add_option( '-x', '--mismatch', dest="mismatch", default=2, help=hstr)
hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 3]"
optparser.add_option( '-o', '--gap_open', dest="gap_open", default=3, help=hstr)
hstr = "Positive integer. The negative value will be used as weight for the gap opening. [default: 1]"
optparser.add_option( '-e', '--gap_extend', dest="gap_extend", default=1, help=hstr)
hstr = "Integer. Consider alignments having a score <= as not aligned. [default: 0]"
optparser.add_option( '-f', '--min_score', dest="min_score", default=0, help=hstr)
hstr = "Integer. Consider alignments having a length <= as not aligned. [default: 0]"
optparser.add_option( '-l', '--min_len', dest="min_len", default=0, help=hstr)
hstr = "Flag. Align query in forward and reverse orientation and choose the best alignment. [Set by default]"
optparser.add_option( '-r', '--reverse', dest="reverse", action="store_true", default=True, help=hstr)
hstr = "Flag. Write unaligned reads in sam output [Unset by default]"
optparser.add_option( '-u', '--unaligned', dest="unaligned", action="store_true", default=False, help=hstr)
# Parse arg and return a dictionnary_like object of options
opt, args = optparser.parse_args()
if not opt.subject:
print ("\nERROR: a subject fasta file has to be provided (-s option)\n")
optparser.print_help()
sys.exit()
if not opt.query:
print ("\nERROR: a query fasta or fastq file has to be provided (-q option)\n")
optparser.print_help()
sys.exit()
return opt
#~~~~~~~TOP LEVEL INSTRUCTIONS~~~~~~~#
if __name__ == '__main__':
# try to import Third party and local packages
try:
from Bio import SeqIO
except ImportError:
print ("ERROR: Please install Biopython package")
sys.exit()
try:
from ssw_wrap import Aligner
except ImportError:
print ("ERROR: Please place ssw_wrap in the current directory or add its dir to python path")
sys.exit()
# Parse command line arguments
opt = optparser()
# Run the main function
main(opt)