-
Notifications
You must be signed in to change notification settings - Fork 0
/
choleraefinder.py
executable file
·733 lines (643 loc) · 29.9 KB
/
choleraefinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
#!/usr/bin/env python3
from __future__ import division
import sys
import os
import time
import random
import re
import subprocess
from argparse import ArgumentParser
from tabulate import tabulate
import collections
import pprint
import json
import gzip
from distutils.spawn import find_executable
from cgecore.blaster import Blaster
from cgecore.cgefinder import CGEFinder
class CholeraeFinder(CGEFinder):
def __init__(self, db_conf_file, notes, db_path, db_path_kma=None,
databases=None):
"""
"""
self.db_path = db_path
if(db_path_kma is None):
self.db_path_kma = db_path + "/kma_indexing"
else:
self.db_path_kma = db_path_kma
self.configured_dbs = dict()
self.description_dbs = dict()
self.kma_db_files = None
self.load_db_config(db_conf_file=db_conf_file)
self.databases = []
self.load_databases(databases=databases)
self.phenos = dict()
self.load_notes(notes=notes)
self.blast_results = None
# self.kma_results = None
# self.results = None
def blast(self, inputfile, out_path, min_cov=0.9, threshold=0.6,
blast="blastn", allowed_overlap=0):
"""
"""
blast_run = Blaster(inputfile=inputfile, databases=self.databases,
db_path=self.db_path, out_path=out_path,
min_cov=min_cov, threshold=threshold, blast=blast,
allowed_overlap=allowed_overlap)
self.blast_results = blast_run.results
return blast_run
def typer(self, json_results):
"""
Reads the json_results to type the vibrio cholerae specie and prepares
the results for the wrapper
"""
prot_class = {}
list_genes = []
type_cholerae = {"specie": "Not able to determine",
"biotype": "Not able to determine",
"serogroup": "Not able to determine",
"wave": "Not able to determine",
"pandemic": "Not able to determine"}
for i in json_results:
if json_results[i][i.lower()] == "No hit found":
prot_class[i] = []
else:
headers = json_results[i][i.lower()].keys()
genes = []
for head in list(headers):
gene = head.split(":")[0]
genes.append(gene)
list_genes.extend(genes)
prot_class[i] = genes
if 'ompW' not in list_genes:
type_cholerae["specie"] = "Not Vibrio Cholerae"
else:
type_cholerae["specie"] = "Vibrio Cholerae"
if "rfbV_O1" in list_genes and "wbfZ_O139" not in list_genes:
type_cholerae["serogroup"] = "O1"
if "VC2346" in list_genes:
type_cholerae["pandemic"] = True
else:
type_cholerae["pandemic"] = False
if ("rstR_cc" in list_genes or "tcpA_cc" in list_genes) and ("rstR_et" not in list_genes and "tcpA_et3" not in list_genes):
type_cholerae["biotype"] = "Classical"
elif "rstR_et" in list_genes or "tcpA_et3" in list_genes:
if "ctxB_1" in list_genes and type_cholerae["pandemic"]:
type_cholerae["biotype"] = "El Tor Variant"
type_cholerae["wave"] = "Early Wave 3"
elif "ctxB_3" in list_genes and type_cholerae["pandemic"]:
type_cholerae["biotype"] = "El Tor"
type_cholerae["wave"] = "Wave 1"
elif "ctxB_7" in list_genes and type_cholerae["pandemic"]:
type_cholerae["biotype"] = "El Tor Variant"
type_cholerae["wave"] = "Late Wave 3"
else:
type_cholerae["biotype"] = "El Tor"
elif "rstR_cc" not in list_genes and "tcpA_cc" not in list_genes and "rstR_et" not in list_genes or "tcpA_et3" not in list_genes:
if type_cholerae["pandemic"]:
if "ctxB_1" in list_genes:
type_cholerae["biotype"] = "El Tor Variant"
type_cholerae["wave"] = "Early Wave 3"
elif "ctxB_3" in list_genes:
type_cholerae["biotype"] = "El Tor"
type_cholerae["wave"] = "Wave 1"
elif "ctxB_7" in list_genes:
type_cholerae["biotype"] = "El Tor Variant"
type_cholerae["wave"] = "Late Wave 3"
else:
if "ctxB_1" in list_genes:
type_cholerae["biotype"] = "Classical"
elif "wbfZ_O139" in list_genes and "rfbV_O1" not in list_genes:
type_cholerae["serogroup"] = "O139"
elif "rfvB_O1" not in list_genes and "wbfZ_O139" not in list_genes:
type_cholerae["serogroup"] = "No O1 / No O139"
else:
type_cholerae["serogroup"] = "Not able to determine. Both rfvB-O1 and wbfZ-O139 have been found. Please look at the Gene Description section to interpretate the results."
return type_cholerae, prot_class
def create_results(self, results_method, outdir):
"""
"""
results = results_method.results
query_aligns = results_method.gene_align_query
homo_aligns = results_method.gene_align_homo
sbjct_aligns = results_method.gene_align_sbjct
json_results = dict()
titles = dict()
rows = dict()
headers = dict()
txt_file_seq_text = dict()
split_print = collections.defaultdict(list)
hits = []
for db in results:
contig_res = {}
if db == 'excluded':
continue
db_name = str(db).capitalize()
if db_name not in json_results:
json_results[db_name] = {}
if db not in json_results[db_name]:
json_results[db_name][db] = {}
if results[db] == "No hit found":
json_results[db_name][db] = "No hit found"
else:
for contig_id, hit in results[db].items():
identity = float(hit["perc_ident"])
coverage = float(hit["perc_coverage"])
# Skip hits below coverage
if coverage < (min_cov * 100) or identity < (threshold * 100):
continue
bit_score = identity * coverage
if contig_id not in contig_res:
contig_res[contig_id] = []
contig_res[contig_id].append([hit["query_start"],
hit["query_end"], bit_score,
hit])
# Check for overlapping hits, only report the best
for contig_id, hit_lsts in contig_res.items():
hit_lsts.sort(key=lambda x: x[0])
hits = [hit[3] for hit in hit_lsts]
for hit in hits:
header = hit["sbjct_header"]
tmp = header.split(":")
try:
gene = tmp[0]
note = tmp[1]
acc = "_".join(tmp[2:])
except IndexError:
gene = ":".join(tmp)
note = ""
acc = ""
try:
variant = tmp[3]
except IndexError:
variant = ""
identity = hit["perc_ident"]
coverage = hit["perc_coverage"]
if coverage>100.:
print(gene)
sbj_length = hit["sbjct_length"]
HSP = hit["HSP_length"]
positions_contig = "%s..%s" % (hit["query_start"],
hit["query_end"])
positions_ref = "%s..%s" % (hit["sbjct_start"],
hit["sbjct_end"])
contig_name = hit["contig_name"]
description = self.phenos.get(gene, "Warning: gene is missing from "
"Notes file. Please inform curator.")
pheno = description.split(":")[0].strip()
note = description.split(":")[-1].strip()
# Write JSON results dict
json_results[db_name][db].update({header: {}})
json_results[db_name][db][header] = {
"vibrio_cholerae_gene": gene,
"identity": round(identity, 2),
"HSP_length": HSP,
"template_length": sbj_length,
"position_in_ref": positions_ref,
"contig_name": contig_name,
"positions_in_contig": positions_contig,
"note": note,
"accession": acc,
"predicted_phenotype": pheno,
"coverage": round(coverage, 2),
"hit_id": contig_id}
# Get run info for JSON file
service = os.path.basename(__file__).replace(".py", "")
date = time.strftime("%d.%m.%Y")
time_ = time.strftime("%H:%M:%S")
# Make JSON genes output file
data = {service: {}}
userinput = {"filename(s)": args.inputfile,
"method": method,
"file_format": file_format}
run_info = {"date": date, "time": time_}
type_cholerae, prot_class = self.typer(json_results)
data[service]["user_input"] = userinput
data[service]["run_info"] = run_info
data[service]["results"] = json_results
data[service]["typing_cholerae"] = type_cholerae
data[service]["list_proteins"] = prot_class
pprint.pprint(data)
# Save json genes output
result_file = "{}/data_CholeraeFinder.json".format(outdir)
with open(result_file, "w") as outfile:
json.dump(data, outfile)
# Getting and writing out the results
header = ["Vibrio Cholerae Gene", "Identity", "Query / Template length",
"Contig", "Position in contig", "Predicted phenotype",
"Accession number"]
if args.extented_output:
# Define extented output
table_filename = "{}/results_tab.tsv".format(outdir)
query_filename = "{}/Hit_in_genome_seq.fsa".format(outdir)
sbjct_filename = "{}/Cholerae_seqs.fsa".format(outdir)
result_filename = "{}/results.txt".format(outdir)
table_file = open(table_filename, "w")
query_file = open(query_filename, "w")
sbjct_file = open(sbjct_filename, "w")
result_file = open(result_filename, "w")
# Make results file
result_file.write("{} Gene(s)\n\n"
.format(service))
# Write tsv table
##TODO##
rows = [["Database"] + header]
for species, dbs_info in json_results.items():
for db_name, db_hits in dbs_info.items():
result_file.write("*" * len("\t".join(header)) + "\n")
result_file.write(self.description_dbs[species.lower()] + "\n")
db_rows = []
# Check it hits are found
if isinstance(db_hits, str):
content = [''] * len(header)
content[int(len(header) / 2)] = db_hits
result_file.write(CholeraeFinder.text_table(header, [content]) + "\n")
continue
for gene_id, gene_info in sorted(db_hits.items(),key=lambda x: (x[1]['vibrio_cholerae_gene'],x[1]['accession'])):
res_gene = gene_info["vibrio_cholerae_gene"]
identity = str(gene_info["identity"])
coverage = str(gene_info["coverage"])
template_HSP = (
"{hsp_len} / {template_len}".
format(hsp_len=gene_info["HSP_length"],
template_len=gene_info["template_length"]))
position_in_ref = gene_info["position_in_ref"]
position_in_contig = gene_info["positions_in_contig"]
predicted_phenotype = gene_info["predicted_phenotype"]
acc = gene_info["accession"]
contig_name = gene_info["contig_name"]
# Add rows to result tables
db_rows.append([res_gene, identity, template_HSP, contig_name,
position_in_contig, predicted_phenotype, acc])
rows.append([db_name, res_gene, identity, template_HSP,
contig_name, position_in_contig, predicted_phenotype,
acc])
# Write query fasta output
hit_name = gene_info["hit_id"]
query_seq = query_aligns[db_name][hit_name]
sbjct_seq = sbjct_aligns[db_name][hit_name]
if coverage == 100. and identity == 100.:
match = "PERFECT MATCH"
else:
match = "WARNING"
qry_header = (">{}:{} ID:{}% COV:{}% Best_match:{}\n"
.format(res_gene, match, identity, coverage,
gene_id))
query_file.write(qry_header)
for i in range(0, len(query_seq), 60):
query_file.write(query_seq[i:i + 60] + "\n")
# Write template fasta output
sbj_header = ">{}\n".format(gene_id)
sbjct_file.write(sbj_header)
for i in range(0, len(sbjct_seq), 60):
sbjct_file.write(sbjct_seq[i:i + 60] + "\n")
# Write db results tables in results file and table file
result_file.write(CholeraeFinder.text_table(header, db_rows) + "\n")
result_file.write("\n")
for row in rows:
table_file.write("\t".join(row) + "\n")
# Write allignment output
result_file.write("\n\nExtended Output:\n\n")
self.make_aln(result_file, json_results, query_aligns, homo_aligns,
sbjct_aligns)
# Close all files
query_file.close()
sbjct_file.close()
table_file.close()
result_file.close()
if args.quiet:
f.close()
def make_aln(self, file_handle, json_data, query_aligns, homol_aligns, sbjct_aligns):
for dbs_info in json_data.values():
for db_name, db_info in dbs_info.items():
if isinstance(db_info, str):
continue
for gene_id, gene_info in sorted(
db_info.items(),
key=lambda x: (x[1]['vibrio_cholerae_gene'],
x[1]['accession'])):
seq_name = ("{gene}_{acc}"
.format(gene=gene_info["vibrio_cholerae_gene"],
acc=gene_info["accession"]))
hit_name = gene_info["hit_id"]
seqs = ["", "", ""]
seqs[0] = sbjct_aligns[db_name][hit_name]
seqs[1] = homol_aligns[db_name][hit_name]
seqs[2] = query_aligns[db_name][hit_name]
self.write_align(seqs, seq_name, file_handle)
def write_align(self, seq, seq_name, file_handle):
file_handle.write("# {}".format(seq_name) + "\n")
sbjct_seq = seq[0]
homol_seq = seq[1]
query_seq = seq[2]
for i in range(0, len(sbjct_seq), 60):
file_handle.write("%-10s\t%s\n" % ("template:", sbjct_seq[i:i + 60]))
file_handle.write("%-10s\t%s\n" % ("", homol_seq[i:i + 60]))
file_handle.write("%-10s\t%s\n\n" % ("query:", query_seq[i:i + 60]))
@staticmethod
def text_table(headers, rows, empty_replace='-'):
''' Create text table
USAGE:
>>> from tabulate import tabulate
>>> headers = ['A','B']
>>> rows = [[1,2],[3,4]]
>>> print(text_table(headers, rows))
**********
A B
**********
1 2
3 4
==========
'''
# Replace empty cells with placeholder
rows = map(lambda row: map(lambda x: x if x else empty_replace, row), rows)
# Create table
table = tabulate(rows, headers, tablefmt='simple').split('\n')
# Prepare title injection
width = len(table[0])
# Switch horisontal line
table[1] = '*' * (width + 2)
# Update table with title
table = (("%s\n" * 3)
% ('*' * (width + 2), '\n'.join(table), '=' * (width + 2)))
return table
def load_notes(self, notes):
with open(notes, 'r') as f:
for line in f:
line = line.strip()
if line.startswith("#"):
continue
else:
line = line+":"
tmp = line.split(":")
self.phenos[tmp[0]] = "%s: %s: %s" % (tmp[1], tmp[2], tmp[3])
if(tmp[2].startswith("Alternate name; ")):
self.phenos[tmp[2][16:]] = "%s %s" % (tmp[1], tmp[2])
def load_databases(self, databases):
"""
"""
# Check if databases and config file are correct/correponds
if databases is '':
sys.exit("Input Error: No database was specified!\n")
elif databases is None:
# Choose all available databases from the config file
self.databases = self.configured_dbs.keys()
else:
# Handle multiple databases
databases = databases.split(',')
# Check that the CholeraeFinder DBs are valid
for db_prefix in databases:
if db_prefix in self.configured_dbs:
self.databases.append(db_prefix)
else:
sys.exit("Input Error: Provided database was not "
"recognised! (%s)\n" % db_prefix)
def load_db_config(self, db_conf_file):
"""
"""
extensions = []
with open(db_conf_file) as f:
for line in f:
line = line.strip()
if not line:
continue
if line[0] == '#':
if 'extensions:' in line:
extensions = [s.strip() for s
in line.split('extensions:')[-1].split(',')]
continue
tmp = line.split('\t')
if len(tmp) != 3:
sys.exit(("Input Error: Invalid line in the database"
" config file!\nA proper entry requires 3 tab "
"separated columns!\n%s") % (line))
db_prefix = tmp[0].strip()
name = tmp[1].split('#')[0].strip()
description = tmp[2]
# Check if all db files are present
for ext in extensions:
db = "%s/%s.%s" % (self.db_path, db_prefix, ext)
if not os.path.exists(db):
sys.exit(("Input Error: The database file (%s) "
"could not be found!") % (db_path))
if db_prefix not in self.configured_dbs:
self.configured_dbs[db_prefix] = []
self.configured_dbs[db_prefix].append(name)
self.description_dbs[db_prefix] = description
if len(self.configured_dbs) == 0:
sys.exit("Input Error: No databases were found in the "
"database config file!")
# Loading paths for KMA databases.
for drug in self.configured_dbs:
kma_db = self.db_path_kma + drug
self.kma_db_files = [kma_db + ".b", kma_db + ".length.b",
kma_db + ".name.b", kma_db + ".align.b"]
def is_gzipped(file_path):
''' Returns True if file is gzipped and False otherwise.
The result is inferred from the first two bits in the file read
from the input path.
On unix systems this should be: 1f 8b
Theoretically there could be exceptions to this test but it is
unlikely and impossible if the input files are otherwise expected
to be encoded in utf-8.
'''
with open(file_path, mode='rb') as fh:
bit_start = fh.read(2)
if(bit_start == b'\x1f\x8b'):
return True
else:
return False
def get_file_format(input_files):
"""
Takes all input files and checks their first character to assess
the file format. Returns one of the following strings; fasta, fastq,
other or mixed. fasta and fastq indicates that all input files are
of the same format, either fasta or fastq. other indiates that all
files are not fasta nor fastq files. mixed indicates that the inputfiles
are a mix of different file formats.
"""
# Open all input files and get the first character
file_format = []
invalid_files = []
for infile in input_files:
if is_gzipped(infile):
f = gzip.open(infile, "rb")
fst_char = f.read(1)
else:
f = open(infile, "rb")
fst_char = f.read(1)
f.close()
# Assess the first character
if fst_char == b"@":
file_format.append("fastq")
elif fst_char == b">":
file_format.append("fasta")
else:
invalid_files.append("other")
if len(set(file_format)) != 1:
return "mixed"
return ",".join(set(file_format))
if __name__ == '__main__':
##########################################################################
# PARSE COMMAND LINE OPTIONS
##########################################################################
parser = ArgumentParser()
parser.add_argument("-i", "--inputfile",
dest="inputfile",
help="FASTA or FASTQ input files.",
nargs="+",
required=True)
parser.add_argument("-o", "--outputPath",
dest="out_path",
help="Path to blast output",
default='.')
parser.add_argument("-tmp", "--tmp_dir",
help=("Temporary directory for storage of the results "
"from the external software."))
parser.add_argument("-mp", "--methodPath",
dest="method_path",
help="Path to method to use (kma or blastn)")
parser.add_argument("-p", "--databasePath",
dest="db_path",
help="Path to the databases",
default='')
parser.add_argument("-d", "--databases",
dest="databases",
help="Databases chosen to search in - if none are \
specified all are used",
default=None)
parser.add_argument("-l", "--min_cov",
dest="min_cov",
help="Minimum coverage",
default=0.60)
parser.add_argument("-t", "--threshold",
dest="threshold",
help="Blast threshold for identity",
default=0.90)
parser.add_argument("-ao", "--acq_overlap",
help="Genes are allowed to overlap this number of\
nucleotides. Default: 30.",
type=int,
default=30)
parser.add_argument("-matrix", "--matrix",
help="Gives the counts all all called bases at each\
position in each mapped template. Columns are: reference\
base, A count, C count, G count, T count, N count,\
- count.",
dest="kma_matrix",
action='store_true',
default=False)
parser.add_argument("-x", "--extented_output",
help=("Give extented output with allignment files, "
"template and query hits in fasta and a tab "
"seperated file with gene profile results"),
action="store_true")
parser.add_argument("-q", "--quiet",
action="store_true",
default=False)
args = parser.parse_args()
##########################################################################
# MAIN
##########################################################################
if args.quiet:
f = open('/dev/null', 'w')
sys.stdout = f
# Defining varibales
min_cov = float(args.min_cov)
threshold = float(args.threshold)
method_path = args.method_path
# Check if valid database is provided
if args.db_path is None:
sys.exit("Input Error: No database directory was provided!\n")
elif not os.path.exists(args.db_path):
sys.exit("Input Error: The specified database directory does not "
"exist!\n")
else:
# Check existence of config file
db_config_file = '%s/config' % (args.db_path)
if not os.path.exists(db_config_file):
sys.exit("Input Error: The database config file could not be found!")
# Save path
db_path = args.db_path
# Check existence of notes file
notes_path = "%s/notes.txt" % (args.db_path)
if not os.path.exists(notes_path):
sys.exit('Input Error: notes.txt not found! (%s)' % (notes_path))
# Check if valid input files are provided
if args.inputfile is None:
sys.exit("Input Error: No input file was provided!\n")
elif not os.path.exists(args.inputfile[0]):
sys.exit("Input Error: Input file does not exist!\n")
elif len(args.inputfile) > 1:
if not os.path.exists(args.inputfile[1]):
sys.exit("Input Error: Input file does not exist!\n")
infile = args.inputfile
else:
infile = args.inputfile
# Check if valid output directory is provided
if not os.path.exists(args.out_path):
sys.exit("Input Error: Output dirctory does not exists!\n")
else:
out_path = args.out_path
# Add kma_matrix
if (args.kma_matrix):
extra_args = "-matrix"
else:
extra_args = None
# Check if valid tmp directory is provided
if args.tmp_dir:
if not os.path.exists(args.tmp_dir):
sys.exit("Input Error: Tmp dirctory, {}, does not exist!\n"
.format(args.tmp_dir))
else:
tmp_dir = os.path.abspath(args.tmp_dir)
else:
tmp_dir = out_path
file_format = get_file_format(infile)
if file_format == "fastq":
if not method_path:
method_path = "kma"
if find_executable(method_path) is None:
sys.exit("No valid path to a kma program was provided. Use the -mp "
"flag to provide the path.")
# Check the number of files
if len(infile) == 1:
infile_1 = infile[0]
infile_2 = None
elif len(infile) == 2:
infile_1 = infile[0]
infile_2 = infile[1]
else:
sys.exit("Only 2 input file accepted for raw read data,\
if data from more runs is avaliable for the same\
sample, please concatinate the reads into two files")
sample_name = os.path.basename(sorted(args.inputfile)[0])
method = "kma"
finder = CholeraeFinder(db_conf_file=db_config_file,
databases=args.databases, db_path=db_path,
db_path_kma=args.db_path, notes=notes_path)
# if input_fastq2 is None, it is ignored by the kma method.
kma_run = finder.kma(inputfile_1=infile_1, inputfile_2=infile_2,
out_path=tmp_dir, databases=finder.databases,
db_path_kma=finder.db_path_kma,
min_cov=min_cov, threshold=threshold,
kma_path=method_path, kma_add_args=extra_args)
finder.create_results(results_method=kma_run, outdir=out_path)
elif file_format == "fasta":
if not method_path:
method_path = "blastn"
if find_executable(method_path) is None:
sys.exit("No valid path to a blastn program was provided. Use the "
"-mp flag to provide the path.")
# Assert that only one fasta file is inputted
assert len(infile) == 1, "Only one input file accepted for assembled data"
infile = infile[0]
method = "blast"
finder = CholeraeFinder(db_conf_file=db_config_file,
databases=args.databases, db_path=db_path,
notes=notes_path)
blast_run = finder.blast(inputfile=infile, out_path=tmp_dir,
min_cov=min_cov, threshold=threshold,
blast=method_path,
allowed_overlap=args.acq_overlap)
finder.create_results(results_method=blast_run, outdir=out_path)