-
Notifications
You must be signed in to change notification settings - Fork 0
/
FASTQcharacterize.sh
executable file
·210 lines (196 loc) · 6.68 KB
/
FASTQcharacterize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/bin/bash
#This class finds the read length histogram, quality score values, insertion/deletion/mutation rates, and other parameters
#for an input fastq file
# @author Anna Shcherbina (mailto: [email protected])
# Copyrigth 2013 Anna Shcherbina
#License: GNU GPL license (http://www.gnu.org/licenses/gpl.html)
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
unset inputfastq
unset ref
unset blastresults
unset full
unset sam
unset bam
plothistogram=""
#split input fasta file into blocks of size 1000
blocksize=1000
#default number of blocks to use for the BLAST algorithm
defaultblocksforblast=5
array=( $@ )
numinputs=$#
for ((i=0; i<$numinputs; i++))
do
if [ ${array[$i]} = -input ]
then
inputfastq=${array[$i+1]}
fi
if [ ${array[$i]} = -reference ]
then
ref=${array[$i+1]}
fi
if [ ${array[$i]} = -blastresults ]
then
blastresults=${array[i+1]}
fi
if [ ${array[$i]} = -full ]
then
full=true
fi
if [ ${array[$i]} = -plothistogram ]
then
plothistogram="-plothistogram"
fi
if [ ${array[$i]} = -sam ]
then
sam=${array[i+1]}
echo "sam file identified: $sam"
fi
if [ ${array[$i]} = -bam ]
then
bam=${array[i+1]}
echo "bam file identified: $bam"
fi
done
printhelp=false;
if [[ -z "$inputfastq" ]]
then
printhelp=true
elif [[ -z "$ref" ]]
then
if [[ -z "$blastresults" ]]
then
printhelp=true
fi
fi
if [ ${printhelp} = true ]
then
echo "FASTQcharacterize can be used with the BLAST alignment tool or with an input SAM/BAM file generated by an external aligner"
echo ""
echo "To use with BLAST aligner:"
echo ""
echo "FASTQcharacterize.sh -input <input fastq file to characterize> -reference <.fasta reference file> "
echo "options: "
echo ""
echo "-blastresults <blast results file> If a BLAST results file is specified, which might be preferred for speed purposes, the software will parse this input and use it as an input for dataset characterization" | fold -w 80
echo "-reference This can be specified as an alternative to 'blastresults'. If specified, the software will run BLAST of the input fastq file against the input fasta reference" | fold -w 80
echo "-full If a reference is provided and the -full flag is set, blast will be performed on the entire input fastq file. This can be slow, so by default only the first 5000 sequences of the input fastq file (or less) are used to perform blast against the reference dataset." | fold -w 80
echo ""
echo "To use with BAM/SAM input file:"
echo ""
echo "FASTQcharacterize.sh -input <input fastq file to characterize > -reference <.fasta reference file> -bam[-sam] <.bam or .sam file > "
echo ""
echo "-plothistogram flag will cause histograms to be generated as the dataset is characterized." | fold -w 80
exit
fi
echo "converting input fastq into fasta and quality files"
fasta=`echo $inputfastq | sed 's/.fastq/.fasta'/`
fasta=`echo $fasta | sed 's/.fq/.fasta'/`
qual=`echo $inputfastq | sed 's/.fastq/.qual'/`
qual=`echo $qual | sed 's/.fq/.qual'/`
basename=`echo $fasta | sed 's/.fasta/'/`
echo "fasta = $fasta"
echo "qual = $qual"
python src/fastq_extract.py $inputfastq
echo "complete"
echo "getting read histogram..."
python src/readHist.py $fasta $plothistogram
echo "complete"
echo "getting quality histogram..."
python src/qualityHist.py $qual $plothistogram
echo "complete"
readHistName=`echo $fasta | sed 's/.fasta/readHist.csv'/`
qualHistName=`echo $qual | sed 's/.qual/qualHist.csv'/`
#if a BAM file is specified, convert to sam
if [[ -n "$bam" ]]
then
echo "converting input bam file to sam format"
sam=`echo $bam | sed 's/.bam/.sam'/`
./samtools view $bam > $sam
fi
#annotate the matching bases and single base mutations in the sam file
if [[ -n "$sam" ]]
then
echo "annotating single-base mutations in sam file"
sam_annotated=`echo $sam | sed 's/.sam/_annotated.sam'/`
echo "sam_annotated: $sam_annotated"
echo "sam: $sam"
echo "reference: $ref"
./samtools calmd -e -S $sam $ref > $sam_annotated
echo "obtaining msa from sam file"
python src/msa_from_sam.py $sam_annotated $basename $plothistogram
echo "complete!"
fi
if [[ -z $sam ]]
then
if [[ -z "$blastresults" ]]
then
echo "splitting fasta into blocks"
numblocks=`python src/fasta_blocks.py $fasta $blocksize`
echo "complete"
echo "creating BLAST database"
./makeblastdb -in $ref -dbtype nucl
success=`echo $?`
if [[ "$success" != "0" ]]
then
echo "makeblastdb failed to create BLAST databsae. Exiting now ..."
exit;
fi
echo "complete"
if [[ -z "$full" ]]
then
if [[ $numblocks -gt $defaultblocksforblast ]]
then
numblocks=$defaultblocksforblast
fi
fi
for ((i=0; i<$numblocks; i++))
do
echo "running blast on input block $i ..."
./blastn -num_alignments 10 -num_threads 2 -query $fasta\_$i -db $ref -out $basename\_b_$i.blast
success=`echo $?`
if [[ "$success" != "0" ]]
then
echo "BLAST command failed. Exiting now ... "
exit
fi
echo "done"
echo "Parsing blast block $i..."
java -jar BlastParser.jar $basename\_b_$i.blast > $basename\_parsed.csv_$i
echo "done"
done
cat $basename\_parsed.csv_* > $basename\_parsed.csv
else
java -jar BlastParser.jar $blastresults > $basename\_parsed.csv
fi
python src/getBlastStats.py $fasta $basename\_parsed.csv $plothistogram
echo "writing summary file for all statistics files generated"
summary=$basename\summary.csv
rm $summary
touch $summary
echo "${basename}delCount.csv" >> $summary
echo "${basename}delsByRead.csv" >> $summary
echo "${basename}delSize.csv" >> $summary
echo "${basename}insertCount.csv" >> $summary
echo "${basename}insertsByRead.csv" >> $summary
echo "${basename}insertSize.csv" >> $summary
echo "${basename}mutationCount.csv" >> $summary
echo "${basename}mutationType.csv" >> $summary
echo "${basename}posCount.csv" >> $summary
echo "${basename}primerCheck.csv" >> $summary
echo "${basename}qualHist.csv" >> $summary
echo "${basename}readHist.csv" >> $summary
echo "${basename}Usage.csv" >> $summary
fi