-
Notifications
You must be signed in to change notification settings - Fork 0
/
Nano-Pal.SVA_F.sh
410 lines (294 loc) · 15.5 KB
/
Nano-Pal.SVA_F.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
echo "Nano-Pal Version1.0 Jan 22nd 2021"
#PB_DATA='/home/arthurz/arthur_remillsscr/19.01.22.NA12878_PacBio_pbmm2/19.01.29.alignment'
lib='lib'
#####
echo "Enter the your output prefix: "
read FC
#FC='FAO08905'
#####
echo "Enter the your element (SVA): "
read MEI
MEI='SVA'
#####
echo "Enter the your input data: "
read RAW
#RAW='/home/arthurz/arthur_remillsscr/19.10.26.Nanopore.validaton/workspace.12.29.20.rebatch/data/'${FC}
######
echo "Enter the your input reference GRCh38.fa file: "
read REF
#REF='/nfs/turbo/dcmb-brainsom/technical/reference/GRCh38_bsm_reference_genome/GRCh38_BSM.fa'
######
echo "Enter the your reference version (GRCh38 only for now): "
read REF_VER
#REF_VER='GRCh38'
######
echo "Enter the path of minimap2: "
read MINIMAP
#MINIMAP='/home/arthurz/app/minimap2/minimap2'
######
echo "Enter the path of PALMER: "
read PALMER
#PALMER='/home/arthurz/arthur_remillsscr/18.02.23.PALMER.upgrade/v1.7.2/PALMER'
#MEI reference sequence
L1Hs=${lib}'/L1.3'
AluYa5=${lib}'/AluYa5'
AluYb8=${lib}'/AluYb8'
SVA_E=${lib}'/SVA_E'
SVA_F=${lib}'/SVA_F'
#ref MEI information from RepeatMasker
S_refLINE=${lib}'/hg38.RM.L1.ref'
S_refALU=${lib}'/hg38.RM.ALU.ref'
S_refSVA=${lib}'/hg38.RM.SVA.ref'
#PALMER call for NA12878
S_origLINE=${lib}'/PALMER.NA12878.L1.txt'
S_origALU=${lib}'/PALMER.NA12878.ALU.txt'
S_origSVA=${lib}'/PALMER.NA12878.SVA.txt'
#P&P call for NA12878
S_PP_LINE=${lib}'/union.1214/L1.inter.fi'
S_PP_ALU=${lib}'/union.1214/ALU.inter.fi'
S_PP_SVA=${lib}'/union.1214/SVA.inter.fi'
mv ${FC}.workspace.${MEI}.svaf ${FC}.workspace.old
mkdir ${FC}.workspace.${MEI}.svaf
cd ${FC}.workspace.${MEI}.svaf
##RAW fastq data & generate fasta
cat ${RAW}/*.fastq > batch.fastq
cat batch.fastq | awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' > batch.fasta
wait
##alignment
${MINIMAP} -ax map-ont ${REF} batch.fastq > Nanopore.sam
samtools sort Nanopore.sam > Nanopore.sorted.bam
samtools index Nanopore.sorted.bam
wait
mkdir palmer
##PALMER
##pull out L1Hs signals
while read -r chr
do
${PALMER} --input Nanopore.sorted.bam --workdir palmer/ --output ${chr} --ref_ver ${REF_VER} --ref_fa ${REF} --type ${MEI} --chr ${chr}
done < ${lib}/chr.list
wait
##pull out all reads having putative L1Hs signal reported by PALMER
fasta_i=batch.fasta
rm blastn_refine.all.txt
cat palmer/chr*_*_*/blastn_refine.txt >> blastn_refine.all.txt
###Tool: Cigar information parsing
cp ${lib}/cigar.parser.1106.o .
samtools view Nanopore.sorted.bam -q 10 -F 0x100 -F 0x200 -F 0x800 -F 0x400 | awk '{print $1,$3,$4,$6}' > mapped.info.txt
rm cigar_results.all.txt
while read -r a b c d
do
echo ${d} > input_cigar
./cigar.parser.1106.o
wait
cat cigar_results >> cigar_results.all.txt
done < mapped.info.txt
##Get the information of start and end for each read with L1Hs signal
awk '{print $4+$6+$10}' cigar_results.all.txt > cigar.ref.txt
paste mapped.info.txt cigar.ref.txt | awk '{print $1,$2,$3,$3+$5}' > mapped.info.final.txt
rm 5.plus.txt
rm 5.minus.txt
rm 3.plus.txt
rm 3.minus.txt
rm read.length.txt
rm read.name.txt
mkdir blastn
#blastn -evalue 0.001 -task blastn -gapopen 4 -query ${L1Hs} -subject ${fasta_i} -outfmt "7 qacc sacc evalue qstart qend sstart send" > blastn/all.blastn.txt
line=$(wc -l $fasta_i| awk '{print $1}')
##Grep the L1Hs sequence details (in L1Hs reference sequence) using blastn
for((i=0;i<${line}/2;i++))
do
let t=${line}-i*2
tail -n ${t} ${fasta_i} | head -n 2 > blastn/${t}.fasta
#######MEI
blastn -evalue 0.001 -task blastn -gapopen 4 -query ${SVA_F} -subject blastn/${t}.fasta -outfmt "7 qacc sacc evalue qstart qend sstart send" > blastn/${t}.blastn.txt
awk 'NR%2' blastn/${t}.fasta | awk '{print $1}' | sed -e "s/>//g" >> read.name.txt
len=$(awk '!(NR%2)' blastn/${t}.fasta | awk '{print length($1)}')
let len_fix=len-100
echo ${len} >> read.length.txt
#######MEI
cat blastn/${t}.blastn.txt | grep -v "#" | awk 'BEGIN{flag=0}{if($5>1006&&($6<100||$7<100)&&($6<$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' >> 5.plus.txt
cat blastn/${t}.blastn.txt | grep -v "#" | awk 'BEGIN{flag=0}{if($5>1006&&($6<100||$7<100)&&($6>$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' >> 5.minus.txt
cat blastn/${t}.blastn.txt | grep -v "#" | awk 'BEGIN{flag=0}{if($5>1006&&($6>'$len_fix'||$7>'$len_fix')&&($6<$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' >> 3.plus.txt
cat blastn/${t}.blastn.txt | grep -v "#" | awk 'BEGIN{flag=0}{if($5>1006&&($6>'$len_fix'||$7>'$len_fix')&&($6>$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' >> 3.minus.txt
wait
done
paste read.name.txt read.length.txt 5.plus.txt 5.minus.txt 3.plus.txt 3.minus.txt > read.all.txt
wait
rm palmer.5.plus.txt
rm palmer.5.minus.txt
rm palmer.3.minus.txt
rm palmer.3.plus.txt
rm palmer.map.txt
##Grep the polymorphic L1Hs sequence details (in L1Hs reference sequence) from PALMER
while read -r a b c d e f
do
let b_fix=b-100
#######MEI
awk 'BEGIN{flag=0}{if($2~"'$a'"&&$5>1006&&($6<100||$7<100)&&($6<$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' blastn_refine.all.txt >> palmer.5.plus.txt
awk 'BEGIN{flag=0}{if($2~"'$a'"&&$5>1006&&($6<100||$7<100)&&($6>$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' blastn_refine.all.txt >> palmer.5.minus.txt
awk 'BEGIN{flag=0}{if($2~"'$a'"&&$5>1006&&($6>'$b_fix'||$7>'$b_fix')&&($6<$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' blastn_refine.all.txt >> palmer.3.plus.txt
awk 'BEGIN{flag=0}{if($2~"'$a'"&&$5>1006&&($6>'$b_fix'||$7>'$b_fix')&&($6>$7)) flag+=1} END{if(flag>=1) print "1";else print "0"}' blastn_refine.all.txt >> palmer.3.minus.txt
awk 'BEGIN{flag=0}{if($1=="'$a'") {flag+=1;print $2,$3,$4}} END{if(flag==0) print "NON","0","0"}' mapped.info.final.txt >> palmer.map.txt
wait
done < read.all.txt
paste read.all.txt palmer.5.plus.txt palmer.5.minus.txt palmer.3.plus.txt palmer.3.minus.txt palmer.map.txt > read.all.palmer.final.txt
wait
#cp read.all.palmer.txt read.all.palmer.final.txt
mkdir inter
##Tool: intersect with reference L1NE data from RepeatMasker and PALMER calls
cp ${lib}/inter inter/
##Tool: exclude redundant intersection for each read
cp ${lib}/RM_collapse.0921.o inter/
cd inter
awk '{print $11,$12,$12+1,$1}' ../read.all.palmer.final.txt | grep -v "NON" > read.5.Q.txt
awk '{print $11,$13,$13+1,$1}' ../read.all.palmer.final.txt | grep -v "NON" > read.3.Q.txt
cp read.5.Q.txt Q.txt
#######MEI
cp ${S_refSVA} S.txt
./inter
awk '{if($8=="") print $1,$2,$3,$4,"NON"; else print $1,$2,$3,$4,$8}' inter.txt > input_rm_cluster.txt
./RM_collapse.0921.o
mv output_rm_cluster.txt read.5.Q.ref.txt
#######MEI
cp ${S_origSVA} S.txt
./inter
awk '{if($8=="") print $1,$2,$3,$4,"NON"; else print $1,$2,$3,$4,$8}' inter.txt > read.5.Q.P.txt
cp read.3.Q.txt Q.txt
#######MEI
cp ${S_refSVA} S.txt
./inter
awk '{if($8=="") print $1,$2,$3,$4,"NON"; else print $1,$2,$3,$4,$8}' inter.txt > input_rm_cluster.txt
./RM_collapse.0921.o
mv output_rm_cluster.txt read.3.Q.ref.txt
#######MEI
cp ${S_origSVA} S.txt
./inter
awk '{if($8=="") print $1,$2,$3,$4,"NON"; else print $1,$2,$3,$4,$8}' inter.txt > read.3.Q.P.txt
cd ..
wait
sort -k 4 inter/read.5.Q.P.txt > 5.P.inter.txt
sort -k 4 inter/read.3.Q.P.txt > 3.P.inter.txt
sort -k 4 inter/read.5.Q.ref.txt > 5.ref.inter.txt
sort -k 4 inter/read.3.Q.ref.txt > 3.ref.inter.txt
paste 5.P.inter.txt 3.P.inter.txt 5.ref.inter.txt 3.ref.inter.txt | awk '{print $4,$5,$10,$15,$20}' > read.RM.txt
rm read.RM.2.txt
while read -r a b c d e f g h i j k l m
do
awk 'BEGIN{score=0}{if($1=="'$a'") {score+=1;print $2,$3,$4,$5}} END{if(score==0) print "NA","NA","NA","NA"}' read.RM.txt >> read.RM.2.txt
wait
done < read.all.palmer.final.txt
##Summary table of L1Hs for each read
paste read.all.palmer.final.txt read.RM.2.txt > summary.final.txt
##Tool: interscetion
mkdir inter.2
cp ${lib}/inter.0118.o inter.2/
##different types of LINE
#######MEI
less ${S_refSVA} | grep SVA_E > ref.SVA_E
less ${S_refSVA} | grep SVA_F > ref.SVA_F
less ${S_refSVA} | grep -v SVA_E | grep -v SVA_F | grep SVA > ref.SVA
##Valid mapping reads
samtools view Nanopore.sorted.bam -q 10 -F 0x100 -F 0x200 -F 0x800 -F 0x400 -f 0x10 | awk '{print $1}' > RC.all.list
rm RC.tag
while read -r aa b c d e f g h i j k l m n o p q r
do
awk 'BEGIN{flag=0}{if($1=="'$aa'") flag+=1} END{if(flag>=1) print "1";else print "0"}' RC.all.list >> RC.tag
wait
done < summary.final.txt
paste summary.final.txt RC.tag > summary.final.2.txt
SIGNAL1=$(less summary.final.2.txt |awk '{if((($3+$4)!=0&&($5+$6)==0)||(($3+$4)==0&&($5+$6)!=0)) print $0}' | wc -l)
SIGNAL2=$(less summary.final.2.txt |awk '{if(($3+$4)!=0&&($5+$6)!=0) print $0}' | wc -l)
FAIL=$(less summary.final.2.txt |awk '{if(($3+$4+$5+$6)==0) print $0}' | wc -l)
less summary.final.2.txt |awk '{if($11=="NON") print $0}' > summary.final.unmap.read.txt
less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6)==0&&($7+$8+$9+$10)!=0) print $0}'> summary.final.odd.read.txt
less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6+$7+$8+$9+$10)==0) print $0}'> summary.final.no.read.txt
less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6)!=0) print $0}'> summary.final.all.read.txt
#less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6)!=0&&($7+$8+$9+$10)!=0) print $0}'> summary.final.PALMER.L1.read.txt
less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6)!=0&&($7+$8+$9+$10)!=0) print $0}'> summary.final.PALMER.read.txt
less summary.final.2.txt |awk '{if($11!="NON") print $0}' |awk '{if(($3+$4+$5+$6)!=0&&($7+$8+$9+$10)==0) print $0}'> summary.final.ref.read.txt
#less summary.final.PALMER.L1.read.txt |awk '{if(($7+$8)>0&&($9+$10)==0) print $11,$12,$14,$18,"5"; else if (($7+$8)==0&&($9+$10)>0) print $11,$13,$15,$18,"3"; else if(($7+$8)>0&&($9+$10)>0) print $11,$12,$14,$18,"5""\n"$11,$13,$15,$18,"3"}' | sort -k 2 -n | sort -k 1 > capture.loci.palmer
less summary.final.PALMER.read.txt |awk '{if(($7+$8)>0&&($9+$10)==0&&$7>0) print $11,$12,$14,$18,"5","+";else if(($7+$8)>0&&($9+$10)==0&&$8>0) print $11,$12,$14,$18,"5","-";else if (($7+$8)==0&&($9+$10)>0&&$9>0) print $11,$13,$15,$18,"3","+";else if (($7+$8)==0&&($9+$10)>0&&$10>0) print $11,$13,$15,$18,"3","-"; else if(($7+$8)>0&&($9+$10)>0&&$7>0&&$9>0) print $11,$12,$14,$18,"5","+""\n"$11,$13,$15,$18,"3","+";else if(($7+$8)>0&&($9+$10)>0&&$7>0&&$10>0) print $11,$12,$14,$18,"5","+""\n"$11,$13,$15,$18,"3","-";else if(($7+$8)>0&&($9+$10)>0&&$8>0&&$9>0) print $11,$12,$14,$18,"5","-""\n"$11,$13,$15,$18,"3","+";else if(($7+$8)>0&&($9+$10)>0&&$8>0&&$10>0) print $11,$12,$14,$18,"5","-""\n"$11,$13,$15,$18,"3","-"}' | sort -k 2 -n | sort -k 1 > capture.loci.palmer
less summary.final.PALMER.read.txt |awk '{if(($7+$8)>0&&($9+$10)==0&&$18==0&&($5+$6)>0) print $11,$13,$17,$18,"3"; else if(($7+$8)>0&&($9+$10)==0&&$18==1&&($3+$4)>0) print $11,$13,$17,$18,"3"; else if (($7+$8)==0&&($9+$10)>0&&$18==0&&($3+$4)>0) print $11,$12,$16,$18,"5"; else if (($7+$8)==0&&($9+$10)>0&&$18==1&&($5+$6)>0) print $11,$12,$16,$18,"5"}' | sort -k 2 -n | sort -k 1 > capture.loci.ref.add
less summary.final.ref.read.txt |awk '{if(($3+$4)>0&&($5+$6)==0&&$18==0) print $11,$12,$16,$18,"5"; else if(($3+$4)>0&&($5+$6)==0&&$18==1) print $11,$13,$17,$18,"3"; else if (($3+$4)==0&&($5+$6)>0&&$18==0) print $11,$13,$17,$18,"3"; else if (($3+$4)==0&&($5+$6)>0&&$18==1) print $11,$12,$16,$18,"5"; else if(($3+$4)>0&&($5+$6)>0) print $11,$12,$16,$18,"5""\n"$11,$13,$17,$18,"3"}' | sort -k 2 -n | sort -k 1 > capture.loci.ref
less capture.loci.palmer | grep cluster | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.palmer.process
#less capture.loci.palmer | grep -v cluster | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.potential.process
less capture.loci.palmer | grep -v cluster | awk '{print $1,$2,$2+1,$3,$4,$5,$6,"Nanopore"}' > capture.loci.potential.process
cat capture.loci.ref capture.loci.ref.add > capture.loci.ref.all
less capture.loci.ref.all | grep SVA_F | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.r.SVA_F
less capture.loci.ref.all | grep -v SVA_F | grep SVA_E | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.r.SVA_E
less capture.loci.ref.all | grep -v SVA_E | grep -v SVA_F | grep SVA | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.r.SVAother
less capture.loci.ref.all | grep -v SVA_E | grep -v SVA_F | grep -v SVA | awk '{print $1,$2,$2+1,$3,$4,$5,"Nanopore"}' > capture.loci.r.SVAnon
cd inter.2
#######MEI
cp ${S_PP_SVA} Q.txt
cp ../capture.loci.palmer.process S.txt
./inter.0118.o
wait
mv inter.txt inter.palmer.txt
cp ../capture.loci.potential.process S.txt
./inter.0118.o
wait
mv inter.txt inter.palmer.add.txt
#######MEI
cp ../ref.SVA_E Q.txt
cp ../capture.loci.r.SVA_E S.txt
./inter.0118.o
wait
mv inter.txt inter.r.SVA_E.txt
#######MEI
cp ../ref.SVA_F Q.txt
cp ../capture.loci.r.SVA_F S.txt
./inter.0118.o
wait
mv inter.txt inter.r.SVA_F.txt
#######MEI
cp ../ref.SVA Q.txt
cp ../capture.loci.r.SVAother S.txt
./inter.0118.o
wait
mv inter.txt inter.r.SVA.txt
##report the numbers of captured events
#cat inter.palmer.txt inter.palmer.add.txt | grep Nano | awk '{print $4,$30 ,$17, $(NF-1)}' | sort | uniq -c > p.txt.fi.2
cat inter.palmer.txt inter.palmer.add.txt | grep Nano | awk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' | sort | uniq -c > p.txt.fi
#######MEI
less inter.r.SVA_E.txt | grep Nano | awk '{print $1,$2,$3}' | sort | uniq -c > r.SVA_E.txt.fi
less inter.r.SVA_F.txt | grep Nano | awk '{print $1,$2,$3}' | sort | uniq -c > r.SVA_F.txt.fi
less inter.r.SVA.txt | grep Nano | awk '{print $1,$2,$3, $4}' | sort | uniq -c > r.SVA.txt.fi
less r.SVA.txt.fi | grep SVA_D > r.SVA_D.txt.fi
less r.SVA.txt.fi | grep -v SVA_D > r.SVAother.txt.fi
cp ../capture.loci.potential.process Q.txt
#######MEI
cp ${S_PP_SVA} S.txt
./inter.0118.o
wait
less inter.txt | grep -v cluster > inter.potential.txt
cd ..
#######MEI
cp ${lib}/cluster.1219.o .
cp inter.2/inter.potential.txt input_cluster.txt
./cluster.1219.o
cp clustered.txt potential.clustered.txt.fi
pwd
echo There are ${SIGNAL1} reads caputuring putative ${MEI} signals on one end.
echo There are ${SIGNAL2} reads caputuring putative ${MEI} signals on both ends.
echo There are ${FAIL} reads having no putative L1 signals.
less inter.2/p.txt.fi | awk '{sum+=$1} END {print "Non-reference SVA reads sum = ", sum}'
less inter.2/r.SVA_E.txt.fi | awk '{sum+=$1} END {print "Reference SVA_E reads sum = ", sum}'
less inter.2/r.SVA_F.txt.fi | awk '{sum+=$1} END {print "Reference SVA_F reads sum = ", sum}'
less inter.2/r.SVA_D.txt.fi | awk '{sum+=$1} END {print "Reference SVA_D reads sum = ", sum}'
less inter.2/r.SVAother.txt.fi | awk '{sum+=$1} END {print "Other reference SVA reads sum = ", sum}'
less potential.clustered.txt.fi | awk '{sum+=$4} END {print "Potential specific non-reference SVA reads sum = ", sum}'
echo "Number of non_SVA reads"
less capture.loci.r.SVAnon | wc -l | awk '{print $1}'
echo "Number of non-reference SVA and the file (number of suppoting reads + coordinate + overlap information)"
wc -l inter.2/p.txt.fi
echo "Number of reference SVA_E and the file (number of suppoting reads + coordinate)"
wc -l inter.2/r.SVA_E.txt.fi
echo "Number of reference SVA_F and the file (number of suppoting reads + coordinate)"
wc -l inter.2/r.SVA_F.txt.fi
echo "Number of reference SVA_D and the file (number of suppoting reads + coordinate)"
wc -l inter.2/r.SVA_D.txt.fi
echo "Number of other reference SVA and the file (number of suppoting reads + coordinate)"
wc -l inter.2/r.SVAother.txt.fi
echo "Number of potential specific non-reference SVA and the file (coordinate + number of suppoting reads + number of left suppoting reads + number of right suppoting reads + strand)"
wc -l potential.clustered.txt.fi