-
Notifications
You must be signed in to change notification settings - Fork 3
/
virtual-normal-correction-SVs.sh
executable file
·137 lines (111 loc) · 4.85 KB
/
virtual-normal-correction-SVs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
### location of cgatools binary, change as needed, binaries may be downloaded from Complete Genomics website here: http://cgatools.sourceforge.net/
# latest version as of the writing of this code is included in this repository (v1.8)
cgatools="cgatools18"
### if invalid options were given, print usage instructions
function usage(){
echo ""
echo "Usage: $0 "
echo " --variants <CG junctions file> "
echo " --reference <reference .crr file> "
echo " --VN_junctionfiles_list <list with paths to CG-sequenced normals, one file per line> "
echo " --scoreThresholdA <default 10. The minimum number of discordant mate pair alignments supporting the junction from input genome> "
echo " --scoreThresholdB <default 10. The minimum number of discordant mate pair alignments supporting the junction from normal genomes> "
echo " --distance <default 200. Maximum distance between coordinates of potentially compatible junctions.>"
echo " --minlength <default 500. Minimum deletion junctions length to be included into the difference file>"
echo " --output_prefix <where to store final output, for example output/mysample> "
exit
}
### set some defaults
distance=200
minlength=500
scoreThresholdA=10
scoreThresholdB=10
### Parse input parameters
set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_junctionfiles_list: output_prefix: scoreThresholdA: scoreThresholdB: distance: minlength: " "h:" "$@"` || usage
[ $# -eq 0 ] && usage
while [ $# -gt 0 ]
do
case "$1" in
--variants) variants=$2;shift;;
--reference) crr=$2;shift;;
--VN_junctionfiles_list) VN_junctionfiles_list=$2;shift;;
--output_prefix) output_filtered="$2_filtered.tsv";output_report="$2_report.txt";output_prefix=$2;shift;;
--scoreThresholdA) scoreThresholdA=$2;shift;;
--scoreThresholdB) scoreThresholdB=$2;shift;;
--distance) distance=$2;shift;;
--minlength) minlength=$2;shift;;
-h) shift;;
--) shift;break;;
-*) usage;;
*) break;;
esac
shift
done
### check that we have all necessary information
if [[ $variants == "" || $crr == "" || $VN_junctionfiles_list == "" || $output_prefix == "" ]]
then
echo "A mandatory parameter was missing, make sure you provided all required parameters"
echo " variants: $variants"
echo " reference: $crr"
echo " VN_junctionfiles_list: $VN_junctionfiles_list"
echo " output_prefix: $output_prefix"
usage
fi
echo "********************************************************"
echo "* Virtual Normal Correction *"
echo "********************************************************"
echo ""
echo " variants: $variants"
echo " reference: $crr"
echo " VN_junctionfiles_list: $VN_junctionfiles_list"
echo " output_prefix: $output_prefix"
echo " distance: $distance "
echo " minlength: $minlength "
echo " scoreThresholdA: $scoreThresholdA"
echo " scoreThresholdB: $scoreThresholdB"
echo ""
echo " NOTE: performing multiple runs in the same directory simultaneously may give problems with the report output file"
echo ""
### make copy of input junctions file, as this file will be altered
outputdir=`dirname $output_prefix`
outputname=`basename $output_prefix`
junctions="${output_prefix}_currentjunctions.tsv"
cp $variants $junctions
### run JunctionDiff against all of the VN junctionfiles
echo "--> Running Virtual Normal Correction against each of the VN genomes"
count=0
while read line
do
if [[ $line != "" ]] # catch empty lines
then
count=$[$count+1]
$cgatools junctiondiff \
--beta \
--statout \
--reference $crr \
--junctionsA $junctions \
--junctionsB $line \
--scoreThresholdA $scoreThresholdA \
--scoreThresholdB $scoreThresholdB \
--distance $distance \
--minlength $minlength
# concatenate all reports
echo -e "report of run $count:\n----------------------" >> $output_report
cat report.tsv >> $output_report
rm report.tsv
echo "" >> $output_report
# rename output file to junctions file for next iteration
rm $junctions
mv "diff-${outputname}_currentjunctions.tsv" $junctions
fi
done < $VN_junctionfiles_list
### cleanup and move
cp $junctions $output_filtered
rm $junctions
### Run finished, tell user where to find result
echo "--> Virtual Normal Correction Completed."
echo "--> Output files created: "
echo " - Filtered junctions file: $output_filtered "
echo " - Report file: $output_report "
echo ""