-
Notifications
You must be signed in to change notification settings - Fork 0
/
gatk_cnv-filter.sh
69 lines (57 loc) · 2.51 KB
/
gatk_cnv-filter.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# This script has been developed at the Institute of Biological Chemistry of the School of Exact and Natural Science (IQUIBICEN)
# of University of Buenos Aires.
#____________________________________________________________________________________________________________________________
# After gGATK-CNV analysis with gatk_case.sh, this pipeline is used for sample and variant filtering
#1) Sample filtering: remove samples with more than 100 CNVs.
#2) Variant filtering: remove (common) CNVs with frequency > 1%.
#Paths to Directories required for this pipeline:
output_dir=${HOME}/output_dir
case=${output_dir}/case
case-results=${case}/case-results
newfiles= ${output_dir}/newfiles
unique_cnvs=${newfiles}/unique_cnvs
# gGATK-CNV output are VCF segment files (zcat genotyped-segments-case-sample_0.vcf), one per sample,
# saved in case-results directory. The file is edited as follows,generating _type.bed files in the newfiles folder.
if [ ! $(ls -A ${newfiles} ];then
for file in ${case-results}/*;do
cd $file;
zcat genotyped-segments-case-sample_0.vcf| \
grep -v "#"| \
cut -f1,2,5,6,8,10| \
sed 's/END=//g'|sed 's/:/\t/g'| \
awk '{if ($3!=".") print $1"\t"$2"\t"$5"\t"$3"\t"$4"\t"$6"\t"$7"\t"$8"\t"}'> \
${newfiles}/(basename $file)_type.bed;
cd ../;
done
else
echo "Case VCF segment files were processed to access CNV calls"
fi
# 1) Sample filtering: Samples with more than 100 CNVs are removed from _type.bed files
for file in ${newfiles}/*_type.bed;do
count=$(cat $file|wc -l);
if [ $count -gt 100 ];
then \
rm $file;
fi;
done
# 2) Variant filtering:
# this script concatenates the fields: chr/start/end/cnv_type in intervals with '@', then identical concatenated intervals
# are sorted and counted. Counts greater than 3 are removed,keeping unfrequent CNVs,saved as unique_cnvs.bed
if [ ! -f ${newfiles}/unique_cnvs.bed ];then
for file in ${newfiles}/*_type.bed;do
cat $file| \
awk '{print $1,$2,$3,$4}'| \
sed 's/ /@/g';
done|sort|uniq -c|sort -nrk1,1|awk -F " " '{if ($1 <= 2) print $2}'|sed 's/@/\t/g'|sed 's/^chr//g'| \
sort -nk1,1 -nk2,2|sed 's/^/chr/g' > ${newfiles}/unique_cnvs.bed
fi
# Search and filter samples for unfrequent CNVs, listed in unique_cnvs.bed:
if [ ! $(ls -A ${unique_cnvs}/*) ];then
for file in ${newfiles}/*_type.bed;do
awk '(FNR==NR){a[$1,$2,$3,$4];next} (($1,$2,$3,$4) in a) {print $0}' ${newfiles}/unique_cnvs.bed $file \
> ${unique_cnvs}/$(basename $file _type.bed).bed;
done
else
echo "Samples Common CNVs filtering already done "
fi