Skip to content

Commit

Permalink
adds scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Geert van Geest committed Nov 29, 2024
1 parent 3058976 commit 1956fcb
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 0 deletions.
62 changes: 62 additions & 0 deletions scripts/01_download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@

mkdir -p data/resources

cd data/resources

# panel of normals

aws s3 \
--no-sign-request --region eu-west-1 \
cp \
s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz \
.

aws s3 \
--no-sign-request --region eu-west-1 \
cp \
s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi \
.

cd ../
mkdir reference

cd reference

# exome intervals
wget https://genomedata.org/pmbio-workshop/references/exome/chr6_and_chr17/exome_regions.bed
wget https://genomedata.org/pmbio-workshop/references/exome/chr6_and_chr17/exome_regions.bed.interval_list

# reference genome
wget https://genomedata.org/pmbio-workshop/references/genome/chr6_and_chr17/ref_genome.tar
tar xvf ref_genome.tar
rm ref_genome.tar

# reads
cd ../
mkdir reads
cd reads

wget https://genomedata.org/pmbio-workshop/fastqs/chr6_and_chr17/Exome_Norm.tar
wget https://genomedata.org/pmbio-workshop/fastqs/chr6_and_chr17/Exome_Tumor.tar

tar xvf Exome_Norm.tar
rm Exome_Norm.tar
tar xvf Exome_Tumor.tar
rm Exome_Tumor.tar

mv Exome_Norm/Exome_Norm_R1.fastq.gz normal_R1.fastq.gz
mv Exome_Norm/Exome_Norm_R2.fastq.gz normal_R2.fastq.gz

mv Exome_Tumor/Exome_Tumor_R1.fastq.gz tumor_R1.fastq.gz
mv Exome_Tumor/Exome_Tumor_R2.fastq.gz tumor_R2.fastq.gz

rm -r Exome_Norm
rm -r Exome_Tumor

# subset vcf
cd ../resources
bcftools view -Oz -r chr6,chr17 af-only-gnomad.hg38.vcf.gz > af-only-gnomad.hg38.subset.vcf.gz
bcftools index --tbi af-only-gnomad.hg38.subset.vcf.gz

bcftools view -Oz -r chr6,chr17 1000g_pon.hg38.vcf.gz > 1000g_pon.hg38.subset.vcf.gz
bcftools index --tbi 1000g_pon.hg38.subset.vcf.gz
6 changes: 6 additions & 0 deletions scripts/02_build_reference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

REFDIR=/config/data/reference/

mkdir -p "$ALIGNDIR"

bwa index "$REFDIR"/ref_genome.fa
18 changes: 18 additions & 0 deletions scripts/03_alignment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

REFDIR=/config/data/reference/
READDIR=/config/data/reads
ALIGNDIR=/config/data/alignments

mkdir -p "$ALIGNDIR"

for sample in tumor normal
do
bwa mem \
"$REFDIR"/ref_genome.fa \
"$READDIR"/"$sample"_R1.fastq.gz \
"$READDIR"/"$sample"_R2.fastq.gz \
2> "$ALIGNDIR"/$sample.bwa.log \
| samtools sort \
| samtools view -bh \
> "$ALIGNDIR"/"$sample".bam
done
15 changes: 15 additions & 0 deletions scripts/04_add_readgroups.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash

ALIGNDIR=/config/data/alignments

for sample in tumor normal
do
gatk AddOrReplaceReadGroups \
--INPUT "$ALIGNDIR"/"$sample".bam \
--OUTPUT "$ALIGNDIR"/"$sample".rg.bam \
--RGLB "$sample" \
--RGPU HWI-ST466.C1TD1ACXX \
--RGPL ILLUMINA \
--RGSM "$sample" \
--RGID HWI-ST466.C1TD1ACXX."$sample"
done
13 changes: 13 additions & 0 deletions scripts/05_mark_duplicates.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

ALIGNDIR=/config/data/alignments

for sample in tumor normal
do
gatk MarkDuplicates \
--INPUT "$ALIGNDIR"/"$sample".rg.bam \
--OUTPUT "$ALIGNDIR"/"$sample".rg.md.bam \
--METRICS_FILE "$ALIGNDIR"/marked_dup_metrics_"$sample".txt

samtools index "$ALIGNDIR"/"$sample".rg.md.bam
done
18 changes: 18 additions & 0 deletions scripts/06_run_mutect2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

ALIGNDIR=/config/data/alignments
REFDIR=/config/data/reference
RESOURCEDIR=/config/data/resources
VARIANTDIR=/config/data/variants

mkdir -p $VARIANTDIR

gatk Mutect2 \
-R "$REFDIR"/ref_genome.fa \
--intervals "$REFDIR"/exome_regions.bed.interval_list \
-I "$ALIGNDIR"/tumor.rg.md.bam \
-I "$ALIGNDIR"/normal.rg.md.bam \
-normal normal \
--germline-resource "$RESOURCEDIR"/af-only-gnomad.hg38.subset.vcf.gz \
--panel-of-normals "$RESOURCEDIR"/1000g_pon.hg38.subset.vcf.gz \
-O "$VARIANTDIR"/somatic.vcf.gz
14 changes: 14 additions & 0 deletions scripts/07_get_pileups.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

ALIGNDIR=/config/data/alignments
RESOURCEDIR=/config/data/resources
VARIANTDIR=/config/data/variants

for sample in tumor normal
do
gatk GetPileupSummaries \
-I "$ALIGNDIR"/"$sample".rg.md.bam \
-V "$RESOURCEDIR"/af-only-gnomad.hg38.vcf.gz \
-L "$RESOURCEDIR"/af-only-gnomad.hg38.vcf.gz \
-O "$VARIANTDIR"/"$sample".pileups.table
done
10 changes: 10 additions & 0 deletions scripts/08_calculate_contamination.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

#!/usr/bin/env bash

ALIGNDIR=/config/data/alignments
RESOURCEDIR=/config/data/resources

gatk CalculateContamination \
-I "$VARIANTDIR"/tumor.pileups.table \
-matched "$VARIANTDIR"/normal.pileups.table \
-O "$VARIANTDIR"/ontamination.table
3 changes: 3 additions & 0 deletions scripts/start_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

cd /Users/geertvangeest/Documents/repositories/cancer-variants-training
docker run --rm -v $PWD:/config -p 8443:8443 geertvangeest/cancer-variants-vscode:latest

0 comments on commit 1956fcb

Please sign in to comment.