Skip to content

Commit

Permalink
updated coverage analysis scripts and promoter region annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
mezewudo committed Oct 27, 2016
1 parent 92edd89 commit 2775ec3
Show file tree
Hide file tree
Showing 7 changed files with 7,261 additions and 255 deletions.
7,111 changes: 7,065 additions & 46 deletions bin/bed_list.txt

Large diffs are not rendered by default.

36 changes: 22 additions & 14 deletions bin/del_parse.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,46 @@
#! /usr/bin/python

## iterates through coverage files to fine loci with no coverage
## iterates through coverage files to find loci with no coverage

import sys
import re
from string import join

dele = False
loci = []
ids = ["Rv0005","Rv0006","Rv0407","Rv0486","Rv0635","Rv0667","Rv0676c","Rv0677c","Rv0678","Rv0682","Rv0701","Rv1137","Rv1258c","Rv1305","rrs","rrl","fabG11/inhA promoter","Rv1483","Rv1484","Rv1694","Rv1704c","Rv1908c","Rv1909c","Rv1988","Rv2043c","pncA promoter","Rv2416c","eis promoter","Rv2428","ahpC promoter","Rv2447c","Rv2671","Rv2763c","Rv2764c","Rv3197A","whiB7 promoter","Rv3261","Rv3262","Rv3547","Rv3792","Rv3793","Rv3794","Rv3795","Rv3806c","Rv3854c","Rv3919c"]
ids = []
names = []
loci = []

input3 = sys.argv[3]
fh3 = open(input3,'r')
for lines in fh3:
fields = lines.rstrip("\r\n").split("\t")
ids.append(fields[4])
names.append(fields[3])
fh3.close()

input1 = sys.argv[1]
input2 = sys.argv[2]

fh1 = open(input1 + "_Resistance_Region_Coverage.txt",'r')
fh1 = open(input1 + "_genome_region_coverage.txt",'r')
print "Sample ID" + "\t" + "CHROM" + "\t" + "POS" + "\t" + "REF" + "\t" + "ALT" + "\t" + "Read Depth" + "\t" + "Quality" + "\t" + "Percent Alt allele" + "\t" + "Annotation" + "\t" + "Variant Type" + "\t" + "Nucleotide Change" + "\t" + "Position within CDS " + "\t" + "Amino acid change" + "\t" + "REF Amino acid" + "\t" + "ALT Amino Acid" + "\t" + "Codon Position" + "\t" "Gene name" + "\t" + "Gene ID" + "\t" + "Transcript ID" + "\t" + "Annotation details"

for lines in fh1:
if lines.startswith("Chrom"):
continue
fields = lines.rstrip("\r\n").split("\t")
if int(fields[4]) < 10:
ind = ids.index(fields[4])
if float(fields[5]) < 10.0:
dele = True
print input2 + "\t" + fields[3] + "\t" + "Complete deletion"
elif int(fields[5]) < 90:
print input2 + "\t" + "NC_000962" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "Complete deletion" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + names[ind] + "\t" + fields[4] + "\t" + "NA" + "\t" + "NA"
elif int(fields[6]) < 96:
dele = True
print input2 + "\t" + fields[3] + "\t" + "Partial deletion"
loci.append(fields[3])
print input2 + "\t" + "NC_000962" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "Partial deletion" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + names[ind] + "\t" + fields[4] + "\t" + "NA" + "\t" + "NA"
loci.append(fields[4])
for genes in ids:
if genes not in loci:
ind = ids.index(genes)
dele = True
print input2 + "\t" + genes + "\t" + "Complete deletion"
print input2 + "\t" + "NC_000962" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "Complete deletion" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + names[ind] + "\t" + genes + "\t" + "NA" + "\t" + "NA"
if dele == False:
print input2 + "\tNo gene deletion inferred"



print input2 + "\t" + "NC_000962" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "No gene deletion inferred" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA" + "\t" + "NA"
73 changes: 73 additions & 0 deletions bin/mutation_loci.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
H37Rv gene start stop type coding strand
gyrB upstream 4998 5239 intergenic forward
gyrA upstream 7268 7301 intergenic forward
ponA1 upstream 53245 53662 intergenic forward
ldtA upstream 141023 141199 intergenic reverse
ldtE upstream 223531 223563 intergenic forward
inbR upstream 331659 331747 intergenic reverse
fgd1 upstream 490706 490782 intergenic forward
lprQ upstream 571649 571709 intergenic forward
mshA upstream 575301 575347 intergenic forward
Rv0565c upstream 657471 657547 intergenic reverse
hadA upstream 731880 731929 intergenic forward
mmaA4 upstream 737204 737267 intergenic reverse
mmaA3 upstream 738150 738296 intergenic reverse
mmaA2 upstream 739161 739326 intergenic reverse
rpoB upstream 759310 759806 intergenic forward
rpoC upstream 763326 763369 intergenic forward
mmpR upstream 778906 778989 intergenic forward
rpsL upstream 781312 781559 intergenic forward
rplC upstream 800793 800808 intergenic forward
rpfB upstream 1127884 1128090 intergenic forward
mshB upstream 1300125 1300303 intergenic forward
fbiC upstream 1302682 1302930 intergenic forward
sigI upstream 1332011 1332091 intergenic forward
folP2 upstream 1351126 1351190 intergenic forward
tap upstream 1407341 1408238 CDS reverse
embR upstream 1417348 1417657 intergenic reverse
atpE upstream 1460997 1461044 intergenic forward
rrs upstream 1471743 1471845 intergenic forward
rrs 1471846 1473382 rRNA forward
rrl upstream 1473383 1473657 intergenic forward
rrl 1473658 1476795 rRNA forward
ldtD upstream 1611271 1611433 intergenic forward
fabG1 upstream 1673300 1673439 intergenic forward
inhA upstream 1674184 1674201 intergenic forward
pykA upstream 1816082 1816188 intergenic forward
rpsA upstream 1833380 1833541 intergenic forward
cycA upstream 1931457 1932654 intergenic reverse
eccB5 upstream 2017477 2017739 intergenic forward
ndh upstream 2103043 2103183 intergenic reverse
katG upstream 2156112 2156148 intergenic reverse
furA upstream 2156593 2156705 intergenic reverse
Rv1979c upstream 2223165 2223342 intergenic reverse
erm(37) upstream 2231455 2231679 intergenic forward
pncA upstream 2289242 2289281 intergenic reverse
blaC upstream 2326810 2326943 intergenic reverse
mshC upstream 2392460 2392516 intergenic reverse
eis upstream 2715333 2715471 intergenic reverse
ahpC upstream 2726088 2726192 intergenic forward
ldtB upstream 2835336 2835493 intergenic reverse
pepQ upstream 2860419 2860451 intergenic reverse
Rv2671 upstream 2985731 2986838 CDS forward
thyX upstream 3067946 3068188 intergenic reverse
dfrA upstream 3073610 3073679 intergenic reverse
thyA upstream 3074472 3074635 intergenic reverse
ald upstream 3086755 3086819 intergenic forward
gpsI upstream 3092598 3092950 intergenic reverse
dacB2 upstream 3218271 3218338 forward
ddlA upstream 3337918 3337994 intergenic reverse
serB2 upstream 3403163 3403199 intergenic reverse
mymA upstream 3448427 3448503 intergenic forward
whiB7 upstream 3568680 3569108 intergenic reverse
nudC upstream 3572544 3572601 intergenic reverse
fbiA upstream 3640142 3640542 intergenic forward
alr upstream 3841421 3841713 intergenic reverse
rpoA upstream 3878508 3878658 intergenic reverse
ddn upstream 3986733 3986843 intergenic forward
nat upstream 4008183 4008433 CDS reverse
folP1 upstream 4049981 4050585 CDS reverse
embA upstream 4243148 4243232 intergenic forward
ubiA upstream 4269834 4269839 intergenic reverse
ethA upstream 4327474 4327548 intergenic reverse
gidB upstream 4408203 4408333 intergenic reverse
204 changes: 58 additions & 146 deletions bin/parse_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

input1 = sys.argv[1]
input2 = sys.argv[2]
input3 = sys.argv[3]


position = ""
reference = ""
Expand Down Expand Up @@ -48,6 +50,19 @@
transcript1 = ""
annotation_details1 = ""
Block = False
(genez,start,stop,gene_anot,strand) = ([],[],[],[],[])
nuc_change = ""

fh3 = open(input3,'r')
for lines in fh3:
lined = lines.rstrip("\r\n").split("\t")
if lines.startswith("H37Rv"):
continue
genez.append(lined[0])
start.append(lined[1])
stop.append(lined[2])
gene_anot.append(lined[3])
strand.append(lined[4])

fh1 = open(input1,'r')
print "Sample ID" + "\t" + "CHROM" + "\t" + "POS" + "\t" + "REF" + "\t" + "ALT" + "\t" + "Read Depth" + "\t" + "Quality" + "\t" + "Percent Alt allele" + "\t" + "Annotation" + "\t" + "Variant Type" + "\t" + "Nucleotide Change" + "\t" + "Position within CDS " + "\t" + "Amino acid change" + "\t" + "REF Amino acid" + "\t" + "ALT Amino Acid" + "\t" + "Codon Position" + "\t" "Gene name" + "\t" + "Gene ID" + "\t" + "Transcript ID" + "\t" + "Annotation details"
Expand Down Expand Up @@ -75,152 +90,50 @@
subannot = annot.split(",")
smallannot = subannot[0].split("|")
if smallannot[2] == "MODIFIER":
if "rrs" in annot and 1471845 < int(position) < 1473382 :
annotation = 'Intragenic_variant'
nuc_change = str(int(position) - 1471845)
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'rrs'
gene_id = 'MTB000019'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif "rrl" in annot and 1473657 < int(position) < 1476495 :
annotation = 'Intragenic_variant'
nuc_change = str(int(position) - 1473657)
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'rrl'
gene_id = 'MTB000020'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif 1673338 < int(position) < 1673440 :
annotation = 'Intragenic_variant'
nuc_change = str(int(position) - 1673440)
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'inhA promoter'
gene_id = 'inhA promoter'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif 2289241 < int(position) < 2289282 :
annotation = 'Intragenic_variant'
nuc_change = str(2289241 - int(position))
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'pncA promoter'
gene_id = 'pncA promoter'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif 2715332 < int(position) < 2715384 :
annotation = 'Intragenic_variant'
nuc_change = str(2715332 - int(position))
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'eis promoter'
gene_id = 'eis promoter'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif 2725991 < int(position) < 2726193 :
annotation = 'Intragenic_variant'
nuc_change = str(int(position) - 2726193)
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'aphC promoter'
gene_id = 'aphC promoter'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
elif 3568679 < int(position) < 3569109 :
annotation = 'Intragenic_variant'
nuc_change = str(3568679 - int(position))
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
for x in range(0,71):
if (int(start[x]) -1) < int(position) < (int(stop[x]) + 1):
annotation = gene_anot[x]
if genez[x] == 'rrs':
nuc_change = str((int(position)) - (int(start[x]) - 1))
gene_id = 'MTB000019'
elif genez[x] == 'rrl':
nuc_change = str((int(position)) - (int(start[x]) - 1))
gene_id = 'MTB000020'
elif strand[x] == 'forward':
gene_id = genez[x]
nuc_change = str((int(position)) - (int(stop[x]) + 1))
elif strand[x] == 'reverse':
gene_id = genez[x]
nuc_change = str((int(start[x]) -1) - int(position))
gene_name = genez[x]
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
if len(fields[4]) > len(fields[3]):
variant = "Insertion"
elif len(fields[3]) > len(fields[4]):
variant = "Deletion"
else:
variant = "SNP"
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
break
else:
variant = "SNP"
nucleotide_change = "c." + nuc_change + reference + ">" + alternate
amino_acid_change = 'NA'
gene_name = 'whiB7 promoter'
gene_id = 'whiB7 promoter'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[0:])
else:
annotation = 'Non-Coding'
variant = 'NA'
nucleotide_change = smallannot[9]
amino_acid_change = 'NA'
gene_name = 'NA'
gene_id = 'NA'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[1:])
annotation = 'Non-Coding'
variant = 'NA'
nucleotide_change = smallannot[9]
amino_acid_change = 'NA'
gene_name = 'NA'
gene_id = 'NA'
transcript = 'NA'
transcript_pos = 'NA'
orig_aacid = 'NA'
new_aacid = 'NA'
codon_pos = 'NA'
annotation_details = ','.join(subannot[1:])
if len(position1) != 0:
if Block == True:
print input2 + "\t" + fields[0] + "\t" + position1 + "\t" + reference1 + "\t" + alternate1 + "\t" + read_depth1 + "\t" + quality1 + "\t" + perc_alt11 + "\t" + annotation1 + "\t" + 'MNV' + "\t" + nucleotide_change1 + "\t" + transcript_pos1 + "\t" + amino_acid_change1 + "\t" + orig_aacid1 + "\t" + 'Block_Substitution' + "\t" + codon_pos1 + "\t" + gene_name1 + "\t" + gene_id1 + "\t" + transcript1 + "\t" + annotation_details1
Expand Down Expand Up @@ -317,4 +230,3 @@
else:
print input2 + "\t" + fields[0] + "\t" + position1 + "\t" + reference1 + "\t" + alternate1 + "\t" + read_depth1 + "\t" + quality1 + "\t" + perc_alt11 + "\t" + annotation1 + "\t" + variant1 + "\t" + nucleotide_change1 + "\t" + transcript_pos1 + "\t" + amino_acid_change1 + "\t" + orig_aacid1 + "\t" + new_aacid1 + "\t" + codon_pos1 + "\t" + gene_name1 + "\t" + gene_id1 + "\t" + transcript1 + "\t" + annotation_details1


Loading

0 comments on commit 2775ec3

Please sign in to comment.