Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

12 improve test data #13

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Reference genome indexes
*.bt2

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
80 changes: 80 additions & 0 deletions .tests/data/generateSinData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np
import matplotlib.pyplot as plt

def generate_reads(genome, amplitude, shift, read_length):
"""Generate reads over the genome with a sin wave distribution of coverage."""
genome_length = len(genome)
subsample_factor = 50
# Use half the period for the sine wave
coverage = (amplitude * np.sin(np.pi * np.arange(genome_length) / (genome_length))) + shift
coverage = coverage[0::subsample_factor]

print(f'Generating {sum(coverage) / 1000000} million reads')

for i in range(int(len(coverage) / subsample_factor)):
num_reads_at_position = int(coverage[i])
for _ in range(num_reads_at_position):
start = max(0, i - read_length // 2)
end = min(genome_length, start + read_length)
read = genome[start:end]
yield read

def complement_read(read):
"""Generate the complement of a DNA sequence."""
comp = ""
for c in read:
if c == "A":
comp += "T"
elif c == "T":
comp += "A"
elif c == "G":
comp += "C"
elif c == "C":
comp += "G"
return comp

def write_fastq(output_prefix, reads, index, id):
"""Write reads to paired FASTQ files as they are generated."""
with open(f'{output_prefix}_{index}_1.fastq', 'a') as f1, open(f'{output_prefix}_{index}_2.fastq', 'a') as f2:
for i, read in enumerate(reads):
f1.write(f'@Read{i + 1}_{id}/1\n{read}\n+\n{"I" * len(read)}\n')

# Generate reversed and complemented sequence for the second FASTQ file
comp_read = complement_read(read[::-1])
f2.write(f'@Read{i + 1}_{id}/2\n{comp_read}\n+\n{"I" * len(comp_read)}\n')

def plot_coverage(coverage):
"""Plot the coverage distribution."""
plt.plot(coverage)
plt.title('Coverage Distribution')
plt.xlabel('Genome Position')
plt.ylabel('Coverage')
plt.show()

def generate_and_write_reads(output_prefix, genome, amplitudes, shift, read_length, id):
"""Generate and write n paired read files with specified amplitudes."""
for amplitude in amplitudes:
reads_generator = generate_reads(genome, amplitude, shift, read_length)
write_fastq(output_prefix, reads_generator, amplitude, id)

# Parameters
amplitudes = [2, 3, 4] # Specify the amplitudes for each set of reads
shift = 1
read_length = 150
output_prefix = 'triple'

# Generate synthetic genome
with open("reference/Ecoli.fasta") as f:
f.readline() # Info line
ecoli = "".join([l.strip() for l in f.readlines()])
with open("reference/Bfragilis.fasta") as f:
f.readline() # Info line
bfragilis = "".join([l.strip() for l in f.readlines()])
with open("reference/akk-genome.fasta") as f:
f.readline() # Info line
akk = "".join([l.strip() for l in f.readlines()])

# Generate and write n paired read files with specified amplitudes
generate_and_write_reads(output_prefix, ecoli, amplitudes, shift, read_length, "ecoli")
generate_and_write_reads(output_prefix, bfragilis, amplitudes, shift, read_length, "bfragilis")
generate_and_write_reads(output_prefix, akk, amplitudes, shift, read_length, "akk")
118 changes: 104 additions & 14 deletions .tests/data/generateTestData.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from io import TextIOWrapper
import random
import matplotlib.pyplot as plt
from collections import Counter
import gzip
import os
import math


def triangle(position: float, PTR: float, min_reads: int) -> int:
Expand All @@ -22,6 +24,16 @@ def get_read(genome: str, start_index: int, read_length: int) -> str:
else:
spillover = start_index + read_length - genome_length
return genome[start_index:] + genome[:spillover]

def adjust_genome(genome: str, name_str: str) -> str:
# Return genome str adjusted to have the replication origin in the middle if its location is known
ref_dict = {"Bacteroides_fragilis_638R": 105766, "Bifidobacterium_animalis_lactis_B420": 1886, "Streptomyces_scabiei_87_22": 5138728}
for species, rep_index in ref_dict.items():
if species in name_str:
start_at_rep = genome[rep_index:] + genome[:rep_index]
return start_at_rep[len(genome) // 2:] + start_at_rep[:len(genome) // 2]

return genome


# Generates a triangular distribution of reads from input sequence data
Expand All @@ -36,6 +48,7 @@ def generateTestData(genome_fp: str, reads: str, num_reads: int, PTR: float) ->
for r in genome_f.readlines():
if r[0] != ">":
genome += r.strip()
genome = adjust_genome(genome, genome_fp)

with open(reads + "_R1.fastq", "w") as readsF_R1, open(
reads + "_R2.fastq", "w"
Expand All @@ -45,10 +58,66 @@ def generateTestData(genome_fp: str, reads: str, num_reads: int, PTR: float) ->
# from that to add to the fastq file
c: Counter = Counter()
genome_length: int = len(genome)
read_length: int = 250
gap: int = 0
read_length: int = 150
min_triangle: int = 100

def D(x):
# Sine probability density function
return (1 / genome_length) * (1 - ((1 - PTR) / (1 + PTR)) * math.cos((2 * math.pi * x) / genome_length))

for i in range(num_reads):
# Randomly sample sine probability distribution
x = random.uniform(0, genome_length)
y = random.uniform(0, D(0))
while y > D(x):
x = random.uniform(0, genome_length)
y = random.uniform(0, D(0))

start_index = int(x) - read_length
forward_read = get_read(genome, start_index, read_length)
reverse_read = get_read(genome, start_index + read_length, read_length)

if random.random() < 0.05:
writeReads(readsF_R1, forward_read, i)
writeReads(readsF_R2, complementRead(reverse_read[::-1]), i)

return c



for initial_index in [0, 10, 20]:
i = initial_index
read_index = 0
while i < genome_length:
forward_read = get_read(genome, i, read_length)
reverse_read = get_read(genome, i + read_length, read_length)

writeReads(readsF_R1, forward_read, read_index)
writeReads(readsF_R2, complementRead(reverse_read[::-1]), read_index)
read_index += 1

# Counter
#for j in range(i, i + (2 * read_length)):
# c[j] += 1

if i < genome_length / 2:
mirrored_index = i
else:
mirrored_index = genome_length - i

#increment = int(read_length - ((2 * read_length * (1 - 1 / PTR)) / genome_length) * mirrored_index)
# linear increase, top gets chopped off too aggressively
#increment = read_length + int(read_length / PTR) - int((4 * read_length / (genome_length ^ 2)) * (1 - 1 / PTR) * (mirrored_index ^ 2) + read_length / PTR)
# quadratic increase
increment = int(read_length / (math.cos(((2 * math.pi) / genome_length) * i) * ((PTR - 1) / 2) + (PTR + 1) / 2))
# sine increase

assert increment > 0
i += 2 * increment # double because we're doing both forward and reverse reads

return c


# num_reads = triangle_area + underneath_rect_area
# num_reads = 0.5 * (PTR * min_triangle - min_triangle) * num_steps + min_triangle * num_steps
# num_reads = (0.5 * PTR + 1 - 0.5) * min_triangle * num_steps
Expand Down Expand Up @@ -86,6 +155,11 @@ def writeReads(readsF: TextIOWrapper, read: str, it: int) -> None:
readsF.write("@TEST" + str(it) + "\n")
readsF.write(read + "\n")
readsF.write("+\n")
# Create a random quality string
#rand_qual = ""
#for i in range(len(read)):
# rand_qual += chr(random.randint(33, 126))
#readsF.write(rand_qual + "\n")
readsF.write("~" * len(read) + "\n")


Expand All @@ -111,18 +185,23 @@ def complementRead(read: str) -> str:
# @param n is the number of read file pairs to create
def generateN(genome: str, reads: str, n: int, reads_per_sample: int = 50000):
for i in range(n):
c = generateTestData(genome, reads + str(i), reads_per_sample, i / 2 + 2)
print(f"{reads + str(i)}: {sorted(c.items())}")

with open(reads + str(i) + "_R1.fastq", "rb") as r1, open(
reads + str(i) + "_R2.fastq", "rb"
) as r2, gzip.open(reads + str(i) + "_R1.fastq.gz", "wb") as w1, gzip.open(
reads + str(i) + "_R2.fastq.gz", "wb"
ptr = i / 4 + 1.5
ptr_str = str(ptr).replace(".", "-")
c = generateTestData(genome, reads + ptr_str, reads_per_sample, ptr)
print(f"Finished {reads + ptr_str}")
#print(f"{reads + ptr_str}: {sorted(c.items())}")
#plt.bar(c.keys(), c.values())
#plt.show()

with open(reads + ptr_str + "_R1.fastq", "rb") as r1, open(
reads + ptr_str + "_R2.fastq", "rb"
) as r2, gzip.open(reads + ptr_str + "_R1.fastq.gz", "wb") as w1, gzip.open(
reads + ptr_str + "_R2.fastq.gz", "wb"
) as w2:
w1.writelines(r1)
w2.writelines(r2)
os.remove(reads + str(i) + "_R1.fastq")
os.remove(reads + str(i) + "_R2.fastq")
os.remove(reads + ptr_str + "_R1.fastq")
os.remove(reads + ptr_str + "_R2.fastq")


# multi-reads
Expand All @@ -137,11 +216,22 @@ def generateN(genome: str, reads: str, n: int, reads_per_sample: int = 50000):
# generateN("reference/Bfragilis.fasta", "new/Bfrag", 5, 100000)

# new multi-reads
generateN("reference/Bfragilis.fasta", "new-multi/Bfrag", 3, 100000)
generateN("reference/Ecoli.fasta", "new-multi/Ecoli", 3, 100000)
generateN("reference/akk-genome.fasta", "new-multi/Akk", 3, 100000)
#generateN("reference/Bfragilis.fasta", "new-multi/Bfrag", 3, 25000)
#generateN("reference/Ecoli.fasta", "new-multi/Ecoli", 3, 25000)
#generateN("reference/akk-genome.fasta", "new-multi/Akk", 3, 25000)

# tiny-multi-reads
# generateN("tiny-ref/akk-genome.fasta", "tiny-multi-reads/Akk", 3, 1500)
# generateN("tiny-ref/Bfragilis.fasta", "tiny-multi-reads/Bfrag", 3, 1500)
# generateN("tiny-ref/Ecoli.fasta", "tiny-multi-reads/Ecoli", 3, 1500)

# source-forge multi-reads
#generateN("source-forge-ref/Bacteroides_fragilis_638R_uid84217__NC_016776.fna", "source-forge-multi/Bfragilis", 3, 500000)
#generateN("source-forge-ref/Bifidobacterium_animalis_lactis_B420_uid163691__NC_017866.fna", "source-forge-multi/Bifidobacterium", 3, 500000)
#generateN("source-forge-ref/Streptomyces_scabiei_87_22_uid46531__NC_013929.fna", "source-forge-multi/Streptomyces", 3, 500000)

# source-forge reads
#generateN("source-forge-ref/Bacteroides_fragilis_638R_uid84217__NC_016776.fna", "source-forge-reads/Bfragilis", 4, 50000)

# sparse reads
generateN("source-forge-ref/Bacteroides_fragilis_638R_uid84217__NC_016776.fna", "sparse/Bfragilis", 6, 500000)
Loading
Loading