-
Notifications
You must be signed in to change notification settings - Fork 2
Case 3
davve2 edited this page Jun 26, 2019
·
28 revisions
This example runs through SLURM on a linux based compute cluster.
If you want to follow this example, make sure to have the following softwares installed:
Download the MinION reads from the SRA (run accession SRR9290761 and SRR9290851):
# FSC200
wget -P ~/case_3/data https://sra-download.ncbi.nlm.nih.gov/traces/sra3/SRZ/009290/SRR9290761/CEPA-francisella-FSC200_20171107.fastq
# FSC454
wget -P ~/case_3/data https://sra-download.ncbi.nlm.nih.gov/traces/sra23/SRZ/009290/SRR9290851/CEPA-francisella-FSC454_20180201.fastq
To replicate the test data, parse the data using nanopore_max_length.py (see scripts below) (SEED is set):
python nanopore_max_length.py CEPA-francisella-FSC200_20171107.fastq target.fastq 50000
python nanopore_max_length.py CEPA-francisella-FSC454_20180201.fastq near_neighbor.fastq 15000
Parse the data using the nanopore_max_length.py script
nanopore_max_length.py
#!/usr/bin/env python3 -c
import sys
import random
outputfile = open(sys.argv[2], "wt")
readidfile = open(sys.argv[2].replace("fastq","readids"), "wt")
if len(sys.argv) > 3:
samplesize = int(sys.argv[3])
else:
samplesize = 50000
### PaRAMS
SEED = 454
random.seed(SEED)
cut = 3000
#samplesize = 50000
nreads = 0
with open(sys.argv[1],"rt") as f:
for row in f:
if row.startswith("@"):
nreads += 1
print("Number of reads in total: ", nreads)
print("Sample size: ",samplesize)
samples_selection = set(random.sample(range(nreads),samplesize))
## Count
count = 0
ncut = 0
all = 0
readids = []
readDict = []
with open(sys.argv[1], "rt") as f:
while True:
all += 1
header = f.readline().strip()
readids.append(header)
if not header:
break
bases = f.readline().strip()
length = len(bases)
qh = f.readline().strip()
qual = f.readline().strip()
if len(bases) > cut:
bases = bases[:cut]
qual = qual[:cut]
ncut +=1
if len(set([all-1]) & samples_selection) > 0:
count += 1
readDict.append([header,bases,qh,qual,length])
print(header.split(" ")[0], file=readidfile)
readDict = sorted(readDict, key=lambda x:x[4],reverse=True)
for header,bases,qh,qual,length in readDict:
print(header, file=outputfile)
print(bases, file=outputfile)
print(qh, file=outputfile)
print(qual, file=outputfile)
print(sys.argv[1])
print("N selected, {}".format(count))
print("Reads trimmed {}".format(ncut))
outputfile.close()
readidfile.close()
parse_pipeline_results.py
parse_pipeline_results.py
'''Parse results from doepipeline'''
import argparse
import sys
import os
class ParseKrakenResults(object):
"""docstring for ParseKrakenResults."""
def __init__(self, kraken_results,target=False):
super(ParseKrakenResults, self).__init__()
self.kraken_results = kraken_results
self.taxonomy_target = target
self.tree = {"0":"1"}
self.results = False,
self.levelsum = False
self.maxcov = 0
self.maxcovTaxid = 0
def print_results(self,output,answer=False):
if not answer: ## The most likely case of this script failing here
## is that there were not reads found to target species
## This is not bugtested 100%
answer = [["F1Score", str(0)]]
for ans in answer:
print(",".join(ans),file=output)
def read_kraken_results(self,kraken_results=False,single=True,filtered=False):
'''Read the kraken result file and print taxid cov, thits, fhits'''
oldformat = False
results = {}
levelsum = {}
savetree = False
coverage = 0
if not kraken_results:
kraken_results = self.kraken_results
## Remove comments in report file
line = kraken_results.readline()
while True:
if line.startswith("%") or filtered:
break
else:
line = kraken_results.readline()
self.header =line.strip("\n").strip().split("\t")
### Check if file is kraken2 format else
if self.header[3] == "U":
oldformat = True
taxid = self.header[4]
results[taxid] = self.header + [0]
levelsum["0"] = 0
for row in kraken_results:
if row.startswith("%"): continue
data = row.strip("\n").split("\t")
nbase = data[-1].split(" ")
dcol = 1
try:
taxReads = int(data[1]) ### to get cumulative reads take data[1] tax specific reads = data[2]
except ValueError: #This is most likely to happen in the case where no reads were found to requested target
print("Warning: taxReads not found in row", data) #return False and print out an F1 score of 0!
return False
level = 0
while True:
if len(nbase) == 0 or nbase == "":
break
if oldformat:
name = nbase
spl = nbase.pop(0)
if "" == spl:
level += 1
else:
name = " ".join([spl] + nbase)
break
level = level/2
try:
levelsum[level] += taxReads
except:
levelsum[level] = taxReads
data[-1] = name
data += [level]
### This part handles the two different report versions that separates the original report and the filtered report.
### Unfortunately the cov value cannot be assesed for the filtered data
try:
taxid = data[6]
coverage = float(data[5])
if coverage > self.maxcov and coverage < 1:
self.maxcov = coverage
self.maxcovTaxid = taxid
except:
taxid = data[4]
results[taxid] = data
self.results = results
self.levelsum = levelsum
return results
def parse_results(self,results=False,target=False,max=False,output=None):
'''This function taked the result and a target to produce tcount fcount and cov as well as top cov'''
if not results:
results = self.results
if not target:
target = self.taxonomy_target
if max:
target = self.maxcovTaxid
if target in max:
#print("Correct target already in result, skip")
return
tres = results[target]
level = tres[-1]
cov = float(tres[5])
taxReads = int(tres[2]) ## Tax reads is tres[2]
levelsum = self.levelsum[level]
fp = levelsum - taxReads
print( '''
level: {level}
coverage: {cov}
taxReads: {treads}
FalseP: {fp}
levelsum: {levsum}
t/f ratio: {frac}
'''.format(level=level, cov=cov,treads=taxReads, levsum=levelsum, fp = fp, frac=(float(taxReads)/levelsum)), file=output)
def parse_result(self,results, target):
try:
tres = results[target]
except KeyError:
return [0,0,0]
level = tres[-1]
try:
cov = float(tres[5])
except:
cov = False
taxReads = int(tres[2])
levelsum = self.levelsum[level]
fp = levelsum - taxReads
return [taxReads, fp, cov]
def pipeline_results(self,kraken_files = False,target=False,output=None, nearNeighbour=True):
'''Print results for pipeline'''
### Four files has to be analyzed per sample, kraken cov and kraken filter TP/FP fraction in form of an F1 score.
allRes = []
unclassifiedRes = []
### results[0] is the unclassified result which will be added as a response
for kf in kraken_files:
#print(kf.name)
if kf.name.endswith("filtered.report"):
kraken_res = self.read_kraken_results(kf,filtered=True)
else:
kraken_res = self.read_kraken_results(kf)
if not kraken_res: ## no reads were found at all
self.print_results(output)
return
results = self.parse_result(kraken_res,target=target)
unclassified = self.parse_result(kraken_res, target = "0")
allRes.append(results)
unclassifiedRes.append(unclassified)
### results[0] is the unclassified result which will be added as a response
nunclassified = unclassifiedRes[1][0]
## The number of unclassified reads will reflect the number of unclassified after the filter is applied.
## As coverage only can be accounted for non filtered, this will reflect the change after the filter.
tcov = allRes[0][2] ## Coverage of kmers for a species, must be taken from before filter as it is not calculated with centrifuge-report could be multiplied with 1/tp
ttp = allRes[1][0] ## After filter, target
tfp = allRes[3][0] ## After filter nearNeighbour hitting target but actually near neighbour
fn = allRes[1][1] ## After filter, target not hitting correct species, false negative
div = float((2*ttp + tfp + fn)) ## (2TP + FP + FN) = div
if div == float(0): ## div = 0 is not allowed
div = 1
F1 = float(2*ttp) / div # F1 = 2TP / div(2TP + FP + FN)
if ttp < 3:
F1 = 0
tcov = tcov * ttp / allRes[0][0] ### tcov is adjusted from the number of reads originally to the number of reads after filter.
answer = [
["F1Score", str(F1)]
]
self.print_results(output,answer=answer)
def arguments():
import argparse
parser = argparse.ArgumentParser(description='parse kraken results from doepipeline')
parser.add_argument('kraken_results',type=argparse.FileType("r"), nargs="+", help="name of kraken report file(s)")
parser.add_argument('--target', type=str, help="set target tax id allows multiple targets separated with comma (263,119857,351581)")
parser.add_argument('--pipeline', type=str, help="set target tax and print answer formatted for pipeline")
parser.add_argument('-o', '--output', default=None, type=argparse.FileType("w"), help="output to file instead of stdout")
parser.add_argument('-v', '--verbose', action='store_true', help="Add extra informative output for debugging!")
return parser.parse_args()
def main(args):
PK = ParseKrakenResults(args.kraken_results)
if args.verbose:
print(args)
if not args.pipeline:
results = PK.read_kraken_results()
for target in args.target.split(","):
PK.parse_results(results, target=target,output=args.output)
PK.parse_results(max=args.target.split(","),output=args.output)
else:
#print(args.kraken_results[0].name)
PK.pipeline_results(kraken_files = args.kraken_results, target=args.pipeline,output=args.output)
if __name__ == '__main__':
main(arguments())
config.yaml
# Specify the design
design:
type: ccf
factors:
# Each factor to vary is specified here.
# Min/max: the smallest/largest values allowed
# Low_init/high_init: the initial design space min and max. Is only used when not running the GSD screening step in the beginning.
# Type: ordinal (integer), qualitative (categorical), quantitative (float)
FILTER:
min: 0
max: 0.5
low_init: 0.05
high_init: 0.1
type: quantitative
PRECISION:
min: 10
max: 18
low_init: 10
high_init: 18
type: ordinal
MINHITS:
min: 1
max: 100
low_init: 5
high_init: 50
type: ordinal
responses:
# Each response to measure is specified here
# Criterion: minimize/maximize
F1Score:
criterion: maximize
low_limit: 0.3
target: 1
# All work and output files goes here
working_directory: "francisella_nano_grid_final"
# The name of the file containing the results (response values)
# The response in this file should be formatted like "RESPONSE,VALUE". One response per line.
results_file: "results.txt"
# Name the different pipeline steps to run
## krakendb_GTDB
pipeline:
- KRAKEN
- KRAKEN2
- KRAKENFILTER
- KRAKENFILTER2
- CalculateResponse
KRAKEN:
script: >
conda activate krakenuniq;
krakenuniq \
--db krakendb_GTDB/ \
--threads 20 --fastq-input --hll-precision {% PRECISION %} --quick --min-hits {% MINHITS %} \
--report-file kraken.report --output kraken.output \
input_data/target.fastq;
factors:
PRECISION:
substitute: true
# KMERSIZE:
# substitute: true
MINHITS:
substitute: true
SLURM:
A: <proj>
p: node
n: 20
t: 0-16:00:00
C: mem256GB
J: run_kraken_optkmers1
o: slurm_KRAKENHLL1-Francisella.out
e: slurm_KRAKENHLL1-Francisella.err
KRAKEN2:
script: >
conda activate krakenuniq;
krakenuniq \
--db krakendb_GTDB/ \
--threads 20 --fastq-input --hll-precision {% PRECISION %} --quick --min-hits {% MINHITS %} \
--report-file kraken_near_neighbor.report --output kraken_near_neighbor.output \
input_data/near_neighbor.fastq;
# Specify the factors that are used in the script, and how to insert them in the script string
# Here every factor is simply substituted
factors:
PRECISION:
substitute: true
# KMERSIZE:
# substitute: true
MINHITS:
substitute: true
SLURM:
A: <proj>
p: node
n: 20
t: 0-16:00:00
C: mem256GB
J: run_kraken_optkmers2
o: slurm_KRAKENHLL2-Francisella.out
e: slurm_KRAKENHLL2-Francisella.err
KRAKENFILTER:
script: >
conda activate krakenuniq;
krakenuniq-filter --db krakendb_GTDB/ --threshold {% FILTER %} kraken.output > kraken-filtered.output;
krakenuniq-report --db krakendb_GTDB/ kraken-filtered.output > kraken.filtered.report
# Specify the factors that are used in the script, and how to insert them in the script string
# Here every factor is simply substituted
factors:
FILTER:
substitute: true
# KMERSIZE:
# substitute: true
SLURM:
SLURM:
A: <proj>
p: node
n: 20
t: 0-16:00:00
C: mem256GB
J: run_kraken_filter_optkmers1
o: slurm_KRAKENHLL_filter1-Francisella.out
e: slurm_KRAKENHLL_filter1-Francisella.err
KRAKENFILTER2:
script: >
conda activate krakenuniq;
krakenuniq-filter --db krakendb_GTDB/ --threshold {% FILTER %} kraken_near_neighbor.output > kraken-filtered_near_neighbor.output;
krakenuniq-report --db krakendb_GTDB/ kraken-filtered_near_neighbor.output > kraken_near_neighbor.filtered.report
# Specify the factors that are used in the script, and how to insert them in the script string
# Here every factor is simply substituted
factors:
FILTER:
substitute: true
# KMERSIZE:
# substitute: true
SLURM:
SLURM:
A: <proj>
p: node
n: 20
t: 0-16:00:00
C: mem256GB
J: run_kraken_filter_optkmers2
o: slurm_KRAKENHLL_filter2-Francisella.out
e: slurm_KRAKENHLL_filter2-Francisella.err
# Second and final pipeline step. The response is calculated
# No factors are used here and it's not run using SLURM
# Specify the output file with {% results_file %}
CalculateResponse:
script: >
python scripts/pipeline_parse_results.py --pipeline 358 -o {% results_file %} kraken.report kraken.filtered.report kraken_near_neighbor.report kraken_near_neighbor.filtered.report;
#
SLURM:
A: <proj>
p: core
n: 1
t: 0-0:01:00
#C: mem128GB
J: run_kraken_calculate_response
o: slurm_KRAKENHLL-Francisella-CalculateResponse.out
e: slurm_KRAKENHLL-Francisella.CalculateResponse.err
Submit_script.sh
#!/bin/bash -l
#SBATCH -A PROJECT
#SBATCH -n 1
#SBATCH -t 3-00:00:00
#SBATCH -J optimizeKrakenuniq
#SBATCH -o optimizeKrakenuniq.out
#SBATCH -e optimizeKrakenuniq.error
set -e
set -x
# Changes here
_WORKDIR="francisella"
_OUTDIR="${_WORKDIR}/results"
_DOE="bin/doepipeline"
_YAML="optimizeKrakenKmersFrancisella.yaml"
_LOG="${_OUTDIR}/run.log"
_EXECUTION="slurm"
_ITERS="4"
_MODEL_SELECTION="brute"
_SHRINK="0.9"
_REDUCTION=""
PARAMETERS=$1
# Run doepipeline
echo "Starting analysis $(date)"
if [ ! -d ${_OUTDIR} ]; then
echo "Creating working directory: ${_OUTDIR}"
mkdir -p ${_OUTDIR}
fi
${_DOE} -e ${_EXECUTION} -s ${_SHRINK} -m ${_MODEL_SELECTION} ${_REDUCTION} -i ${_ITERS} -l ${_LOG} $PARAMETERS ${_YAML}
echo "Finishing analysis $(date)"