Skip to content
davve2 edited this page Jun 26, 2019 · 28 revisions

k-mer taxonomic classification of ONT MinION reads

Before you start

This example runs through SLURM on a linux based compute cluster.

If you want to follow this example, make sure to have the following softwares installed:

Data preperation

Download the MinION reads from the SRA (run accession SRR9290761 and SRR9290851):

# FSC200
wget -P ~/case_3/data https://sra-download.ncbi.nlm.nih.gov/traces/sra3/SRZ/009290/SRR9290761/CEPA-francisella-FSC200_20171107.fastq

# FSC454
wget -P ~/case_3/data https://sra-download.ncbi.nlm.nih.gov/traces/sra23/SRZ/009290/SRR9290851/CEPA-francisella-FSC454_20180201.fastq

To replicate the test data, parse the data using nanopore_max_length.py (see scripts below) (SEED is set):

python nanopore_max_length.py CEPA-francisella-FSC200_20171107.fastq target.fastq 50000
python nanopore_max_length.py CEPA-francisella-FSC454_20180201.fastq near_neighbor.fastq 15000

Parse the data using the nanopore_max_length.py script

Scripts

nanopore_max_length.py

nanopore_max_length.py

    #!/usr/bin/env python3 -c
    import sys
    import random

    outputfile = open(sys.argv[2], "wt")
    readidfile = open(sys.argv[2].replace("fastq","readids"), "wt")

    if len(sys.argv) > 3:
        samplesize = int(sys.argv[3])
    else:
        samplesize = 50000
    ### PaRAMS
    SEED = 454
    random.seed(SEED)
    cut = 3000
    #samplesize = 50000

    nreads = 0
    with open(sys.argv[1],"rt") as f:
        for row in f:
            if row.startswith("@"):
                nreads += 1
    print("Number of reads in total: ", nreads)
    print("Sample size: ",samplesize)

    samples_selection = set(random.sample(range(nreads),samplesize))

    ## Count
    count = 0
    ncut = 0
    all = 0

    readids = []
    readDict = []

    with open(sys.argv[1], "rt") as f:
        while True:
            all += 1

            header = f.readline().strip()
            readids.append(header)
            if not header:
                break
            bases = f.readline().strip()
            length = len(bases)
            qh = f.readline().strip()
            qual = f.readline().strip()

            if len(bases) > cut:
                bases = bases[:cut]
                qual = qual[:cut]
                ncut +=1

            if len(set([all-1]) & samples_selection) > 0:
                count += 1
                readDict.append([header,bases,qh,qual,length])
            print(header.split(" ")[0], file=readidfile)

    readDict = sorted(readDict, key=lambda x:x[4],reverse=True)

    for header,bases,qh,qual,length in readDict:
        print(header,   file=outputfile)
        print(bases,    file=outputfile)
        print(qh,   file=outputfile)
        print(qual, file=outputfile)


    print(sys.argv[1])
    print("N selected,      {}".format(count))
    print("Reads trimmed    {}".format(ncut))

    outputfile.close()
    readidfile.close()
parse_pipeline_results.py

parse_pipeline_results.py

'''Parse results from doepipeline'''

import argparse
import sys
import os

class ParseKrakenResults(object):
    """docstring for ParseKrakenResults."""
    def __init__(self, kraken_results,target=False):
        super(ParseKrakenResults, self).__init__()
        self.kraken_results = kraken_results
        self.taxonomy_target = target
        self.tree = {"0":"1"}
        self.results = False,
        self.levelsum = False
        self.maxcov = 0
        self.maxcovTaxid = 0

    def print_results(self,output,answer=False):
        if not answer:  ## The most likely case of this script failing here
                        ## is that there were not reads found to target species
                        ## This is not bugtested 100%
            answer = [["F1Score", str(0)]]
        for ans in answer:
            print(",".join(ans),file=output)

    def read_kraken_results(self,kraken_results=False,single=True,filtered=False):
        '''Read the kraken result file and print taxid cov, thits, fhits'''
        oldformat = False
        results = {}
        levelsum = {}
        savetree = False
        coverage = 0
        if not kraken_results:
            kraken_results = self.kraken_results
        ## Remove comments in report file
        line = kraken_results.readline()
        while True:
            if line.startswith("%") or filtered:
                break
            else:
                line = kraken_results.readline()
        self.header =line.strip("\n").strip().split("\t")
        ### Check if file is kraken2 format else
        if self.header[3] == "U":
            oldformat = True
            taxid = self.header[4]
            results[taxid] = self.header + [0]
            levelsum["0"] = 0
        for row in kraken_results:
            if row.startswith("%"): continue
            data = row.strip("\n").split("\t")
            nbase = data[-1].split(" ")
            dcol = 1
            try:
                taxReads = int(data[1])  ### to get cumulative reads take data[1] tax specific reads = data[2]
            except ValueError:  #This is most likely to happen in the case where no reads were found to requested target
                print("Warning: taxReads not found in row", data)                #return False and print out an F1 score of 0!
                return False
            level = 0
            while True:
                if len(nbase) == 0 or nbase == "":
                    break
                if oldformat:
                    name = nbase
                spl = nbase.pop(0)
                if "" == spl:
                    level += 1
                else:
                    name = " ".join([spl] + nbase)
                    break
            level = level/2
            try:
                levelsum[level] += taxReads
            except:
                levelsum[level] = taxReads
            data[-1] = name
            data += [level]
            ### This part handles the two different report versions that separates the original report and the filtered report.
            ### Unfortunately the cov value cannot be assesed for the filtered data
            try:
                taxid = data[6]
                coverage = float(data[5])
                if coverage > self.maxcov and coverage < 1:
                    self.maxcov = coverage
                    self.maxcovTaxid = taxid
            except:
                taxid = data[4]
            results[taxid] = data
        self.results = results
        self.levelsum = levelsum
        return results

    def parse_results(self,results=False,target=False,max=False,output=None):
        '''This function taked the result and a target to produce tcount fcount and cov as well as top cov'''
        if not results:
            results = self.results
        if not target:
            target = self.taxonomy_target
        if max:
            target = self.maxcovTaxid
            if target in max:
                #print("Correct target already in result, skip")
                return
        tres = results[target]
        level = tres[-1]
        cov = float(tres[5])
        taxReads = int(tres[2])   ## Tax reads is tres[2]
        levelsum = self.levelsum[level]
        fp = levelsum - taxReads
        print( '''
            level: {level}
            coverage: {cov}
            taxReads: {treads}
            FalseP: {fp}
            levelsum: {levsum}
            t/f ratio: {frac}
        '''.format(level=level, cov=cov,treads=taxReads, levsum=levelsum, fp = fp, frac=(float(taxReads)/levelsum)), file=output)

    def parse_result(self,results, target):
        try:
            tres = results[target]
        except KeyError:
            return [0,0,0]
        level = tres[-1]
        try:
            cov = float(tres[5])
        except:
            cov = False
        taxReads = int(tres[2])
        levelsum = self.levelsum[level]
        fp = levelsum - taxReads
        return [taxReads, fp, cov]

    def pipeline_results(self,kraken_files = False,target=False,output=None, nearNeighbour=True):
        '''Print results for pipeline'''
        ### Four files has to be analyzed per sample, kraken cov and kraken filter TP/FP fraction in form of an F1 score.
        allRes = []
        unclassifiedRes = []

        ### results[0] is the unclassified result which will be added as a response
        for kf in kraken_files:
            #print(kf.name)
            if kf.name.endswith("filtered.report"):
                kraken_res = self.read_kraken_results(kf,filtered=True)
            else:
                kraken_res = self.read_kraken_results(kf)
            if not kraken_res:  ## no reads were found at all
                self.print_results(output)
                return
            results = self.parse_result(kraken_res,target=target)
            unclassified = self.parse_result(kraken_res, target = "0")
            allRes.append(results)
            unclassifiedRes.append(unclassified)
        ### results[0] is the unclassified result which will be added as a response
        nunclassified = unclassifiedRes[1][0]
        ## The number of unclassified reads will reflect the number of unclassified after the filter is applied.
        ## As coverage only can be accounted for non filtered, this will reflect the change after the filter.
        tcov = allRes[0][2]  ## Coverage of kmers for a species,  must be taken from before filter as it is not calculated with centrifuge-report could be multiplied with 1/tp
    
        ttp = allRes[1][0]  ## After filter, target
        tfp = allRes[3][0]  ## After filter nearNeighbour hitting target but actually near neighbour

        fn = allRes[1][1] ## After filter, target not hitting correct species, false negative
        div = float((2*ttp + tfp + fn)) ## (2TP + FP + FN) = div
        if div == float(0):  ## div = 0 is not allowed
            div = 1
        F1 = float(2*ttp) / div # F1 = 2TP / div(2TP + FP + FN)
        if ttp < 3: 
            F1 = 0
        tcov = tcov * ttp / allRes[0][0]  ### tcov is adjusted from the number of reads originally to the number of reads after filter.
        answer = [
                    ["F1Score", str(F1)]
        ]
        self.print_results(output,answer=answer)


def arguments():
    import argparse
    parser = argparse.ArgumentParser(description='parse kraken results from doepipeline')
    parser.add_argument('kraken_results',type=argparse.FileType("r"), nargs="+", help="name of kraken report file(s)")
    parser.add_argument('--target', type=str, help="set target tax id allows multiple targets separated with comma (263,119857,351581)")
    parser.add_argument('--pipeline', type=str, help="set target tax and print answer formatted for pipeline")
    parser.add_argument('-o', '--output', default=None, type=argparse.FileType("w"), help="output to file instead of stdout")
    parser.add_argument('-v', '--verbose', action='store_true', help="Add extra informative output for debugging!")
    return parser.parse_args()

def main(args):
    PK = ParseKrakenResults(args.kraken_results)
    if args.verbose:
        print(args)
    if not args.pipeline:
        results = PK.read_kraken_results()
        for target in args.target.split(","):
            PK.parse_results(results, target=target,output=args.output)
        PK.parse_results(max=args.target.split(","),output=args.output)
    else:
        #print(args.kraken_results[0].name)
        PK.pipeline_results(kraken_files = args.kraken_results, target=args.pipeline,output=args.output)

if __name__ == '__main__':
    main(arguments())

YAML configuration file

config.yaml

# Specify the design
design:
    type: ccf
    factors:
        # Each factor to vary is specified here.
        # Min/max: the smallest/largest values allowed
        # Low_init/high_init: the initial design space min and max. Is only used when not running the GSD screening step in the beginning.
        # Type: ordinal (integer), qualitative (categorical), quantitative (float)

        FILTER:
            min: 0
            max: 0.5
            low_init: 0.05
            high_init: 0.1
            type: quantitative
        PRECISION:
            min: 10
            max: 18
            low_init: 10
            high_init: 18
            type: ordinal
        MINHITS:
            min: 1
            max: 100
            low_init: 5
            high_init: 50
            type: ordinal
    responses:
        # Each response to measure is specified here
        # Criterion: minimize/maximize

        F1Score:
            criterion: maximize
            low_limit: 0.3
            target: 1

# All work and output files goes here
working_directory: "francisella_nano_grid_final"

# The name of the file containing the results (response values)
# The response in this file should be formatted like "RESPONSE,VALUE". One response per line.
results_file: "results.txt"

# Name the different pipeline steps to run
## krakendb_GTDB
pipeline:
    - KRAKEN
    - KRAKEN2
    - KRAKENFILTER
    - KRAKENFILTER2
    - CalculateResponse


KRAKEN:
    script: >
        conda activate krakenuniq;
        krakenuniq \
          --db krakendb_GTDB/ \
          --threads 20 --fastq-input --hll-precision {% PRECISION %} --quick --min-hits {% MINHITS %} \
          --report-file kraken.report --output kraken.output \
         input_data/target.fastq;

    factors:
        PRECISION:
            substitute: true
        # KMERSIZE:
        #     substitute: true
        MINHITS:
            substitute: true

    SLURM:
        A: <proj>
        p: node
        n: 20
        t: 0-16:00:00
        C: mem256GB
        J: run_kraken_optkmers1
        o: slurm_KRAKENHLL1-Francisella.out
        e: slurm_KRAKENHLL1-Francisella.err


KRAKEN2:
    script: >
        conda activate krakenuniq;
        krakenuniq \
          --db krakendb_GTDB/ \
          --threads 20 --fastq-input --hll-precision {% PRECISION %} --quick --min-hits {% MINHITS %} \
          --report-file kraken_near_neighbor.report --output kraken_near_neighbor.output \
         input_data/near_neighbor.fastq;

    # Specify the factors that are used in the script, and how to insert them in the script string
    # Here every factor is simply substituted
    factors:
        PRECISION:
            substitute: true
        # KMERSIZE:
        #     substitute: true
        MINHITS:
            substitute: true

    SLURM:
        A: <proj>
        p: node
        n: 20
        t: 0-16:00:00
        C: mem256GB
        J: run_kraken_optkmers2
        o: slurm_KRAKENHLL2-Francisella.out
        e: slurm_KRAKENHLL2-Francisella.err

KRAKENFILTER:
    script: >
        conda activate krakenuniq;
        krakenuniq-filter --db krakendb_GTDB/ --threshold {% FILTER %} kraken.output > kraken-filtered.output;
        krakenuniq-report --db krakendb_GTDB/ kraken-filtered.output > kraken.filtered.report

    # Specify the factors that are used in the script, and how to insert them in the script string
    # Here every factor is simply substituted
    factors:
        FILTER:
            substitute: true
        # KMERSIZE:
        #     substitute: true
    SLURM:
        SLURM:
        A: <proj>
        p: node
        n: 20
        t: 0-16:00:00
        C: mem256GB
        J: run_kraken_filter_optkmers1
        o: slurm_KRAKENHLL_filter1-Francisella.out
        e: slurm_KRAKENHLL_filter1-Francisella.err

KRAKENFILTER2:
    script: >
        conda activate krakenuniq;
        krakenuniq-filter --db krakendb_GTDB/ --threshold {% FILTER %} kraken_near_neighbor.output > kraken-filtered_near_neighbor.output;
        krakenuniq-report --db krakendb_GTDB/ kraken-filtered_near_neighbor.output > kraken_near_neighbor.filtered.report


    # Specify the factors that are used in the script, and how to insert them in the script string
    # Here every factor is simply substituted
    factors:
        FILTER:
            substitute: true
        # KMERSIZE:
        #     substitute: true
    SLURM:
        SLURM:
        A: <proj>
        p: node
        n: 20
        t: 0-16:00:00
        C: mem256GB
        J: run_kraken_filter_optkmers2
        o: slurm_KRAKENHLL_filter2-Francisella.out
        e: slurm_KRAKENHLL_filter2-Francisella.err


# Second and final pipeline step. The response is calculated
# No factors are used here and it's not run using SLURM
# Specify the output file with {% results_file %}
CalculateResponse:
    script: >
        python scripts/pipeline_parse_results.py --pipeline 358 -o {% results_file %} kraken.report kraken.filtered.report kraken_near_neighbor.report kraken_near_neighbor.filtered.report;
    #
    SLURM:
        A: <proj>
        p: core
        n: 1
        t: 0-0:01:00
        #C: mem128GB
        J: run_kraken_calculate_response
        o: slurm_KRAKENHLL-Francisella-CalculateResponse.out
        e: slurm_KRAKENHLL-Francisella.CalculateResponse.err

Executing doepipeline

Submit_script.sh

#!/bin/bash -l

#SBATCH -A PROJECT
#SBATCH -n 1
#SBATCH -t 3-00:00:00
#SBATCH -J optimizeKrakenuniq
#SBATCH -o optimizeKrakenuniq.out
#SBATCH -e optimizeKrakenuniq.error

set -e
set -x

# Changes here
_WORKDIR="francisella"
_OUTDIR="${_WORKDIR}/results"
_DOE="bin/doepipeline"
_YAML="optimizeKrakenKmersFrancisella.yaml"
_LOG="${_OUTDIR}/run.log"
_EXECUTION="slurm"
_ITERS="4"
_MODEL_SELECTION="brute"
_SHRINK="0.9"
_REDUCTION=""

PARAMETERS=$1

# Run doepipeline
echo "Starting analysis $(date)"
if [ ! -d ${_OUTDIR} ]; then
  echo "Creating working directory: ${_OUTDIR}"
  mkdir -p ${_OUTDIR}
fi
${_DOE} -e ${_EXECUTION} -s ${_SHRINK} -m ${_MODEL_SELECTION} ${_REDUCTION} -i ${_ITERS} -l ${_LOG} $PARAMETERS ${_YAML}
echo "Finishing analysis $(date)"