From b40f075d6b98474f60c919332e2e86ec83f642c3 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Fri, 15 May 2020 13:01:35 -0400 Subject: [PATCH] Add --priority-taxids option for database building. All kmers associated with any priority taxids will override other taxids, meaning that the hash value for the kmer will not be LCA'd but set at priority taxid. This is useful for artificial vector/plasmid sequences overlapping bacteria so that their kmers don't get uninformatively LCA'd to a root node. The default value includes 32630 and 81077. Specifying any taxids will clear these two taxids. Multiple taxids can be separated by colon (:), comma (,) or repeated args. --- scripts/build_kraken2_db.sh | 2 +- scripts/kraken2-build | 11 ++++++++ src/build_db.cc | 54 ++++++++++++++++++++++++++++++++++--- src/taxonomy.h | 1 + 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh index 9ed1b93..5b5da80 100755 --- a/scripts/build_kraken2_db.sh +++ b/scripts/build_kraken2_db.sh @@ -125,7 +125,7 @@ else list_sequence_files | xargs -0 cat | \ build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \ -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \ - -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag + -c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag finalize_file taxo.k2d finalize_file opts.k2d finalize_file hash.k2d diff --git a/scripts/kraken2-build b/scripts/kraken2-build index 2d5909e..7bd393f 100755 --- a/scripts/kraken2-build +++ b/scripts/kraken2-build @@ -44,6 +44,7 @@ my ( $minimizer_len, $kmer_len, $minimizer_spaces, + @priority_taxids, $is_protein, $masking, $max_db_size, @@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR; $kmer_len = $ENV{"KRAKEN2_KMER_LEN"}; $minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"}; $minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"}; +@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || (); $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0; $use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0; $skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0; @@ -92,6 +94,7 @@ GetOptions( "minimizer-len=i" => \$minimizer_len, "kmer-len=i" => \$kmer_len, "minimizer-spaces=i", \$minimizer_spaces, + "priority-taxids=s" => \@priority_taxids, "protein" => \$is_protein, "masking!" => \$masking, "max-db-size=i" => \$max_db_size, @@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len; $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len; $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces; $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template(); +$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = ""; +if (@priority_taxids) { + $ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids))); +} $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : ""; $ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : ""; $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : ""; @@ -230,6 +237,10 @@ Options: ignored in comparisons (build task only; def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa) --protein Build a protein database for translated search + --priority-taxids NUM Taxids whose k-mers override other taxids. By default includes + 32630: synthetic construct and 81077: artifical sequences. Specifying + any taxids will clear these two default taxids. Multiple taxids can be + separated by colon (:), comma (,) or repeated args. --no-masking Used with --standard/--download-library/ --add-to-library to avoid masking low-complexity sequences prior to building; masking requires diff --git a/src/build_db.cc b/src/build_db.cc index 82f2cbb..66d0467 100644 --- a/src/build_db.cc +++ b/src/build_db.cc @@ -27,6 +27,11 @@ using namespace kraken2; #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024) // 10 MB +// k-mers appearing in contaminant sequences will keep the contaminant +// sequence taxid, even if they also appear in a genome +const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct' +const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences' + struct Options { string ID_to_taxon_map_filename; string ncbi_taxonomy_directory; @@ -36,6 +41,7 @@ struct Options { size_t block_size; int num_threads; bool input_is_protein; + vector priority_taxids; ssize_t k, l; size_t capacity; size_t maximum_capacity; @@ -65,6 +71,8 @@ int main(int argc, char **argv) { opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK; opts.toggle_mask = DEFAULT_TOGGLE_MASK; opts.input_is_protein = false; + opts.priority_taxids.push_back(TID_CONTAMINANT1); + opts.priority_taxids.push_back(TID_CONTAMINANT2); opts.num_threads = 1; opts.block_size = DEFAULT_BLOCK_SIZE; opts.min_clear_hash_value = 0; @@ -82,6 +90,9 @@ int main(int argc, char **argv) { Taxonomy taxonomy(opts.taxonomy_filename.c_str()); taxonomy.GenerateExternalToInternalIDMap(); + for (auto taxid : opts.priority_taxids) { + taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid)); + } size_t bits_needed_for_value = 1; while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count()) bits_needed_for_value++; @@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map &ID_to_taxon_map, std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl; } +// Split string by any of delimiters, convert to long, and insert to iterator. +template +OutIt SplitInsert(char* input, string delimiters, OutIt out) { + std::string line(input); + std::size_t prev = 0, pos; + while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) { + if (pos > prev) { + *out++ = stol(line.substr(prev, pos-prev)); + } + prev = pos+1; + } + if (prev < line.length()) { + *out++ = stol(line.substr(prev, std::string::npos)); + } + return out; +} + // This function exists to deal with NCBI's use of \x01 characters to denote // the start of a new FASTA header in the same line (for non-redundant DBs). // We return all sequence IDs in a header line, not just the first. @@ -213,8 +241,9 @@ vector ExtractNCBISequenceIDs(const string &header) { void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; long long sig; + bool priority_taxids_cleared = false; - while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) { + while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); @@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) { errx(EX_USAGE, "max capacity must be positive integer"); opts.maximum_capacity = sig; break; + case 'P' : + if (!priority_taxids_cleared) { + opts.priority_taxids.clear(); + priority_taxids_cleared = true; + } + SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids)); + break; case 'X' : opts.input_is_protein = true; break; @@ -326,6 +362,7 @@ void usage(int exit_code) { << " -M INT Set maximum capacity of hash table (MiniKraken)\n" << " -S BITSTRING Spaced seed mask\n" << " -T BITSTRING Minimizer toggle mask\n" + << " -A INT(:INT) Priority taxids that don't get LCA'd\n" << " -X Input seqs. are proteins\n" << " -p INT Number of threads\n" << " -B INT Read block size" << endl; @@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid, CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner, uint64_t min_clear_hash_value) { + bool seq_is_priority = tax.priority_taxids.count(taxid); scanner.LoadSequence(seq); uint64_t *minimizer_ptr; bool ambig_flag; @@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid, continue; hvalue_t existing_taxid = 0; hvalue_t new_taxid = taxid; - while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) { - new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid); + if (seq_is_priority) { + hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) || + hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid); + } + else { + while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) { + if (! tax.priority_taxids.count(existing_taxid)) { + new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid); + } else { + new_taxid = existing_taxid; + } + } } } } diff --git a/src/taxonomy.h b/src/taxonomy.h index 8d43604..9b02d16 100644 --- a/src/taxonomy.h +++ b/src/taxonomy.h @@ -64,6 +64,7 @@ class Taxonomy { return 0; return external_to_internal_id_map_[external_id]; } + std::set priority_taxids; private: void Init(const char *filename, bool memory_mapping);