Add --priority-taxids option for database building.

All kmers associated with any priority taxids will override other taxids, meaning that the hash value for the kmer will not be LCA'd but set at priority taxid. This is useful for artificial vector/plasmid sequences overlapping bacteria so that their kmers don't get uninformatively LCA'd to a root node. The default value includes 32630 and 81077. Specifying any taxids will clear these two taxids. Multiple taxids can be separated by colon (:), comma (,) or repeated args.
DerrickWood · May 21, 2020 · 6db7221 · 6db7221
1 parent 9c702c8
commit 6db7221
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 4 deletions.
diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh
@@ -125,7 +125,7 @@ else
   list_sequence_files | xargs -0 cat | \
     build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
              -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-             -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
+             -c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag
   finalize_file taxo.k2d
   finalize_file opts.k2d
   finalize_file hash.k2d

diff --git a/scripts/kraken2-build b/scripts/kraken2-build
@@ -44,6 +44,7 @@ my (
   $minimizer_len,
   $kmer_len,
   $minimizer_spaces,
+  @priority_taxids,
   $is_protein,
   $masking,
   $max_db_size,
@@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR;
 $kmer_len = $ENV{"KRAKEN2_KMER_LEN"};
 $minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"};
 $minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"};
+@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || ();
 $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0;
 $use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0;
 $skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0;
@@ -92,6 +94,7 @@ GetOptions(
   "minimizer-len=i" => \$minimizer_len,
   "kmer-len=i" => \$kmer_len,
   "minimizer-spaces=i", \$minimizer_spaces,
+  "priority-taxids=s" => \@priority_taxid,
   "protein" => \$is_protein,
   "masking!" => \$masking,
   "max-db-size=i" => \$max_db_size,
@@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
 $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
 $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
 $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
+$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "";
+if (@priority_taxids) {
+    $ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/,join(',',@priority_taxids)));
+}
 $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
 $ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : "";
 $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
@@ -230,6 +237,10 @@ Options:
                              ignored in comparisons (build task only;
                              def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
   --protein                  Build a protein database for translated search
+  --priority-taxids NUM      Taxids whose k-mers override other taxids. By default includes
+                             32630: synthetic construct and 81077: artifical sequences. Specifying
+                             any taxids will clear these two default taxids. Multiple taxids can be
+                             separated by colon (:), comma (,) or repeated args.
   --no-masking               Used with --standard/--download-library/
                              --add-to-library to avoid masking low-complexity
                              sequences prior to building; masking requires

diff --git a/src/build_db.cc b/src/build_db.cc
@@ -27,6 +27,11 @@ using namespace kraken2;
 
 #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024)  // 10 MB
 
+// k-mers appearing in contaminant sequences will keep the contaminant
+//  sequence taxid, even if they also appear in a genome
+const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct'
+const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences'
+
 struct Options {
   string ID_to_taxon_map_filename;
   string ncbi_taxonomy_directory;
@@ -36,6 +41,7 @@ struct Options {
   size_t block_size;
   int num_threads;
   bool input_is_protein;
+  vector<uint64_t> priority_taxids;
   ssize_t k, l;
   size_t capacity;
   size_t maximum_capacity;
@@ -65,6 +71,8 @@ int main(int argc, char **argv) {
   opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK;
   opts.toggle_mask = DEFAULT_TOGGLE_MASK;
   opts.input_is_protein = false;
+  opts.priority_taxids.push_back(TID_CONTAMINANT1);
+  opts.priority_taxids.push_back(TID_CONTAMINANT2);
   opts.num_threads = 1;
   opts.block_size = DEFAULT_BLOCK_SIZE;
   opts.min_clear_hash_value = 0;
@@ -82,6 +90,9 @@ int main(int argc, char **argv) {
 
   Taxonomy taxonomy(opts.taxonomy_filename.c_str());
   taxonomy.GenerateExternalToInternalIDMap();
+  for (auto taxid : opts.priority_taxids) {
+    taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid));
+  }
   size_t bits_needed_for_value = 1;
   while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count())
     bits_needed_for_value++;
@@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map<string, uint64_t> &ID_to_taxon_map,
   std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl;
 }
 
+// Split string by any of delimiters, convert to long, and insert to iterator.
+template <typename OutIt>
+OutIt SplitInsert(char* input, string delimiters, OutIt out) {
+  std::string line(input);
+  std::size_t prev = 0, pos;
+  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) {
+    if (pos > prev) {
+      *out++ = stol(line.substr(prev, pos-prev));
+    }
+    prev = pos+1;
+  }
+  if (prev < line.length()) {
+    *out++ = stol(line.substr(prev, std::string::npos));
+  }
+  return out;
+}
+
 // This function exists to deal with NCBI's use of \x01 characters to denote
 // the start of a new FASTA header in the same line (for non-redundant DBs).
 // We return all sequence IDs in a header line, not just the first.
@@ -213,8 +241,9 @@ vector<string> ExtractNCBISequenceIDs(const string &header) {
 void ParseCommandLine(int argc, char **argv, Options &opts) {
   int opt;
   long long sig;
+  bool priority_taxids_cleared = false;
 
-  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) {
+  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) {
     switch (opt) {
       case 'h' : case '?' :
         usage(0);
@@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) {
           errx(EX_USAGE, "max capacity must be positive integer");
         opts.maximum_capacity = sig;
         break;
+      case 'P' :
+        if (!priority_taxids_cleared) {
+          opts.priority_taxids.clear();
+          priority_taxids_cleared = true;
+        }
+        SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids));
+        break;
       case 'X' :
         opts.input_is_protein = true;
         break;
@@ -326,6 +362,7 @@ void usage(int exit_code) {
        << "  -M INT        Set maximum capacity of hash table (MiniKraken)\n"
        << "  -S BITSTRING  Spaced seed mask\n"
        << "  -T BITSTRING  Minimizer toggle mask\n"
+       << "  -A INT(:INT)  Priority taxids that don't get LCA'd\n"
        << "  -X            Input seqs. are proteins\n"
        << "  -p INT        Number of threads\n"
        << "  -B INT        Read block size" << endl;
@@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid,
     CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner,
     uint64_t min_clear_hash_value)
 {
+  bool seq_is_priority = tax.priority_taxids.count(taxid);
   scanner.LoadSequence(seq);
   uint64_t *minimizer_ptr;
   bool ambig_flag;
@@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid,
       continue;
     hvalue_t existing_taxid = 0;
     hvalue_t new_taxid = taxid;
-    while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
-      new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+    if (seq_is_priority) {
+      hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) ||
+        hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid);
+    }
+    else {
+      while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
+        if (! tax.priority_taxids.count(existing_taxid)) {
+          new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+        } else {
+          new_taxid = existing_taxid;
+        }
+      }
     }
   }
 }

diff --git a/src/taxonomy.h b/src/taxonomy.h
@@ -64,6 +64,7 @@ class Taxonomy {
       return 0;
     return external_to_internal_id_map_[external_id];
   }
+  std::set<uint64_t> priority_taxids;
 
   private:
   void Init(const char *filename, bool memory_mapping);