DerrickWood · yesimon · May 15, 2020
diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh
@@ -125,7 +125,7 @@ else
   list_sequence_files | xargs -0 cat | \
     build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
              -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-             -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
+             -c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag
   finalize_file taxo.k2d
   finalize_file opts.k2d
   finalize_file hash.k2d

diff --git a/scripts/kraken2-build b/scripts/kraken2-build
@@ -44,6 +44,7 @@ my (
   $minimizer_len,
   $kmer_len,
   $minimizer_spaces,
+  @priority_taxids,
   $is_protein,
   $masking,
   $max_db_size,
@@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR;
 $kmer_len = $ENV{"KRAKEN2_KMER_LEN"};
 $minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"};
 $minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"};
+@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || ();
 $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0;
 $use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0;
 $skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0;
@@ -92,6 +94,7 @@ GetOptions(
   "minimizer-len=i" => \$minimizer_len,
   "kmer-len=i" => \$kmer_len,
   "minimizer-spaces=i", \$minimizer_spaces,
+  "priority-taxids=s" => \@priority_taxids,
   "protein" => \$is_protein,
   "masking!" => \$masking,
   "max-db-size=i" => \$max_db_size,
@@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
 $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
 $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
 $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
+$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "";
+if (@priority_taxids) {
+    $ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids)));
+}
 $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
 $ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : "";
 $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
@@ -230,6 +237,10 @@ Options:
                              ignored in comparisons (build task only;
                              def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
   --protein                  Build a protein database for translated search
+  --priority-taxids NUM      Taxids whose k-mers override other taxids. By default includes
+                             32630: synthetic construct and 81077: artifical sequences. Specifying
+                             any taxids will clear these two default taxids. Multiple taxids can be
+                             separated by colon (:), comma (,) or repeated args.
   --no-masking               Used with --standard/--download-library/
                              --add-to-library to avoid masking low-complexity
                              sequences prior to building; masking requires

diff --git a/src/build_db.cc b/src/build_db.cc
@@ -27,6 +27,11 @@ using namespace kraken2;
 
 #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024)  // 10 MB
 
+// k-mers appearing in contaminant sequences will keep the contaminant
+//  sequence taxid, even if they also appear in a genome
+const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct'
+const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences'
+
 struct Options {
   string ID_to_taxon_map_filename;
   string ncbi_taxonomy_directory;
@@ -36,6 +41,7 @@ struct Options {
   size_t block_size;
   int num_threads;
   bool input_is_protein;
+  vector<uint64_t> priority_taxids;
   ssize_t k, l;
   size_t capacity;
   size_t maximum_capacity;
@@ -65,6 +71,8 @@ int main(int argc, char **argv) {
   opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK;
   opts.toggle_mask = DEFAULT_TOGGLE_MASK;
   opts.input_is_protein = false;
+  opts.priority_taxids.push_back(TID_CONTAMINANT1);
+  opts.priority_taxids.push_back(TID_CONTAMINANT2);
   opts.num_threads = 1;
   opts.block_size = DEFAULT_BLOCK_SIZE;
   opts.min_clear_hash_value = 0;
@@ -82,6 +90,9 @@ int main(int argc, char **argv) {
 
   Taxonomy taxonomy(opts.taxonomy_filename.c_str());
   taxonomy.GenerateExternalToInternalIDMap();
+  for (auto taxid : opts.priority_taxids) {
+    taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid));
+  }
   size_t bits_needed_for_value = 1;
   while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count())
     bits_needed_for_value++;
@@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map<string, uint64_t> &ID_to_taxon_map,
   std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl;
 }
 
+// Split string by any of delimiters, convert to long, and insert to iterator.
+template <typename OutIt>
+OutIt SplitInsert(char* input, string delimiters, OutIt out) {
+  std::string line(input);
+  std::size_t prev = 0, pos;
+  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) {
+    if (pos > prev) {
+      *out++ = stol(line.substr(prev, pos-prev));
+    }
+    prev = pos+1;
+  }
+  if (prev < line.length()) {
+    *out++ = stol(line.substr(prev, std::string::npos));
+  }
+  return out;
+}
+
 // This function exists to deal with NCBI's use of \x01 characters to denote
 // the start of a new FASTA header in the same line (for non-redundant DBs).
 // We return all sequence IDs in a header line, not just the first.
@@ -213,8 +241,9 @@ vector<string> ExtractNCBISequenceIDs(const string &header) {
 void ParseCommandLine(int argc, char **argv, Options &opts) {
   int opt;
   long long sig;
+  bool priority_taxids_cleared = false;
 
-  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) {
+  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) {
     switch (opt) {
       case 'h' : case '?' :
         usage(0);
@@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) {
           errx(EX_USAGE, "max capacity must be positive integer");
         opts.maximum_capacity = sig;
         break;
+      case 'P' :
+        if (!priority_taxids_cleared) {
+          opts.priority_taxids.clear();
+          priority_taxids_cleared = true;
+        }
+        SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids));
+        break;
       case 'X' :
         opts.input_is_protein = true;
         break;
@@ -326,6 +362,7 @@ void usage(int exit_code) {
        << "  -M INT        Set maximum capacity of hash table (MiniKraken)\n"
        << "  -S BITSTRING  Spaced seed mask\n"
        << "  -T BITSTRING  Minimizer toggle mask\n"
+       << "  -A INT(:INT)  Priority taxids that don't get LCA'd\n"
        << "  -X            Input seqs. are proteins\n"
        << "  -p INT        Number of threads\n"
        << "  -B INT        Read block size" << endl;
@@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid,
     CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner,
     uint64_t min_clear_hash_value)
 {
+  bool seq_is_priority = tax.priority_taxids.count(taxid);
   scanner.LoadSequence(seq);
   uint64_t *minimizer_ptr;
   bool ambig_flag;
@@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid,
       continue;
     hvalue_t existing_taxid = 0;
     hvalue_t new_taxid = taxid;
-    while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
-      new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+    if (seq_is_priority) {
+      hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) ||
+        hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid);
+    }
+    else {
+      while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
+        if (! tax.priority_taxids.count(existing_taxid)) {
+          new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+        } else {
+          new_taxid = existing_taxid;
+        }
+      }
     }
   }
 }

diff --git a/src/taxonomy.h b/src/taxonomy.h
@@ -64,6 +64,7 @@ class Taxonomy {
       return 0;
     return external_to_internal_id_map_[external_id];
   }
+  std::set<uint64_t> priority_taxids;
 
   private:
   void Init(const char *filename, bool memory_mapping);