From b40f075d6b98474f60c919332e2e86ec83f642c3 Mon Sep 17 00:00:00 2001
From: Simon Ye <mail@yesimon.com>
Date: Fri, 15 May 2020 13:01:35 -0400
Subject: [PATCH] Add --priority-taxids option for database building.

All kmers associated with any priority taxids will override other taxids, meaning that the hash
value for the kmer will not be LCA'd but set at priority taxid. This is useful for artificial
vector/plasmid sequences overlapping bacteria so that their kmers don't get uninformatively LCA'd
to a root node.

The default value includes 32630 and 81077. Specifying any taxids will clear these two taxids.
Multiple taxids can be separated by colon (:), comma (,) or repeated args.
---
 scripts/build_kraken2_db.sh |  2 +-
 scripts/kraken2-build       | 11 ++++++++
 src/build_db.cc             | 54 ++++++++++++++++++++++++++++++++++---
 src/taxonomy.h              |  1 +
 4 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh
index 9ed1b93..5b5da80 100755
--- a/scripts/build_kraken2_db.sh
+++ b/scripts/build_kraken2_db.sh
@@ -125,7 +125,7 @@ else
   list_sequence_files | xargs -0 cat | \
     build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
              -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-             -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
+             -c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag
   finalize_file taxo.k2d
   finalize_file opts.k2d
   finalize_file hash.k2d
diff --git a/scripts/kraken2-build b/scripts/kraken2-build
index 2d5909e..7bd393f 100755
--- a/scripts/kraken2-build
+++ b/scripts/kraken2-build
@@ -44,6 +44,7 @@ my (
   $minimizer_len,
   $kmer_len,
   $minimizer_spaces,
+  @priority_taxids,
   $is_protein,
   $masking,
   $max_db_size,
@@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR;
 $kmer_len = $ENV{"KRAKEN2_KMER_LEN"};
 $minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"};
 $minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"};
+@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || ();
 $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0;
 $use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0;
 $skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0;
@@ -92,6 +94,7 @@ GetOptions(
   "minimizer-len=i" => \$minimizer_len,
   "kmer-len=i" => \$kmer_len,
   "minimizer-spaces=i", \$minimizer_spaces,
+  "priority-taxids=s" => \@priority_taxids,
   "protein" => \$is_protein,
   "masking!" => \$masking,
   "max-db-size=i" => \$max_db_size,
@@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
 $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
 $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
 $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
+$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "";
+if (@priority_taxids) {
+    $ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids)));
+}
 $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
 $ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : "";
 $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
@@ -230,6 +237,10 @@ Options:
                              ignored in comparisons (build task only;
                              def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
   --protein                  Build a protein database for translated search
+  --priority-taxids NUM      Taxids whose k-mers override other taxids. By default includes
+                             32630: synthetic construct and 81077: artifical sequences. Specifying
+                             any taxids will clear these two default taxids. Multiple taxids can be
+                             separated by colon (:), comma (,) or repeated args.
   --no-masking               Used with --standard/--download-library/
                              --add-to-library to avoid masking low-complexity
                              sequences prior to building; masking requires
diff --git a/src/build_db.cc b/src/build_db.cc
index 82f2cbb..66d0467 100644
--- a/src/build_db.cc
+++ b/src/build_db.cc
@@ -27,6 +27,11 @@ using namespace kraken2;
 
 #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024)  // 10 MB
 
+// k-mers appearing in contaminant sequences will keep the contaminant
+//  sequence taxid, even if they also appear in a genome
+const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct'
+const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences'
+
 struct Options {
   string ID_to_taxon_map_filename;
   string ncbi_taxonomy_directory;
@@ -36,6 +41,7 @@ struct Options {
   size_t block_size;
   int num_threads;
   bool input_is_protein;
+  vector<uint64_t> priority_taxids;
   ssize_t k, l;
   size_t capacity;
   size_t maximum_capacity;
@@ -65,6 +71,8 @@ int main(int argc, char **argv) {
   opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK;
   opts.toggle_mask = DEFAULT_TOGGLE_MASK;
   opts.input_is_protein = false;
+  opts.priority_taxids.push_back(TID_CONTAMINANT1);
+  opts.priority_taxids.push_back(TID_CONTAMINANT2);
   opts.num_threads = 1;
   opts.block_size = DEFAULT_BLOCK_SIZE;
   opts.min_clear_hash_value = 0;
@@ -82,6 +90,9 @@ int main(int argc, char **argv) {
 
   Taxonomy taxonomy(opts.taxonomy_filename.c_str());
   taxonomy.GenerateExternalToInternalIDMap();
+  for (auto taxid : opts.priority_taxids) {
+    taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid));
+  }
   size_t bits_needed_for_value = 1;
   while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count())
     bits_needed_for_value++;
@@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map<string, uint64_t> &ID_to_taxon_map,
   std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl;
 }
 
+// Split string by any of delimiters, convert to long, and insert to iterator.
+template <typename OutIt>
+OutIt SplitInsert(char* input, string delimiters, OutIt out) {
+  std::string line(input);
+  std::size_t prev = 0, pos;
+  while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) {
+    if (pos > prev) {
+      *out++ = stol(line.substr(prev, pos-prev));
+    }
+    prev = pos+1;
+  }
+  if (prev < line.length()) {
+    *out++ = stol(line.substr(prev, std::string::npos));
+  }
+  return out;
+}
+
 // This function exists to deal with NCBI's use of \x01 characters to denote
 // the start of a new FASTA header in the same line (for non-redundant DBs).
 // We return all sequence IDs in a header line, not just the first.
@@ -213,8 +241,9 @@ vector<string> ExtractNCBISequenceIDs(const string &header) {
 void ParseCommandLine(int argc, char **argv, Options &opts) {
   int opt;
   long long sig;
+  bool priority_taxids_cleared = false;
 
-  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) {
+  while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) {
     switch (opt) {
       case 'h' : case '?' :
         usage(0);
@@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) {
           errx(EX_USAGE, "max capacity must be positive integer");
         opts.maximum_capacity = sig;
         break;
+      case 'P' :
+        if (!priority_taxids_cleared) {
+          opts.priority_taxids.clear();
+          priority_taxids_cleared = true;
+        }
+        SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids));
+        break;
       case 'X' :
         opts.input_is_protein = true;
         break;
@@ -326,6 +362,7 @@ void usage(int exit_code) {
        << "  -M INT        Set maximum capacity of hash table (MiniKraken)\n"
        << "  -S BITSTRING  Spaced seed mask\n"
        << "  -T BITSTRING  Minimizer toggle mask\n"
+       << "  -A INT(:INT)  Priority taxids that don't get LCA'd\n"
        << "  -X            Input seqs. are proteins\n"
        << "  -p INT        Number of threads\n"
        << "  -B INT        Read block size" << endl;
@@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid,
     CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner,
     uint64_t min_clear_hash_value)
 {
+  bool seq_is_priority = tax.priority_taxids.count(taxid);
   scanner.LoadSequence(seq);
   uint64_t *minimizer_ptr;
   bool ambig_flag;
@@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid,
       continue;
     hvalue_t existing_taxid = 0;
     hvalue_t new_taxid = taxid;
-    while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
-      new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+    if (seq_is_priority) {
+      hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) ||
+        hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid);
+    }
+    else {
+      while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
+        if (! tax.priority_taxids.count(existing_taxid)) {
+          new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
+        } else {
+          new_taxid = existing_taxid;
+        }
+      }
     }
   }
 }
diff --git a/src/taxonomy.h b/src/taxonomy.h
index 8d43604..9b02d16 100644
--- a/src/taxonomy.h
+++ b/src/taxonomy.h
@@ -64,6 +64,7 @@ class Taxonomy {
       return 0;
     return external_to_internal_id_map_[external_id];
   }
+  std::set<uint64_t> priority_taxids;
 
   private:
   void Init(const char *filename, bool memory_mapping);