Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --priority-taxids option for database building. #256

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/build_kraken2_db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ else
list_sequence_files | xargs -0 cat | \
build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
-H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
-c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag
finalize_file taxo.k2d
finalize_file opts.k2d
finalize_file hash.k2d
Expand Down
11 changes: 11 additions & 0 deletions scripts/kraken2-build
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ my (
$minimizer_len,
$kmer_len,
$minimizer_spaces,
@priority_taxids,
$is_protein,
$masking,
$max_db_size,
Expand All @@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR;
$kmer_len = $ENV{"KRAKEN2_KMER_LEN"};
$minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"};
$minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"};
@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || ();
$is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0;
$use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0;
$skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0;
Expand All @@ -92,6 +94,7 @@ GetOptions(
"minimizer-len=i" => \$minimizer_len,
"kmer-len=i" => \$kmer_len,
"minimizer-spaces=i", \$minimizer_spaces,
"priority-taxids=s" => \@priority_taxids,
"protein" => \$is_protein,
"masking!" => \$masking,
"max-db-size=i" => \$max_db_size,
Expand Down Expand Up @@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
$ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
$ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
$ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "";
if (@priority_taxids) {
$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids)));
}
$ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
$ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : "";
$ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
Expand Down Expand Up @@ -230,6 +237,10 @@ Options:
ignored in comparisons (build task only;
def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
--protein Build a protein database for translated search
--priority-taxids NUM Taxids whose k-mers override other taxids. By default includes
32630: synthetic construct and 81077: artifical sequences. Specifying
any taxids will clear these two default taxids. Multiple taxids can be
separated by colon (:), comma (,) or repeated args.
--no-masking Used with --standard/--download-library/
--add-to-library to avoid masking low-complexity
sequences prior to building; masking requires
Expand Down
54 changes: 51 additions & 3 deletions src/build_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ using namespace kraken2;

#define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024) // 10 MB

// k-mers appearing in contaminant sequences will keep the contaminant
// sequence taxid, even if they also appear in a genome
const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct'
const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences'

struct Options {
string ID_to_taxon_map_filename;
string ncbi_taxonomy_directory;
Expand All @@ -36,6 +41,7 @@ struct Options {
size_t block_size;
int num_threads;
bool input_is_protein;
vector<uint64_t> priority_taxids;
ssize_t k, l;
size_t capacity;
size_t maximum_capacity;
Expand Down Expand Up @@ -65,6 +71,8 @@ int main(int argc, char **argv) {
opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK;
opts.toggle_mask = DEFAULT_TOGGLE_MASK;
opts.input_is_protein = false;
opts.priority_taxids.push_back(TID_CONTAMINANT1);
opts.priority_taxids.push_back(TID_CONTAMINANT2);
opts.num_threads = 1;
opts.block_size = DEFAULT_BLOCK_SIZE;
opts.min_clear_hash_value = 0;
Expand All @@ -82,6 +90,9 @@ int main(int argc, char **argv) {

Taxonomy taxonomy(opts.taxonomy_filename.c_str());
taxonomy.GenerateExternalToInternalIDMap();
for (auto taxid : opts.priority_taxids) {
taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid));
}
size_t bits_needed_for_value = 1;
while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count())
bits_needed_for_value++;
Expand Down Expand Up @@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map<string, uint64_t> &ID_to_taxon_map,
std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl;
}

// Split string by any of delimiters, convert to long, and insert to iterator.
template <typename OutIt>
OutIt SplitInsert(char* input, string delimiters, OutIt out) {
std::string line(input);
std::size_t prev = 0, pos;
while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) {
if (pos > prev) {
*out++ = stol(line.substr(prev, pos-prev));
}
prev = pos+1;
}
if (prev < line.length()) {
*out++ = stol(line.substr(prev, std::string::npos));
}
return out;
}

// This function exists to deal with NCBI's use of \x01 characters to denote
// the start of a new FASTA header in the same line (for non-redundant DBs).
// We return all sequence IDs in a header line, not just the first.
Expand Down Expand Up @@ -213,8 +241,9 @@ vector<string> ExtractNCBISequenceIDs(const string &header) {
void ParseCommandLine(int argc, char **argv, Options &opts) {
int opt;
long long sig;
bool priority_taxids_cleared = false;

while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) {
while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) {
switch (opt) {
case 'h' : case '?' :
usage(0);
Expand Down Expand Up @@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) {
errx(EX_USAGE, "max capacity must be positive integer");
opts.maximum_capacity = sig;
break;
case 'P' :
if (!priority_taxids_cleared) {
opts.priority_taxids.clear();
priority_taxids_cleared = true;
}
SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids));
break;
case 'X' :
opts.input_is_protein = true;
break;
Expand Down Expand Up @@ -326,6 +362,7 @@ void usage(int exit_code) {
<< " -M INT Set maximum capacity of hash table (MiniKraken)\n"
<< " -S BITSTRING Spaced seed mask\n"
<< " -T BITSTRING Minimizer toggle mask\n"
<< " -A INT(:INT) Priority taxids that don't get LCA'd\n"
<< " -X Input seqs. are proteins\n"
<< " -p INT Number of threads\n"
<< " -B INT Read block size" << endl;
Expand All @@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid,
CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner,
uint64_t min_clear_hash_value)
{
bool seq_is_priority = tax.priority_taxids.count(taxid);
scanner.LoadSequence(seq);
uint64_t *minimizer_ptr;
bool ambig_flag;
Expand All @@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid,
continue;
hvalue_t existing_taxid = 0;
hvalue_t new_taxid = taxid;
while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
if (seq_is_priority) {
hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) ||
hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid);
}
else {
while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) {
if (! tax.priority_taxids.count(existing_taxid)) {
new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid);
} else {
new_taxid = existing_taxid;
}
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/taxonomy.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class Taxonomy {
return 0;
return external_to_internal_id_map_[external_id];
}
std::set<uint64_t> priority_taxids;

private:
void Init(const char *filename, bool memory_mapping);
Expand Down