Skip to content

Commit

Permalink
update help messages
Browse files Browse the repository at this point in the history
  • Loading branch information
Jens-Uwe Ulrich committed May 23, 2023
1 parent 2f5cbdb commit 3a6f313
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 289 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
.settings/
.cproject
.project
.autotools
.vscode/
src/.settings/
src/.cproject
src/.project
Expand Down
188 changes: 1 addition & 187 deletions src/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@

using namespace seqan3::literals;

//std::string gtdb_root = "D:\\gtdb_genomes_reps_r202";

double cputime(void)
{
struct rusage r;
Expand All @@ -55,7 +53,7 @@ int main(int argc, char const **argv)

seqan3::argument_parser top_level_parser{"taxor", argc, argv, seqan3::update_notifications::off,
{"build", "search", "profile"}};
top_level_parser.info.version = "1.0.0";
top_level_parser.info.version = "0.1.0";

try
{
Expand Down Expand Up @@ -87,189 +85,5 @@ int main(int argc, char const **argv)

return error_code;

// Print debuging of bulk count method to fid error when building ixf
/*using sequence_file_t = seqan3::sequence_file_input<hixf::dna4_traits, seqan3::fields<seqan3::field::seq>>;
hixf::build_arguments args{};
std::vector<std::vector<size_t>> bins{};
seqan3::interleaved_xor_filter<> ixf{64, 7000000};
bool success = false;
while (!success)
{
size_t bin_idx{0};
for (int64_t pos = 1160; pos < 1170;++pos)
{
std::vector<size_t> hashes{};
hixf::read_from_temp_hash_file(pos, hashes);
if (hashes.empty())
continue;
//bins.emplace_back(hashes);
success = iff.add_bin_elements(bin_idx, hashes);
std::cout << bin_idx << "\t" << hashes.size() << std::endl;
bin_idx++;
if (!success)
break;
}
if (!success)
{
iff.clear();
iff.set_seed();
std::cout << "Reset seed after bin " << bin_idx << std::endl;
}
}
robin_hood::unordered_flat_set<size_t> read_hashes{};
for (auto && [seq] : sequence_file_t{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"})
for (auto hash :
seq
| seqan3::views::minimiser_hash(args.shape,
seqan3::window_size{args.window_size},
seqan3::seed{hixf::adjust_seed(args.shape.count())}))
read_hashes.insert(hash);
std::vector<size_t> c{};
std::ranges::copy(read_hashes, std::back_inserter(c));
typedef seqan3::interleaved_binary_fuse_filter<>::counting_agent_type<u_int64_t> TIFFAgent;
TIFFAgent ixf_count_agent = iff.counting_agent< uint64_t >();
auto result = ixf_count_agent.bulk_count(c);
seqan3::debug_stream << "Root result: " << result << "\n";
*/


//create_layout();

/*hixf::search_arguments search_args{};
search_args.index_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor_kmer.hixf"};
search_args.query_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"};
search_args.out_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor.hixf_search.out"};
search_args.compute_syncmer = false;
search_args.threshold = 0.2;
hixf::search_hixf(search_args);
*/

return 1;

/*
// multi_interleaved_xor_filter mixf = load_multi_ixf_index("GTDB_r202.ixf");
// std::cout << "Filter loaded successfully" << std::endl;
//std::vector<std::vector<Species*>> bins2vec(mixf.count_single_filter());
//bins_to_species(mixf, bins2vec);
double ratio_sum = 0.0;
uint64_t nr_reads = 0;
std::vector<double> sensitivity(mixf.species_vector.size());
std::fill(sensitivity.begin(), sensitivity.end(), 0.0);
std::vector<uint64_t> classified(mixf.species_vector.size());
std::fill(classified.begin(), classified.end(), 0);
int count = 0;
for ( const auto& [seq, header, qual] : fin)
{
if (seq.size() < 1000)
continue;
// rhv holds hash vectors for forward and reverse complement
std::vector<size_t> rhv = hashing::seq_to_syncmers(kmer_size, seq, sync_size, t_syncmer);
uint64_t strobe_nr = rhv.size();
// TInterval ci = calculateCI(0.1, kmer_size, strobe_nr, 0.95);
// uint64_t threshold = strobe_nr - ci.second;
std::vector<uint64_t> max_found(mixf.species_vector.size());
std::fill(max_found.begin(), max_found.end(), 0);
// std::cout << rhv[0].first.size() << "\t" << seq.size() << std::endl;
std::vector<TIXFAgent::counting_vector> count_vectors = mixf.bulk_count(rhv);
std::vector<uint64_t> result(mixf.species_vector.size());
for (uint64_t i = 0; i < result.size(); ++i)
{
result[i] = 0;
if (mixf.species_vector[i].filter_index == UINT16_MAX)
continue;
for (uint64_t spec_bin = mixf.species_vector[i].first_bin; spec_bin <= mixf.species_vector[i].last_bin; ++spec_bin)
{
result[i] += count_vectors[mixf.species_vector[i].filter_index][spec_bin];
if (i == 355)
{
std::cout << spec_bin << "\t" << count_vectors[mixf.species_vector[i].filter_index][spec_bin] << "\t" << strobe_nr << std::endl;
}
}
}
// find maximum matches between forward and reverse complement matches
for (int i = 0; i < max_found.size(); ++i)
{
if (result[i] > max_found[i])
max_found[i] = result[i];
}
//break;
double max_ratio = 0.0;
std::vector<std::pair<uint64_t, double>> potential_indexes{};
for (int i = 0; i < max_found.size(); ++i)
{
double ratio = (double) max_found[i] / (double) strobe_nr;
sensitivity[i] += ratio;
// Why does 0.05 work so well?
if ( ratio > 0.1)
{
if (ratio > max_ratio)
max_ratio = ratio;
potential_indexes.emplace_back(std::make_pair(i, ratio));
}
}
int spec_count = 0;
for (const auto & p : potential_indexes)
{
if (p.second >= max_ratio)
{
classified[p.first] += 1;
// if (++spec_count > 1)
// std::cout << header << std::endl;
}
}
nr_reads++;
// if (nr_reads == 10)
// break;
}
for (int i=0; i < sensitivity.size(); ++i)
{
sensitivity[i] /= (double) nr_reads;
}
for (uint64_t i = 0; i < mixf.species_vector.size(); ++i)
{
if (classified[i] > 1)
{
std::cout << i << "\t" << mixf.species_vector[i].name << "\t" << classified[i] << "\t" << sensitivity[i] << std::endl;
}
}
// seqan3::debug_stream << sensitivity << "\n";
// seqan3::debug_stream << classified << "\n";
seqan3::debug_stream << nr_reads << "\n";
return 0;
*/
}

104 changes: 6 additions & 98 deletions src/main/taxor_build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,26 +41,17 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con
parser.info.version = "0.1.0";
parser.info.author = "Jens-Uwe Ulrich";
parser.info.email = "[email protected]";
parser.info.short_description = "Creates and HIXF index of a given set of fasta files";
parser.info.short_description = "Creates an HIXF index of a given set of fasta files";

parser.info.description.emplace_back("Creates an HIXF index using either kmers, syncmers or FracMinHash sketches");
parser.info.description.emplace_back("Creates an HIXF index using either k-mers or syncmers");

parser.add_subsection("Main options:");
// -----------------------------------------------------------------------------------------------------------------
parser.add_option(config.input_file_name,
'\0', "input-file", "",
/* "Provide the prefix you used for the output prefix in the chopper count --output-prefix option. "
"If you have different means of estimating the k-mer counts of your input data, make sure that a "
"file [INPUT-PREFIX].count exists. It needs to be tab-separated and consist of two columns: "
"\"[filepath] [tab] [weight/count]\".",*/
'\0', "input-file", "tab-separated-value file containing taxonomy information and reference file names",
seqan3::option_spec::required);
parser.add_list_item("", "Example file:");
parser.add_list_item("", "```");
parser.add_list_item("", "GCF_000839185.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/839/185/GCF_000839185.1_ViralProj14174 Cowpox virus k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Cowpox virus");
parser.add_list_item("", "GCF_000860085.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/860/085/GCF_000860085.1_ViralProj15241 Vaccinia virus k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Vaccinia virus");
parser.add_list_item("", "```");

parser.add_option(config.input_sequence_folder, '\0', "input-sequence_dir", "directory containing the fasta reference files");
parser.add_option(config.input_sequence_folder, '\0', "input-sequence-dir", "directory containing the fasta reference files");

parser.add_option(config.output_file_name, '\0', "output-filename", "A file name for the resulting index.");

Expand All @@ -74,18 +65,15 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con

parser.add_option(config.threads,
'\0', "threads",
"The number of threads to use. Currently, only merging of sketches is parallelized, so if option "
"--rearrange-user-bins is not set, --threads will have no effect.",
"The number of threads to use.",
seqan3::option_spec::standard,
seqan3::arithmetic_range_validator{static_cast<size_t>(1), static_cast<size_t>(32)});

parser.add_flag(config.use_syncmer,'\0', "use-syncmer", "enable using syncmers for smaller index size");

parser.add_flag(config.output_verbose_statistics,
'\0', "output-verbose-statistics",
"Enable verbose statistics to be "
"printed to std::cout. If the flag --determine-best-tmax is not set, this flag is ignored "
"and has no effect.",
"Enable verbose statistics to be printed",
seqan3::option_spec::hidden);

parser.add_flag(config.debug,
Expand Down Expand Up @@ -369,86 +357,6 @@ void build_hixf(taxor::build::configuration const config,
store_index(args.out_path, index);
}

uint64_t construct_and_query_ixf(size_t bins, size_t elements_per_bin)
{


//ulrich::interleaved_xor_filter<> ixf(elems);
seqan3::interleaved_xor_filter<> ixf(bins, elements_per_bin);
std::vector<size_t> elems{};
while (true)
{
bool success = true;
for (int e = 0; e < bins ; ++e)
{

//if (e % 7 == 0 || e % 15 == 0)
// continue;

std::vector<size_t> tmp{};
for (size_t i = 0; i < elements_per_bin; ++i)
{
size_t key = (e*elements_per_bin) + i;
tmp.emplace_back(key);
}
success = ixf.add_bin_elements(e, tmp);
if (!success)
{
ixf.clear();
ixf.set_seed();
std::cout << e << std::endl;
break;
}
if (e == 2)
elems=std::move(tmp);
}
if (success)
break;
}

//std::cout << "Built IXF in " << ixf_construct.elapsed() << " seconds" << std::endl;

std::cout << ixf.bin_count() << std::endl;
std::cout << (double) ixf.bit_size() / (double) ixf.bin_count() << std::endl;

typedef seqan3::interleaved_xor_filter<>::counting_agent_type< uint64_t > TIXFAgent;

auto agent = ixf.membership_agent();
auto & result = agent.bulk_contains(elems[0]);
seqan3::debug_stream << elems[0] << "\t" << result << '\n';
TIXFAgent ixf_count_agent = ixf.counting_agent< uint64_t >();
//auto result = count_agent.bulk_count(readHs);
auto ixf_result = ixf_count_agent.bulk_count(elems);

seqan3::debug_stream << ixf_result << "\n";
/* double fpr {0.0};
for (int i =0; i < ixf_result.size(); ++i)
{
if (i == 2)
{
std::cout << ixf_result[i] << "/" << elems.size() << std::endl;
return ixf_result[i];
continue;
}
fpr += (double) ixf_result[i] / (double) elements_per_bin;
}
fpr /= (double) (bins - 1);
std::cout << "FPR of the IXF: " << fpr << std::endl;
double mbytes = (double) ixf.bit_size() / (double) 8388608;
std::cout << "Size of the IXF: " << mbytes << " MBytes" << std::endl;
std::cout << "Queried " << elems.size() <<" keys in IXF in " << ixf_query.elapsed() << " seconds" << std::endl;
StopClock ixf_store{};
ixf_store.start();
std::string filter_file = "test.ixf";
std::ofstream os( filter_file, std::ios::binary );
cereal::BinaryOutputArchive archive( os );
archive( ixf );
ixf_store.stop();
std::cout << "Stored IXF in " << ixf_store.elapsed() << " seconds" << std::endl;
*/
}

int execute(seqan3::argument_parser & parser)
{
taxor::build::configuration config;
Expand Down
8 changes: 4 additions & 4 deletions src/main/taxor_profile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::profile::c
seqan3::option_spec::required);

parser.add_option(config.report_file, '\0', "cami-report-file",
"output file reporting genomic abundances in CAMI profiling format"
"This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample."
"output file reporting genomic abundances in CAMI profiling format. "
"This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample. "
"Note that this is not identical to the relative abundance in terms of assigned base pairs.",
seqan3::option_spec::required);

parser.add_option(config.sequence_abundance_file, '\0', "seq-abundance-file",
"output file reporting sequence abundance in CAMI profiling format (including unclassified reads)"
"This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample."
"output file reporting sequence abundance in CAMI profiling format (including unclassified reads). "
"This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample. "
"Note that this is not identical to the genomic abundance in terms of genome copy number for the respective TAXID.");

parser.add_option(config.binning_file, '\0', "binning-file",
Expand Down

0 comments on commit 3a6f313

Please sign in to comment.