Skip to content

Commit

Permalink
Merge pull request #52 from JensUweUlrich/bug_51
Browse files Browse the repository at this point in the history
Bug 51
  • Loading branch information
JensUweUlrich authored Sep 8, 2022
2 parents fd5b57f + 9a4780e commit 5ad5603
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 51 deletions.
4 changes: 2 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ set(CPACK_NSIS_EXECUTION_LEVEL "user")
set(CPACK_PACKAGE_NAME "ReadBouncer")
set(CPACK_PACKAGE_VENDOR "Jens-Uwe Ulrich")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "ReadBouncer - Installer")
set(CPACK_PACKAGE_VERSION "1.2.0")
set(CPACK_PACKAGE_VERSION "1.2.1")
set(CPACK_PACKAGE_VERSION_MAJOR "1")
set(CPACK_PACKAGE_VERSION_MINOR "2")
set(CPACK_PACKAGE_VERSION_PATCH "0")
set(CPACK_PACKAGE_VERSION_PATCH "1")
set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE.txt")
set(CPACK_PACKAGE_INSTALL_DIRECTORY ${PROJECT_NAME})

Expand Down
9 changes: 5 additions & 4 deletions src/IBF/IBF.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ namespace interleave

class IBF
{
friend class IBFTest;
//friend class IBFTest;

private:
TIbf filter{};
Expand Down Expand Up @@ -142,6 +142,7 @@ namespace interleave
return filter;
}

/*
std::future< void > test_read_task;// for gtest
std::string cutOutNNNsTest; // for gtest
TIbf filter_{}; // for gtest
Expand All @@ -153,7 +154,7 @@ namespace interleave
int64_t fragstart;
uint64_t fragend;
} test_func1;

*/

};

Expand Down Expand Up @@ -211,7 +212,7 @@ namespace interleave
int classify(std::vector< IBFMeta >& filters, ClassifyConfig& config);
std::pair<int, int> classify(std::vector< IBFMeta >& filt1, std::vector< IBFMeta >& filt2, ClassifyConfig& config);

// gtest
/* // gtest
bool select_matches_test(std::vector< uint16_t >& selectedBins,
std::vector< uint16_t >& selectedBinsRev,
TIbf& filter,
Expand All @@ -221,7 +222,7 @@ namespace interleave
uint64_t max_matches_test(std::vector< uint16_t >& selectedBins, std::vector< uint16_t >& selectedBinsRev,
TIbf& filter, uint16_t threshold);
bool find_matches_test(std::vector<interleave::TIbf> &filters, interleave::ClassifyConfig &config);

*/
};

typedef std::vector<Read> TReads;
Expand Down
16 changes: 9 additions & 7 deletions src/IBF/IBFBuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ namespace interleave
buf << seq;
}
std::string newseq = buf.str();
interleave::IBF::cutOutNNNsTest = buf.str();// for getest
//interleave::IBF::cutOutNNNsTest = buf.str();// for getest
queue_refs.push(Seqs{ seqid, ((seqan::Dna5String) newseq) });
// calculate bins needed for that sequence
stats.totalBinsBinId += ( newseq.length() / config.fragment_length) + 1;
Expand Down Expand Up @@ -152,7 +152,7 @@ namespace interleave
this->ibf_logger->flush();
for ( uint16_t taskNo = 0; taskNo < config.threads_build; ++taskNo )
{
interleave::IBF::test_func1.threads_build = config.threads_build;// g-test
//interleave::IBF::test_func1.threads_build = config.threads_build;// g-test
tasks.emplace_back( std::async( std::launch::async, [=, &queue_refs, &binid] {
std::shared_ptr<spdlog::logger> logger = spdlog::get("IbfLog");
while ( true )
Expand Down Expand Up @@ -200,10 +200,10 @@ namespace interleave
}
fragIdx++;
fragstart = fragIdx * config.fragment_length - config.kmer_size + 1;
interleave::IBF::test_func1.fragend = fragend;// g-test
//interleave::IBF::test_func1.fragend = fragend;// g-test
}
interleave::IBF::test_func1.fragIdx = fragIdx; // g-test
interleave::IBF::test_func1.fragstart = fragstart;// g-test
//interleave::IBF::test_func1.fragIdx = fragIdx; // g-test
//interleave::IBF::test_func1.fragstart = fragstart;// g-test
}
else
{
Expand Down Expand Up @@ -440,7 +440,8 @@ namespace interleave
try{
stats.timeLoadSeq.start();
std::future< void > read_task = parse_ref_seqs(queue_refs, config, stats);
interleave::IBF::test_read_task = parse_ref_seqs(queue_refs, config, stats); // for gtest
// DON't call test functions in production code
//interleave::IBF::test_read_task = parse_ref_seqs(queue_refs, config, stats); // for gtest
read_task.get();
stats.timeLoadSeq.stop();
}
Expand All @@ -463,7 +464,8 @@ namespace interleave
{
this->filter = TIbf(stats.totalBinsBinId, config.hash_functions, config.kmer_size, config.filter_size_bits);
stats.totalBinsFile = seqan::getNumberOfBins(this->filter);
this->filter_ = TIbf(stats.totalBinsBinId, config.hash_functions, config.kmer_size, config.filter_size_bits); // for gtest
// DON'T create test objects for production use => this doubles the size of RAM used
//this->filter_ = TIbf(stats.totalBinsBinId, config.hash_functions, config.kmer_size, config.filter_size_bits); // for gtest
}
catch (seqan::Exception const& e)
{
Expand Down
4 changes: 2 additions & 2 deletions src/IBF/IBFClassify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ namespace interleave
return result;
}


/*
//gtest
bool Read::select_matches_test(std::vector< uint16_t >& selectedBins,
Expand Down Expand Up @@ -395,5 +395,5 @@ uint64_t Read::max_matches_test(std::vector< uint16_t >& selectedBins, std::vec
}

*/
} // end namespace interleave
78 changes: 43 additions & 35 deletions src/main/classify.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ std::filesystem::path writeUnclassifed (ConfigReader config){
* @param id read's id
* @param seq read's sequence
* @param UnclassifiedOut output file of unclassifed reads
* @return true if read was classified, false otherwise
*/

void classify_deplete_target(std::vector<interleave::IBFMeta>& DepletionFilters, std::vector<interleave::IBFMeta>& TargetFilters,
interleave::ClassifyConfig Conf, interleave::Read r, int best_filter_index, bool classified,
std::vector< std::ofstream>& targetFastas, seqan::CharString id, seqan::CharString seq, seqan::SeqFileOut &UnclassifiedOut){
bool classify_deplete_target(std::vector<interleave::IBFMeta>& DepletionFilters, std::vector<interleave::IBFMeta>& TargetFilters,
interleave::ClassifyConfig& Conf, interleave::Read& r, int best_filter_index,
std::vector< std::ofstream>& targetFastas, seqan::CharString& id, seqan::CharString& seq, seqan::SeqFileOut &UnclassifiedOut){

std::pair<uint64_t, uint64_t> p = r.classify(TargetFilters, DepletionFilters, Conf);
if (p.first > 0)
Expand All @@ -68,48 +69,44 @@ void classify_deplete_target(std::vector<interleave::IBFMeta>& DepletionFilters,
Conf.error_rate += 0.02;

if (p.first > 0 && p.second > 0)
{} // do nothing
{
return false;
}
else if (p.first > 0)
{
classified = true;
best_filter_index = r.classify(TargetFilters, Conf);
if (best_filter_index != -1)
{
TargetFilters[best_filter_index].classified += 1;

//seqan::writeRecord(outfiles[best_filter_index], id, (seqan::Dna5String) seq);
targetFastas[best_filter_index] << ">" << id << std::endl;
targetFastas[best_filter_index] << seq << std::endl;

return true;
}
}
else
{
classified = false;
seqan::writeRecord(UnclassifiedOut, id, (seqan::Dna5String)seq);
return;
return false;
}
}
else
{
classified = true;
best_filter_index = r.classify(TargetFilters, Conf);
if (best_filter_index != -1)
{
TargetFilters[best_filter_index].classified += 1;
//seqan::writeRecord(outfiles[best_filter_index], id, seq);

targetFastas[best_filter_index] << ">" << id << std::endl;
targetFastas[best_filter_index] << seq << std::endl;
return true;
}
}
}
else
{
classified = false;
seqan::writeRecord(UnclassifiedOut, id, (seqan::Dna5String)seq);
return false;
}


return false;

}

Expand Down Expand Up @@ -142,7 +139,7 @@ struct ClassificationResults{
* @parser : toml input parameters
*/

void classify_reads(ConfigReader config, std::vector<interleave::IBFMeta> DepletionFilters, std::vector<interleave::IBFMeta> TargetFilters)
void classify_reads(ConfigReader& config, std::vector<interleave::IBFMeta> DepletionFilters, std::vector<interleave::IBFMeta> TargetFilters)
{
std::shared_ptr<spdlog::logger> readbouncer_logger = spdlog::get("ReadBouncerLog");
// create classification config
Expand Down Expand Up @@ -185,38 +182,50 @@ void classify_reads(ConfigReader config, std::vector<interleave::IBFMeta> Deplet
StopClock::TimePoint begin = StopClock::Clock::now();


seqan::SeqFileIn seqFileIn;
if (!seqan::open(seqFileIn, seqan::toCString(read_file.string())))
{
std::cerr << "ERROR: Unable to open the file: " << read_file.string() << std::endl;
return;
}


// initialize classification output files
seqan::SeqFileOut UnclassifiedOut;

// only print classified reads to file if option given via command line
std::vector< std::ofstream> targetFastas{};
/*std::vector< std::ofstream> targetFastas{};
for (interleave::IBFMeta f : TargetFilters)
{
std::ofstream outf = writeClassifed(config, f);
targetFastas.emplace_back(std::move(outf));
}
*/

std::vector< std::ofstream> targetFastas{};
for (interleave::IBFMeta f : TargetFilters)
{
std::filesystem::path outfile(config.output_dir);
outfile /= f.name + ".fasta";
std::ofstream outf;
outf.open(outfile, std::ios::out);

std::filesystem::path outfile = writeUnclassifed(config);
targetFastas.emplace_back(std::move(outf));
}

std::filesystem::path outfile(config.output_dir);
outfile /= "unclassified.fasta";
if (!seqan::open(UnclassifiedOut, seqan::toCString(outfile.string())))
{
std::cerr << "ERROR: Unable to open the file: " << outfile.string() << std::endl;
return;
}

seqan::SeqFileIn seqFileIn;
if (!seqan::open(seqFileIn, seqan::toCString(read_file.string())))
{
std::cerr << "ERROR: Unable to open the file: " << read_file.string() << std::endl;
return;
}

std::cout << '\n';
std::cout << "Classification results of: " << read_file.string() << '\n';
std::cout << '\n';


while (!seqan::atEnd(seqFileIn))
{
interleave::Read r;
Expand Down Expand Up @@ -256,25 +265,26 @@ void classify_reads(ConfigReader config, std::vector<interleave::IBFMeta> Deplet
uint64_t fragstart = fragment_start(config.IBF_Parsed.chunk_length, i);

// make sure that last fragment ends at last position of the reference sequence
if (fragend > length(seq)) fragend = length(seq);
if (fragend > length(seq))
fragend = length(seq);

seqan::Infix< seqan::CharString >::Type fragment = seqan::infix(seq, fragstart, fragend);
seqan::Dna5String fr = (seqan::Dna5String) fragment;
r = interleave::Read(id, fr);

if (deplete && target)
{
classify_deplete_target(DepletionFilters, TargetFilters, Conf, r, best_filter_index,classified,targetFastas, id, seq, UnclassifiedOut);
classified = classify_deplete_target(DepletionFilters, TargetFilters, Conf, r, best_filter_index,targetFastas, id, seq, UnclassifiedOut);

}
else if (deplete)
classified = r.classify(DepletionFilters, Conf) > -1;
else
{
classified = true;
best_filter_index = r.classify(TargetFilters, Conf);
if (best_filter_index != -1)
{
classified = true;
TargetFilters[best_filter_index].classified += 1;
targetFastas[best_filter_index] << ">" << id << std::endl;
targetFastas[best_filter_index] << seq << std::endl;
Expand All @@ -285,13 +295,11 @@ void classify_reads(ConfigReader config, std::vector<interleave::IBFMeta> Deplet
found++;
break;
}
/*
else if (parser.unclassified_file.length() > 0)
{
seqan::writeRecord(UnclassifiedOut, id, seq);
}*/
i++;
}
if (!classified)
seqan::writeRecord(UnclassifiedOut, id, (seqan::Dna5String)seq);

classifyRead.stop();

}
Expand Down
5 changes: 4 additions & 1 deletion src/main/ibfbuild.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ interleave::TIbf buildIBF(ConfigReader config_reader, const std::string referenc
* @return vector of loaded/constructed IBF's
*/

std::vector<interleave::IBFMeta> getIBF (ConfigReader config, bool depleteFilter, bool targetFilter){
std::vector<interleave::IBFMeta> getIBF (ConfigReader config, bool depleteFilter, bool targetFilter)
{

std::shared_ptr<spdlog::logger> readbouncer_logger = spdlog::get("ReadBouncerLog");

std::vector<interleave::IBFMeta> DepletionFilters{};
std::vector<interleave::IBFMeta> TargetFilters{};
Expand Down
Loading

0 comments on commit 5ad5603

Please sign in to comment.