Skip to content

Commit

Permalink
Add population support to GRG samples
Browse files Browse the repository at this point in the history
* Add new argument to "grg construct", add unit tests
* "grg construct --population-ids <file:field>" now attaches populations
to the data.
* Add population ID map support to grgl executable
* Set populationID on the sample nodes based on the individual IDs.
* Unit tests for the population ID handling in GRG construction and the
associated helper functions.
* Add individual ID support to MutationIterator
* Add individual filtering to gconverter
  • Loading branch information
dcdehaas committed Jul 22, 2024
1 parent cd282eb commit 6973169
Show file tree
Hide file tree
Showing 12 changed files with 354 additions and 18 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ endif()

set(GRGL_TEST_SOURCES
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_common.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_construct.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_bloom_filter.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_grg.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_main.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_mutation.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_serialize.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_util.cpp
${CMAKE_CURRENT_LIST_DIR}/test/unit/test_visitor.cpp
)

Expand Down Expand Up @@ -201,6 +203,6 @@ if(ENABLE_TESTS)

# Now simply link against gtest or gtest_main as needed. Eg
add_executable(grgl_test ${GRGL_TEST_SOURCES})
target_link_libraries(grgl_test gtest_main tskit grgl)
target_link_libraries(grgl_test gtest_main tskit grgl z ${BGEN_LIBS})
add_test(NAME grgl_test COMMAND grgl_test)
endif()
7 changes: 5 additions & 2 deletions include/grgl/mut_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ class MutationIterator {

virtual void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) = 0;

virtual std::vector<std::string> getIndividualIds() = 0;

virtual size_t countMutations() const = 0;

bool next(MutationAndSamples& mutAndSamples, size_t& totalSamples);
Expand Down Expand Up @@ -114,6 +116,7 @@ class VCFMutationIterator : public MutationIterator {

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;
size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand All @@ -132,8 +135,8 @@ class IGDMutationIterator : public MutationIterator {
const char* filename, FloatRange genomeRange, bool binaryMutations, bool emitMissingData, bool flipRefMajor);

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;

size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand All @@ -157,8 +160,8 @@ class BGENMutationIterator : public MutationIterator {
~BGENMutationIterator() override;

void getMetadata(size_t& ploidy, size_t& numIndividuals, bool& isPhased) override;

size_t countMutations() const override;
std::vector<std::string> getIndividualIds() override;

protected:
void buffer_next(size_t& totalSamples) override;
Expand Down
21 changes: 13 additions & 8 deletions pygrgl/clicmd/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,23 @@ def add_options(subparser):
subparser.add_argument("--parts", "-p", type=int, default=8,
help="The number of parts to split the sequence into; defaults to 8")
subparser.add_argument("--jobs", "-j", type=int, default=1,
help="Number of jobs (threads/cores) to use. Defaults to 1.")
help="Number of jobs (threads/cores) to use. Defaults to 1.")
subparser.add_argument("--trees", "-t", type=int, default=1,
help="Number of trees to use during shape construction. Defaults to 1.")
help="Number of trees to use during shape construction. Defaults to 1.")
subparser.add_argument("--binary-muts", "-b", action="store_true",
help="Use binary mutations (don't track specific alternate alleles).")
help="Use binary mutations (don't track specific alternate alleles).")
subparser.add_argument("--no-file-cleanup", "-c", action="store_true",
help="Do not cleanup intermediate files (for debugging, e.g.).")
help="Do not cleanup intermediate files (for debugging, e.g.).")
subparser.add_argument("--no-maf-flip", action="store_true",
help="Do not switch the reference allele with the major allele")
help="Do not switch the reference allele with the major allele")
subparser.add_argument("--shape-lf-filter", "-f", type=float, default=10.0,
help="During shape construction ignore mutations with counts less than this."
"If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).")
help="During shape construction ignore mutations with counts less than this."
"If value is <1.0 then it is treated as a frequency. Defaults to 10 (count).")
subparser.add_argument("--population-ids", default=None,
help="Format: \"filename:fieldname\". Read population ids from the given "
"tab-separate file, using the given fieldname.")
subparser.add_argument("--verbose", "-v", action="store_true",
help="Verbose output, including timing information.")
help="Verbose output, including timing information.")

grgl_exe = which("grgl")
grg_merge_exe = which("grg-merge")
Expand Down Expand Up @@ -56,6 +59,8 @@ def build_shape(range_triple, args, input_file):
command = [grgl_exe, input_file]
if args.no_maf_flip:
command.append("--no-maf-flip")
if args.population_ids:
command.extend(["--population-ids", args.population_ids])
command.extend(["--lf-filter", str(args.shape_lf_filter)])
command.extend(["-l", "-s", "-r", f"{base}:{base+pspans}",
"-o", out_filename_tree(input_file, part, tnum)])
Expand Down
37 changes: 36 additions & 1 deletion src/build_shape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <chrono>
#include <deque>
#include <limits>
#include <stdexcept>
#include <unordered_map>

#include "grgl/grg.h"
Expand Down Expand Up @@ -232,7 +233,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,
const bool useBinaryMuts,
const bool emitMissingData,
const bool flipRefMajor,
const double dropBelowThreshold) {
const double dropBelowThreshold,
const std::map<std::string, std::string>& indivIdToPop) {
MutableGRGPtr result;
NodeToHapVect hashIndex;
std::cout << "Building genotype hash index..." << std::endl;
Expand All @@ -252,6 +254,39 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,

std::cout << "Done" << std::endl;
result = std::make_shared<MutableGRG>(hashIndex.size());
if (!indivIdToPop.empty()) {
size_t ploidy = 0;
size_t numIndividuals = 0;
bool isPhased = false;
mutationIterator->getMetadata(ploidy, numIndividuals, isPhased);
std::vector<std::string> indivIds = mutationIterator->getIndividualIds();
release_assert(indivIds.size() == numIndividuals);
std::map<std::string, size_t> popDescriptionMap;
for (NodeID individual = 0; individual < indivIds.size(); individual++) {
const auto& stringId = indivIds[individual];
const auto& findIt = indivIdToPop.find(stringId);
if (findIt == indivIdToPop.end()) {
std::stringstream ssErr;
ssErr << "Could not find population mapping for individual " << stringId;
throw std::runtime_error(ssErr.str());
}
const auto& popDescription = findIt->second;
const size_t nextPopId = popDescriptionMap.size();
const auto& findPopIt = popDescriptionMap.emplace(popDescription, nextPopId);
const auto popId = findPopIt.first->second;
if (findPopIt.second) {
release_assert(popId == nextPopId);
result->addPopulation(popDescription);
} else {
release_assert(popId != nextPopId);
}
for (NodeID offset = 0; offset < ploidy; offset++) {
const NodeID sampleId = (individual * ploidy) + offset;
release_assert(sampleId < result->numSamples());
result->getNodeData(sampleId).populationId = popId;
}
}
}
std::cout << "Adding GRG shape from genotype hashes..." << std::endl;
const bool buildBinaryTrees = true;
addGrgShapeFromHashing(result, hashIndex, result->getSampleNodes(), buildBinaryTrees);
Expand Down
3 changes: 2 additions & 1 deletion src/build_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ MutableGRGPtr createEmptyGRGFromSamples(const std::string& sampleFile,
bool useBinaryMuts,
bool emitMissingData,
bool flipRefMajor,
double dropBelowThreshold);
double dropBelowThreshold,
const std::map<std::string, std::string>& indivIdToPop);

}

Expand Down
66 changes: 63 additions & 3 deletions src/gconverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,26 @@
#include "picovcf.hpp"
#include "util.h"

static grgl::NodeIDList trimIndividuals(grgl::NodeIDList sampleList,
const size_t ploidy,
const std::unordered_map<grgl::NodeID, grgl::NodeID>& keepIndividuals) {
if (!keepIndividuals.empty()) {
grgl::NodeIDList newList;
newList.reserve(keepIndividuals.size() * ploidy);
for (auto sampleId : sampleList) {
const auto extra = sampleId % ploidy;
const auto indivId = sampleId / ploidy;
const auto& findIt = keepIndividuals.find(indivId);
if (findIt != keepIndividuals.end()) {
grgl::NodeID newID = (findIt->second * ploidy) + extra;
newList.emplace_back(newID);
}
}
return std::move(newList);
}
return std::move(sampleList);
}

static void trim(grgl::NodeIDList& sampleList, size_t newNumSamples) {
if (0 != newNumSamples) {
size_t trimTo = sampleList.size();
Expand Down Expand Up @@ -90,6 +110,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
const double fpPerVariant = 0,
const double fnPerVariant = 0,
const size_t trimToSamples = 0,
std::string keepIndividualFile = "",
bool verbose = true) {
constexpr size_t EMIT_EVERY = 10000;

Expand All @@ -114,7 +135,39 @@ static void mutationIteratorToIGD(const std::string& inFilename,

const size_t numSamples = numIndividuals * ploidy;
release_assert(trimToSamples == 0 || (trimToSamples % ploidy == 0));
const size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples;
size_t effectiveSamples = (0 == trimToSamples) ? numSamples : trimToSamples;

std::unordered_map<grgl::NodeID, grgl::NodeID> keepIndividuals;
auto individualIds = iterator->getIndividualIds();
if (!keepIndividualFile.empty()) {
if (individualIds.empty()) {
throw grgl::BadInputFileFailure("Individual filtering requires the input file to contain individual IDs");
}
std::map<std::string, grgl::NodeID> idToNodeId;
for (size_t i = 0; i < individualIds.size(); i++) {
idToNodeId.emplace(individualIds[i], i);
}
std::vector<std::string> newIndividualIds;
std::ifstream filterFile(keepIndividualFile);
std::string indivId;
size_t count = 0;
while (std::getline(filterFile, indivId)) {
release_assert(!indivId.empty());
const auto& findIt = idToNodeId.find(indivId);
if (findIt == idToNodeId.end()) {
std::stringstream ssErr;
ssErr << "Could not find individual with id " << indivId;
throw grgl::BadInputFileFailure(ssErr.str().c_str());
}
keepIndividuals.emplace(findIt->second, count++);
newIndividualIds.push_back(indivId);
}
individualIds = std::move(newIndividualIds);
size_t newSamples = individualIds.size() * ploidy;
if (newSamples < effectiveSamples) {
effectiveSamples = newSamples;
}
}

std::ofstream outFile(outFilename, std::ios::binary);
picovcf::IGDWriter writer(ploidy, effectiveSamples / ploidy, isPhased);
Expand All @@ -134,6 +187,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
missingCount += mutAndSamples.samples.size();
}
trim(mutAndSamples.samples, trimToSamples);
mutAndSamples.samples = trimIndividuals(std::move(mutAndSamples.samples), ploidy, keepIndividuals);
if (fpPerVariant + fnPerVariant > 0) {
const double fp = fpPerVariant + fpLeftovers;
const size_t fpThisVariant = (size_t)fp;
Expand All @@ -156,7 +210,7 @@ static void mutationIteratorToIGD(const std::string& inFilename,
}
writer.writeIndex(outFile);
writer.writeVariantInfo(outFile);
writer.writeIndividualIds(outFile, {});
writer.writeIndividualIds(outFile, individualIds);
outFile.seekp(0);
writer.writeHeader(outFile, inFilename, "");
}
Expand All @@ -179,6 +233,11 @@ int main(int argc, char* argv[]) {
"genomeRange",
"Only convert for the given genome range: 'x:y' means [x, y) (x inclusive, y exclusive)",
{'r', "range"});
args::ValueFlag<std::string> keepIndivs(
parser,
"keepIndivs",
"Only retain the individuals with the IDs given in this file (one ID per line).",
{'i', "keep-indivs"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help&) {
Expand Down Expand Up @@ -231,6 +290,7 @@ int main(int argc, char* argv[]) {
restrictRange,
falseNegPerVariant ? *falseNegPerVariant : 0,
falsePosPerVariant ? *falsePosPerVariant : 0,
trimTo ? *trimTo : 0);
trimTo ? *trimTo : 0,
keepIndivs ? *keepIndivs : "");
return 0;
}
19 changes: 18 additions & 1 deletion src/grgl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ int main(int argc, char** argv) {
"ts-node-times",
"When converting tree-seq, use node times instead of mutation times",
{"ts-node-times"});
args::ValueFlag<std::string> populationIds(parser,
"population-ids",
"Format: \"filename:fieldname\". Read population ids from the given "
"tab-separate file, using the given fieldname.",
{"population-ids"});
try {
parser.ParseCLI(argc, argv);
} catch (args::Help&) {
Expand Down Expand Up @@ -155,6 +160,17 @@ int main(int argc, char** argv) {
restrictRange = grgl::FloatRange(gStart, gEnd);
}

std::map<std::string, std::string> indivIdToPop;
if (populationIds) {
std::vector<std::string> parts = split(*populationIds, ':');
if (parts.size() != 2) {
std::cerr << "Must specify \"filename:fieldname\" for --population-ids" << std::endl;
return 1;
}
indivIdToPop = loadMapFromTSV(parts[0], "sample", parts[1]);
}
std::cout << "loaded " << indivIdToPop.size() << " id->pops\n";

grgl::GRGPtr theGRG;
START_TIMING_OPERATION();
if (ends_with(*infile, ".trees")) {
Expand Down Expand Up @@ -185,7 +201,8 @@ int main(int argc, char** argv) {
binaryMutations,
missingDataHandling == MDH_ADD_TO_GRG,
!noMAFFlip,
lfFilter ? *lfFilter : 0.0);
lfFilter ? *lfFilter : 0.0,
indivIdToPop);
dumpStats(theGRG);
} else {
std::cerr << "Unsupported/undetected filetype for " << *infile << std::endl;
Expand Down
20 changes: 20 additions & 0 deletions src/mut_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ size_t VCFMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> VCFMutationIterator::getIndividualIds() { return m_vcf->getIndividualLabels(); }

void VCFMutationIterator::buffer_next(size_t& totalSamples) {
bool foundMutations = false;
while (!foundMutations && m_vcf->hasNextVariant() && m_alreadyLoaded.empty()) {
Expand Down Expand Up @@ -265,6 +267,8 @@ size_t IGDMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> IGDMutationIterator::getIndividualIds() { return m_igd->getIndividualIds(); }

void IGDMutationIterator::buffer_next(size_t& totalSamples) {
totalSamples = m_igd->numSamples();

Expand Down Expand Up @@ -398,6 +402,22 @@ size_t BGENMutationIterator::countMutations() const {
return mutations;
}

std::vector<std::string> BGENMutationIterator::getIndividualIds() {
if (bgen_file_contain_samples(m_file)) {
std::vector<std::string> result;
struct bgen_samples* samples = bgen_file_read_samples(m_file);
release_assert(samples != nullptr);
for (size_t i = 0; i < bgen_file_nsamples(m_file); i++) {
const bgen_string* bgStr = bgen_samples_get(samples, i);
release_assert(bgStr != nullptr);
result.emplace_back(bgen_string_data(bgStr));
}
bgen_samples_destroy(samples);
return std::move(result);
}
return {};
}

void BGENMutationIterator::buffer_next(size_t& totalSamples) {
bool foundMutations = false;
while (m_currentVariant < bgen_partition_nvariants(m_partition) && m_alreadyLoaded.empty()) {
Expand Down
Loading

0 comments on commit 6973169

Please sign in to comment.