Skip to content

Commit

Permalink
Order mutation IDs, down edges optional, gperf support (#13)
Browse files Browse the repository at this point in the history
Serialize mutations using getMutationsToNodeOrdered() which ensures
that the MutationID order matches the (position, allele) sort order
upon deserialization. If you have an "old" GRG that does not have
sorted MutationIDs, just reserialize it:
`./grgl old.grg -o new.grg`

Make down edges optional on deserialization, just like up edges are.

Compile-time support for gperf CPU profiling in the CMakeList. Just
download the appropriate release of gperf source, build it, and
then build grgl with -DENABLE_CPU_PROF=ON

A GRG can be checked for mutation ID ordering via
mutationsAreOrdered() (C++) and mutations_are_ordered (Python).
  • Loading branch information
dcdehaas committed Sep 9, 2024
1 parent 2d5c3e7 commit 3ad2a3f
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 22 deletions.
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ option(ENABLE_CHECKERS "Enable external tools like clang-tidy" OFF)
option(ENABLE_TESTS "Enable automated test execution" ON)
option(ENABLE_BGEN "Enable BGEN support" OFF)
option(ENABLE_GSL "Enable GSL support for stat calculations" OFF)
option(ENABLE_CPU_PROF "Enable gperftools CPU profiler" OFF)

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
Expand Down Expand Up @@ -112,6 +113,14 @@ if (${ENABLE_BGEN})
add_compile_options(-DBGEN_ENABLED=1)
endif()

# In order to use this, grab https://github.com/gperftools/gperftools/releases/download/gperftools-2.15/gperftools-2.15.tar.gz
# and unzip it to the top-level grgl repo directory, and do "./configure && make"
if (${ENABLE_CPU_PROF})
include_directories(${CMAKE_CURRENT_LIST_DIR}/gperftools-2.15/src/)
add_compile_options(-g) # Turn on symbols (but not debug mode)
set(GRGP_LIBS ${CMAKE_CURRENT_LIST_DIR}/gperftools-2.15/.libs/libprofiler.a unwind)
endif()

# We don't need AMPL bindings from gsl.
if (${ENABLE_GSL})
ExternalProject_Add(
Expand Down
44 changes: 40 additions & 4 deletions include/grgl/grg.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,36 @@ class GRG : public std::enable_shared_from_this<GRG> {
virtual NodeListIterator getUpEdges(NodeID nodeId) const = 0;
virtual NodeData& getNodeData(NodeID nodeId) = 0;

bool mutationsAreOrdered() const { return m_mutsAreOrdered; }

/**
* Get the base-pair range this GRG covers as a pair {first, last+1} where first is
* the base-pair position of the first mutation, and last is similarly defined.
* In general we treat all ranges as left-inclusive and right-exlusive, hence the
* +1 on the last.
*/
std::pair<BpPosition, BpPosition> getBPRange() const {
if (mutationsAreOrdered() && !m_mutations.empty()) {
const size_t lastMutId = m_mutations.size() - 1;
return {m_mutations[0].getPosition(), m_mutations[lastMutId].getPosition()};
}
BpPosition firstPos = std::numeric_limits<BpPosition>::max();
BpPosition lastPos = 0;
for (MutationId i = 0; i < numMutations(); i++) {
const auto position = m_mutations[i].getPosition();
if (position < firstPos) {
firstPos = position;
}
if (position > lastPos) {
lastPos = position;
}
}
return {firstPos, lastPos + 1};
}

/**
* Get the NodeIDList for all samples in the graph.
*/
NodeIDList getSampleNodes() const {
NodeIDList result;
for (grgl::NodeID i = 0; i < this->numSamples(); i++) {
Expand All @@ -102,6 +132,9 @@ class GRG : public std::enable_shared_from_this<GRG> {
return result;
}

/**
* Get the NodeIDList for all roots in the graph.
*/
NodeIDList getRootNodes() const {
NodeIDList result;
for (grgl::NodeID nodeId = 0; nodeId < this->numNodes(); nodeId++) {
Expand Down Expand Up @@ -215,10 +248,14 @@ class GRG : public std::enable_shared_from_this<GRG> {

const size_t m_numSamples;
const uint16_t m_ploidy;
// True if the mutationId order matches the (position, allele) order.
bool m_mutsAreOrdered;
};

using GRGPtr = std::shared_ptr<GRG>;
using ConstGRGPtr = std::shared_ptr<const GRG>;
using MutableGRGPtr = std::shared_ptr<MutableGRG>;
using ConstMutableGRGPtr = std::shared_ptr<const MutableGRG>;

class MutableGRG : public GRG {
public:
Expand Down Expand Up @@ -342,10 +379,9 @@ class MutableGRG : public GRG {
private:
// The list of nodes. The node's position in this vector must match its ID.
std::vector<GRGNodePtr> m_nodes;
};

using MutableGRGPtr = std::shared_ptr<MutableGRG>;
using ConstMutableGRGPtr = std::shared_ptr<const MutableGRG>;
friend MutableGRGPtr readMutableGrg(std::istream& inStream);
};

class CSRGRG : public GRG {
public:
Expand Down Expand Up @@ -403,7 +439,7 @@ class CSRGRG : public GRG {
NodeIDList m_upEdges;
std::vector<NodeData> m_nodeData;

friend GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges);
friend GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges, bool loadDownEdges);
};

using CSRGRGPtr = std::shared_ptr<CSRGRG>;
Expand Down
20 changes: 13 additions & 7 deletions include/grgl/mutation.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ constexpr EXTERNAL_ID NO_ORIGINAL_ID = -1;
class Mutation;

using MutationId = uint32_t;
constexpr MutationId INVALID_MUTATION_ID = std::numeric_limits<MutationId>::max();
constexpr MutationId MAX_MUTATION_ID = INVALID_MUTATION_ID - 1;

using BpPosition = uint64_t;
constexpr BpPosition INVALID_POSITION = std::numeric_limits<BpPosition>::max();
constexpr BpPosition MAX_POSITION = INVALID_POSITION - 1;

/**
* A mutation in the GRG.
Expand All @@ -49,12 +55,12 @@ class Mutation {

Mutation()
: m_alleleStorage({}),
m_position(std::numeric_limits<uint64_t>::max()),
m_position(INVALID_POSITION),
m_time(-1.0),
m_originalId(NO_ORIGINAL_ID),
m_refPosition(0) {}

Mutation(uint64_t position, std::string allele)
Mutation(BpPosition position, std::string allele)
: m_alleleStorage({}),
m_position(position),
m_time(-1.0),
Expand All @@ -66,7 +72,7 @@ class Mutation {
setAlleleValues(std::move(allele), "");
}

Mutation(uint64_t position,
Mutation(BpPosition position,
std::string allele,
const std::string& refAllele,
float time = -1.0,
Expand Down Expand Up @@ -134,9 +140,9 @@ class Mutation {
setAlleleValues(std::move(rhs.getAllele()), rhs.getRefAllele());
}

bool isEmpty() const { return m_position == std::numeric_limits<uint64_t>::max(); }
bool isEmpty() const { return m_position == INVALID_POSITION; }

uint64_t getPosition() const { return m_position; }
BpPosition getPosition() const { return m_position; }

float getTime() const { return m_time; }

Expand Down Expand Up @@ -220,7 +226,7 @@ class Mutation {
char _b[8];
} m_alleleStorage; // 8
static_assert(sizeof(m_alleleStorage) == 8, "Allele storage size changed");
uint64_t m_position; // 8
BpPosition m_position; // 8
float m_time; // 4
EXTERNAL_ID m_originalId; // 4
uint16_t m_refPosition; // 2
Expand Down Expand Up @@ -249,7 +255,7 @@ struct MutationLtPosAllele {
*/
template <> struct std::hash<grgl::Mutation> {
std::size_t operator()(const grgl::Mutation& mut) const noexcept {
std::size_t h1 = std::hash<uint64_t>{}(mut.getPosition());
std::size_t h1 = std::hash<grgl::BpPosition>{}(mut.getPosition());
std::size_t h2 = std::hash<std::string>{}(mut.getAllele());
return grgl::hash_combine(h1, h2);
}
Expand Down
2 changes: 1 addition & 1 deletion include/grgl/serialize.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void writeGrg(const GRGPtr& grg, std::ostream& out, bool useVarInt = true, bool
* @param[in] inStream The (binary) input stream.
*/
MutableGRGPtr readMutableGrg(std::istream& inStream);
GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges = true);
GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges = true, bool loadDownEdges = true);

}; // namespace grgl

Expand Down
15 changes: 13 additions & 2 deletions src/grg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ MutationId GRG::addMutation(const Mutation& mutation, const NodeID nodeId) {
release_assert(nodeId < this->numNodes());
}
this->m_nodeToMutations.emplace(nodeId, mutId);
if (m_mutsAreOrdered) {
m_mutsAreOrdered = false;
}
return mutId;
}

Expand All @@ -60,13 +63,21 @@ struct MutIdAndNodeLt {
ConstGRGPtr grg;
};

struct MutIdAndNodeLtFast {
bool operator()(const MutIdAndNode& lhs, const MutIdAndNode& rhs) const { return lhs.first < rhs.first; }
};

std::vector<MutIdAndNode> GRG::getMutationsToNodeOrdered() const {
std::vector<std::pair<MutationId, NodeID>> result;
for (const auto& nodeAndMutId : m_nodeToMutations) {
result.emplace_back(nodeAndMutId.second, nodeAndMutId.first);
}
ConstGRGPtr sharedThis = shared_from_this();
std::sort(result.begin(), result.end(), MutIdAndNodeLt(sharedThis));
if (mutationsAreOrdered()) {
std::sort(result.begin(), result.end(), MutIdAndNodeLtFast());
} else {
std::sort(result.begin(), result.end(), MutIdAndNodeLt(sharedThis));
}
return std::move(result);
}

Expand Down Expand Up @@ -260,8 +271,8 @@ void CSRGRG::visitTopo(GRGVisitor& visitor,
GRGPtr sharedThis = shared_from_this();
NodeIDSet alreadySeen;
std::vector<NodeID> heap;
heap.reserve(seedList.size() * 2);
for (const auto& seedId : seedList) {
release_assert(seedId < numNodes());
heap.emplace_back(seedId);
}
const bool isDown = direction == TraversalDirection::DIRECTION_DOWN;
Expand Down
4 changes: 2 additions & 2 deletions src/grg_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ static inline MutableGRGPtr loadMutableGRG(const std::string& filename) {
return result;
}

static inline GRGPtr loadImmutableGRG(const std::string& filename, bool loadUpEdges = true) {
static inline GRGPtr loadImmutableGRG(const std::string& filename, bool loadUpEdges = true, bool loadDownEdges = true) {
GRGPtr result;
std::ifstream inStream(filename, std::ios::binary);
if (!inStream.good()) {
std::cerr << "Could not read " << filename << std::endl;
return result;
}
try {
result = readImmutableGrg(inStream, loadUpEdges);
result = readImmutableGrg(inStream, loadUpEdges, loadDownEdges);
} catch (SerializationFailure& e) {
std::cerr << "Failed to load GRG: " << e.what() << std::endl;
return result;
Expand Down
17 changes: 16 additions & 1 deletion src/python/_grgl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ PYBIND11_MODULE(_grgl, m) {
num_nodes instead of performing actual graph traversals, depending
on what you are trying to accomplish.
)^")
.def_property_readonly("mutations_are_ordered", &grgl::GRG::mutationsAreOrdered, R"^(
Returns true if the MutationID order matches the (position, allele)
sorted order. That is, MutationID of 0 is the lowest position value
and MutationID of num_mutations-1 is the highest. Ties are broken
by the lexicographic order of the allele.
)^")
.def_property_readonly("num_nodes", &grgl::GRG::numNodes, R"^(
Get the total number of nodes (including sample and mutation nodes)
in the GRG.
Expand Down Expand Up @@ -338,13 +344,22 @@ PYBIND11_MODULE(_grgl, m) {
:rtype: pygrgl.MutableGRG
)^");

m.def("load_immutable_grg", &grgl::loadImmutableGRG, py::arg("filename"), py::arg("load_up_edges") = true, R"^(
m.def("load_immutable_grg",
&grgl::loadImmutableGRG,
py::arg("filename"),
py::arg("load_up_edges") = true,
py::arg("load_down_edges") = true,
R"^(
Load a GRG file from disk. Immutable GRGs are much faster to traverse than mutable
GRGs and take up less RAM, so this is the preferred method if you are using a GRG
for calculation or annotation, and not modifying the graph structure itself.
:param filename: The file to load.
:type filename: str
:param load_up_edges: If False, do not load the graph "up" edges (saves RAM).
:type load_up_edges: bool
:param load_down_edges: If False, do not load the graph "down" edges (saves RAM).
:type load_down_edges: bool
:return: The GRG.
:rtype: pygrgl.GRG
)^");
Expand Down
23 changes: 18 additions & 5 deletions src/serialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ constexpr uint32_t GRG_MINOR_VERSION = 1;
namespace grgl {

constexpr uint64_t HAS_INDIVIDUAL_COALS = 0x2;
constexpr uint64_t MUTATION_IDS_ARE_ORDERED = 0x4;

#pragma pack(push, 1)
struct GRGFileHeader {
Expand Down Expand Up @@ -267,7 +268,7 @@ void writeGrg(const GRGPtr& grg, std::ostream& outStream, bool useVarInt, bool a
static_cast<uint16_t>(grg->getPopulations().size()),
grg->getPloidy(),
0, /* Unused */
HAS_INDIVIDUAL_COALS,
HAS_INDIVIDUAL_COALS | MUTATION_IDS_ARE_ORDERED,
0,
0,
0,
Expand Down Expand Up @@ -297,10 +298,10 @@ void writeGrg(const GRGPtr& grg, std::ostream& outStream, bool useVarInt, bool a
std::cout << " Edges: " << header.edgeCount << std::endl;

// Mutations
for (const auto& nodeAndMutId : grg->getNodeMutationPairs()) {
writeMutation(grg->getMutationById(nodeAndMutId.second), header.idSize, outStream);
for (const auto& nodeAndMutId : grg->getMutationsToNodeOrdered()) {
writeMutation(grg->getMutationById(nodeAndMutId.first), header.idSize, outStream);
assert_deserialization(outStream.good(), "Writing GRG failed");
writeNodeID(visitor.getNewID(nodeAndMutId.first), header.idSize, outStream);
writeNodeID(visitor.getNewID(nodeAndMutId.second), header.idSize, outStream);
assert_deserialization(outStream.good(), "Writing GRG failed");
}
// Populations
Expand Down Expand Up @@ -460,10 +461,13 @@ MutableGRGPtr readMutableGrg(std::istream& inStream) {
grg->compact();

readGrgCommon(header, grg, inStream);
if ((bool)(header.flags & MUTATION_IDS_ARE_ORDERED)) {
grg->m_mutsAreOrdered = true;
}
return grg;
}

GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges) {
GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges, bool loadDownEdges) {
assert_deserialization(inStream.good(), "Bad input stream");
// Read header.
GRGFileHeader header = {};
Expand Down Expand Up @@ -517,8 +521,17 @@ GRGPtr readImmutableGrg(std::istream& inStream, bool loadUpEdges) {
}
}
}
if (!loadDownEdges) {
std::fill(grg->m_downPositions.begin(), grg->m_downPositions.end(), 0);
grg->m_downEdges.clear();
grg->m_downEdges.shrink_to_fit();
grg->finalize();
}

readGrgCommon(header, grg, inStream);
if ((bool)(header.flags & MUTATION_IDS_ARE_ORDERED)) {
grg->m_mutsAreOrdered = true;
}
return grg;
}

Expand Down

0 comments on commit 3ad2a3f

Please sign in to comment.