Skip to content

Commit

Permalink
Stop using std::map<std::string, Prefilter*>.
Browse files Browse the repository at this point in the history
Use `absl::flat_hash_set<Prefilter*>` as foretold by the
prophecy to avoid string churn, heap pressure et cetera.

While I'm here, switch the `Regexp` reference count overflow map from
`std::map<Regexp*, int>` to `absl::flat_hash_map<Regexp*, int>`.

Change-Id: I57fcfec2931ba6694daee337d6c488118d404fef
Reviewed-on: https://code-review.googlesource.com/c/re2/+/61530
Reviewed-by: Alex Chernyakhovsky <[email protected]>
Reviewed-by: Paul Wankadia <[email protected]>
  • Loading branch information
junyer committed Jul 7, 2023
1 parent 1d465f6 commit a57a1d6
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 48 deletions.
38 changes: 38 additions & 0 deletions re2/prefilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,44 @@ class Prefilter {
std::string DebugString() const;

private:
template <typename H>
friend H AbslHashValue(H h, const Prefilter& a) {
h = H::combine(std::move(h), a.op_);
if (a.op_ == ATOM) {
h = H::combine(std::move(h), a.atom_);
} else if (a.op_ == AND || a.op_ == OR) {
h = H::combine(std::move(h), a.subs_->size());
for (size_t i = 0; i < a.subs_->size(); ++i) {
h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_);
}
}
return h;
}

friend bool operator==(const Prefilter& a, const Prefilter& b) {
if (&a == &b) {
return true;
}
if (a.op_ != b.op_) {
return false;
}
if (a.op_ == ATOM) {
if (a.atom_ != b.atom_) {
return false;
}
} else if (a.op_ == AND || a.op_ == OR) {
if (a.subs_->size() != b.subs_->size()) {
return false;
}
for (size_t i = 0; i < a.subs_->size(); ++i) {
if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) {
return false;
}
}
}
return true;
}

// A comparator used to store exact strings. We compare by length,
// then lexicographically. This ordering makes it easier to reduce the
// set of strings in SimplifyStringSet.
Expand Down
47 changes: 15 additions & 32 deletions re2/prefilter_tree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <stddef.h>
#include <algorithm>
#include <cmath>
#include <map>
#include <memory>
#include <string>
#include <utility>
Expand Down Expand Up @@ -63,33 +62,18 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {

compiled_ = true;

NodeMap nodes;
NodeSet nodes;
AssignUniqueIds(&nodes, atom_vec);
if (ExtraDebug)
PrintDebugInfo(&nodes);
}

Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
std::string node_string = NodeString(node);
NodeMap::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
}

std::string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
std::string s = absl::StrFormat("%d", node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
}
Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
NodeSet::const_iterator iter = nodes->find(node);
if (iter != nodes->end()) {
return *iter;
}
return s;
return NULL;
}

bool PrefilterTree::KeepNode(Prefilter* node) const {
Expand Down Expand Up @@ -129,7 +113,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
}
}

void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
std::vector<std::string>* atom_vec) {
atom_vec->clear();

Expand Down Expand Up @@ -169,9 +153,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// Any further nodes that have the same atom/subs
// will find this node as the canonical node.
nodes->emplace(NodeString(node), node);
nodes->emplace(node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
Expand Down Expand Up @@ -300,7 +284,7 @@ void PrefilterTree::RegexpsGivenStrings(
for (size_t j = 0; j < matched_atoms.size(); j++)
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
for (IntMap::const_iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
Expand All @@ -316,7 +300,7 @@ void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
Expand Down Expand Up @@ -348,7 +332,7 @@ void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}

void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();

Expand All @@ -360,11 +344,10 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
for (int parent : parents)
LOG(ERROR) << parent;
}
LOG(ERROR) << "Map:";
for (NodeMap::const_iterator iter = nodes->begin();
LOG(ERROR) << "Set:";
for (NodeSet::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
}

std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
Expand Down
41 changes: 27 additions & 14 deletions re2/prefilter_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
// atoms) that the user of this class should use to do the string
// matching.

#include <map>
#include <string>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
#include "util/logging.h"

namespace re2 {

Expand Down Expand Up @@ -57,10 +58,25 @@ class PrefilterTree {
void PrintPrefilter(int regexpid);

private:
typedef SparseArray<int> IntMap;
// TODO(junyer): Use absl::flat_hash_set<Prefilter*> instead?
// It should be trivial to get rid of the stringification...
typedef std::map<std::string, Prefilter*> NodeMap;
using IntMap = SparseArray<int>;

struct PrefilterHash {
size_t operator()(const Prefilter* a) const {
DCHECK(a != NULL);
return absl::Hash<Prefilter>()(*a);
}
};

struct PrefilterEqual {
bool operator()(const Prefilter* a, const Prefilter* b) const {
DCHECK(a != NULL);
DCHECK(b != NULL);
return *a == *b;
}
};

using NodeSet =
absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;

// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
Expand Down Expand Up @@ -90,25 +106,22 @@ class PrefilterTree {
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);

// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;

// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);

// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
std::string NodeString(Prefilter* node) const;
// Returns the prefilter node that has the same atom/subs as this
// node. For the canonical node, returns node. Assumes that the
// children of node have already been assigned unique ids.
Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);

// Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;

// Used for debugging.
void PrintDebugInfo(NodeMap* nodes);
void PrintDebugInfo(NodeSet* nodes);

// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
Expand Down
5 changes: 3 additions & 2 deletions re2/regexp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "absl/base/call_once.h"
#include "absl/base/macros.h"
#include "absl/container/flat_hash_map.h"
#include "absl/synchronization/mutex.h"
#include "util/logging.h"
#include "util/utf.h"
Expand Down Expand Up @@ -76,15 +77,15 @@ bool Regexp::QuickDestroy() {
// Similar to EmptyStorage in re2.cc.
struct RefStorage {
absl::Mutex ref_mutex;
std::map<Regexp*, int> ref_map;
absl::flat_hash_map<Regexp*, int> ref_map;
};
alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];

static inline absl::Mutex* ref_mutex() {
return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
}

static inline std::map<Regexp*, int>* ref_map() {
static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
}

Expand Down

0 comments on commit a57a1d6

Please sign in to comment.