Skip to content

Commit

Permalink
Avoid expanding counted repetitions of empty-width ops.
Browse files Browse the repository at this point in the history
Credit to Andrew Gallant for doing this for Rust first.

Change-Id: I1587d80ad3f0ec470bcdf338d6fce6786432d23f
Reviewed-on: https://code-review.googlesource.com/c/re2/+/61550
Reviewed-by: Alex Chernyakhovsky <[email protected]>
Reviewed-by: Paul Wankadia <[email protected]>
  • Loading branch information
junyer committed Jul 10, 2023
1 parent a57a1d6 commit e664633
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
21 changes: 21 additions & 0 deletions re2/simplify.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
// to use simple extended regular expression features.
// Also sort and simplify character classes.

#include <algorithm>
#include <string>

#include "util/logging.h"
Expand Down Expand Up @@ -579,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
return re;
}

// Returns true if re is an empty-width op.
static bool IsEmptyOp(Regexp* re) {
return (re->op() == kRegexpBeginLine ||
re->op() == kRegexpEndLine ||
re->op() == kRegexpWordBoundary ||
re->op() == kRegexpNoWordBoundary ||
re->op() == kRegexpBeginText ||
re->op() == kRegexpEndText);
}

// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
Expand All @@ -587,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// For an empty-width op OR a concatenation or alternation of empty-width
// ops, cap the repetition count at 1.
if (IsEmptyOp(re) ||
((re->op() == kRegexpConcat ||
re->op() == kRegexpAlternate) &&
std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
min = std::min(min, 1);
max = std::min(max, 1);
}

// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
Expand Down
16 changes: 16 additions & 0 deletions re2/testing/simplify_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ static Test tests[] = {
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },

// For an empty-width op OR a concatenation or alternation of empty-width
// ops, test that the repetition count is capped at 1.
{ "(?:^){0,}", "^*" }, // x{0,} -> x*
{ "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+
{ "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x?
{ "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x
{ "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" },
{ "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" },
// NonGreedy should also be handled.
{ "(?:^){0,}?", "^*?" },
{ "(?:$){28,}?", "$+?" },
{ "(?-m:^){0,30}?", "(?-m:^)??" },
{ "(?-m:$){28,30}?", "(?-m:$)" },
{ "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" },
{ "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" },

// Test that coalescing occurs and that the resulting repeats are simplified.
// Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
{ "a*a*", "a*" },
Expand Down

0 comments on commit e664633

Please sign in to comment.