diff --git a/re2/simplify.cc b/re2/simplify.cc index 8cd10cfbb..cea100b08 100644 --- a/re2/simplify.cc +++ b/re2/simplify.cc @@ -6,6 +6,7 @@ // to use simple extended regular expression features. // Also sort and simplify character classes. +#include #include #include "util/logging.h" @@ -579,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, return re; } +// Returns true if re is an empty-width op. +static bool IsEmptyOp(Regexp* re) { + return (re->op() == kRegexpBeginLine || + re->op() == kRegexpEndLine || + re->op() == kRegexpWordBoundary || + re->op() == kRegexpNoWordBoundary || + re->op() == kRegexpBeginText || + re->op() == kRegexpEndText); +} + // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. @@ -587,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, // but in the Regexp* representation, both (x) are marked as $1. Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) { + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, cap the repetition count at 1. + if (IsEmptyOp(re) || + ((re->op() == kRegexpConcat || + re->op() == kRegexpAlternate) && + std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) { + min = std::min(min, 1); + max = std::min(max, 1); + } + // x{n,} means at least n matches of x. if (max == -1) { // Special case: x{0,} is x* diff --git a/re2/testing/simplify_test.cc b/re2/testing/simplify_test.cc index d2c136a3c..5b683f580 100644 --- a/re2/testing/simplify_test.cc +++ b/re2/testing/simplify_test.cc @@ -140,6 +140,22 @@ static Test tests[] = { { "(){1,}", "()+" }, { "(){0,2}", "(?:()()?)?" }, + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, test that the repetition count is capped at 1. + { "(?:^){0,}", "^*" }, // x{0,} -> x* + { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+ + { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x? + { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x + { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" }, + // NonGreedy should also be handled. + { "(?:^){0,}?", "^*?" }, + { "(?:$){28,}?", "$+?" }, + { "(?-m:^){0,30}?", "(?-m:^)??" }, + { "(?-m:$){28,30}?", "(?-m:$)" }, + { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" }, + // Test that coalescing occurs and that the resulting repeats are simplified. // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: { "a*a*", "a*" },