From 6148386f0c8f03da1ab0bf982a4dbe080b4ea7bc Mon Sep 17 00:00:00 2001 From: Paul Wankadia Date: Fri, 11 Aug 2023 18:40:51 +0000 Subject: [PATCH] Add support for `(?expr)`. This follows https://github.com/golang/go/commit/ee61186 to some extent. I took the opportunity to simplify the parsing logic and also fixed a bug in `Regexp::Equal()` that had gone unnoticed... Change-Id: I90abec942d39b02a1c6d1ac95cd3b1cc66ec7b2a Reviewed-on: https://code-review.googlesource.com/c/re2/+/61690 Reviewed-by: Alex Chernyakhovsky Reviewed-by: Paul Wankadia --- doc/syntax.html | 2 +- doc/syntax.txt | 2 +- re2/parse.cc | 26 +++++++++++++------------- re2/regexp.cc | 8 +++++++- re2/testing/parse_test.cc | 18 ++++++++++++++++++ 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/doc/syntax.html b/doc/syntax.html index eed4fd2a7..6cbda140e 100644 --- a/doc/syntax.html +++ b/doc/syntax.html @@ -62,7 +62,7 @@

RE2 regular expression syntax reference

Grouping: (re)numbered capturing group (submatch) (?P<name>re)named & numbered capturing group (submatch) -(?<name>re)named & numbered capturing group (submatch) +(?<name>re)named & numbered capturing group (submatch) (?'name're)named & numbered capturing group (submatch) (?:re)non-capturing group (?flags)set flags within current group; non-capturing diff --git a/doc/syntax.txt b/doc/syntax.txt index 5bb2067f9..6070efd96 100644 --- a/doc/syntax.txt +++ b/doc/syntax.txt @@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED Grouping: (re) numbered capturing group (submatch) (?Pre) named & numbered capturing group (submatch) -(?re) named & numbered capturing group (submatch) NOT SUPPORTED +(?re) named & numbered capturing group (submatch) (?'name're) named & numbered capturing group (submatch) NOT SUPPORTED (?:re) non-capturing group (?flags) set flags within current group; non-capturing diff --git a/re2/parse.cc b/re2/parse.cc index 67a485791..7b1510dda 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -2059,8 +2059,6 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { return false; } - t.remove_prefix(2); // "(?" - // Check for named captures, first introduced in Python's regexp library. // As usual, there are three slightly different syntaxes: // @@ -2074,22 +2072,23 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { // support all three as well. EcmaScript 4 uses only the Python form. // // In both the open source world (via Code Search) and the - // Google source tree, (?Pname) is the dominant form, - // so that's the one we implement. One is enough. - if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { + // Google source tree, (?Pexpr) and (?expr) are the + // dominant forms of named captures and both are supported. + if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') || + (t.size() > 3 && t[2] == '<')) { // Pull out name. - size_t end = t.find('>', 2); + size_t begin = t[2] == 'P' ? 4 : 3; + size_t end = t.find('>', begin); if (end == absl::string_view::npos) { - if (!IsValidUTF8(*s, status_)) + if (!IsValidUTF8(t, status_)) return false; status_->set_code(kRegexpBadNamedCapture); - status_->set_error_arg(*s); + status_->set_error_arg(t); return false; } - // t is "P...", t[end] == '>' - absl::string_view capture(t.data()-2, end+3); // "(?P" - absl::string_view name(t.data()+2, end-2); // "name" + absl::string_view capture(t.data(), end+1); + absl::string_view name(t.data()+begin, end-begin); if (!IsValidUTF8(name, status_)) return false; if (!IsValidCaptureName(name)) { @@ -2103,11 +2102,12 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) { return false; } - s->remove_prefix( - static_cast(capture.data() + capture.size() - s->data())); + s->remove_prefix(capture.size()); return true; } + t.remove_prefix(2); // "(?" + bool negated = false; bool sawflags = false; int nflags = flags_; diff --git a/re2/regexp.cc b/re2/regexp.cc index 1614bb0fe..4ea81cfcd 100644 --- a/re2/regexp.cc +++ b/re2/regexp.cc @@ -400,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) { a->max() == b->max(); case kRegexpCapture: - return a->cap() == b->cap() && a->name() == b->name(); + if (a->name() == NULL || b->name() == NULL) { + // One pointer is null, so the other pointer should also be null. + return a->cap() == b->cap() && a->name() == b->name(); + } else { + // Neither pointer is null, so compare the pointees for equality. + return a->cap() == b->cap() && *a->name() == *b->name(); + } case kRegexpHaveMatch: return a->match_id() == b->match_id(); diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 9d3954424..0ee5561e9 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -166,6 +166,8 @@ static Test tests[] = { // Test named captures { "(?Pa)", "cap{name:lit{a}}" }, { "(?P<中文>a)", "cap{中文:lit{a}}" }, + { "(?a)", "cap{name:lit{a}}" }, + { "(?<中文>a)", "cap{中文:lit{a}}" }, // Case-folded literals { "[Aa]", "litfold{a}" }, @@ -396,6 +398,11 @@ const char* badtests[] = { "(?Pa)", "(?P<>a)", + "(?a", + "(?", + "(?a)", + "(?<>a)", "[a-Z]", "(?i)[a-Z]", "a{100000}", @@ -416,6 +423,7 @@ const char* only_perl[] = { "\\Q\\\\\\\\\\E", "(?:a)", "(?Pa)", + "(?a)", }; // Valid in POSIX, bad in Perl. @@ -505,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) { EXPECT_TRUE(re == NULL); EXPECT_EQ(status.code(), kRegexpBadNamedCapture); EXPECT_EQ(status.error_arg(), "(?P"); + + re = Regexp::Parse("test(?z)", Regexp::LikePerl, &status); + EXPECT_TRUE(re == NULL); + EXPECT_EQ(status.code(), kRegexpBadNamedCapture); + EXPECT_EQ(status.error_arg(), "(?"); } } // namespace re2