From 6148386f0c8f03da1ab0bf982a4dbe080b4ea7bc Mon Sep 17 00:00:00 2001
From: Paul Wankadia <junyer@google.com>
Date: Fri, 11 Aug 2023 18:40:51 +0000
Subject: [PATCH] Add support for `(?<name>expr)`.

This follows https://github.com/golang/go/commit/ee61186 to some
extent. I took the opportunity to simplify the parsing logic and
also fixed a bug in `Regexp::Equal()` that had gone unnoticed...

Change-Id: I90abec942d39b02a1c6d1ac95cd3b1cc66ec7b2a
Reviewed-on: https://code-review.googlesource.com/c/re2/+/61690
Reviewed-by: Alex Chernyakhovsky <achernya@google.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
---
 doc/syntax.html           |  2 +-
 doc/syntax.txt            |  2 +-
 re2/parse.cc              | 26 +++++++++++++-------------
 re2/regexp.cc             |  8 +++++++-
 re2/testing/parse_test.cc | 18 ++++++++++++++++++
 5 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/doc/syntax.html b/doc/syntax.html
index eed4fd2a7..6cbda140e 100644
--- a/doc/syntax.html
+++ b/doc/syntax.html
@@ -62,7 +62,7 @@ <h1>RE2 regular expression syntax reference</h1>
 <tr><td colspan=2><b>Grouping:</b></td></tr>
 <tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
 <tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
-<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
+<tr><td><code>(?&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
 <tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
 <tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
 <tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
diff --git a/doc/syntax.txt b/doc/syntax.txt
index 5bb2067f9..6070efd96 100644
--- a/doc/syntax.txt
+++ b/doc/syntax.txt
@@ -51,7 +51,7 @@ x{n}+	exactly «n» «x», possessive NOT SUPPORTED
 Grouping:
 (re)	numbered capturing group (submatch)
 (?P<name>re)	named & numbered capturing group (submatch)
-(?<name>re)	named & numbered capturing group (submatch) NOT SUPPORTED
+(?<name>re)	named & numbered capturing group (submatch)
 (?'name're)	named & numbered capturing group (submatch) NOT SUPPORTED
 (?:re)	non-capturing group
 (?flags)	set flags within current group; non-capturing
diff --git a/re2/parse.cc b/re2/parse.cc
index 67a485791..7b1510dda 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -2059,8 +2059,6 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
     return false;
   }
 
-  t.remove_prefix(2);  // "(?"
-
   // Check for named captures, first introduced in Python's regexp library.
   // As usual, there are three slightly different syntaxes:
   //
@@ -2074,22 +2072,23 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
   // support all three as well.  EcmaScript 4 uses only the Python form.
   //
   // In both the open source world (via Code Search) and the
-  // Google source tree, (?P<expr>name) is the dominant form,
-  // so that's the one we implement.  One is enough.
-  if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
+  // Google source tree, (?P<name>expr) and (?<name>expr) are the
+  // dominant forms of named captures and both are supported.
+  if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
+      (t.size() > 3 && t[2] == '<')) {
     // Pull out name.
-    size_t end = t.find('>', 2);
+    size_t begin = t[2] == 'P' ? 4 : 3;
+    size_t end = t.find('>', begin);
     if (end == absl::string_view::npos) {
-      if (!IsValidUTF8(*s, status_))
+      if (!IsValidUTF8(t, status_))
         return false;
       status_->set_code(kRegexpBadNamedCapture);
-      status_->set_error_arg(*s);
+      status_->set_error_arg(t);
       return false;
     }
 
-    // t is "P<name>...", t[end] == '>'
-    absl::string_view capture(t.data()-2, end+3);  // "(?P<name>"
-    absl::string_view name(t.data()+2, end-2);     // "name"
+    absl::string_view capture(t.data(), end+1);
+    absl::string_view name(t.data()+begin, end-begin);
     if (!IsValidUTF8(name, status_))
       return false;
     if (!IsValidCaptureName(name)) {
@@ -2103,11 +2102,12 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
       return false;
     }
 
-    s->remove_prefix(
-        static_cast<size_t>(capture.data() + capture.size() - s->data()));
+    s->remove_prefix(capture.size());
     return true;
   }
 
+  t.remove_prefix(2);  // "(?"
+
   bool negated = false;
   bool sawflags = false;
   int nflags = flags_;
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 1614bb0fe..4ea81cfcd 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -400,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) {
              a->max() == b->max();
 
     case kRegexpCapture:
-      return a->cap() == b->cap() && a->name() == b->name();
+      if (a->name() == NULL || b->name() == NULL) {
+        // One pointer is null, so the other pointer should also be null.
+        return a->cap() == b->cap() && a->name() == b->name();
+      } else {
+        // Neither pointer is null, so compare the pointees for equality.
+        return a->cap() == b->cap() && *a->name() == *b->name();
+      }
 
     case kRegexpHaveMatch:
       return a->match_id() == b->match_id();
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 9d3954424..0ee5561e9 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -166,6 +166,8 @@ static Test tests[] = {
   // Test named captures
   { "(?P<name>a)", "cap{name:lit{a}}" },
   { "(?P<中文>a)", "cap{中文:lit{a}}" },
+  { "(?<name>a)", "cap{name:lit{a}}" },
+  { "(?<中文>a)", "cap{中文:lit{a}}" },
 
   // Case-folded literals
   { "[Aa]", "litfold{a}" },
@@ -396,6 +398,11 @@ const char* badtests[] = {
   "(?P<name",
   "(?P<x y>a)",
   "(?P<>a)",
+  "(?<name>a",
+  "(?<name>",
+  "(?<name",
+  "(?<x y>a)",
+  "(?<>a)",
   "[a-Z]",
   "(?i)[a-Z]",
   "a{100000}",
@@ -416,6 +423,7 @@ const char* only_perl[] = {
  "\\Q\\\\\\\\\\E",
  "(?:a)",
  "(?P<name>a)",
+ "(?<name>a)",
 };
 
 // Valid in POSIX, bad in Perl.
@@ -505,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) {
   EXPECT_TRUE(re == NULL);
   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
   EXPECT_EQ(status.error_arg(), "(?P<space bar>");
+
+  re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
+  EXPECT_TRUE(re == NULL);
+  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+  EXPECT_EQ(status.error_arg(), "(?<name");
+
+  re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
+  EXPECT_TRUE(re == NULL);
+  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+  EXPECT_EQ(status.error_arg(), "(?<space bar>");
 }
 
 }  // namespace re2