ENH: added regex argument to Series.str.split (pandas-dev#44185)

me-kbs · Nov 4, 2021 · 669acb4 · 669acb4
1 parent 4d507b0
commit 669acb4
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 22 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -180,6 +180,7 @@ Other enhancements
 - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`)
 - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
 - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
+- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
 -
 

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -659,11 +659,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     Split strings around given separator/delimiter.
 
     Splits the string in the Series/Index from the %(side)s,
-    at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
+    at the specified delimiter string.
 
     Parameters
     ----------
-    pat : str, optional
+    pat : str or compiled regex, optional
         String or regular expression to split on.
         If not specified, split on whitespace.
     n : int, default -1 (all)
@@ -672,14 +672,30 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     expand : bool, default False
         Expand the split strings into separate columns.
 
-        * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
-        * If ``False``, return Series/Index, containing lists of strings.
+        - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
+        - If ``False``, return Series/Index, containing lists of strings.
+
+    regex : bool, default None
+        Determines if the passed-in pattern is a regular expression:
+
+        - If ``True``, assumes the passed-in pattern is a regular expression
+        - If ``False``, treats the pattern as a literal string.
+        - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
+        - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
+        - Cannot be set to False if `pat` is a compiled regex
+
+        .. versionadded:: 1.4.0
 
     Returns
     -------
     Series, Index, DataFrame or MultiIndex
         Type matches caller unless ``expand=True`` (see Notes).
 
+    Raises
+    ------
+    ValueError
+        * if `regex` is False and `pat` is a compiled regex
+
     See Also
     --------
     Series.str.split : Split strings around given separator/delimiter.
@@ -702,6 +718,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     If using ``expand=True``, Series and Index callers return DataFrame and
     MultiIndex objects, respectively.
 
+    Use of `regex=False` with a `pat` as a compiled regex will raise
+    an error.
+
     Examples
     --------
     >>> s = pd.Series(
@@ -776,22 +795,63 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
     1  https://docs.python.org/3/tutorial  index.html
     2                                 NaN         NaN
 
-    Remember to escape special characters when explicitly using regular
-    expressions.
+    Remember to escape special characters when explicitly using regular expressions.
 
-    >>> s = pd.Series(["1+1=2"])
-    >>> s
-    0    1+1=2
-    dtype: object
-    >>> s.str.split(r"\+|=", expand=True)
-         0    1    2
-    0    1    1    2
+    >>> s = pd.Series(["foo and bar plus baz"])
+    >>> s.str.split(r"and|plus", expand=True)
+        0   1   2
+    0 foo bar baz
+
+    Regular expressions can be used to handle urls or file names.
+    When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
+    as a regex only if ``len(pat) != 1``.
+
+    >>> s = pd.Series(['foojpgbar.jpg'])
+    >>> s.str.split(r".", expand=True)
+               0    1
+    0  foojpgbar  jpg
+
+    >>> s.str.split(r"\.jpg", expand=True)
+               0 1
+    0  foojpgbar
+
+    When ``regex=True``, `pat` is interpreted as a regex
+
+    >>> s.str.split(r"\.jpg", regex=True, expand=True)
+               0 1
+    0  foojpgbar
+
+    A compiled regex can be passed as `pat`
+
+    >>> import re
+    >>> s.str.split(re.compile(r"\.jpg"), expand=True)
+               0 1
+    0  foojpgbar
+
+    When ``regex=False``, `pat` is interpreted as the string itself
+
+    >>> s.str.split(r"\.jpg", regex=False, expand=True)
+                   0
+    0  foojpgbar.jpg
     """
 
     @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
     @forbid_nonstring_types(["bytes"])
-    def split(self, pat=None, n=-1, expand=False):
-        result = self._data.array._str_split(pat, n, expand)
+    def split(
+        self,
+        pat: str | re.Pattern | None = None,
+        n=-1,
+        expand=False,
+        *,
+        regex: bool | None = None,
+    ):
+        if regex is False and is_re(pat):
+            raise ValueError(
+                "Cannot use a compiled regex as replacement pattern with regex=False"
+            )
+        if is_re(pat):
+            regex = True
+        result = self._data.array._str_split(pat, n, expand, regex)
         return self._wrap_result(result, returns_string=expand, expand=expand)
 
     @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -308,21 +308,38 @@ def f(x):
 
         return self._str_map(f)
 
-    def _str_split(self, pat=None, n=-1, expand=False):
+    def _str_split(
+        self,
+        pat: str | re.Pattern | None = None,
+        n=-1,
+        expand=False,
+        regex: bool | None = None,
+    ):
         if pat is None:
             if n is None or n == 0:
                 n = -1
             f = lambda x: x.split(pat, n)
         else:
-            if len(pat) == 1:
-                if n is None or n == 0:
-                    n = -1
-                f = lambda x: x.split(pat, n)
+            new_pat: str | re.Pattern
+            if regex is True or isinstance(pat, re.Pattern):
+                new_pat = re.compile(pat)
+            elif regex is False:
+                new_pat = pat
+            # regex is None so link to old behavior #43563
             else:
+                if len(pat) == 1:
+                    new_pat = pat
+                else:
+                    new_pat = re.compile(pat)
+
+            if isinstance(new_pat, re.Pattern):
                 if n is None or n == -1:
                     n = 0
-                regex = re.compile(pat)
-                f = lambda x: regex.split(x, maxsplit=n)
+                f = lambda x: new_pat.split(x, maxsplit=n)
+            else:
+                if n is None or n == 0:
+                    n = -1
+                f = lambda x: x.split(pat, n)
         return self._str_map(f, dtype=object)
 
     def _str_rsplit(self, pat=None, n=-1):

diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+import re
 
 import numpy as np
 import pytest
@@ -35,6 +36,44 @@ def test_split(any_string_dtype):
     tm.assert_series_equal(result, exp)
 
 
+def test_split_regex(any_string_dtype):
+    # GH 43563
+    # explicit regex = True split
+    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
+    result = values.str.split(r"\.jpg", regex=True)
+    exp = Series([["xxxjpgzzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # explicit regex = True split with compiled regex
+    regex_pat = re.compile(r".jpg")
+    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
+    result = values.str.split(regex_pat)
+    exp = Series([["xx", "zzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # explicit regex = False split
+    result = values.str.split(r"\.jpg", regex=False)
+    exp = Series([["xxxjpgzzz.jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length == 1
+    result = values.str.split(r".")
+    exp = Series([["xxxjpgzzz", "jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length != 1
+    result = values.str.split(r".jpg")
+    exp = Series([["xx", "zzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # regex=False with pattern compiled regex raises error
+    with pytest.raises(
+        ValueError,
+        match="Cannot use a compiled regex as replacement pattern with regex=False",
+    ):
+        values.str.split(regex_pat, regex=False)
+
+
 def test_split_object_mixed():
     mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
     result = mixed.str.split("_")