diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 733f11a588117..4d0dee01f05c1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -180,6 +180,7 @@ Other enhancements - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) +- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a62d701413bf1..9f163f77a2ae8 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -659,11 +659,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + at the specified delimiter string. Parameters ---------- - pat : str, optional + pat : str or compiled regex, optional String or regular expression to split on. If not specified, split on whitespace. n : int, default -1 (all) @@ -672,14 +672,30 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): expand : bool, default False Expand the split strings into separate columns. - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. + - If ``True``, return DataFrame/MultiIndex expanding dimensionality. + - If ``False``, return Series/Index, containing lists of strings. + + regex : bool, default None + Determines if the passed-in pattern is a regular expression: + + - If ``True``, assumes the passed-in pattern is a regular expression + - If ``False``, treats the pattern as a literal string. + - If ``None`` and `pat` length is 1, treats `pat` as a literal string. + - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. + - Cannot be set to False if `pat` is a compiled regex + + .. versionadded:: 1.4.0 Returns ------- Series, Index, DataFrame or MultiIndex Type matches caller unless ``expand=True`` (see Notes). + Raises + ------ + ValueError + * if `regex` is False and `pat` is a compiled regex + See Also -------- Series.str.split : Split strings around given separator/delimiter. @@ -702,6 +718,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): If using ``expand=True``, Series and Index callers return DataFrame and MultiIndex objects, respectively. + Use of `regex=False` with a `pat` as a compiled regex will raise + an error. + Examples -------- >>> s = pd.Series( @@ -776,22 +795,63 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): 1 https://docs.python.org/3/tutorial index.html 2 NaN NaN - Remember to escape special characters when explicitly using regular - expressions. + Remember to escape special characters when explicitly using regular expressions. - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 + >>> s = pd.Series(["foo and bar plus baz"]) + >>> s.str.split(r"and|plus", expand=True) + 0 1 2 + 0 foo bar baz + + Regular expressions can be used to handle urls or file names. + When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled + as a regex only if ``len(pat) != 1``. + + >>> s = pd.Series(['foojpgbar.jpg']) + >>> s.str.split(r".", expand=True) + 0 1 + 0 foojpgbar jpg + + >>> s.str.split(r"\.jpg", expand=True) + 0 1 + 0 foojpgbar + + When ``regex=True``, `pat` is interpreted as a regex + + >>> s.str.split(r"\.jpg", regex=True, expand=True) + 0 1 + 0 foojpgbar + + A compiled regex can be passed as `pat` + + >>> import re + >>> s.str.split(re.compile(r"\.jpg"), expand=True) + 0 1 + 0 foojpgbar + + When ``regex=False``, `pat` is interpreted as the string itself + + >>> s.str.split(r"\.jpg", regex=False, expand=True) + 0 + 0 foojpgbar.jpg """ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = self._data.array._str_split(pat, n, expand) + def split( + self, + pat: str | re.Pattern | None = None, + n=-1, + expand=False, + *, + regex: bool | None = None, + ): + if regex is False and is_re(pat): + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + if is_re(pat): + regex = True + result = self._data.array._str_split(pat, n, expand, regex) return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 76ee55ef5f9ad..3081575f50700 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -308,21 +308,38 @@ def f(x): return self._str_map(f) - def _str_split(self, pat=None, n=-1, expand=False): + def _str_split( + self, + pat: str | re.Pattern | None = None, + n=-1, + expand=False, + regex: bool | None = None, + ): if pat is None: if n is None or n == 0: n = -1 f = lambda x: x.split(pat, n) else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) + new_pat: str | re.Pattern + if regex is True or isinstance(pat, re.Pattern): + new_pat = re.compile(pat) + elif regex is False: + new_pat = pat + # regex is None so link to old behavior #43563 else: + if len(pat) == 1: + new_pat = pat + else: + new_pat = re.compile(pat) + + if isinstance(new_pat, re.Pattern): if n is None or n == -1: n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) + f = lambda x: new_pat.split(x, maxsplit=n) + else: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) return self._str_map(f, dtype=object) def _str_rsplit(self, pat=None, n=-1): diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index f3f5acd0d2f1c..01a397938db52 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -1,4 +1,5 @@ from datetime import datetime +import re import numpy as np import pytest @@ -35,6 +36,44 @@ def test_split(any_string_dtype): tm.assert_series_equal(result, exp) +def test_split_regex(any_string_dtype): + # GH 43563 + # explicit regex = True split + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) + result = values.str.split(r"\.jpg", regex=True) + exp = Series([["xxxjpgzzz", ""]]) + tm.assert_series_equal(result, exp) + + # explicit regex = True split with compiled regex + regex_pat = re.compile(r".jpg") + values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) + result = values.str.split(regex_pat) + exp = Series([["xx", "zzz", ""]]) + tm.assert_series_equal(result, exp) + + # explicit regex = False split + result = values.str.split(r"\.jpg", regex=False) + exp = Series([["xxxjpgzzz.jpg"]]) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length == 1 + result = values.str.split(r".") + exp = Series([["xxxjpgzzz", "jpg"]]) + tm.assert_series_equal(result, exp) + + # non explicit regex split, pattern length != 1 + result = values.str.split(r".jpg") + exp = Series([["xx", "zzz", ""]]) + tm.assert_series_equal(result, exp) + + # regex=False with pattern compiled regex raises error + with pytest.raises( + ValueError, + match="Cannot use a compiled regex as replacement pattern with regex=False", + ): + values.str.split(regex_pat, regex=False) + + def test_split_object_mixed(): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.split("_")