Skip to content

Commit

Permalink
ENH: added regex argument to Series.str.split (pandas-dev#44185)
Browse files Browse the repository at this point in the history
  • Loading branch information
saehuihwang authored Nov 4, 2021
1 parent 4d507b0 commit 669acb4
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 22 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ Other enhancements
- :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`)
- The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`)
- Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
-

Expand Down
90 changes: 75 additions & 15 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,11 +659,11 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
Split strings around given separator/delimiter.
Splits the string in the Series/Index from the %(side)s,
at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
at the specified delimiter string.
Parameters
----------
pat : str, optional
pat : str or compiled regex, optional
String or regular expression to split on.
If not specified, split on whitespace.
n : int, default -1 (all)
Expand All @@ -672,14 +672,30 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
expand : bool, default False
Expand the split strings into separate columns.
* If ``True``, return DataFrame/MultiIndex expanding dimensionality.
* If ``False``, return Series/Index, containing lists of strings.
- If ``True``, return DataFrame/MultiIndex expanding dimensionality.
- If ``False``, return Series/Index, containing lists of strings.
regex : bool, default None
Determines if the passed-in pattern is a regular expression:
- If ``True``, assumes the passed-in pattern is a regular expression
- If ``False``, treats the pattern as a literal string.
- If ``None`` and `pat` length is 1, treats `pat` as a literal string.
- If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
- Cannot be set to False if `pat` is a compiled regex
.. versionadded:: 1.4.0
Returns
-------
Series, Index, DataFrame or MultiIndex
Type matches caller unless ``expand=True`` (see Notes).
Raises
------
ValueError
* if `regex` is False and `pat` is a compiled regex
See Also
--------
Series.str.split : Split strings around given separator/delimiter.
Expand All @@ -702,6 +718,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
If using ``expand=True``, Series and Index callers return DataFrame and
MultiIndex objects, respectively.
Use of `regex=False` with a `pat` as a compiled regex will raise
an error.
Examples
--------
>>> s = pd.Series(
Expand Down Expand Up @@ -776,22 +795,63 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"):
1 https://docs.python.org/3/tutorial index.html
2 NaN NaN
Remember to escape special characters when explicitly using regular
expressions.
Remember to escape special characters when explicitly using regular expressions.
>>> s = pd.Series(["1+1=2"])
>>> s
0 1+1=2
dtype: object
>>> s.str.split(r"\+|=", expand=True)
0 1 2
0 1 1 2
>>> s = pd.Series(["foo and bar plus baz"])
>>> s.str.split(r"and|plus", expand=True)
0 1 2
0 foo bar baz
Regular expressions can be used to handle urls or file names.
When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
as a regex only if ``len(pat) != 1``.
>>> s = pd.Series(['foojpgbar.jpg'])
>>> s.str.split(r".", expand=True)
0 1
0 foojpgbar jpg
>>> s.str.split(r"\.jpg", expand=True)
0 1
0 foojpgbar
When ``regex=True``, `pat` is interpreted as a regex
>>> s.str.split(r"\.jpg", regex=True, expand=True)
0 1
0 foojpgbar
A compiled regex can be passed as `pat`
>>> import re
>>> s.str.split(re.compile(r"\.jpg"), expand=True)
0 1
0 foojpgbar
When ``regex=False``, `pat` is interpreted as the string itself
>>> s.str.split(r"\.jpg", regex=False, expand=True)
0
0 foojpgbar.jpg
"""

@Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
@forbid_nonstring_types(["bytes"])
def split(self, pat=None, n=-1, expand=False):
result = self._data.array._str_split(pat, n, expand)
def split(
self,
pat: str | re.Pattern | None = None,
n=-1,
expand=False,
*,
regex: bool | None = None,
):
if regex is False and is_re(pat):
raise ValueError(
"Cannot use a compiled regex as replacement pattern with regex=False"
)
if is_re(pat):
regex = True
result = self._data.array._str_split(pat, n, expand, regex)
return self._wrap_result(result, returns_string=expand, expand=expand)

@Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
Expand Down
31 changes: 24 additions & 7 deletions pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,21 +308,38 @@ def f(x):

return self._str_map(f)

def _str_split(self, pat=None, n=-1, expand=False):
def _str_split(
self,
pat: str | re.Pattern | None = None,
n=-1,
expand=False,
regex: bool | None = None,
):
if pat is None:
if n is None or n == 0:
n = -1
f = lambda x: x.split(pat, n)
else:
if len(pat) == 1:
if n is None or n == 0:
n = -1
f = lambda x: x.split(pat, n)
new_pat: str | re.Pattern
if regex is True or isinstance(pat, re.Pattern):
new_pat = re.compile(pat)
elif regex is False:
new_pat = pat
# regex is None so link to old behavior #43563
else:
if len(pat) == 1:
new_pat = pat
else:
new_pat = re.compile(pat)

if isinstance(new_pat, re.Pattern):
if n is None or n == -1:
n = 0
regex = re.compile(pat)
f = lambda x: regex.split(x, maxsplit=n)
f = lambda x: new_pat.split(x, maxsplit=n)
else:
if n is None or n == 0:
n = -1
f = lambda x: x.split(pat, n)
return self._str_map(f, dtype=object)

def _str_rsplit(self, pat=None, n=-1):
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/strings/test_split_partition.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import re

import numpy as np
import pytest
Expand Down Expand Up @@ -35,6 +36,44 @@ def test_split(any_string_dtype):
tm.assert_series_equal(result, exp)


def test_split_regex(any_string_dtype):
# GH 43563
# explicit regex = True split
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
result = values.str.split(r"\.jpg", regex=True)
exp = Series([["xxxjpgzzz", ""]])
tm.assert_series_equal(result, exp)

# explicit regex = True split with compiled regex
regex_pat = re.compile(r".jpg")
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
result = values.str.split(regex_pat)
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)

# explicit regex = False split
result = values.str.split(r"\.jpg", regex=False)
exp = Series([["xxxjpgzzz.jpg"]])
tm.assert_series_equal(result, exp)

# non explicit regex split, pattern length == 1
result = values.str.split(r".")
exp = Series([["xxxjpgzzz", "jpg"]])
tm.assert_series_equal(result, exp)

# non explicit regex split, pattern length != 1
result = values.str.split(r".jpg")
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)

# regex=False with pattern compiled regex raises error
with pytest.raises(
ValueError,
match="Cannot use a compiled regex as replacement pattern with regex=False",
):
values.str.split(regex_pat, regex=False)


def test_split_object_mixed():
mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
result = mixed.str.split("_")
Expand Down

0 comments on commit 669acb4

Please sign in to comment.