From b90bcb59ec154a8ddb310759e47bd1c0b7a80657 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 18 Feb 2019 05:29:55 -0800 Subject: [PATCH 001/110] DOC: Correct doc mistake in combiner func (#25360) Closes gh-25359. --- doc/source/getting_started/basics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 02cbc7e2c3b6d..bbec7b5de1d2e 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -505,7 +505,7 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: .. ipython:: python def combiner(x, y): - np.where(pd.isna(x), y, x) + return np.where(pd.isna(x), y, x) df1.combine(df2, combiner) .. _basics.stats: From f74aba614554d1de4d7986fef6296a4eda951ac9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 18 Feb 2019 15:37:54 +0100 Subject: [PATCH 002/110] DOC/BLD: fix --no-api option (#25209) --- doc/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 776b1bfa7bdd7..c59d28a6dc3ea 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -98,9 +98,9 @@ if (fname == 'index.rst' and os.path.abspath(dirname) == source_path): continue - elif pattern == '-api' and dirname == 'api': + elif pattern == '-api' and dirname == 'reference': exclude_patterns.append(fname) - elif fname != pattern: + elif pattern != '-api' and fname != pattern: exclude_patterns.append(fname) with open(os.path.join(source_path, 'index.rst.template')) as f: From 3d3093a1d923b02da79854918a6d43f92186e72b Mon Sep 17 00:00:00 2001 From: Alyssa Fu Ward Date: Tue, 19 Feb 2019 00:45:51 -0800 Subject: [PATCH 003/110] DOC: modify typos in Contributing section (#25365) --- doc/source/development/contributing.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index c9d6845107dfc..511936467641e 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -54,7 +54,7 @@ Bug reports must: ... ``` -#. Include the full version string of *pandas* and its dependencies. You can use the built in function:: +#. Include the full version string of *pandas* and its dependencies. You can use the built-in function:: >>> import pandas as pd >>> pd.show_versions() @@ -211,7 +211,7 @@ See the full conda docs `here `__. Creating a Python Environment (pip) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you aren't using conda for you development environment, follow these instructions. +If you aren't using conda for your development environment, follow these instructions. You'll need to have at least python3.5 installed on your system. .. code-block:: none @@ -484,7 +484,7 @@ contributing them to the project:: ./ci/code_checks.sh -The script verify the linting of code files, it looks for common mistake patterns +The script verifies the linting of code files, it looks for common mistake patterns (like missing spaces around sphinx directives that make the documentation not being rendered properly) and it also validates the doctests. It is possible to run the checks independently by using the parameters ``lint``, ``patterns`` and @@ -675,7 +675,7 @@ Otherwise, you need to do it manually: You'll also need to -1. write a new test that asserts a warning is issued when calling with the deprecated argument +1. Write a new test that asserts a warning is issued when calling with the deprecated argument 2. Update all of pandas existing tests and code to use the new argument See :ref:`contributing.warnings` for more. From 590cb547d5ca19b7fbbfb26d6bfd44b00061fab3 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 19 Feb 2019 05:12:34 -0800 Subject: [PATCH 004/110] Remove spurious MultiIndex creation in `_set_axis_name` (#25371) * Resovles #25370 * Introduced by #22969 --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3a73861086bed..6e79c02d7dbdd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1333,7 +1333,6 @@ def _set_axis_name(self, name, axis=0, inplace=False): cat 4 monkey 2 """ - pd.MultiIndex.from_product([["mammal"], ['dog', 'cat', 'monkey']]) axis = self._get_axis_number(axis) idx = self._get_axis(axis).set_names(name) From 9561b9621fd41735b59c48a61a860b4ec9a3a1a3 Mon Sep 17 00:00:00 2001 From: Shivam Rana Date: Tue, 19 Feb 2019 18:44:24 +0530 Subject: [PATCH 005/110] #23049: test for Fatal Stack Overflow stemming From Misuse of astype('category') (#25366) --- pandas/tests/frame/test_combine_concat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 59497153c8524..c2364dc135a9a 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -504,6 +504,16 @@ def test_concat_numerical_names(self): names=[1, 2])) tm.assert_frame_equal(result, expected) + def test_concat_astype_dup_col(self): + # gh 23049 + df = pd.DataFrame([{'a': 'b'}]) + df = pd.concat([df, df], axis=1) + + result = df.astype('category') + expected = pd.DataFrame(np.array(["b", "b"]).reshape(1, 2), + columns=["a", "a"]).astype("category") + tm.assert_frame_equal(result, expected) + class TestDataFrameCombineFirst(TestData): From dbe357933fc7536ddf4fe8f62492a4d6a38bd0db Mon Sep 17 00:00:00 2001 From: Shivam Rana Date: Tue, 19 Feb 2019 19:02:41 +0530 Subject: [PATCH 006/110] 9236: test for the DataFrame.groupby with MultiIndex having pd.NaT (#25310) --- pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1ae8efd2f6867..12a5d494648fc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1698,3 +1698,19 @@ def test_groupby_agg_ohlc_non_first(): result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected) + + +def test_groupby_multiindex_nat(): + # GH 9236 + values = [ + (pd.NaT, 'a'), + (datetime(2012, 1, 2), 'a'), + (datetime(2012, 1, 2), 'b'), + (datetime(2012, 1, 3), 'a') + ] + mi = pd.MultiIndex.from_tuples(values, names=['date', None]) + ser = pd.Series([3, 2, 2.5, 4], index=mi) + + result = ser.groupby(level=1).mean() + expected = pd.Series([3., 2.5], index=["a", "b"]) + assert_series_equal(result, expected) From c400bd3158bb125dbfdb19f35a5cbf9da80acbde Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Tue, 19 Feb 2019 21:46:03 +0800 Subject: [PATCH 007/110] [BUG] exception handling of MultiIndex.__contains__ too narrow (#25268) --- doc/source/whatsnew/v0.25.0.rst | 3 +-- pandas/core/indexes/multi.py | 2 +- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 686c5ad0165e7..9d33c651ef283 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -149,10 +149,9 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in which incorrect exception raised by :meth:`pd.Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - - -- - I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index efb77b5d155a1..c19b6f61f2caa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -840,7 +840,7 @@ def __contains__(self, key): try: self.get_loc(key) return True - except (LookupError, TypeError): + except (LookupError, TypeError, ValueError): return False contains = __contains__ diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 4f5517f89e852..ccf017489e046 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -84,3 +84,11 @@ def test_multi_nan_indexing(self): name='a'), Index(['C1', 'C2', 'C3', 'C4'], name='b')]) tm.assert_frame_equal(result, expected) + + def test_contains(self): + # GH 24570 + tx = pd.timedelta_range('09:30:00', '16:00:00', freq='30 min') + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert 'element_not_exit' not in idx + assert '0 day 09:30:00' in idx From f9cb58148c85b99b3829a496c3b8f6fc25c99a95 Mon Sep 17 00:00:00 2001 From: Shivam Rana Date: Tue, 19 Feb 2019 19:41:41 +0530 Subject: [PATCH 008/110] 14873: test for groupby.agg coercing booleans (#25327) --- .../tests/groupby/aggregate/test_aggregate.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9de8a08809009..0c2e74c0b735f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -286,3 +286,20 @@ def test_multi_function_flexible_mix(df): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_coercing_bools(): + # issue 14873 + dat = pd.DataFrame( + {'a': [1, 1, 2, 2], 'b': [0, 1, 2, 3], 'c': [None, None, 1, 1]}) + gp = dat.groupby('a') + + index = Index([1, 2], name='a') + + result = gp['b'].aggregate(lambda x: (x != 0).all()) + expected = Series([False, True], index=index, name='b') + tm.assert_series_equal(result, expected) + + result = gp['c'].aggregate(lambda x: x.isnull().all()) + expected = Series([True, False], index=index, name='c') + tm.assert_series_equal(result, expected) From b2c751985ebd09b72d917d08dc06193dc0922018 Mon Sep 17 00:00:00 2001 From: Saurav Chakravorty Date: Tue, 19 Feb 2019 21:44:22 +0530 Subject: [PATCH 009/110] BUG/ENH: Timestamp.strptime (#25124) * BUG: constructor Timestamp.strptime() does not support %z. * Add doc string to NaT and Timestamp * updated the error message * Updated whatsnew entry. --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/_libs/tslibs/nattype.pyx | 9 ++++++++- pandas/_libs/tslibs/timestamps.pyx | 11 +++++++++++ pandas/tests/scalar/timestamp/test_timestamp.py | 8 ++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 9d33c651ef283..afde665407d18 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -28,6 +28,8 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- :meth:`Timestamp.strptime` will now rise a NotImplementedError (:issue:`21257`) + .. _whatsnew_0250.api.other: Other API Changes diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index a13fcfdc855d5..79e2e256c501d 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -374,7 +374,6 @@ class NaTType(_NaT): utctimetuple = _make_error_func('utctimetuple', datetime) timetz = _make_error_func('timetz', datetime) timetuple = _make_error_func('timetuple', datetime) - strptime = _make_error_func('strptime', datetime) strftime = _make_error_func('strftime', datetime) isocalendar = _make_error_func('isocalendar', datetime) dst = _make_error_func('dst', datetime) @@ -388,6 +387,14 @@ class NaTType(_NaT): # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. + strptime = _make_error_func('strptime', # noqa:E128 + """ + Timestamp.strptime(string, format) + + Function is not implemented. Use pd.to_datetime(). + """ + ) + utcfromtimestamp = _make_error_func('utcfromtimestamp', # noqa:E128 """ Timestamp.utcfromtimestamp(ts) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a2929dbeb471f..8d825e0a6179e 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -697,6 +697,17 @@ class Timestamp(_Timestamp): """ return cls(datetime.fromtimestamp(ts)) + # Issue 25016. + @classmethod + def strptime(cls, date_string, format): + """ + Timestamp.strptime(string, format) + + Function is not implemented. Use pd.to_datetime(). + """ + raise NotImplementedError("Timestamp.strptime() is not implmented." + "Use to_datetime() to parse date strings.") + @classmethod def combine(cls, date, time): """ diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index f42fad4c925f0..7d81d905eac4f 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -355,6 +355,14 @@ def test_constructor_invalid_tz(self): # interpreted as a `freq` Timestamp('2012-01-01', 'US/Pacific') + def test_constructor_strptime(self): + # GH25016 + # Test support for Timestamp.strptime + fmt = '%Y%m%d-%H%M%S-%f%z' + ts = '20190129-235348-000001+0000' + with pytest.raises(NotImplementedError): + Timestamp.strptime(ts, fmt) + def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [Timestamp(year=2017, month=10, day=22, tz='UTC'), From 2909b830fa21c6bc2e9797aae25b13f9a060653a Mon Sep 17 00:00:00 2001 From: Zach Angell <42625717+zangell44@users.noreply.github.com> Date: Tue, 19 Feb 2019 22:51:04 -0500 Subject: [PATCH 010/110] Interval dtype fix (#25338) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/dtypes/dtypes.py | 19 ++++++++++++------- pandas/tests/dtypes/test_dtypes.py | 14 +++++++++----- pandas/tests/series/test_operators.py | 7 +++++++ 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8e59c2300e7ca..f528c058d2868 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -26,6 +26,7 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) +- Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) .. _whatsnew_0242.enhancements: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 640d43f3b0e03..11a132c4d14ee 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -931,13 +931,18 @@ def construct_from_string(cls, string): attempt to construct this type from a string, raise a TypeError if its not possible """ - if (isinstance(string, compat.string_types) and - (string.startswith('interval') or - string.startswith('Interval'))): - return cls(string) + if not isinstance(string, compat.string_types): + msg = "a string needs to be passed, got type {typ}" + raise TypeError(msg.format(typ=type(string))) + + if (string.lower() == 'interval' or + cls._match.search(string) is not None): + return cls(string) - msg = "a string needs to be passed, got type {typ}" - raise TypeError(msg.format(typ=type(string))) + msg = ('Incorrectly formatted string passed to constructor. ' + 'Valid formats include Interval or Interval[dtype] ' + 'where dtype is numeric, datetime, or timedelta') + raise TypeError(msg) @property def type(self): @@ -978,7 +983,7 @@ def is_dtype(cls, dtype): return True else: return False - except ValueError: + except (ValueError, TypeError): return False else: return False diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 710f215686eab..1c1442d6f2f23 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -511,10 +511,11 @@ def test_construction_not_supported(self, subtype): with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) - def test_construction_errors(self): + @pytest.mark.parametrize('subtype', ['xx', 'IntervalA', 'Interval[foo]']) + def test_construction_errors(self, subtype): msg = 'could not construct IntervalDtype' with pytest.raises(TypeError, match=msg): - IntervalDtype('xx') + IntervalDtype(subtype) def test_construction_from_string(self): result = IntervalDtype('interval[int64]') @@ -523,7 +524,7 @@ def test_construction_from_string(self): assert is_dtype_equal(self.dtype, result) @pytest.mark.parametrize('string', [ - 'foo', 'foo[int64]', 0, 3.14, ('a', 'b'), None]) + 0, 3.14, ('a', 'b'), None]) def test_construction_from_string_errors(self, string): # these are invalid entirely msg = 'a string needs to be passed, got type' @@ -532,10 +533,12 @@ def test_construction_from_string_errors(self, string): IntervalDtype.construct_from_string(string) @pytest.mark.parametrize('string', [ - 'interval[foo]']) + 'foo', 'foo[int64]', 'IntervalA']) def test_construction_from_string_error_subtype(self, string): # this is an invalid subtype - msg = 'could not construct IntervalDtype' + msg = ("Incorrectly formatted string passed to constructor. " + r"Valid formats include Interval or Interval\[dtype\] " + "where dtype is numeric, datetime, or timedelta") with pytest.raises(TypeError, match=msg): IntervalDtype.construct_from_string(string) @@ -559,6 +562,7 @@ def test_is_dtype(self): assert not IntervalDtype.is_dtype('U') assert not IntervalDtype.is_dtype('S') assert not IntervalDtype.is_dtype('foo') + assert not IntervalDtype.is_dtype('IntervalA') assert not IntervalDtype.is_dtype(np.object_) assert not IntervalDtype.is_dtype(np.int64) assert not IntervalDtype.is_dtype(np.float64) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 4d3c9926fc5ae..b2aac441db195 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -563,6 +563,13 @@ def test_comp_ops_df_compat(self): with pytest.raises(ValueError, match=msg): left.to_frame() < right.to_frame() + def test_compare_series_interval_keyword(self): + # GH 25338 + s = Series(['IntervalA', 'IntervalB', 'IntervalC']) + result = s == 'IntervalA' + expected = Series([True, False, False]) + assert_series_equal(result, expected) + class TestSeriesFlexComparisonOps(object): From f4568fd76e864d8aee3d23f5a81302262d6e0dcb Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Wed, 20 Feb 2019 09:04:31 +0100 Subject: [PATCH 011/110] [CLN] Excel Module Cleanups (#25275) Closes gh-25153 Authored-By: tdamsma --- pandas/io/excel/_base.py | 5 ++--- pandas/io/excel/_util.py | 39 ++++++++++++++++++++--------------- pandas/tests/io/test_excel.py | 7 ++----- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ed5943e9a1698..8f7bf8e0466f9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -590,9 +590,8 @@ def __new__(cls, path, engine=None, **kwargs): if engine == 'auto': engine = _get_default_writer(ext) except KeyError: - error = ValueError("No engine for filetype: '{ext}'" - .format(ext=ext)) - raise error + raise ValueError("No engine for filetype: '{ext}'" + .format(ext=ext)) cls = get_writer(engine) return object.__new__(cls) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 1aeaf70f0832e..49255d83d1cd3 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -5,32 +5,39 @@ from pandas.core.dtypes.common import is_integer, is_list_like -from pandas.core import config - -_writer_extensions = ["xlsx", "xls", "xlsm"] - - _writers = {} def register_writer(klass): - """Adds engine to the excel writer registry. You must use this method to - integrate with ``to_excel``. Also adds config options for any new - ``supported_extensions`` defined on the writer.""" + """ + Add engine to the excel writer registry.io.excel. + + You must use this method to integrate with ``to_excel``. + + Parameters + ---------- + klass : ExcelWriter + """ if not callable(klass): raise ValueError("Can only register callables as engines") engine_name = klass.engine _writers[engine_name] = klass - for ext in klass.supported_extensions: - if ext.startswith('.'): - ext = ext[1:] - if ext not in _writer_extensions: - config.register_option("io.excel.{ext}.writer".format(ext=ext), - engine_name, validator=str) - _writer_extensions.append(ext) def _get_default_writer(ext): + """ + Return the default writer for the given extension. + + Parameters + ---------- + ext : str + The excel file extension for which to get the default engine. + + Returns + ------- + str + The default engine for the extension. + """ _default_writers = {'xlsx': 'openpyxl', 'xlsm': 'openpyxl', 'xls': 'xlwt'} try: import xlsxwriter # noqa @@ -230,8 +237,6 @@ def _fill_mi_header(row, control_row): return _maybe_convert_to_string(row), control_row -# fill blank if index_col not None - def _pop_header_name(row, index_col): """ diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 09b2d86bde3d3..04c9c58a326a4 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -2359,7 +2359,7 @@ def test_register_writer(self): class DummyClass(ExcelWriter): called_save = False called_write_cells = False - supported_extensions = ['test', 'xlsx', 'xls'] + supported_extensions = ['xlsx', 'xls'] engine = 'dummy' def save(self): @@ -2377,12 +2377,9 @@ def check_called(func): with pd.option_context('io.excel.xlsx.writer', 'dummy'): register_writer(DummyClass) - writer = ExcelWriter('something.test') + writer = ExcelWriter('something.xlsx') assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) - - func = lambda: df.to_excel('something.test') - check_called(func) check_called(lambda: df.to_excel('something.xlsx')) check_called( lambda: df.to_excel( From 66d486ef2cb622475c0d48a3faeafcf927fc1a9c Mon Sep 17 00:00:00 2001 From: tamuhey Date: Wed, 20 Feb 2019 19:28:43 +0900 Subject: [PATCH 012/110] ENH: indexing and __getitem__ of dataframe and series accept zerodim integer np.array as int (#24924) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/frame.py | 1 + pandas/core/indexing.py | 3 +++ pandas/tests/indexing/test_iloc.py | 13 +++++++++++++ pandas/tests/indexing/test_loc.py | 13 +++++++++++++ pandas/tests/indexing/test_scalar.py | 13 +++++++++++++ 6 files changed, 44 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index afde665407d18..ef004af0ea6f7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -19,6 +19,7 @@ including other versions of pandas. Other Enhancements ^^^^^^^^^^^^^^^^^^ +- Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a239ff4b4d5db..79f209f9ebc0a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2838,6 +2838,7 @@ def _ixs(self, i, axis=0): return result def __getitem__(self, key): + key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) # shortcut if the key is in columns diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 539da0beaefb4..623a48acdd48b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -5,6 +5,7 @@ import numpy as np from pandas._libs.indexing import _NDFrameIndexerBase +from pandas._libs.lib import item_from_zerodim import pandas.compat as compat from pandas.compat import range, zip from pandas.errors import AbstractMethodError @@ -1856,6 +1857,7 @@ def _getitem_axis(self, key, axis=None): if axis is None: axis = self.axis or 0 + key = item_from_zerodim(key) if is_iterator(key): key = list(key) @@ -2222,6 +2224,7 @@ def _getitem_axis(self, key, axis=None): # a single integer else: + key = item_from_zerodim(key) if not is_integer(key): raise TypeError("Cannot index by location index with a " "non-integer key") diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 5c87d553daba3..69ec6454e952a 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -697,3 +697,16 @@ def test_identity_slice_returns_new_object(self): # should also be a shallow copy original_series[:3] = [7, 8, 9] assert all(sliced_series[:3] == [7, 8, 9]) + + def test_indexing_zerodim_np_array(self): + # GH24919 + df = DataFrame([[1, 2], [3, 4]]) + result = df.iloc[np.array(0)] + s = pd.Series([1, 2], name=0) + tm.assert_series_equal(result, s) + + def test_series_indexing_zerodim_np_array(self): + # GH24919 + s = Series([1, 2]) + result = s.iloc[np.array(0)] + assert result == 1 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3bf4a6bee4af9..29f70929624fc 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -778,3 +778,16 @@ def test_loc_setitem_empty_append_raises(self): msg = "cannot copy sequence with size 2 to array axis with dimension 0" with pytest.raises(ValueError, match=msg): df.loc[0:2, 'x'] = data + + def test_indexing_zerodim_np_array(self): + # GH24924 + df = DataFrame([[1, 2], [3, 4]]) + result = df.loc[np.array(0)] + s = pd.Series([1, 2], name=0) + tm.assert_series_equal(result, s) + + def test_series_indexing_zerodim_np_array(self): + # GH24924 + s = Series([1, 2]) + result = s.loc[np.array(0)] + assert result == 1 diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 6d607ce86c08e..0cd41562541d1 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -221,3 +221,16 @@ def test_iat_setter_incompatible_assignment(self): result.iat[0, 0] = None expected = DataFrame({"a": [None, 1], "b": [4, 5]}) tm.assert_frame_equal(result, expected) + + def test_getitem_zerodim_np_array(self): + # GH24924 + # dataframe __getitem__ + df = DataFrame([[1, 2], [3, 4]]) + result = df[np.array(0)] + expected = Series([1, 3], name=0) + tm.assert_series_equal(result, expected) + + # series __getitem__ + s = Series([1, 2]) + result = s[np.array(0)] + assert result == 1 From def8b962e50d88efdc99d78c807a14519b19cb36 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Feb 2019 02:52:01 -0800 Subject: [PATCH 013/110] REGR: fix TimedeltaIndex sum and datetime subtraction with NaT (#25282, #25317) (#25329) --- doc/source/whatsnew/v0.24.2.rst | 2 ++ pandas/core/arrays/datetimes.py | 6 +++--- pandas/core/arrays/timedeltas.py | 5 +++++ pandas/core/indexes/base.py | 3 ++- pandas/tests/arithmetic/test_datetime64.py | 14 ++++++++++++++ pandas/tests/arrays/test_timedeltas.py | 22 ++++++++++++++++++++++ 6 files changed, 48 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index f528c058d2868..a7e522d27f8e2 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -26,6 +26,8 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) +- Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the `Series` on the right contains null values (:issue:`25317`) +- Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) .. _whatsnew_0242.enhancements: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cd8e8ed520ddc..75cf658423210 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -720,11 +720,11 @@ def _sub_datetime_arraylike(self, other): self_i8 = self.asi8 other_i8 = other.asi8 + arr_mask = self._isnan | other._isnan new_values = checked_add_with_arr(self_i8, -other_i8, - arr_mask=self._isnan) + arr_mask=arr_mask) if self._hasnans or other._hasnans: - mask = (self._isnan) | (other._isnan) - new_values[mask] = iNaT + new_values[arr_mask] = iNaT return new_values.view('timedelta64[ns]') def _add_offset(self, offset): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 06e2bf76fcf96..74fe8072e6924 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -190,6 +190,8 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): "ndarray, or Series or Index containing one of those." ) raise ValueError(msg.format(type(values).__name__)) + if values.ndim != 1: + raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == 'i8': # for compat with datetime/timedelta/period shared methods, @@ -945,6 +947,9 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): .format(dtype=data.dtype)) data = np.array(data, copy=copy) + if data.ndim != 1: + raise ValueError("Only 1-dimensional input arrays are supported.") + assert data.dtype == 'm8[ns]', data return data, inferred_freq diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f2c8ac6e9b413..b5f3c929a7f36 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -665,7 +665,8 @@ def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. """ - if is_bool_dtype(result): + result = lib.item_from_zerodim(result) + if is_bool_dtype(result) or lib.is_scalar(result): return result attrs = self._get_attributes_dict() diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 405dc0805a285..c81a371f37dc1 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1440,6 +1440,20 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, class TestDatetime64OverflowHandling(object): # TODO: box + de-duplicate + def test_dt64_overflow_masking(self, box_with_array): + # GH#25317 + left = Series([Timestamp('1969-12-31')]) + right = Series([NaT]) + + left = tm.box_expected(left, box_with_array) + right = tm.box_expected(right, box_with_array) + + expected = TimedeltaIndex([NaT]) + expected = tm.box_expected(expected, box_with_array) + + result = left - right + tm.assert_equal(result, expected) + def test_dt64_series_arith_overflow(self): # GH#12534, fixed by GH#19024 dt = pd.Timestamp('1700-01-31') diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 6b4662ca02e80..1fec533a14a6f 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -9,6 +9,18 @@ class TestTimedeltaArrayConstructor(object): + def test_only_1dim_accepted(self): + # GH#25282 + arr = np.array([0, 1, 2, 3], dtype='m8[h]').astype('m8[ns]') + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 2-dim + TimedeltaArray(arr.reshape(2, 2)) + + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + TimedeltaArray(arr[[0]].squeeze()) + def test_freq_validation(self): # ensure that the public constructor cannot create an invalid instance arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9 @@ -51,6 +63,16 @@ def test_copy(self): class TestTimedeltaArray(object): + def test_np_sum(self): + # GH#25282 + vals = np.arange(5, dtype=np.int64).view('m8[h]').astype('m8[ns]') + arr = TimedeltaArray(vals) + result = np.sum(arr) + assert result == vals.sum() + + result = np.sum(pd.TimedeltaIndex(arr)) + assert result == vals.sum() + def test_from_sequence_dtype(self): msg = "dtype .*object.* cannot be converted to timedelta64" with pytest.raises(ValueError, match=msg): From 13a505d5541a8fab878c2175f60328f5816b4d25 Mon Sep 17 00:00:00 2001 From: Saurav Chakravorty Date: Wed, 20 Feb 2019 19:36:49 +0530 Subject: [PATCH 014/110] edited whatsnew typo (#25381) --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ef004af0ea6f7..6e225185ecf84 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -29,7 +29,7 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`Timestamp.strptime` will now rise a NotImplementedError (:issue:`21257`) +- :meth:`Timestamp.strptime` will now raise a NotImplementedError (:issue:`25016`) .. _whatsnew_0250.api.other: From 6c4cb6f08a83b20a82e0ea8cc0f2474b0fab4350 Mon Sep 17 00:00:00 2001 From: knuu Date: Thu, 21 Feb 2019 00:50:58 +0900 Subject: [PATCH 015/110] fix typo of see also in DataFrame stat funcs (#25388) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6e79c02d7dbdd..3647565123523 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10866,7 +10866,7 @@ def _doc_parms(cls): Series.max : Return the maximum. Series.idxmin : Return the index of the minimum. Series.idxmax : Return the index of the maximum. -DataFrame.min : Return the sum over the requested axis. +DataFrame.sum : Return the sum over the requested axis. DataFrame.min : Return the minimum over the requested axis. DataFrame.max : Return the maximum over the requested axis. DataFrame.idxmin : Return the index of the minimum over the requested axis. From 54492791116108199c24734a0220560974eb3372 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 20 Feb 2019 16:47:35 +0000 Subject: [PATCH 016/110] API: more consistent error message for MultiIndex.from_arrays (#25189) --- pandas/core/indexes/multi.py | 8 +++++++- .../tests/indexes/multi/test_constructor.py | 19 +++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c19b6f61f2caa..492d28476e1f0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -324,11 +324,17 @@ def from_arrays(cls, arrays, sortorder=None, names=None): codes=[[0, 0, 1, 1], [1, 0, 1, 0]], names=['number', 'color']) """ + error_msg = "Input must be a list / sequence of array-likes." if not is_list_like(arrays): - raise TypeError("Input must be a list / sequence of array-likes.") + raise TypeError(error_msg) elif is_iterator(arrays): arrays = list(arrays) + # Check if elements of array are list-like + for array in arrays: + if not is_list_like(array): + raise TypeError(error_msg) + # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 055d54c613260..fe90e85cf93c8 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -142,6 +142,15 @@ def test_from_arrays_iterator(idx): MultiIndex.from_arrays(0) +def test_from_arrays_tuples(idx): + arrays = tuple(tuple(np.asarray(lev).take(level_codes)) + for lev, level_codes in zip(idx.levels, idx.codes)) + + # tuple of tuples as input + result = MultiIndex.from_arrays(arrays, names=idx.names) + tm.assert_index_equal(result, idx) + + def test_from_arrays_index_series_datetimetz(): idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') @@ -254,11 +263,13 @@ def test_from_arrays_empty(): @pytest.mark.parametrize('invalid_sequence_of_arrays', [ - 1, [1], [1, 2], [[1], 2], 'a', ['a'], ['a', 'b'], [['a'], 'b']]) + 1, [1], [1, 2], [[1], 2], [1, [2]], 'a', ['a'], ['a', 'b'], [['a'], 'b'], + (1,), (1, 2), ([1], 2), (1, [2]), 'a', ('a',), ('a', 'b'), (['a'], 'b'), + [(1,), 2], [1, (2,)], [('a',), 'b'], + ((1,), 2), (1, (2,)), (('a',), 'b') +]) def test_from_arrays_invalid_input(invalid_sequence_of_arrays): - msg = (r"Input must be a list / sequence of array-likes|" - r"Input must be list-like|" - r"object of type 'int' has no len\(\)") + msg = "Input must be a list / sequence of array-likes" with pytest.raises(TypeError, match=msg): MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays) From 9c0f6a8d703b6bee48918f2c5d16418a7ff736e3 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 21 Feb 2019 14:53:42 +0100 Subject: [PATCH 017/110] CLN: (re-)enable infer_dtype to catch complex (#25382) --- pandas/_libs/lib.pyx | 4 ++++ pandas/tests/dtypes/test_inference.py | 31 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1f0f0a408aee8..34ceeb20e260e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -939,6 +939,7 @@ _TYPE_MAP = { 'float32': 'floating', 'float64': 'floating', 'f': 'floating', + 'complex64': 'complex', 'complex128': 'complex', 'c': 'complex', 'string': 'string' if PY2 else 'bytes', @@ -1305,6 +1306,9 @@ def infer_dtype(value: object, skipna: object=None) -> str: elif is_decimal(val): return 'decimal' + elif is_complex(val): + return 'complex' + elif util.is_float_object(val): if is_float_array(values): return 'floating' diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 49a66efaffc11..187b37d4f788e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -618,6 +618,37 @@ def test_decimals(self): result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' + # complex is compatible with nan, so skipna has no effect + @pytest.mark.parametrize('skipna', [True, False]) + def test_complex(self, skipna): + # gets cast to complex on array construction + arr = np.array([1.0, 2.0, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'complex' + + arr = np.array([1.0, 2.0, 1 + 1j], dtype='O') + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'mixed' + + # gets cast to complex on array construction + arr = np.array([1, np.nan, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'complex' + + arr = np.array([1.0, np.nan, 1 + 1j], dtype='O') + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'mixed' + + # complex with nans stays complex + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O') + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'complex' + + # test smaller complex dtype; will pass through _try_infer_map fastpath + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == 'complex' + def test_string(self): pass From b6731886db957b3e667449f67b6c95a638f2ac81 Mon Sep 17 00:00:00 2001 From: Wouter De Coster Date: Fri, 22 Feb 2019 16:13:30 +0100 Subject: [PATCH 018/110] DOC: Edited docstring of Interval (#25410) The docstring contained a repeated segment, which I removed. --- pandas/_libs/interval.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index eb511b1adb28a..e86b692e9915e 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -150,9 +150,6 @@ cdef class Interval(IntervalMixin): Left bound for the interval. right : orderable scalar Right bound for the interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the interval is closed on the left-side, right-side, both or - neither. closed : {'right', 'left', 'both', 'neither'}, default 'right' Whether the interval is closed on the left-side, right-side, both or neither. See the Notes for more detailed explanation. From fbe67d5ce06d6d3766beec1c38dceca89e6ca942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Chv=C3=A1tal?= Date: Fri, 22 Feb 2019 20:04:25 +0100 Subject: [PATCH 019/110] Mark test_pct_max_many_rows as high memory (#25400) Fixes issue #25384 --- pandas/tests/frame/test_rank.py | 1 + pandas/tests/series/test_rank.py | 1 + pandas/tests/test_algos.py | 1 + 3 files changed, 3 insertions(+) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 10c42e0d1a1cf..6bb9dea15d1ce 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -310,6 +310,7 @@ def test_rank_pct_true(self, method, exp): tm.assert_frame_equal(result, expected) @pytest.mark.single + @pytest.mark.high_memory def test_pct_max_many_rows(self): # GH 18271 df = DataFrame({'A': np.arange(2**24 + 1), diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 510a51e002918..dfcda889269ee 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -499,6 +499,7 @@ def test_rank_first_pct(dtype, ser, exp): @pytest.mark.single +@pytest.mark.high_memory def test_pct_max_many_rows(): # GH 18271 s = Series(np.arange(2**24 + 1)) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 888cf78a1c66a..cb7426ce2f7c9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1484,6 +1484,7 @@ def test_too_many_ndims(self): algos.rank(arr) @pytest.mark.single + @pytest.mark.high_memory @pytest.mark.parametrize('values', [ np.arange(2**24 + 1), np.arange(2**25 + 2).reshape(2**24 + 1, 2)], From f2fbebdf7b8da3d354d3957ba5e87d736849aa15 Mon Sep 17 00:00:00 2001 From: willweil <32082133+willweil@users.noreply.github.com> Date: Fri, 22 Feb 2019 19:16:37 -0500 Subject: [PATCH 020/110] Correct a typo of version number for interpolate() (#25418) --- doc/source/user_guide/missing_data.rst | 2 +- pandas/core/generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index a462f01dcd14f..7883814e91c94 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -335,7 +335,7 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 +.. versionadded:: 0.23.0 The ``limit_area`` keyword argument was added. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3647565123523..eb84a9a5810f4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6648,7 +6648,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). - .. versionadded:: 0.21.0 + .. versionadded:: 0.23.0 downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. From 7408c9be6631697b8d15a3746bbb9eb6261aef22 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 23 Feb 2019 19:36:37 +0100 Subject: [PATCH 021/110] DEP: add pytest-mock to environment.yml (#25417) --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 47fe8e4c2a640..ce68dccca0c07 100644 --- a/environment.yml +++ b/environment.yml @@ -20,6 +20,7 @@ dependencies: - isort - moto - pytest>=4.0 + - pytest-mock - sphinx - numpydoc diff --git a/requirements-dev.txt b/requirements-dev.txt index 76aaeefa648f4..22c01ebcef7f0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ hypothesis>=3.82 isort moto pytest>=4.0 +pytest-mock sphinx numpydoc beautifulsoup4>=4.2.1 From 15d8178a3f81febe4d8ddb033d07e57359eb6167 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Sat, 23 Feb 2019 20:12:32 +0100 Subject: [PATCH 022/110] BUG: Fix type coercion in read_json orient='table' (#21345) (#25219) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/json/json.py | 20 +++++++++++++++---- .../tests/io/json/test_json_table_schema.py | 4 ++-- pandas/tests/io/json/test_pandas.py | 15 ++++++++++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6e225185ecf84..c0e00c7bf6f54 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -160,6 +160,7 @@ I/O ^^^ - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) +- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - - - diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 4bbccc8339d7c..725e2d28ffd67 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -226,7 +226,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, return serialized -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): @@ -278,8 +278,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True - If True, infer dtypes, if a dict of column to dtype, then use those, + If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. + + Not applicable with ``orient='table'``. + + .. versionchanged:: 0.25 + + Not applicable with ``orient='table'``. + convert_axes : boolean, default True Try to convert the axes to the proper dtypes. convert_dates : boolean, default True @@ -408,6 +415,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ + if orient == 'table' and dtype: + raise ValueError("cannot pass both dtype and orient='table'") + + dtype = orient != 'table' if dtype is None else dtype + compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -600,15 +612,15 @@ class Parser(object): 'us': long(31536000000000), 'ns': long(31536000000000000)} - def __init__(self, json, orient, dtype=True, convert_axes=True, + def __init__(self, json, orient, dtype=None, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False, precise_float=False, date_unit=None): self.json = json if orient is None: orient = self._default_orient - self.orient = orient + self.dtype = dtype if orient == "split": diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 6fa3b5b3b2ed4..3002d1dfb5f8a 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -502,12 +502,12 @@ class TestTableOrientReader(object): @pytest.mark.parametrize("vals", [ {'ints': [1, 2, 3, 4]}, {'objects': ['a', 'b', 'c', 'd']}, + {'objects': ['1', '2', '3', '4']}, {'date_ranges': pd.date_range('2016-01-01', freq='d', periods=4)}, {'categoricals': pd.Series(pd.Categorical(['a', 'b', 'c', 'c']))}, {'ordered_cats': pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], ordered=True))}, - pytest.param({'floats': [1., 2., 3., 4.]}, - marks=pytest.mark.xfail), + {'floats': [1., 2., 3., 4.]}, {'floats': [1.1, 2.2, 3.3, 4.4]}, {'bools': [True, False, False, True]}]) def test_read_json_table_orient(self, index_nm, vals, recwarn): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0ffc8c978a228..fecd0f0572757 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1202,6 +1202,21 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + def test_from_json_to_json_table_dtypes(self): + # GH21345 + expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = expected.to_json(orient='table') + result = pd.read_json(dfjson, orient='table') + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('dtype', [True, {'b': int, 'c': int}]) + def test_read_json_table_dtype_raises(self, dtype): + # GH21345 + df = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) + dfjson = df.to_json(orient='table') + with pytest.raises(ValueError): + pd.read_json(dfjson, orient='table', dtype=dtype) + @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}), From 5557e3627df67211f5b6406d07995dccb02196f9 Mon Sep 17 00:00:00 2001 From: sudhir mohanraj Date: Sat, 23 Feb 2019 14:47:46 -0500 Subject: [PATCH 023/110] ERR: doc update for ParsingError (#25414) Closes gh-22881 --- pandas/errors/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c57d27ff03ac6..493ee65f63c6a 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -32,6 +32,8 @@ class UnsortedIndexError(KeyError): class ParserError(ValueError): """ Exception that is raised by an error encountered in `pd.read_csv`. + + e.g. HTML Parsing will raise this error. """ From 3855a27be4f04d15e7ba7aee12f0220c93148d3d Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 23 Feb 2019 21:43:20 +0000 Subject: [PATCH 024/110] ENH: Add in sort keyword to DatetimeIndex.union (#25110) --- doc/source/styled.xlsx | Bin 0 -> 5682 bytes doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/datetimes.py | 38 ++++- pandas/tests/indexes/datetimes/test_setops.py | 138 ++++++++++++------ 4 files changed, 123 insertions(+), 54 deletions(-) create mode 100644 doc/source/styled.xlsx diff --git a/doc/source/styled.xlsx b/doc/source/styled.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1233ff2b8692bad1cfcd54f59d26eb315cfa728c GIT binary patch literal 5682 zcmZ`-1z3}9_aEI#3Ze)|3o^QEv>+i}0uqkcMt6@=I!2e2sC0wWMu&ia0@58SjWqI~ z@B95fU%B3IyYA=Nu4})ubD!s&^E>xXQxyxF5&!_;13p(uKBi|H#zLaMQD6c9MCiMj zvo*v8$o=bDnxGNcE<@barjm9}tB>liko4Mw8eU*Jv0loFCo)o0`ul{pKoS{&O~q2K$v?!mX##}swl>kxCc&A6ncxA^jty9ve{7U(Bm4#& zy}?TKB1JDN2TM(72S*p+Ge<{mPbfq!MvbVQhgAMbWpftBu>rD>sZc^C9>>8T41HmP zgr}#k;q>)h&Ch7uEc<(xDqRJ`bkr{&B*X%`%4pPXr>hT&EZGz=N(W{h+yYwVkR>7o zV(^d;Qg4~KpNdb~yA6qY+IMP?>N}(nif*pi@v@(!7$3x#)Jc$>*I!h?=O0~S{^_s7 z3&@upw4a>l2SfHxe=QxHt$#T@nD9ikgNL-IEn=mmg_SNZ_gkc;1|b})t3Ga~&O9+} zrC_#RxYRg&182xpQ*ve-w)C>98d|}(? z8>b#x!*{ z-xA=>>@Ex9G>4NZrnC~-+*&fZq!9NA%u2MZQnjNAbAGdKG8fb_J+?g1414Bxe^qeJ ztWQM@$E12-W#a5v^>me<&PCZ9PsVKzS>@J*;xj+z%S8_=_m)bZ^Vn!09E#Ppop(Jt zX(pLWq__Y;CL20+(b4gQ09|dYq1Jz2d4I)c%gESeN`%sXr~IQQ_y|hWEfiE&L*ZiU zK3$U*vZYQRN@frZ@v@2!320LiA{|I3*-B7*pCN(GwRt|IptQar;&rgiZLtyd0a=$v?ZDD{_B|t`%7O4cKLpPEM@RW%RQP-(;$m3_ zL+-vuom)9k{Zp<2P}+s%Jx`Ikfze3EyVwg)nzGxo4k2v?AcC5F+mBY0M^k0-uir&$ zYm6QH&40_V$s~+6))=$*m>r9G*C7W`bxglu3&^671Om+>nMPHKr?C!!Iw;jEmETHI|6^C;J zAly0fSkt#0R2Q+(o-k?>H$Kc!t4WKW-W&8o3C`FxQ-1j)rKzug-#g1sY{(uFU|Wn) zwAGX`oOt1~v654iIH4vx)gIqW)^i6hAo&r+#LR$?PJ4 z%x5a`v9{GrMXs!a8~n5=#qv9D;WncZ-am0pGX)9n#^0gC1mCGvWDK!w+PzoK9Ftz! zPR1!KWpP`G*17gpP@U7;5OX8)?h#M*(HgP~!xF(nDf8a%gYj-d_|_xK2L_8RX=t$KSB3MY3rYnH)%Fs??XgxiZx;7HK`ZolEeoyJ4S+|hbr zCOntF>B!n@@&~VuubhBDoGQo<&EqO62>c=<+-j>T*Idl zX6rSBMEr8~&Z=MQ5UsVWA}WWK9~{;14G@ciB@*cBJYLZWo*zARUtucb(N2+Tpp|XK zg1*n15fjNyjc+6z(~5KGINj{>q+JsIg5Btsst;YC92q%ZM?I;?-;qeB>v^%C$GDvv zmO9)a7w+X_mI_Yl))b(GLQ0sq6zk0!Cib`}7p%-5zQPMGe}cKMrEAV>+>ON&bkt%OE;`*ykF63Cs{+KKK#?L zv^R&CO#K!Sxw0jK`n5{l!ENodPZiM_kH)X6c3d7{-tA|(#=)zb+f~#a%KKUv&(0& zL~-TEnOBKqWaG2vFP?*^ux5w+|SH*!!7l}d>Q zC~yUbshf_nkW)rf5aTdXJ1_6-B-(jIO`LHBK|^%-tUgos%e_QN@;l2->aZM?O|cS% zPA};UHqo}SnyYqr!?GT-T}_*QlB}VYR6NK}Tew#p?vml%(5_H&WiWp4ly6?taawE+ z%XE!`ORm5l(3vy5MMbf!Oy`u~DHFCz-S=o-cg@ zu4R<*p@3N|rA6uM%F^yBUSh96g9GB+=OXjr3hA``pCd~|qX`{ApJi61r$3R{9FYjz zjmI(NO{){iUr{HNnEAOwQqkTSv$z%6hVdu8>adM%$zTHj3O4`%n!gFe!@=3!#m3s& z)dl$1?JpK_OEGg;79vKrsl*+3-Tx8$U=&d6gF51Q)v)M33@a8wu{aI6Pj5}fsyG|U zzGaV_;^Rn=jz0sXNmXC;E5Ww&=b6i-)Qu%IlqkOPXd2VuAHz;thuFphbqgV=;eX>fuDne#?nV zWnh5Byt?i_J8Wqz^|j6EybomQdE#>mY5#22tZXQ2*TA}2HSi|Ng(r54Gy`_vcPUW@ zD-y_(=t(((^80T|`NtLoe*c0xcZX0Pw4SFP_VgY@F^Pp^=zLUy-Do9!EeC^?=Be#W zTdHqru)SnW3ABHCZh~3P(z-%^W@M43np!R2MV8rt)S~h5%x&cieL~N&D=Ql^LdpD` z77G)u1jdY`2r{!{F1osTa-v}N;YUWgt)sHa^a3d{E;)XL7$ZAoVV`I|JeZH-sZ}07 z>-{AoFj-`%z3Vo`2Lw3huF*HmT(ZXzw2Fy%(Nb|jk7cZsw!_j5!Nej}=8PhxIH%T+ zaDso}%g-^|spVAgDd@3UvM7IIt!8hxdCcM5la^0>o)2;o!PJxD2**H)#iUE*FBxE$ zBt#s@_j?Xe)%4p|G7scOG^#-I0`IlpV%;JbD1pQ$4H^)T*ADtpatk(M7rnJIcF7HQUkY8ldzPG3Z16-B-t0nuln;?~D$ zrZibbHELA#Q-e;45lGGR53e=|qk*~%!m+PJK_zU&Ds{WQqn~`@?gqM>dfB^@mN-)h zk&A$c3-_6rGIUW71((Gn1?9&VrOm<=6buk3av}Tv%tK5)Zj@@wLIXmJF&)8%d5emc zz&UoQuH#Y&yVppZN>0DmZNP_t^Oc9*%OumavO>kLlER*rseDcPuc;zxWEE8_-pD=I zwDL(*-Svk>FLL*%g{E!=H)5u|I!eY`b5k<)>w7>sOpNf(z|lKEi5P~k)Q`?#6d?Ak z1m0;JfixBL!WZog+MoN8HwrvZDG+^W0*+wni4HjiTQ(N=9{Fp|y)i#gP#u(5(1B?}4yCl-}iy zC*u48?Ob>y8$4<&bC4W^a%2vRrBj#Xq^)WeNk{NMice9t;Jm%${`GWU;eqE4zO|Yj zwMU#Q>a2RpY@+Ge{$#Xey(Fp6=6l}#3rK*N;a=AEMb%57rT5{*3_Vb&C#7;aha39& zbgkj!=AT_>KM8Bu1>J3`(CgP33_Z5G*gSK#w$gTWwuOORe$A~{Tfqftx_kTF(b;iB=Bo5U%<4Xi-A@ENp zpalEeh_AQk&{vKFwXd+xwaIT(uoY&Y1W1(E^D?fkUj1px#DPP-;a{ITRy26&1f#tUkflh3lR>ePbV zhT7VCz8_Qb{v5F^iF4lC4Z~&75XVK9&gCz=NO;!DKR@^STG7>A#i72vkwM|jq_E1I z(Q{z*v^M*B;`@X%JUvqyW2dI^?2kD8+I{v&SI+ctNtA26F*Xljv`R&{&DtGfzPhx| zVO_=Cu0{hJZqQ0|2pq|BmOceBr(j9hUm*AYY|qB)&Nt57>03O*PCjv9u@spx$Ww)K zx-hOow&b?>P(R%496@#jmfG%I-6P&*8?fi~4k*$0vjRV5D=OC#FxvY#pvdE|%i~{U zo?WF7Ueg^xz2Z_-C?9C75;Fs5r-SeGI}97N+2SQ!O$7y1&ihLE>@(IX9W+D2_94vr z2+H6%hC{BAh<8gtv$K)6FE`7Ky$0#n_ckgp%eVObKa_e7;u2_-Ey#(;vYj`H!}l&} ze>7`Z9a>#2B++~V13k}#RssNjCXCDKlEVr*Q9>~QfSZ3O%r7praB#5ymC(^~APp== z($FiyMZ8{EJ}c~f%SPeL5mwrrYMzDyBqXrzr!QW5w4^ufT1$$9>7*$;9a~=h34zqw zlc7ScU@gO88ix51cs!jdhs>g&UvV=tb|Scwym`uXHlJ`IZw9JA&iHmZlb zFg)dAeL@$2H*`WU#xC9`>P7)c8TTp`Ex4j&vCrzJ9xV#2G|6B+)NES$^3X5>A4BQ% zNnZ=)R{LC3RVxu?-4J(EUp}R+Bcz~Z)3~h`0s3}7U3krET+q``IWMOB%Z&7)hSp@_ z@o-5?k3PJ+Jn9FYd)uD@xk9qyxBLnSx{7H34#?lS%0I*MAB_b>7|cToXuIzvqYvH8 zeN$xGTz4t+@$-xHknhP~cQU-6fO7GG1kXA?$RtQqB+k&Ny{n`o_ABC`iuf`Ic%wO~ zDt0?^O5?$@ci+zqyNW6s>uA#OMXH$u#wx2m;(b0UZXgp`b3t*uHijG?{j!nPi^n|B zkv^h}q>&|QZ*5(*AP?+%!?_Bo-q&1N6~T)vqJDv77_U)f(n%rsL&k?eg?j zCe-SvX>{-qUvg@f+YBxsF`!KV{Z#A#24>hDL9?oskxQQ$n4HA}3!7<{mj~aEP02YA zj-j~et|%oVN)i`;&4|ZXAni0hdC7y{1R?mli$_KKX3t;U`?{8k(E6 z>wU|xwjxtXG4M{*^OU?Pc^`SPS&C$Q|K8kg>-X^?7W*>!BlMVKH_l8BwanoooPz=BFYx7yWrs*)AgJjel znmE-5)ipR#k4TWPW=YlEk0h*&kNF8BzAMJCCF{WxzES|a@}ozv1=hg7_Dy=DLCF!})Im zC-{FE_}5YWx|Qo>``Zc(O>6%~z1J;VXTskWDA31-Ul#t$i`Svod*W}XJX(E3L;u+d yu7j`lkKbTjv=oR2|Br5R-Olxr{B1{t{9g-HQxykI`TzhS`W=XVR++yF8SsDP;o9B+ literal 0 HcmV?d00001 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c0e00c7bf6f54..83ca93bdfa703 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -22,6 +22,7 @@ Other Enhancements - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1037e2d9a3bd6..a6697e8879b08 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -460,7 +460,7 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other): + def union(self, other, sort=None): """ Specialized union for DatetimeIndex objects. If combine overlapping ranges with the same DateOffset, will be much @@ -469,15 +469,29 @@ def union(self, other): Parameters ---------- other : DatetimeIndex or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result + + .. versionadded:: 0.25.0 Returns ------- y : Index or DatetimeIndex """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other) or len(self) == 0: - return super(DatetimeIndex, self).union(other) + return super(DatetimeIndex, self).union(other, sort=sort) if not isinstance(other, DatetimeIndex): try: @@ -488,9 +502,9 @@ def union(self, other): this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): - return this._fast_union(other) + return this._fast_union(other, sort=sort) else: - result = Index.union(this, other) + result = Index.union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds @@ -563,16 +577,28 @@ def _can_fast_union(self, other): # this will raise return False - def _fast_union(self, other): + def _fast_union(self, other, sort=None): if len(other) == 0: return self.view(type(self)) if len(self) == 0: return other.view(type(self)) - # to make our life easier, "sort" the two ranges + # Both DTIs are monotonic. Check if they are already + # in the "correct" order if self[0] <= other[0]: left, right = self, other + # DTIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + elif sort is False: + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side='left') + right_chunk = right.values[:loc] + dates = _concat._concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + # DTIs are not in the "correct" order and we want + # to sort else: left, right = other, self diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 19009e45ee83a..cf1f75234ec62 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -21,83 +21,107 @@ class TestDatetimeIndexSetOps(object): 'dateutil/US/Pacific'] # TODO: moved from test_datetimelike; dedup with version below - def test_union2(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union2(self, sort): everything = tm.makeDateIndex(10) first = everything[:5] second = everything[5:] - union = first.union(second) - assert tm.equalContents(union, everything) + union = first.union(second, sort=sort) + tm.assert_index_equal(union, everything) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - result = first.union(case) - assert tm.equalContents(result, everything) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("tz", tz) - def test_union(self, tz): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, tz, sort): rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + expected3_notsorted = rng3 - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: + for rng, other, exp, exp_notsorted in [(rng1, other1, expected1, + expected1_notsorted), + (rng2, other2, expected2, + expected2_notsorted), + (rng3, other3, expected3, + expected3_notsorted)]: - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) + result_union = rng.union(other, sort=sort) + tm.assert_index_equal(result_union, exp) - def test_union_coverage(self): + result_union = other.union(rng, sort=sort) + if sort is None: + tm.assert_index_equal(result_union, exp) + else: + tm.assert_index_equal(result_union, exp_notsorted) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_coverage(self, sort): idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) ordered = DatetimeIndex(idx.sort_values(), freq='infer') - result = ordered.union(idx) + result = ordered.union(idx, sort=sort) tm.assert_index_equal(result, ordered) - result = ordered[:0].union(ordered) + result = ordered[:0].union(ordered, sort=sort) tm.assert_index_equal(result, ordered) assert result.freq == ordered.freq - def test_union_bug_1730(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1730(self, sort): rng_a = date_range('1/1/2012', periods=4, freq='3H') rng_b = date_range('1/1/2012', periods=4, freq='4H') - result = rng_a.union(rng_b) + result = rng_a.union(rng_b, sort=sort) exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) tm.assert_index_equal(result, exp) - def test_union_bug_1745(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1745(self, sort): left = DatetimeIndex(['2012-05-11 15:19:49.695000']) right = DatetimeIndex(['2012-05-29 13:04:21.322000', '2012-05-11 15:27:24.873000', '2012-05-11 15:31:05.350000']) - result = left.union(right) - exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + result = left.union(right, sort=sort) + exp = DatetimeIndex(['2012-05-11 15:19:49.695000', + '2012-05-29 13:04:21.322000', + '2012-05-11 15:27:24.873000', + '2012-05-11 15:31:05.350000']) + if sort is None: + exp = exp.sort_values() tm.assert_index_equal(result, exp) - def test_union_bug_4564(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_4564(self, sort): from pandas import DateOffset left = date_range("2013-01-01", "2013-02-01") right = left + DateOffset(minutes=15) - result = left.union(right) + result = left.union(right, sort=sort) exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) tm.assert_index_equal(result, exp) - def test_union_freq_both_none(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_freq_both_none(self, sort): # GH11086 expected = bdate_range('20150101', periods=10) expected.freq = None - result = expected.union(expected) + result = expected.union(expected, sort=sort) tm.assert_index_equal(result, expected) assert result.freq is None @@ -112,11 +136,14 @@ def test_union_dataframe_index(self): exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') tm.assert_index_equal(df.index, exp) - def test_union_with_DatetimeIndex(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D') - i1.union(i2) # Works - i2.union(i1) # Fails with "AttributeError: can't set attribute" + # Works + i1.union(i2, sort=sort) + # Fails with "AttributeError: can't set attribute" + i2.union(i1, sort=sort) # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): @@ -262,11 +289,12 @@ def test_datetimeindex_diff(self, sort): periods=98) assert len(dti1.difference(dti2, sort)) == 2 - def test_datetimeindex_union_join_empty(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_datetimeindex_union_join_empty(self, sort): dti = date_range(start='1/1/2001', end='2/1/2001', freq='D') empty = Index([]) - result = dti.union(empty) + result = dti.union(empty, sort=sort) assert isinstance(result, DatetimeIndex) assert result is result @@ -287,35 +315,40 @@ class TestBusinessDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END) - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + else: + expected = pd.DatetimeIndex(list(right) + list(left)) + tm.assert_index_equal(right.union(left, sort=sort), expected) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self): @@ -350,16 +383,21 @@ def test_outer_join(self): assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None - def test_union_not_cacheable(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_not_cacheable(self, sort): rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] - the_union = rng1.union(rng2) - tm.assert_index_equal(the_union, rng) + the_union = rng1.union(rng2, sort=sort) + if sort is None: + tm.assert_index_equal(the_union, rng) + else: + expected = pd.DatetimeIndex(list(rng[10:]) + list(rng[:10])) + tm.assert_index_equal(the_union, expected) rng1 = rng[10:] rng2 = rng[15:35] - the_union = rng1.union(rng2) + the_union = rng1.union(rng2, sort=sort) expected = rng[10:] tm.assert_index_equal(the_union, expected) @@ -388,7 +426,8 @@ def test_intersection_bug(self): result = a.intersection(b) tm.assert_index_equal(result, b) - def test_month_range_union_tz_pytz(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone tz = timezone('US/Eastern') @@ -403,10 +442,11 @@ def test_month_range_union_tz_pytz(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) @td.skip_if_windows_python_3 - def test_month_range_union_tz_dateutil(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz tz = dateutil_gettz('US/Eastern') @@ -421,7 +461,7 @@ def test_month_range_union_tz_dateutil(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) class TestCustomDatetimeIndex(object): @@ -429,35 +469,37 @@ class TestCustomDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END, freq='C') - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self): From 183dc02e3538f559c675b82c84fc282a6bb95741 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 23 Feb 2019 19:27:12 -0800 Subject: [PATCH 025/110] DOC: Rewriting of ParserError doc + minor spacing (#25421) Follow-up to gh-25414. --- pandas/errors/__init__.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 493ee65f63c6a..7d5a7f1a99e41 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -9,10 +9,10 @@ class PerformanceWarning(Warning): """ - Warning raised when there is a possible - performance impact. + Warning raised when there is a possible performance impact. """ + class UnsupportedFunctionCall(ValueError): """ Exception raised when attempting to call a numpy function @@ -20,6 +20,7 @@ class UnsupportedFunctionCall(ValueError): the object e.g. ``np.cumsum(groupby_object)``. """ + class UnsortedIndexError(KeyError): """ Error raised when attempting to get a slice of a MultiIndex, @@ -31,9 +32,15 @@ class UnsortedIndexError(KeyError): class ParserError(ValueError): """ - Exception that is raised by an error encountered in `pd.read_csv`. + Exception that is raised by an error encountered in parsing file contents. + + This is a generic error raised for errors encountered when functions like + `read_csv` or `read_html` are parsing contents of a file. - e.g. HTML Parsing will raise this error. + See Also + -------- + read_csv : Read CSV (comma-separated) file into a DataFrame. + read_html : Read HTML table into a DataFrame. """ @@ -182,4 +189,4 @@ def __str__(self): else: name = self.class_instance.__class__.__name__ msg = "This {methodtype} must be defined in the concrete class {name}" - return (msg.format(methodtype=self.methodtype, name=name)) + return msg.format(methodtype=self.methodtype, name=name) From 5ae9b48eee11fbb05fd930f97f232120a7bc4713 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 24 Feb 2019 04:34:18 +0100 Subject: [PATCH 026/110] API/ERR: allow iterators in df.set_index & improve errors (#24984) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/compat/__init__.py | 2 ++ pandas/core/frame.py | 43 ++++++++++++++++++++++++-- pandas/tests/frame/test_alter_axes.py | 44 +++++++++++++++++++++------ 4 files changed, 79 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 83ca93bdfa703..4ea5d935a6920 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -22,6 +22,7 @@ Other Enhancements - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index d7ca7f8963f70..4036af85b7212 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -137,6 +137,7 @@ def lfilter(*args, **kwargs): reload = reload Hashable = collections.abc.Hashable Iterable = collections.abc.Iterable + Iterator = collections.abc.Iterator Mapping = collections.abc.Mapping MutableMapping = collections.abc.MutableMapping Sequence = collections.abc.Sequence @@ -199,6 +200,7 @@ def get_range_parameters(data): Hashable = collections.Hashable Iterable = collections.Iterable + Iterator = collections.Iterator Mapping = collections.Mapping MutableMapping = collections.MutableMapping Sequence = collections.Sequence diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79f209f9ebc0a..608e5c53ec094 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -33,7 +33,7 @@ from pandas import compat from pandas.compat import (range, map, zip, lmap, lzip, StringIO, u, - PY36, raise_with_traceback, + PY36, raise_with_traceback, Iterator, string_and_binary_types) from pandas.compat.numpy import function as nv from pandas.core.dtypes.cast import ( @@ -4025,7 +4025,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" - encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`abc.Iterator`. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4104,6 +4105,32 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not isinstance(keys, list): keys = [keys] + err_msg = ('The parameter "keys" may be a column key, one-dimensional ' + 'array, or a list containing only valid column keys and ' + 'one-dimensional arrays.') + + missing = [] + for col in keys: + if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, + list, Iterator)): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, 'ndim', 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError: + raise TypeError(err_msg + ' Received column of ' + 'type {}'.format(type(col))) + else: + if not found: + missing.append(col) + + if missing: + raise KeyError('None of {} are in the columns'.format(missing)) + if inplace: frame = self else: @@ -4132,6 +4159,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) + elif isinstance(col, Iterator): + arrays.append(list(col)) + names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) @@ -4139,6 +4169,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if drop: to_remove.append(col) + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError('Length mismatch: Expected {len_self} rows, ' + 'received array of length {len_col}'.format( + len_self=len(self), + len_col=len(arrays[-1]) + )) + index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index cc3687f856b4e..a25e893e08900 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -178,10 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, + @pytest.mark.parametrize('box2', [Series, Index, np.array, list, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, + @pytest.mark.parametrize('box1', [Series, Index, np.array, list, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -195,6 +195,9 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] result = df.set_index(keys, drop=drop, append=append) + # if either box is iter, it has been consumed; re-read + keys = [box1(df['A']), box2(df['A'])] + # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers @@ -253,25 +256,48 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df.set_index(['A', df['A'], tuple(df['A'])], drop=drop, append=append) - @pytest.mark.xfail(reason='broken due to revert, see GH 25085') @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)], - ids=['set', 'iter', 'generator']) + @pytest.mark.parametrize('box', [set], ids=['set']) def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' - # forbidden type, e.g. set/iter/generator + # forbidden type, e.g. set with pytest.raises(TypeError, match=msg): df.set_index(box(df['A']), drop=drop, append=append) - # forbidden type in list, e.g. set/iter/generator + # forbidden type in list, e.g. set with pytest.raises(TypeError, match=msg): df.set_index(['A', df['A'], box(df['A'])], drop=drop, append=append) + # MultiIndex constructor does not work directly on Series -> lambda + @pytest.mark.parametrize('box', [Series, Index, np.array, iter, + lambda x: MultiIndex.from_arrays([x])], + ids=['Series', 'Index', 'np.array', + 'iter', 'MultiIndex']) + @pytest.mark.parametrize('length', [4, 6], ids=['too_short', 'too_long']) + @pytest.mark.parametrize('append', [True, False]) + @pytest.mark.parametrize('drop', [True, False]) + def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, + drop, append): + # GH 24984 + df = frame_of_index_cols # has length 5 + + values = np.random.randint(0, 10, (length,)) + + msg = 'Length mismatch: Expected 5 rows, received array of length.*' + + # wrong length directly + with pytest.raises(ValueError, match=msg): + df.set_index(box(values), drop=drop, append=append) + + # wrong length in list + with pytest.raises(ValueError, match=msg): + df.set_index(['A', df.A, box(values)], drop=drop, append=append) + def test_set_index_custom_label_type(self): # GH 24969 @@ -341,7 +367,7 @@ def __repr__(self): # missing key thing3 = Thing(['Three', 'pink']) - msg = '.*' # due to revert, see GH 25085 + msg = r"frozenset\(\{'Three', 'pink'\}\)" with pytest.raises(KeyError, match=msg): # missing label directly df.set_index(thing3) @@ -366,7 +392,7 @@ def __str__(self): thing2 = Thing('Two', 'blue') df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) - msg = 'unhashable type.*' + msg = 'The parameter "keys" may be a column key, .*' with pytest.raises(TypeError, match=msg): # use custom label directly From fc1fe838a1976c38762b2632d6bac2bea76cd039 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 23 Feb 2019 19:37:39 -0800 Subject: [PATCH 027/110] BUG: Indexing with UTC offset string no longer ignored (#25263) --- doc/source/user_guide/timeseries.rst | 10 +++ doc/source/whatsnew/v0.25.0.rst | 34 +++++++- pandas/core/indexes/base.py | 21 ++++- pandas/core/indexes/datetimes.py | 80 ++++++++++--------- .../tests/indexes/datetimes/test_datetime.py | 2 +- .../indexes/datetimes/test_partial_slicing.py | 27 +++++++ 6 files changed, 129 insertions(+), 45 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 23f1aabd69ff3..4e2c428415926 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -633,6 +633,16 @@ We are stopping on the included end-point as it is part of the index: dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] +.. versionadded:: 0.25.0 + +Slicing with string indexing also honors UTC offset. + +.. ipython:: python + + df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df + df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + .. _timeseries.slice_vs_exact_match: Slice vs. Exact Match diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4ea5d935a6920..a1734532668b8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -31,7 +31,37 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`Timestamp.strptime` will now raise a NotImplementedError (:issue:`25016`) +.. _whatsnew_0250.api_breaking.utc_offset_indexing: + +Indexing with date strings with UTC offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Indexing a :class:`DataFrame` or :class:`Series` with a :class:`DatetimeIndex` with a +date string with a UTC offset would previously ignore the UTC offset. Now, the UTC offset +is respected in indexing. (:issue:`24076`, :issue:`16785`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + + In [2]: df + Out[2]: + 0 + 2019-01-01 00:00:00-08:00 0 + + In [3]: df['2019-01-01 00:00:00+04:00':'2019-01-01 01:00:00+04:00'] + Out[3]: + 0 + 2019-01-01 00:00:00-08:00 0 + +*New Behavior*: + +.. ipython:: ipython + + df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] .. _whatsnew_0250.api.other: @@ -40,7 +70,7 @@ Other API Changes - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) - ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`) -- +- :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`) - .. _whatsnew_0250.deprecations: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b5f3c929a7f36..1cdacc908b663 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6,9 +6,10 @@ import numpy as np from pandas._libs import ( - Timedelta, algos as libalgos, index as libindex, join as libjoin, lib, - tslibs) + algos as libalgos, index as libindex, join as libjoin, lib) from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp +from pandas._libs.tslibs.timezones import tz_compare import pandas.compat as compat from pandas.compat import range, set_function_name, u from pandas.compat.numpy import function as nv @@ -447,7 +448,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: pass elif inferred.startswith('timedelta'): @@ -4867,6 +4868,20 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # If it's a reverse slice, temporarily swap bounds. start, end = end, start + # GH 16785: If start and end happen to be date strings with UTC offsets + # attempt to parse and check that the offsets are the same + if (isinstance(start, (compat.string_types, datetime)) + and isinstance(end, (compat.string_types, datetime))): + try: + ts_start = Timestamp(start) + ts_end = Timestamp(end) + except (ValueError, TypeError): + pass + else: + if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): + raise ValueError("Both dates must have the " + "same UTC offset") + start_slice = None if start is not None: start_slice = self.get_slice_bound(start, 'left', kind) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index a6697e8879b08..b8d052ce7be04 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -32,9 +32,8 @@ from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools -from pandas.tseries import offsets from pandas.tseries.frequencies import Resolution, to_offset -from pandas.tseries.offsets import CDay, prefix_mapping +from pandas.tseries.offsets import CDay, Nano, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -852,54 +851,57 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ + valid_resos = {'year', 'month', 'quarter', 'day', 'hour', 'minute', + 'second', 'minute', 'second', 'microsecond'} + if reso not in valid_resos: + raise KeyError if reso == 'year': - return (Timestamp(datetime(parsed.year, 1, 1), tz=self.tz), - Timestamp(datetime(parsed.year, 12, 31, 23, - 59, 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, 1, 1) + end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) elif reso == 'month': d = ccalendar.get_days_in_month(parsed.year, parsed.month) - return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), - Timestamp(datetime(parsed.year, parsed.month, d, 23, - 59, 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) elif reso == 'quarter': qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month - return (Timestamp(datetime(parsed.year, parsed.month, 1), - tz=self.tz), - Timestamp(datetime(parsed.year, qe, d, 23, 59, - 59, 999999), tz=self.tz)) + start = Timestamp(parsed.year, parsed.month, 1) + end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) elif reso == 'day': - st = datetime(parsed.year, parsed.month, parsed.day) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Day(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day) + end = start + timedelta(days=1) - Nano(1) elif reso == 'hour': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Hour(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour) + end = start + timedelta(hours=1) - Nano(1) elif reso == 'minute': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour, minute=parsed.minute) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Minute(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute) + end = start + timedelta(minutes=1) - Nano(1) elif reso == 'second': - st = datetime(parsed.year, parsed.month, parsed.day, - hour=parsed.hour, minute=parsed.minute, - second=parsed.second) - return (Timestamp(st, tz=self.tz), - Timestamp(Timestamp(st + offsets.Second(), - tz=self.tz).value - 1)) + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute, parsed.second) + end = start + timedelta(seconds=1) - Nano(1) elif reso == 'microsecond': - st = datetime(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second, - parsed.microsecond) - return (Timestamp(st, tz=self.tz), Timestamp(st, tz=self.tz)) - else: - raise KeyError + start = Timestamp(parsed.year, parsed.month, parsed.day, + parsed.hour, parsed.minute, parsed.second, + parsed.microsecond) + end = start + timedelta(microseconds=1) - Nano(1) + # GH 24076 + # If an incoming date string contained a UTC offset, need to localize + # the parsed date to this offset first before aligning with the index's + # timezone + if parsed.tzinfo is not None: + if self.tz is None: + raise ValueError("The index must be timezone aware " + "when indexing with a date string with a " + "UTC offset") + start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) + end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) + elif self.tz is not None: + start = start.tz_localize(self.tz) + end = end.tz_localize(self.tz) + return start, end def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e1ba0e1708442..a3ee5fe39769f 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -102,7 +102,7 @@ def test_stringified_slice_with_tz(self): # GH#2658 import datetime start = datetime.datetime.now() - idx = date_range(start=start, freq="1d", periods=10) + idx = date_range(start=start, freq="1d", periods=10, tz='US/Eastern') df = DataFrame(lrange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index a0c9d9f02385c..64693324521b3 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -396,3 +396,30 @@ def test_selection_by_datetimelike(self, datetimelike, op, expected): result = op(df.A, datetimelike) expected = Series(expected, name='A') tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('start', [ + '2018-12-02 21:50:00+00:00', pd.Timestamp('2018-12-02 21:50:00+00:00'), + pd.Timestamp('2018-12-02 21:50:00+00:00').to_pydatetime() + ]) + @pytest.mark.parametrize('end', [ + '2018-12-02 21:52:00+00:00', pd.Timestamp('2018-12-02 21:52:00+00:00'), + pd.Timestamp('2018-12-02 21:52:00+00:00').to_pydatetime() + ]) + def test_getitem_with_datestring_with_UTC_offset(self, start, end): + # GH 24076 + idx = pd.date_range(start='2018-12-02 14:50:00-07:00', + end='2018-12-02 14:50:00-07:00', freq='1min') + df = pd.DataFrame(1, index=idx, columns=['A']) + result = df[start:end] + expected = df.iloc[0:3, :] + tm.assert_frame_equal(result, expected) + + # GH 16785 + start = str(start) + end = str(end) + with pytest.raises(ValueError, match="Both dates must"): + df[start:end[:-4] + '1:00'] + + with pytest.raises(ValueError, match="The index must be timezone"): + df = df.tz_localize(None) + df[start:end] From df039bfe57aad0419378f4fe9476e07e35ba204a Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 24 Feb 2019 03:40:07 +0000 Subject: [PATCH 028/110] PERF/REF: improve performance of Series.searchsorted, PandasArray.searchsorted, collect functionality (#22034) --- asv_bench/benchmarks/series_methods.py | 19 ++++++ doc/source/whatsnew/v0.25.0.rst | 3 +- pandas/core/algorithms.py | 85 +++++++++++++++++++++++++- pandas/core/arrays/base.py | 18 +++--- pandas/core/arrays/numpy_.py | 7 +++ pandas/core/base.py | 6 +- pandas/core/series.py | 8 +-- pandas/tests/arrays/test_array.py | 49 +++++++++++++++ 8 files changed, 175 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f7d0083b86a01..3303483c50e20 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -124,6 +124,25 @@ def time_dropna(self, dtype): self.s.dropna() +class SearchSorted(object): + + goal_time = 0.2 + params = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'str'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**5 + data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) + self.s = Series(data) + + def time_searchsorted(self, dtype): + key = '2' if dtype == 'str' else 2 + self.s.searchsorted(key) + + class Map(object): params = ['dict', 'Series'] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a1734532668b8..170e7f14da397 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -96,7 +96,8 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- +- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is + int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c5c8f47ad6dba..b056a357d0a51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1724,6 +1724,89 @@ def func(arr, indexer, out, fill_value=np.nan): return out +# ------------ # +# searchsorted # +# ------------ # + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `arr` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `arr` would be preserved. + + Assuming that `arr` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``arr[i-1] < value <= self[i]`` + right ``arr[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + arr: array-like + Input array. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( + is_integer(value) or is_integer_dtype(value)): + from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value + value_ser = Series(value)._values + value = value_ser[0] if is_scalar(value) else value_ser + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result + + # ---- # # diff # # ---- # diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7aaefef3d03e5..e770281596134 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None): .. versionadded:: 0.24.0 Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. - Assuming that `a` is sorted: + Assuming that `self` is sorted: - ====== ============================ + ====== ================================ `side` returned index `i` satisfies - ====== ============================ - left ``self[i-1] < v <= self[i]`` - right ``self[i-1] <= v < self[i]`` - ====== ============================ + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ Parameters ---------- @@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None): Returns ------- - indices : array of ints + array of ints Array of insertion points with the same shape as `value`. See Also diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 791ff44303e96..8e2ab586cacb6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -4,6 +4,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -12,6 +13,7 @@ from pandas import compat from pandas.core import nanops +from pandas.core.algorithms import searchsorted from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -423,6 +425,11 @@ def to_numpy(self, dtype=None, copy=False): return result + @Appender(ExtensionArray.searchsorted.__doc__) + def searchsorted(self, value, side='left', sorter=None): + return searchsorted(self.to_numpy(), value, + side=side, sorter=sorter) + # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 7fdc64a8d9f85..f896596dd5216 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1522,11 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1): array([3]) """) - @Substitution(klass='IndexOpsMixin') + @Substitution(klass='Index') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - # needs coercion on the key (DatetimeIndex does already) - return self._values.searchsorted(value, side=side, sorter=sorter) + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/series.py b/pandas/core/series.py index a5dfe8d43c336..ad7c6af21f637 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2392,12 +2392,8 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - if sorter is not None: - sorter = ensure_platform_int(sorter) - result = self._values.searchsorted(Series(value)._values, - side=side, sorter=sorter) - - return result[0] if is_scalar(value) else result + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 9fea1989e46df..b68ec2bf348b4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -9,6 +9,7 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype +from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) @@ -254,3 +255,51 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) + + +class TestArrayAnalytics(object): + def test_searchsorted(self, string_dtype): + arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + + result = arr.searchsorted('a', side='left') + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted('a', side='right') + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('arr, val', [ + [pd.date_range('20120101', periods=10, freq='2D'), + pd.Timestamp('20120102')], + [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), + pd.Timestamp('20120102', tz='Asia/Hong_Kong')], + [pd.timedelta_range(start='1 day', end='10 days', periods=10), + pd.Timedelta('2 days')]]) + def test_search_sorted_datetime64_scalar(self, arr, val): + arr = pd.array(arr) + result = arr.searchsorted(val) + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_sorter(self, any_real_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) From f59a6ab2e993f0e2f78babd02e39297adfb4333a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Feb 2019 19:45:28 -0800 Subject: [PATCH 029/110] TST: remove never-used singleton fixtures (#24885) --- pandas/tests/frame/conftest.py | 54 ---------------------------------- 1 file changed, 54 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 377e737a53158..69ee614ab8d2a 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -29,16 +29,6 @@ def float_frame_with_na(): return df -@pytest.fixture -def float_frame2(): - """ - Fixture for DataFrame of floats with index of unique strings - - Columns are ['D', 'C', 'B', 'A'] - """ - return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A']) - - @pytest.fixture def bool_frame_with_na(): """ @@ -104,21 +94,6 @@ def mixed_float_frame(): return df -@pytest.fixture -def mixed_float_frame2(): - """ - Fixture for DataFrame of different float types with index of unique strings - - Columns are ['A', 'B', 'C', 'D']. - """ - df = DataFrame(tm.getSeriesData()) - df.D = df.D.astype('float32') - df.C = df.C.astype('float32') - df.B = df.B.astype('float16') - df.D = df.D.astype('float64') - return df - - @pytest.fixture def mixed_int_frame(): """ @@ -135,19 +110,6 @@ def mixed_int_frame(): return df -@pytest.fixture -def mixed_type_frame(): - """ - Fixture for DataFrame of float/int/string columns with RangeIndex - - Columns are ['a', 'b', 'c', 'float32', 'int32']. - """ - return DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) - - @pytest.fixture def timezone_frame(): """ @@ -173,22 +135,6 @@ def empty_frame(): return DataFrame({}) -@pytest.fixture -def datetime_series(): - """ - Fixture for Series of floats with DatetimeIndex - """ - return tm.makeTimeSeries(nper=30) - - -@pytest.fixture -def datetime_series_short(): - """ - Fixture for Series of floats with DatetimeIndex - """ - return tm.makeTimeSeries(nper=30)[5:] - - @pytest.fixture def simple_frame(): """ From 85572de5e7bb188cfecc575ee56786406e79dc79 Mon Sep 17 00:00:00 2001 From: Josh Date: Sat, 23 Feb 2019 22:47:23 -0500 Subject: [PATCH 030/110] BUG: fixed merging with empty frame containing an Int64 column (#25183) (#25289) --- doc/source/whatsnew/v0.24.2.rst | 2 +- pandas/core/internals/concat.py | 2 + pandas/tests/reshape/merge/test_merge.py | 78 ++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index a7e522d27f8e2..8f4beb3f484a4 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -96,7 +96,7 @@ Bug Fixes **Other** - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) -- +- Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - .. _whatsnew_0.242.contributors: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 640587b7f9f31..cb98274962656 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -190,6 +190,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, 'is_sparse', False): pass + elif getattr(self.block, 'is_extension', False): + pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 25487ccc76e62..7a97368504fd6 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -39,6 +39,54 @@ def get_test_data(ngroups=NGROUPS, n=N): return arr +def get_series(): + return [ + pd.Series([1], dtype='int64'), + pd.Series([1], dtype='Int64'), + pd.Series([1.23]), + pd.Series(['foo']), + pd.Series([True]), + pd.Series([pd.Timestamp('2018-01-01')]), + pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + ] + + +def get_series_na(): + return [ + pd.Series([np.nan], dtype='Int64'), + pd.Series([np.nan], dtype='float'), + pd.Series([np.nan], dtype='object'), + pd.Series([pd.NaT]), + ] + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype(request): + """ + A parametrized fixture returning a variety of Series of different + dtypes + """ + return request.param + + +@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name) +def series_of_dtype2(request): + """ + A duplicate of the series_of_dtype fixture, so that it can be used + twice by a single function + """ + return request.param + + +@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name) +def series_of_dtype_all_na(request): + """ + A parametrized fixture returning a variety of Series with all NA + values + """ + return request.param + + class TestMerge(object): def setup_method(self, method): @@ -428,6 +476,36 @@ def check2(exp, kwarg): check1(exp_in, kwarg) check2(exp_out, kwarg) + def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): + # GH 25183 + df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2}, + columns=['key', 'value']) + df_empty = df[:0] + expected = pd.DataFrame({ + 'value_x': pd.Series(dtype=df.dtypes['value']), + 'key': pd.Series(dtype=df.dtypes['key']), + 'value_y': pd.Series(dtype=df.dtypes['value']), + }, columns=['value_x', 'key', 'value_y']) + actual = df_empty.merge(df, on='key') + assert_frame_equal(actual, expected) + + def test_merge_all_na_column(self, series_of_dtype, + series_of_dtype_all_na): + # GH 25183 + df_left = pd.DataFrame( + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) + df_right = pd.DataFrame( + {'key': series_of_dtype, 'value': series_of_dtype_all_na}, + columns=['key', 'value']) + expected = pd.DataFrame({ + 'key': series_of_dtype, + 'value_x': series_of_dtype_all_na, + 'value_y': series_of_dtype_all_na, + }, columns=['key', 'value_x', 'value_y']) + actual = df_left.merge(df_right, on='key') + assert_frame_equal(actual, expected) + def test_merge_nosort(self): # #2098, anything to do? From aa084162bcaa7ce0efdc044bc8077f6bfca70674 Mon Sep 17 00:00:00 2001 From: ThibTrip <40694343+ThibTrip@users.noreply.github.com> Date: Mon, 25 Feb 2019 23:35:45 +0100 Subject: [PATCH 031/110] DOC: fixed geo accessor example in extending.rst (#25420) I realised "lon" and "lat" had just been switched with "longitude" and "latitude" in the following code block. So I used those names here as well. --- doc/source/development/extending.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e6928d9efde06..9e5034f6d3db0 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -33,8 +33,9 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): - if 'lat' not in obj.columns or 'lon' not in obj.columns: - raise AttributeError("Must have 'lat' and 'lon'.") + # verify there is a column latitude and a column longitude + if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + raise AttributeError("Must have 'latitude' and 'longitude'.") @property def center(self): From fe1654faa86836a0007bb513504e57c5c9935b8b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 27 Feb 2019 14:40:32 +0000 Subject: [PATCH 032/110] TST: numpy RuntimeWarning with Series.round() (#25432) --- pandas/tests/frame/test_analytics.py | 13 ++++++++++++- pandas/tests/series/test_analytics.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 2e690ebbfa121..43a45bb915819 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas.compat import PY35, lrange +from pandas.compat import PY2, PY35, is_platform_windows, lrange import pandas.util._test_decorators as td import pandas as pd @@ -1842,6 +1842,17 @@ def test_numpy_round(self): with pytest.raises(ValueError, match=msg): np.round(df, decimals=0, out=df) + @pytest.mark.xfail( + PY2 and is_platform_windows(), reason="numpy/numpy#7882", + raises=AssertionError, strict=True) + def test_numpy_round_nan(self): + # See gh-14197 + df = Series([1.53, np.nan, 0.06]).to_frame() + with tm.assert_produces_warning(None): + result = df.round() + expected = Series([2., np.nan, 0.]).to_frame() + tm.assert_frame_equal(result, expected) + def test_round_mixed_type(self): # GH 11885 df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4], diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6811e370726b2..1f265d574da15 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -9,7 +9,7 @@ from numpy import nan import pytest -from pandas.compat import PY35, lrange, range +from pandas.compat import PY2, PY35, is_platform_windows, lrange, range import pandas.util._test_decorators as td import pandas as pd @@ -285,6 +285,17 @@ def test_numpy_round(self): with pytest.raises(ValueError, match=msg): np.round(s, decimals=0, out=s) + @pytest.mark.xfail( + PY2 and is_platform_windows(), reason="numpy/numpy#7882", + raises=AssertionError, strict=True) + def test_numpy_round_nan(self): + # See gh-14197 + s = Series([1.53, np.nan, 0.06]) + with tm.assert_produces_warning(None): + result = s.round() + expected = Series([2., np.nan, 0.]) + assert_series_equal(result, expected) + def test_built_in_round(self): if not compat.PY3: pytest.skip( From 1490d0c790ba974b2e85c3c46ff721aee679d54a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 27 Feb 2019 22:26:53 +0000 Subject: [PATCH 033/110] CI: add __init__.py to isort skip list (#25455) --- ci/deps/azure-27-compat.yaml | 1 + ci/deps/azure-27-locale.yaml | 1 + ci/deps/azure-36-locale_slow.yaml | 1 + ci/deps/azure-37-locale.yaml | 1 + ci/deps/azure-37-numpydev.yaml | 1 + ci/deps/azure-macos-35.yaml | 1 + ci/deps/azure-windows-27.yaml | 1 + ci/deps/azure-windows-36.yaml | 1 + ci/deps/travis-27.yaml | 1 + ci/deps/travis-36-doc.yaml | 1 + ci/deps/travis-36-locale.yaml | 1 + ci/deps/travis-36-slow.yaml | 1 + ci/deps/travis-36.yaml | 1 + ci/deps/travis-37.yaml | 1 + setup.cfg | 20 ++++++++++++++++++++ 15 files changed, 34 insertions(+) diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index 986855c464852..c68b51fbd6644 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -21,6 +21,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - html5lib==1.0b2 - beautifulsoup4==4.2.1 diff --git a/ci/deps/azure-27-locale.yaml b/ci/deps/azure-27-locale.yaml index f73079ecbe3d2..5679c503caddc 100644 --- a/ci/deps/azure-27-locale.yaml +++ b/ci/deps/azure-27-locale.yaml @@ -24,6 +24,7 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - html5lib==1.0b2 - beautifulsoup4==4.2.1 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 6b8d38fd25082..de1f4ad0e9a76 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -30,5 +30,6 @@ dependencies: - pytest-xdist - pytest-mock - moto + - isort - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 569b71dae003b..a89e63a2b7d3a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -28,6 +28,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - hypothesis>=3.58.0 - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index a37be124cc546..3132de891299c 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -10,6 +10,7 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - "git+git://github.com/dateutil/dateutil.git" - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index d1fe926744ecd..9710bcb5bf43d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -25,6 +25,7 @@ dependencies: - pytest - pytest-xdist - pytest-mock + - isort - pip: - python-dateutil==2.5.3 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index 74faeed83c387..093c055e69553 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 94d67b3d37788..e9db271a75d9d 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -27,3 +27,4 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 4915c003bce4e..71b224b2c68c2 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -44,6 +44,7 @@ dependencies: - pytest-mock - moto==1.3.4 - hypothesis>=3.58.0 + - isort - pip: - backports.lzma - pandas-gbq diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml index 26f3a17432ab2..1a65d292ef085 100644 --- a/ci/deps/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -43,3 +43,4 @@ dependencies: # universal - pytest - pytest-xdist + - isort diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 2a7692f10752c..36dbb8013104a 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -32,5 +32,6 @@ dependencies: - pytest-xdist - pytest-mock - moto + - isort - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 7934d179c8618..f4b9091c4300b 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - isort diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 857c3fadfdaeb..e22529784b5ec 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -38,6 +38,7 @@ dependencies: - pytest-cov - pytest-mock - hypothesis>=3.58.0 + - isort - pip: - brotlipy - coverage diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 125750191de7d..a8a5df5894ba5 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -17,5 +17,6 @@ dependencies: - pytest-mock - hypothesis>=3.58.0 - s3fs + - isort - pip: - moto diff --git a/setup.cfg b/setup.cfg index b15c3ce8a110a..956aa23839e73 100644 --- a/setup.cfg +++ b/setup.cfg @@ -152,3 +152,23 @@ skip= asv_bench/benchmarks/dtypes.py asv_bench/benchmarks/strings.py asv_bench/benchmarks/period.py + pandas/__init__.py + pandas/plotting/__init__.py + pandas/tests/extension/decimal/__init__.py + pandas/tests/extension/base/__init__.py + pandas/io/msgpack/__init__.py + pandas/io/json/__init__.py + pandas/io/clipboard/__init__.py + pandas/io/excel/__init__.py + pandas/compat/__init__.py + pandas/compat/numpy/__init__.py + pandas/core/arrays/__init__.py + pandas/core/groupby/__init__.py + pandas/core/internals/__init__.py + pandas/api/__init__.py + pandas/api/extensions/__init__.py + pandas/api/types/__init__.py + pandas/_libs/__init__.py + pandas/_libs/tslibs/__init__.py + pandas/util/__init__.py + pandas/arrays/__init__.py From c9863865c217867583e8f6592ba88d9200601992 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 28 Feb 2019 00:30:33 +0000 Subject: [PATCH 034/110] DOC: CategoricalIndex doc string (#24852) --- pandas/core/indexes/category.py | 75 +++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c6d31339f950d..b494c41c3b58c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -42,20 +42,35 @@ typ='method', overwrite=True) class CategoricalIndex(Index, accessor.PandasDelegate): """ - Immutable Index implementing an ordered, sliceable set. CategoricalIndex - represents a sparsely populated Index with an underlying Categorical. + Index based on an underlying :class:`Categorical`. + + CategoricalIndex, like Categorical, can only take on a limited, + and usually fixed, number of possible values (`categories`). Also, + like Categorical, it might have an order, but numerical operations + (additions, divisions, ...) are not possible. Parameters ---------- - data : array-like or Categorical, (1-dimensional) - categories : optional, array-like - categories for the CategoricalIndex - ordered : boolean, - designating if the categories are ordered - copy : bool - Make a copy of input ndarray - name : object - Name to be stored in the index + data : array-like (1-dimensional) + The values of the categorical. If `categories` are given, values not in + `categories` will be replaced with NaN. + categories : index-like, optional + The categories for the categorical. Items need to be unique. + If the categories are not given here (and also not in `dtype`), they + will be inferred from the `data`. + ordered : bool, optional + Whether or not this categorical is treated as an ordered + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. + dtype : CategoricalDtype or the string "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. + + .. versionadded:: 0.21.0 + copy : bool, default False + Make a copy of input ndarray. + name : object, optional + Name to be stored in the index. Attributes ---------- @@ -75,9 +90,45 @@ class CategoricalIndex(Index, accessor.PandasDelegate): as_unordered map + Raises + ------ + ValueError + If the categories do not validate. + TypeError + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. + See Also -------- - Categorical, Index + Index : The base pandas Index type. + Categorical : A categorical array. + CategoricalDtype : Type for categorical data. + + Notes + ----- + See the `user guide + `_ + for more. + + Examples + -------- + >>> pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + ``CategoricalIndex`` can also be instantiated from a ``Categorical``: + + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.CategoricalIndex(c) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + + Ordered ``CategoricalIndex`` can have a min and max value. + + >>> ci = pd.CategoricalIndex(['a','b','c','a','b','c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> ci + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['c', 'b', 'a'], ordered=True, dtype='category') # noqa + >>> ci.min() + 'c' """ _typ = 'categoricalindex' From 70802c2a4bdd5e377b36b5dff0203cf844460fd6 Mon Sep 17 00:00:00 2001 From: Max van Deursen Date: Thu, 28 Feb 2019 13:41:07 +0100 Subject: [PATCH 035/110] DataFrame.drop Raises KeyError definition (#25474) --- pandas/core/frame.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 608e5c53ec094..a40733b7076b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3797,7 +3797,12 @@ def drop(self, labels=None, axis=0, index=None, columns=None, axis : {0 or 'index', 1 or 'columns'}, default 0 Whether to drop labels from the index (0 or 'index') or columns (1 or 'columns'). - index, columns : single label or list-like + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + + .. versionadded:: 0.21.0 + columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). @@ -3813,11 +3818,12 @@ def drop(self, labels=None, axis=0, index=None, columns=None, Returns ------- DataFrame + DataFrame without the removed index or column labels. Raises ------ KeyError - If none of the labels are found in the selected axis + If any of the labels is not found in the selected axis. See Also -------- @@ -3830,7 +3836,7 @@ def drop(self, labels=None, axis=0, index=None, columns=None, Examples -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3,4), + >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), ... columns=['A', 'B', 'C', 'D']) >>> df A B C D @@ -3867,7 +3873,7 @@ def drop(self, labels=None, axis=0, index=None, columns=None, >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3,0.2]]) + ... [1, 0.8], [0.3, 0.2]]) >>> df big small lama speed 45.0 30.0 From 3b570e33e2047e8261b3002955dcddd578f4c251 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Feb 2019 07:42:54 -0500 Subject: [PATCH 036/110] BUG: Keep column level name in resample nunique (#25469) Closes gh-23222 xref gh-23645 --- doc/source/reference/groupby.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/test_function.py | 9 +++++++++ pandas/tests/resample/test_datetime_index.py | 9 +++++++++ 5 files changed, 21 insertions(+) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 6ed85ff2fac43..c7f9113b53c22 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -99,6 +99,7 @@ application to columns of a specific data type. DataFrameGroupBy.idxmax DataFrameGroupBy.idxmin DataFrameGroupBy.mad + DataFrameGroupBy.nunique DataFrameGroupBy.pct_change DataFrameGroupBy.plot DataFrameGroupBy.quantile diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 170e7f14da397..ee16246a1421d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -210,6 +210,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) - - diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 52056a6842ed9..683c21f7bd47a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1579,6 +1579,7 @@ def groupby_series(obj, col=None): from pandas.core.reshape.concat import concat results = [groupby_series(obj[col], col) for col in obj.columns] results = concat(results, axis=1) + results.columns.names = obj.columns.names if not self.as_index: results.index = ibase.default_index(len(results)) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a884a37840f8a..1788b29a11082 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -897,6 +897,15 @@ def test_nunique_with_timegrouper(): tm.assert_series_equal(result, expected) +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = pd.DataFrame([1, 2, 2], + columns=pd.Index(['A'], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = pd.DataFrame([2], columns=test.columns) + tm.assert_frame_equal(result, expected) + + # count # -------------------------------- diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 71b100401ec21..ce675893d9907 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1135,6 +1135,15 @@ def test_resample_nunique(): assert_series_equal(result, expected) +def test_resample_nunique_preserves_column_level_names(): + # see gh-23222 + df = tm.makeTimeDataFrame(freq="1D").abs() + df.columns = pd.MultiIndex.from_arrays([df.columns.tolist()] * 2, + names=["lev0", "lev1"]) + result = df.resample("1h").nunique() + tm.assert_index_equal(df.columns, result.columns) + + def test_resample_nunique_with_date_gap(): # GH 13453 index = pd.date_range('1-1-2000', '2-15-2000', freq='h') From 84875c1e35014179213bf2556bb712337938c3e5 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Feb 2019 07:44:43 -0500 Subject: [PATCH 037/110] ERR: Correct error message in to_datetime (#25467) * ERR: Correct error message in to_datetime Closes gh-23830 xref gh-23969 --- doc/source/whatsnew/v0.25.0.rst | 3 ++- pandas/_libs/tslib.pyx | 8 +++++--- pandas/tests/indexes/datetimes/test_tools.py | 9 +++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ee16246a1421d..496a7c91f3ce9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -104,7 +104,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - +- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) +- - Categorical diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f932e236b5218..624872c1c56c6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -670,9 +670,11 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # dateutil parser will return incorrect result because # it will ignore nanoseconds if is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) + + # Still raise OutOfBoundsDatetime, + # as error message is informative. + raise + assert is_ignore return values, tz_out raise diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index b94935d2521eb..dd914d8a79837 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1868,6 +1868,15 @@ def test_invalid_origins_tzinfo(self): pd.to_datetime(1, unit='D', origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + @pytest.mark.parametrize("format", [ + None, "%Y-%m-%d %H:%M:%S" + ]) + def test_to_datetime_out_of_bounds_with_format_arg(self, format): + # see gh-23830 + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime("2417-10-27 00:00:00", format=format) + def test_processing_order(self): # make sure we handle out-of-bounds *before* # constructing the dates From ece6074694daa787237c5b221e0eb3254cbf62be Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Thu, 28 Feb 2019 13:45:15 +0100 Subject: [PATCH 038/110] Fix minor typo (#25458) Signed-off-by: Philippe Ombredanne --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb84a9a5810f4..523543ada235c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1815,7 +1815,7 @@ def __hash__(self): ' hashed'.format(self.__class__.__name__)) def __iter__(self): - """Iterate over infor axis""" + """Iterate over info axis""" return iter(self._info_axis) # can we get a better explanation of this? From 778affc64bca3cb6c298c095d5562df7d1e75276 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 28 Feb 2019 12:59:32 +0000 Subject: [PATCH 039/110] CI: Set pytest minversion to 4.0.2 (#25402) * CI: Set pytest minversion to 4.0.2 --- ci/deps/azure-27-compat.yaml | 2 +- ci/deps/azure-27-locale.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/azure-macos-35.yaml | 8 ++++---- ci/deps/azure-windows-27.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/travis-27.yaml | 2 +- ci/deps/travis-36-doc.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- ci/deps/travis-36-slow.yaml | 2 +- ci/deps/travis-36.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- doc/source/development/contributing.rst | 2 +- doc/source/install.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 14 ++++++++++++++ environment.yml | 2 +- pandas/util/_tester.py | 2 +- requirements-dev.txt | 2 +- setup.cfg | 1 + 21 files changed, 37 insertions(+), 22 deletions(-) diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index c68b51fbd6644..a7784f17d1956 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -18,7 +18,7 @@ dependencies: - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - isort diff --git a/ci/deps/azure-27-locale.yaml b/ci/deps/azure-27-locale.yaml index 5679c503caddc..8636a63d02fed 100644 --- a/ci/deps/azure-27-locale.yaml +++ b/ci/deps/azure-27-locale.yaml @@ -20,7 +20,7 @@ dependencies: - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index de1f4ad0e9a76..3f788e5ddcf39 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -26,7 +26,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - moto diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a89e63a2b7d3a..9d598cddce91a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -25,7 +25,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - isort diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 3132de891299c..e58c1f599279c 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -6,7 +6,7 @@ dependencies: - pytz - Cython>=0.28.2 # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 9710bcb5bf43d..2326e8092cc85 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -21,11 +21,11 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest - - pytest-xdist - - pytest-mock - isort - pip: - python-dateutil==2.5.3 + # universal + - pytest>=4.0.2 + - pytest-xdist + - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index 093c055e69553..f40efdfca3cbd 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -25,7 +25,7 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - moto diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index e9db271a75d9d..8517d340f2ba8 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -23,7 +23,7 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 71b224b2c68c2..a910af36a6b10 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -39,7 +39,7 @@ dependencies: - xlsxwriter=0.5.2 - xlwt=0.7.5 # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - moto==1.3.4 diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml index 1a65d292ef085..6f33bc58a8b21 100644 --- a/ci/deps/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -41,6 +41,6 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - isort diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 36dbb8013104a..34b289e6c0c2f 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -28,7 +28,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - moto diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index f4b9091c4300b..46875d59411d9 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -25,7 +25,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - moto diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index e22529784b5ec..06fc0d76a3d16 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -33,7 +33,7 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-cov - pytest-mock diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index a8a5df5894ba5..f71d29fe13378 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -12,7 +12,7 @@ dependencies: - nomkl - pyarrow - pytz - - pytest + - pytest>=4.0.2 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 511936467641e..1270bfec098e8 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -731,7 +731,7 @@ extensions in `numpy.testing .. note:: - The earliest supported pytest version is 3.6.0. + The earliest supported pytest version is 4.0.2. Writing tests ~~~~~~~~~~~~~ diff --git a/doc/source/install.rst b/doc/source/install.rst index 92364fcc9ebd2..5310667c403e5 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -202,7 +202,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 3.6 and `Hypothesis +`__ >= 4.0.2 and `Hypothesis `__ >= 3.58, then run: :: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 496a7c91f3ce9..d1fffbc9e2225 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -63,6 +63,20 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] +.. _whatsnew_0250.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`23519`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| pytest (dev) | 4.0.2 | | ++-----------------+-----------------+----------+ + .. _whatsnew_0250.api.other: Other API Changes diff --git a/environment.yml b/environment.yml index ce68dccca0c07..c1669c9f49017 100644 --- a/environment.yml +++ b/environment.yml @@ -19,7 +19,7 @@ dependencies: - hypothesis>=3.82 - isort - moto - - pytest>=4.0 + - pytest>=4.0.2 - pytest-mock - sphinx - numpydoc diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 18e8d415459fd..19b1cc700261c 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -11,7 +11,7 @@ def test(extra_args=None): try: import pytest except ImportError: - raise ImportError("Need pytest>=3.0 to run tests") + raise ImportError("Need pytest>=4.0.2 to run tests") try: import hypothesis # noqa except ImportError: diff --git a/requirements-dev.txt b/requirements-dev.txt index 22c01ebcef7f0..be84c6f29fdeb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,7 +10,7 @@ gitpython hypothesis>=3.82 isort moto -pytest>=4.0 +pytest>=4.0.2 pytest-mock sphinx numpydoc diff --git a/setup.cfg b/setup.cfg index 956aa23839e73..84b8f69a83f16 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ split_penalty_after_opening_bracket = 1000000 split_penalty_logical_operator = 30 [tool:pytest] +minversion = 4.0.2 testpaths = pandas markers = single: mark a test as single cpu only From 72367b7d2eed54e5063c17360a1506b853f9df30 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 28 Feb 2019 13:33:31 +0000 Subject: [PATCH 040/110] STY: use pytest.raises context manager (indexes) (#25447) --- .../tests/indexes/interval/test_interval.py | 38 ++++++--- pandas/tests/indexes/test_base.py | 47 ++++++----- pandas/tests/indexes/test_category.py | 77 +++++++++++-------- pandas/tests/indexes/test_common.py | 20 ++++- pandas/tests/indexes/test_numeric.py | 71 ++++++++++++----- .../indexes/timedeltas/test_arithmetic.py | 26 +++++-- .../indexes/timedeltas/test_construction.py | 11 ++- pandas/tests/indexes/timedeltas/test_ops.py | 4 +- .../timedeltas/test_partial_slicing.py | 4 +- .../indexes/timedeltas/test_timedelta.py | 10 ++- pandas/tests/indexes/timedeltas/test_tools.py | 16 +++- 11 files changed, 222 insertions(+), 102 deletions(-) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index e4f25ff143273..ba451da10573a 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -403,13 +403,16 @@ def test_get_item(self, closed): # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_get_loc_value(self): - pytest.raises(KeyError, self.index.get_loc, 0) + with pytest.raises(KeyError, match="^0$"): + self.index.get_loc(0) assert self.index.get_loc(0.5) == 0 assert self.index.get_loc(1) == 0 assert self.index.get_loc(1.5) == 1 assert self.index.get_loc(2) == 1 - pytest.raises(KeyError, self.index.get_loc, -1) - pytest.raises(KeyError, self.index.get_loc, 3) + with pytest.raises(KeyError, match="^-1$"): + self.index.get_loc(-1) + with pytest.raises(KeyError, match="^3$"): + self.index.get_loc(3) idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) assert idx.get_loc(0.5) == 0 @@ -419,10 +422,12 @@ def test_get_loc_value(self): tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)), np.array([0, 1], dtype='intp')) assert idx.get_loc(3) == 1 - pytest.raises(KeyError, idx.get_loc, 3.5) + with pytest.raises(KeyError, match=r"^3\.5$"): + idx.get_loc(3.5) idx = IntervalIndex.from_arrays([0, 2], [1, 3]) - pytest.raises(KeyError, idx.get_loc, 1.5) + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def slice_locs_cases(self, breaks): @@ -486,7 +491,9 @@ def test_slice_locs_decreasing_float64(self): # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_fails(self): index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) - with pytest.raises(KeyError): + msg = ("'can only get slices from an IntervalIndex if bounds are" + " non-overlapping and all monotonic increasing or decreasing'") + with pytest.raises(KeyError, match=msg): index.slice_locs(1, 2) # To be removed, replaced by test_interval_new.py (see #16316, #16386) @@ -494,9 +501,12 @@ def test_get_loc_interval(self): assert self.index.get_loc(Interval(0, 1)) == 0 assert self.index.get_loc(Interval(0, 0.5)) == 0 assert self.index.get_loc(Interval(0, 1, 'left')) == 0 - pytest.raises(KeyError, self.index.get_loc, Interval(2, 3)) - pytest.raises(KeyError, self.index.get_loc, - Interval(-1, 0, 'left')) + msg = r"Interval\(2, 3, closed='right'\)" + with pytest.raises(KeyError, match=msg): + self.index.get_loc(Interval(2, 3)) + msg = r"Interval\(-1, 0, closed='left'\)" + with pytest.raises(KeyError, match=msg): + self.index.get_loc(Interval(-1, 0, 'left')) # Make consistent with test_interval_new.py (see #16316, #16386) @pytest.mark.parametrize('item', [3, Interval(1, 4)]) @@ -981,9 +991,11 @@ def test_comparison(self): self.index > 0 with pytest.raises(TypeError, match='unorderable types'): self.index <= 0 - with pytest.raises(TypeError): + msg = r"unorderable types: Interval\(\) > int\(\)" + with pytest.raises(TypeError, match=msg): self.index > np.arange(2) - with pytest.raises(ValueError): + msg = "Lengths must match to compare" + with pytest.raises(ValueError, match=msg): self.index > np.arange(3) def test_missing_values(self, closed): @@ -993,7 +1005,9 @@ def test_missing_values(self, closed): [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) - with pytest.raises(ValueError): + msg = ("missing values must be missing in the same location both left" + " and right sides") + with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8415bab802239..26dcf7d6bc234 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta import math import operator +import re import sys import numpy as np @@ -107,7 +108,10 @@ def test_constructor_copy(self): def test_constructor_corner(self): # corner case - pytest.raises(TypeError, Index, 0) + msg = (r"Index\(\.\.\.\) must be called with a collection of some" + " kind, 0 was passed") + with pytest.raises(TypeError, match=msg): + Index(0) @pytest.mark.parametrize("index_vals", [ [('A', 1), 'B'], ['B', ('A', 1)]]) @@ -488,21 +492,22 @@ def test_constructor_cast(self): Index(["a", "b", "c"], dtype=float) def test_view_with_args(self): - restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', 'empty'] - - for i in restricted: - ind = self.indices[i] - - # with arguments - pytest.raises(TypeError, lambda: ind.view('i8')) - - # these are ok for i in list(set(self.indices.keys()) - set(restricted)): ind = self.indices[i] + ind.view('i8') - # with arguments + @pytest.mark.parametrize('index_type', [ + 'unicodeIndex', + 'strIndex', + pytest.param('catIndex', marks=pytest.mark.xfail(reason="gh-25464")), + 'boolIndex', + 'empty']) + def test_view_with_args_object_array_raises(self, index_type): + ind = self.indices[index_type] + msg = "Cannot change data-type for object array" + with pytest.raises(TypeError, match=msg): ind.view('i8') def test_astype(self): @@ -565,8 +570,8 @@ def test_delete(self, pos, expected): def test_delete_raises(self): index = Index(['a', 'b', 'c', 'd'], name='index') - with pytest.raises((IndexError, ValueError)): - # either depending on numpy version + msg = "index 5 is out of bounds for axis 0 with size 4" + with pytest.raises(IndexError, match=msg): index.delete(5) def test_identical(self): @@ -683,7 +688,9 @@ def test_empty_fancy_raises(self, attr): assert index[[]].identical(empty_index) # np.ndarray only accepts ndarray of int & bool dtypes, so should Index - pytest.raises(IndexError, index.__getitem__, empty_farr) + msg = r"arrays used as indices must be of integer \(or boolean\) type" + with pytest.raises(IndexError, match=msg): + index[empty_farr] @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): @@ -1426,13 +1433,14 @@ def test_get_indexer_strings(self, method, expected): def test_get_indexer_strings_raises(self): index = pd.Index(['b', 'c']) - with pytest.raises(TypeError): + msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" + with pytest.raises(TypeError, match=msg): index.get_indexer(['a', 'b', 'c', 'd'], method='nearest') - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=[2, 2, 2, 2]) @@ -1685,8 +1693,11 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) + msg = r"\"\[{}\] not found in axis\"".format( + re.escape(to_drop[1].__repr__())) for drop_me in to_drop[1], [to_drop[1]]: - pytest.raises(KeyError, removed.drop, drop_me) + with pytest.raises(KeyError, match=msg): + removed.drop(drop_me) @pytest.mark.parametrize("method,expected,sort", [ ('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d889135160ae2..95fac2f6ae05b 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -181,18 +181,21 @@ def test_create_categorical(self): expected = Categorical(['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) - def test_disallow_set_ops(self): - + @pytest.mark.parametrize('func,op_name', [ + (lambda idx: idx - idx, '__sub__'), + (lambda idx: idx + idx, '__add__'), + (lambda idx: idx - ['a', 'b'], '__sub__'), + (lambda idx: idx + ['a', 'b'], '__add__'), + (lambda idx: ['a', 'b'] - idx, '__rsub__'), + (lambda idx: ['a', 'b'] + idx, '__radd__'), + ]) + def test_disallow_set_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(['a', 'b'])) - - pytest.raises(TypeError, lambda: idx - idx) - pytest.raises(TypeError, lambda: idx + idx) - pytest.raises(TypeError, lambda: idx - ['a', 'b']) - pytest.raises(TypeError, lambda: idx + ['a', 'b']) - pytest.raises(TypeError, lambda: ['a', 'b'] - idx) - pytest.raises(TypeError, lambda: ['a', 'b'] + idx) + msg = "cannot perform {} with this index type: CategoricalIndex" + with pytest.raises(TypeError, match=msg.format(op_name)): + func(idx) def test_method_delegation(self): @@ -231,8 +234,9 @@ def test_method_delegation(self): list('aabbca'), categories=list('cabdef'), ordered=True)) # invalid - pytest.raises(ValueError, lambda: ci.set_categories( - list('cab'), inplace=True)) + msg = "cannot use inplace with CategoricalIndex" + with pytest.raises(ValueError, match=msg): + ci.set_categories(list('cab'), inplace=True) def test_contains(self): @@ -357,12 +361,11 @@ def test_append(self): tm.assert_index_equal(result, ci, exact=True) # appending with different categories or reordered is not ok - pytest.raises( - TypeError, - lambda: ci.append(ci.values.set_categories(list('abcd')))) - pytest.raises( - TypeError, - lambda: ci.append(ci.values.reorder_categories(list('abc')))) + msg = "all inputs must be Index" + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.set_categories(list('abcd'))) + with pytest.raises(TypeError, match=msg): + ci.append(ci.values.reorder_categories(list('abc'))) # with objects result = ci.append(Index(['c', 'a'])) @@ -370,7 +373,9 @@ def test_append(self): tm.assert_index_equal(result, expected, exact=True) # invalid objects - pytest.raises(TypeError, lambda: ci.append(Index(['a', 'd']))) + msg = "cannot append a non-category item to a CategoricalIndex" + with pytest.raises(TypeError, match=msg): + ci.append(Index(['a', 'd'])) # GH14298 - if base object is not categorical -> coerce to object result = Index(['c', 'a']).append(ci) @@ -406,7 +411,10 @@ def test_insert(self): tm.assert_index_equal(result, expected, exact=True) # invalid - pytest.raises(TypeError, lambda: ci.insert(0, 'd')) + msg = ("cannot insert an item into a CategoricalIndex that is not" + " already an existing category") + with pytest.raises(TypeError, match=msg): + ci.insert(0, 'd') # GH 18295 (test missing) expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) @@ -633,12 +641,16 @@ def test_get_indexer(self): r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) - pytest.raises(NotImplementedError, - lambda: idx2.get_indexer(idx1, method='pad')) - pytest.raises(NotImplementedError, - lambda: idx2.get_indexer(idx1, method='backfill')) - pytest.raises(NotImplementedError, - lambda: idx2.get_indexer(idx1, method='nearest')) + msg = ("method='pad' and method='backfill' not implemented yet for" + " CategoricalIndex") + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method='pad') + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method='backfill') + + msg = "method='nearest' not implemented yet for CategoricalIndex" + with pytest.raises(NotImplementedError, match=msg): + idx2.get_indexer(idx1, method='nearest') def test_get_loc(self): # GH 12531 @@ -776,12 +788,15 @@ def test_equals_categorical(self): # invalid comparisons with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(['a', 'b', 'c']) - pytest.raises(TypeError, lambda: ci1 == ci2) - pytest.raises( - TypeError, lambda: ci1 == Categorical(ci1.values, ordered=False)) - pytest.raises( - TypeError, - lambda: ci1 == Categorical(ci1.values, categories=list('abc'))) + + msg = ("categorical index comparisons must have the same categories" + " and ordered attributes") + with pytest.raises(TypeError, match=msg): + ci1 == ci2 + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, ordered=False) + with pytest.raises(TypeError, match=msg): + ci1 == Categorical(ci1.values, categories=list('abc')) # tests # make sure that we are testing for category inclusion properly diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index fd356202a8ce5..03448129a48fc 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -3,6 +3,8 @@ any index subclass. Makes use of the `indices` fixture defined in pandas/tests/indexes/conftest.py. """ +import re + import numpy as np import pytest @@ -189,8 +191,14 @@ def test_unique(self, indices): result = indices.unique(level=level) tm.assert_index_equal(result, expected) - for level in 3, 'wrong': - pytest.raises((IndexError, KeyError), indices.unique, level=level) + msg = "Too many levels: Index has only 1 level, not 4" + with pytest.raises(IndexError, match=msg): + indices.unique(level=3) + + msg = r"Level wrong must be same as name \({}\)".format( + re.escape(indices.name.__repr__())) + with pytest.raises(KeyError, match=msg): + indices.unique(level='wrong') def test_get_unique_index(self, indices): # MultiIndex tested separately @@ -239,12 +247,16 @@ def test_get_unique_index(self, indices): tm.assert_index_equal(result, expected) def test_sort(self, indices): - pytest.raises(TypeError, indices.sort) + msg = "cannot sort an Index object in-place, use sort_values instead" + with pytest.raises(TypeError, match=msg): + indices.sort() def test_mutability(self, indices): if not len(indices): pytest.skip('Skip check for empty Index') - pytest.raises(TypeError, indices.__setitem__, 0, indices[0]) + msg = "Index does not support mutable operations" + with pytest.raises(TypeError, match=msg): + indices[0] = indices[0] def test_view(self, indices): assert indices.view().name == indices.name diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a64340c02cd22..26413f4519eff 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- from datetime import datetime +import re import numpy as np import pytest from pandas._libs.tslibs import Timestamp -from pandas.compat import range +from pandas.compat import PY2, range import pandas as pd from pandas import Float64Index, Index, Int64Index, Series, UInt64Index +from pandas.api.types import pandas_dtype from pandas.tests.indexes.common import Base import pandas.util.testing as tm @@ -153,12 +155,22 @@ def test_constructor(self): result = Index(np.array([np.nan])) assert pd.isna(result.values).all() + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_constructor_invalid(self): # invalid - pytest.raises(TypeError, Float64Index, 0.) - pytest.raises(TypeError, Float64Index, ['a', 'b', 0.]) - pytest.raises(TypeError, Float64Index, [Timestamp('20130101')]) + msg = (r"Float64Index\(\.\.\.\) must be called with a collection of" + r" some kind, 0\.0 was passed") + with pytest.raises(TypeError, match=msg): + Float64Index(0.) + msg = ("String dtype not supported, you may need to explicitly cast to" + " a numeric type") + with pytest.raises(TypeError, match=msg): + Float64Index(['a', 'b', 0.]) + msg = (r"float\(\) argument must be a string or a number, not" + " 'Timestamp'") + with pytest.raises(TypeError, match=msg): + Float64Index([Timestamp('20130101')]) def test_constructor_coerce(self): @@ -216,12 +228,17 @@ def test_astype(self): # invalid for dtype in ['M8[ns]', 'm8[ns]']: - pytest.raises(TypeError, lambda: i.astype(dtype)) + msg = ("Cannot convert Float64Index to dtype {}; integer values" + " are required for conversion").format(pandas_dtype(dtype)) + with pytest.raises(TypeError, match=re.escape(msg)): + i.astype(dtype) # GH 13149 for dtype in ['int16', 'int32', 'int64']: i = Float64Index([0, 1.1, np.NAN]) - pytest.raises(ValueError, lambda: i.astype(dtype)) + msg = "Cannot convert NA to integer" + with pytest.raises(ValueError, match=msg): + i.astype(dtype) def test_type_coercion_fail(self, any_int_dtype): # see gh-15832 @@ -275,12 +292,16 @@ def test_get_loc(self): assert idx.get_loc(1.1, method) == loc assert idx.get_loc(1.1, method, tolerance=0.9) == loc - pytest.raises(KeyError, idx.get_loc, 'foo') - pytest.raises(KeyError, idx.get_loc, 1.5) - pytest.raises(KeyError, idx.get_loc, 1.5, method='pad', - tolerance=0.1) - pytest.raises(KeyError, idx.get_loc, True) - pytest.raises(KeyError, idx.get_loc, False) + with pytest.raises(KeyError, match="^'foo'$"): + idx.get_loc('foo') + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5) + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5, method='pad', tolerance=0.1) + with pytest.raises(KeyError, match="^True$"): + idx.get_loc(True) + with pytest.raises(KeyError, match="^False$"): + idx.get_loc(False) with pytest.raises(ValueError, match='must be numeric'): idx.get_loc(1.4, method='nearest', tolerance='foo') @@ -310,15 +331,20 @@ def test_get_loc_na(self): # not representable by slice idx = Float64Index([np.nan, 1, np.nan, np.nan]) assert idx.get_loc(1) == 1 - pytest.raises(KeyError, idx.slice_locs, np.nan) + msg = "'Cannot get left slice bound for non-unique label: nan" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) def test_get_loc_missing_nan(self): # GH 8569 idx = Float64Index([1, 2]) assert idx.get_loc(1) == 0 - pytest.raises(KeyError, idx.get_loc, 3) - pytest.raises(KeyError, idx.get_loc, np.nan) - pytest.raises(KeyError, idx.get_loc, [np.nan]) + with pytest.raises(KeyError, match=r"^3\.0$"): + idx.get_loc(3) + with pytest.raises(KeyError, match="^nan$"): + idx.get_loc(np.nan) + with pytest.raises(KeyError, match=r"^\[nan\]$"): + idx.get_loc([np.nan]) def test_contains_nans(self): i = Float64Index([1.0, 2.0, np.nan]) @@ -499,13 +525,17 @@ def test_union_noncomparable(self): tm.assert_index_equal(result, expected) def test_cant_or_shouldnt_cast(self): + msg = ("String dtype not supported, you may need to explicitly cast to" + " a numeric type") # can't data = ['foo', 'bar', 'baz'] - pytest.raises(TypeError, self._holder, data) + with pytest.raises(TypeError, match=msg): + self._holder(data) # shouldn't data = ['0', '1', '2'] - pytest.raises(TypeError, self._holder, data) + with pytest.raises(TypeError, match=msg): + self._holder(data) def test_view_index(self): self.index.view(Index) @@ -576,7 +606,10 @@ def test_constructor(self): tm.assert_index_equal(index, expected) # scalar raise Exception - pytest.raises(TypeError, Int64Index, 5) + msg = (r"Int64Index\(\.\.\.\) must be called with a collection of some" + " kind, 5 was passed") + with pytest.raises(TypeError, match=msg): + Int64Index(5) # copy arr = self.index.values diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 04977023d7c62..3173252e174ab 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -198,20 +198,34 @@ def test_ops_ndarray(self): expected = pd.to_timedelta(['2 days']).values tm.assert_numpy_array_equal(td + other, expected) tm.assert_numpy_array_equal(other + td, expected) - pytest.raises(TypeError, lambda: td + np.array([1])) - pytest.raises(TypeError, lambda: np.array([1]) + td) + msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td + np.array([1]) + msg = (r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" + " 'Timedelta'") + with pytest.raises(TypeError, match=msg): + np.array([1]) + td expected = pd.to_timedelta(['0 days']).values tm.assert_numpy_array_equal(td - other, expected) tm.assert_numpy_array_equal(-other + td, expected) - pytest.raises(TypeError, lambda: td - np.array([1])) - pytest.raises(TypeError, lambda: np.array([1]) - td) + msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td - np.array([1]) + msg = (r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" + " 'Timedelta'") + with pytest.raises(TypeError, match=msg): + np.array([1]) - td expected = pd.to_timedelta(['2 days']).values tm.assert_numpy_array_equal(td * np.array([2]), expected) tm.assert_numpy_array_equal(np.array([2]) * td, expected) - pytest.raises(TypeError, lambda: td * other) - pytest.raises(TypeError, lambda: other * td) + msg = ("ufunc multiply cannot use operands with types" + r" dtype\(' Date: Thu, 28 Feb 2019 13:34:21 +0000 Subject: [PATCH 041/110] STY: use pytest.raises context manager (tests/test_*) (#25452) * STY: use pytest.raises context manager (tests/test_*) * fix ci failures * skip py2 ci failure --- pandas/tests/test_algos.py | 25 +++++--- pandas/tests/test_config.py | 104 +++++++++++++++++++++----------- pandas/tests/test_multilevel.py | 10 ++- pandas/tests/test_nanops.py | 18 ++++-- pandas/tests/test_sorting.py | 12 +++- pandas/tests/test_strings.py | 50 +++++++++------ pandas/tests/test_window.py | 66 ++++++++++++-------- 7 files changed, 193 insertions(+), 92 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cb7426ce2f7c9..c56bf944699e2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -11,7 +11,7 @@ from pandas._libs import ( algos as libalgos, groupby as libgroupby, hashtable as ht) -from pandas.compat import lrange, range +from pandas.compat import PY2, lrange, range from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -224,11 +224,16 @@ def test_factorize_tuple_list(self, data, expected_label, expected_level): dtype=object) tm.assert_numpy_array_equal(result[1], expected_level_array) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) - pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True) + msg = ("'<' not supported between instances of 'complex' and" + r" 'complex'|" + r"unorderable types: complex\(\) > complex\(\)") + with pytest.raises(TypeError, match=msg): + algos.factorize(x17[::-1], sort=True) def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) @@ -589,9 +594,14 @@ class TestIsin(object): def test_invalid(self): - pytest.raises(TypeError, lambda: algos.isin(1, 1)) - pytest.raises(TypeError, lambda: algos.isin(1, [1])) - pytest.raises(TypeError, lambda: algos.isin([1], 1)) + msg = (r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[int\]") + with pytest.raises(TypeError, match=msg): + algos.isin(1, 1) + with pytest.raises(TypeError, match=msg): + algos.isin(1, [1]) + with pytest.raises(TypeError, match=msg): + algos.isin([1], 1) def test_basic(self): @@ -819,8 +829,9 @@ def test_value_counts_dtypes(self): result = algos.value_counts(Series([1, 1., '1'])) # object assert len(result) == 2 - pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1), - ['1', 1]) + msg = "bins argument only works with numeric data" + with pytest.raises(TypeError, match=msg): + algos.value_counts(['1', 1], bins=1) def test_value_counts_nat(self): td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]') diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index 54db3887850ea..baca66e0361ad 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -3,7 +3,10 @@ import pytest +from pandas.compat import PY2 + import pandas as pd +from pandas.core.config import OptionError class TestConfig(object): @@ -48,26 +51,35 @@ def test_is_one_of_factory(self): v(12) v(None) - pytest.raises(ValueError, v, 1.1) + msg = r"Value must be one of None\|12" + with pytest.raises(ValueError, match=msg): + v(1.1) def test_register_option(self): self.cf.register_option('a', 1, 'doc') # can't register an already registered option - pytest.raises(KeyError, self.cf.register_option, 'a', 1, 'doc') + msg = "Option 'a' has already been registered" + with pytest.raises(OptionError, match=msg): + self.cf.register_option('a', 1, 'doc') # can't register an already registered option - pytest.raises(KeyError, self.cf.register_option, 'a.b.c.d1', 1, - 'doc') - pytest.raises(KeyError, self.cf.register_option, 'a.b.c.d2', 1, - 'doc') + msg = "Path prefix to option 'a' is already an option" + with pytest.raises(OptionError, match=msg): + self.cf.register_option('a.b.c.d1', 1, 'doc') + with pytest.raises(OptionError, match=msg): + self.cf.register_option('a.b.c.d2', 1, 'doc') # no python keywords - pytest.raises(ValueError, self.cf.register_option, 'for', 0) - pytest.raises(ValueError, self.cf.register_option, 'a.for.b', 0) + msg = "for is a python keyword" + with pytest.raises(ValueError, match=msg): + self.cf.register_option('for', 0) + with pytest.raises(ValueError, match=msg): + self.cf.register_option('a.for.b', 0) # must be valid identifier (ensure attribute access works) - pytest.raises(ValueError, self.cf.register_option, - 'Oh my Goddess!', 0) + msg = "oh my goddess! is not a valid identifier" + with pytest.raises(ValueError, match=msg): + self.cf.register_option('Oh my Goddess!', 0) # we can register options several levels deep # without predefining the intermediate steps @@ -90,7 +102,9 @@ def test_describe_option(self): self.cf.register_option('l', "foo") # non-existent keys raise KeyError - pytest.raises(KeyError, self.cf.describe_option, 'no.such.key') + msg = r"No such keys\(s\)" + with pytest.raises(OptionError, match=msg): + self.cf.describe_option('no.such.key') # we can get the description for any key we registered assert 'doc' in self.cf.describe_option('a', _print_desc=False) @@ -122,7 +136,9 @@ def test_case_insensitive(self): assert self.cf.get_option('kAnBaN') == 2 # gets of non-existent keys fail - pytest.raises(KeyError, self.cf.get_option, 'no_such_option') + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + self.cf.get_option('no_such_option') self.cf.deprecate_option('KanBan') assert self.cf._is_deprecated('kAnBaN') @@ -138,7 +154,9 @@ def test_get_option(self): assert self.cf.get_option('b.b') is None # gets of non-existent keys fail - pytest.raises(KeyError, self.cf.get_option, 'no_such_option') + msg = r"No such keys\(s\): 'no_such_option'" + with pytest.raises(OptionError, match=msg): + self.cf.get_option('no_such_option') def test_set_option(self): self.cf.register_option('a', 1, 'doc') @@ -157,16 +175,24 @@ def test_set_option(self): assert self.cf.get_option('b.c') == 'wurld' assert self.cf.get_option('b.b') == 1.1 - pytest.raises(KeyError, self.cf.set_option, 'no.such.key', None) + msg = r"No such keys\(s\): 'no.such.key'" + with pytest.raises(OptionError, match=msg): + self.cf.set_option('no.such.key', None) def test_set_option_empty_args(self): - pytest.raises(ValueError, self.cf.set_option) + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option() def test_set_option_uneven_args(self): - pytest.raises(ValueError, self.cf.set_option, 'a.b', 2, 'b.c') + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option('a.b', 2, 'b.c') def test_set_option_invalid_single_argument_type(self): - pytest.raises(ValueError, self.cf.set_option, 2) + msg = "Must provide an even number of non-keyword arguments" + with pytest.raises(ValueError, match=msg): + self.cf.set_option(2) def test_set_option_multiple(self): self.cf.register_option('a', 1, 'doc') @@ -183,27 +209,36 @@ def test_set_option_multiple(self): assert self.cf.get_option('b.c') is None assert self.cf.get_option('b.b') == 10.0 + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_validation(self): self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) self.cf.register_option('b.c', 'hullo', 'doc2', validator=self.cf.is_text) - pytest.raises(ValueError, self.cf.register_option, 'a.b.c.d2', - 'NO', 'doc', validator=self.cf.is_int) + msg = "Value must have type ''" + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + 'a.b.c.d2', 'NO', 'doc', validator=self.cf.is_int) self.cf.set_option('a', 2) # int is_int self.cf.set_option('b.c', 'wurld') # str is_str - pytest.raises( - ValueError, self.cf.set_option, 'a', None) # None not is_int - pytest.raises(ValueError, self.cf.set_option, 'a', 'ab') - pytest.raises(ValueError, self.cf.set_option, 'b.c', 1) + # None not is_int + with pytest.raises(ValueError, match=msg): + self.cf.set_option('a', None) + with pytest.raises(ValueError, match=msg): + self.cf.set_option('a', 'ab') + + msg = r"Value must be an instance of \|" + with pytest.raises(ValueError, match=msg): + self.cf.set_option('b.c', 1) validator = self.cf.is_one_of_factory([None, self.cf.is_callable]) self.cf.register_option('b', lambda: None, 'doc', validator=validator) self.cf.set_option('b', '%.1f'.format) # Formatter is callable self.cf.set_option('b', None) # Formatter is none (default) - pytest.raises(ValueError, self.cf.set_option, 'b', '%.1f') + with pytest.raises(ValueError, match="Value must be a callable"): + self.cf.set_option('b', '%.1f') def test_reset_option(self): self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) @@ -267,8 +302,9 @@ def test_deprecate_option(self): assert 'eprecated' in str(w[-1]) # we get the default message assert 'nifty_ver' in str(w[-1]) # with the removal_ver quoted - pytest.raises( - KeyError, self.cf.deprecate_option, 'a') # can't depr. twice + msg = "Option 'a' has already been defined as deprecated" + with pytest.raises(OptionError, match=msg): + self.cf.deprecate_option('a') self.cf.deprecate_option('b.c', 'zounds!') with warnings.catch_warnings(record=True) as w: @@ -374,12 +410,6 @@ def eq(val): def test_attribute_access(self): holder = [] - def f(): - options.b = 1 - - def f2(): - options.display = 1 - def f3(key): holder.append(True) @@ -397,8 +427,11 @@ def f3(key): self.cf.reset_option("a") assert options.a == self.cf.get_option("a", 0) - pytest.raises(KeyError, f) - pytest.raises(KeyError, f2) + msg = "You can only set the value of existing options" + with pytest.raises(OptionError, match=msg): + options.b = 1 + with pytest.raises(OptionError, match=msg): + options.display = 1 # make sure callback kicks when using this form of setting options.c = 1 @@ -429,5 +462,6 @@ def test_option_context_scope(self): def test_dictwrapper_getattr(self): options = self.cf.options # GH 19789 - pytest.raises(self.cf.OptionError, getattr, options, 'bananas') + with pytest.raises(OptionError, match="No such option"): + options.bananas assert not hasattr(options, 'bananas') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4ea7e9b8ec9a4..a9a59c6d95373 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -886,8 +886,11 @@ def test_count(self): tm.assert_series_equal(result, expect, check_names=False) assert result.index.name == 'a' - pytest.raises(KeyError, series.count, 'x') - pytest.raises(KeyError, frame.count, level='x') + msg = "Level x not found" + with pytest.raises(KeyError, match=msg): + series.count('x') + with pytest.raises(KeyError, match=msg): + frame.count(level='x') @pytest.mark.parametrize('op', AGG_FUNCTIONS) @pytest.mark.parametrize('level', [0, 1]) @@ -1119,7 +1122,8 @@ def test_level_with_tuples(self): tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) - pytest.raises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) + with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"): + series[('foo', 'bar', 0), 2] result = frame.loc[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index cf5ef6cf15eca..d1893b7efbc41 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -7,6 +7,7 @@ import numpy as np import pytest +from pandas.compat import PY2 from pandas.compat.numpy import _np_version_under1p13 import pandas.util._test_decorators as td @@ -728,6 +729,7 @@ def test_numeric_values(self): # Test complex assert nanops._ensure_numeric(1 + 2j) == 1 + 2j + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_ndarray(self): # Test numeric ndarray values = np.array([1, 2, 3]) @@ -743,7 +745,9 @@ def test_ndarray(self): # Test non-convertible string ndarray s_values = np.array(['foo', 'bar', 'baz'], dtype=object) - pytest.raises(ValueError, lambda: nanops._ensure_numeric(s_values)) + msg = r"could not convert string to float: '(foo|baz)'" + with pytest.raises(ValueError, match=msg): + nanops._ensure_numeric(s_values) def test_convertable_values(self): assert np.allclose(nanops._ensure_numeric('1'), 1.0) @@ -751,9 +755,15 @@ def test_convertable_values(self): assert np.allclose(nanops._ensure_numeric('1+1j'), 1 + 1j) def test_non_convertable_values(self): - pytest.raises(TypeError, lambda: nanops._ensure_numeric('foo')) - pytest.raises(TypeError, lambda: nanops._ensure_numeric({})) - pytest.raises(TypeError, lambda: nanops._ensure_numeric([])) + msg = "Could not convert foo to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric('foo') + msg = "Could not convert {} to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric({}) + msg = r"Could not convert \[\] to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric([]) class TestNanvarFixedValues(object): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 7500cbb3cfc3a..e83bdb1af9121 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -7,6 +7,8 @@ from numpy import nan import pytest +from pandas.compat import PY2 + from pandas import DataFrame, MultiIndex, Series, compat, concat, merge from pandas.core import common as com from pandas.core.sorting import ( @@ -403,15 +405,21 @@ def test_mixed_integer_from_list(self): expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) + msg = ("'<' not supported between instances of 'datetime.datetime'" + r" and 'int'|" + r"unorderable types: int\(\) > datetime.datetime\(\)") if compat.PY2: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception with warnings.catch_warnings(): - pytest.raises(TypeError, safe_sort, arr) + with pytest.raises(TypeError, match=msg): + safe_sort(arr) else: - pytest.raises(TypeError, safe_sort, arr) + with pytest.raises(TypeError, match=msg): + safe_sort(arr) def test_exceptions(self): with pytest.raises(TypeError, diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7cea3be03d1a7..bbcdc24f58f9b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -10,7 +10,7 @@ import pytest import pandas.compat as compat -from pandas.compat import PY3, range, u +from pandas.compat import PY2, PY3, range, u from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna import pandas.core.strings as strings @@ -1002,11 +1002,13 @@ def test_replace(self): tm.assert_series_equal(result, exp) # GH 13438 + msg = "repl must be a string or callable" for klass in (Series, Index): for repl in (None, 3, {'a': 'b'}): for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): values = klass(data) - pytest.raises(TypeError, values.str.replace, 'a', repl) + with pytest.raises(TypeError, match=msg): + values.str.replace('a', repl) def test_replace_callable(self): # GH 15055 @@ -1123,10 +1125,14 @@ def test_replace_literal(self): callable_repl = lambda m: m.group(0).swapcase() compiled_pat = re.compile('[a-z][A-Z]{2}') - pytest.raises(ValueError, values.str.replace, 'abc', callable_repl, - regex=False) - pytest.raises(ValueError, values.str.replace, compiled_pat, '', - regex=False) + msg = "Cannot use a callable replacement when regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace('abc', callable_repl, regex=False) + + msg = ("Cannot use a compiled regex as replacement pattern with" + " regex=False") + with pytest.raises(ValueError, match=msg): + values.str.replace(compiled_pat, '', regex=False) def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) @@ -1242,12 +1248,13 @@ def test_extract_expand_False(self): for klass in [Series, Index]: # no groups s_or_idx = klass(['A1', 'B2', 'C3']) - f = lambda: s_or_idx.str.extract('[ABC][123]', expand=False) - pytest.raises(ValueError, f) + msg = "pattern contains no capture groups" + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract('[ABC][123]', expand=False) # only non-capturing groups - f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=False) - pytest.raises(ValueError, f) + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract('(?:[AB]).*', expand=False) # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) @@ -1387,12 +1394,13 @@ def test_extract_expand_True(self): for klass in [Series, Index]: # no groups s_or_idx = klass(['A1', 'B2', 'C3']) - f = lambda: s_or_idx.str.extract('[ABC][123]', expand=True) - pytest.raises(ValueError, f) + msg = "pattern contains no capture groups" + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract('[ABC][123]', expand=True) # only non-capturing groups - f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=True) - pytest.raises(ValueError, f) + with pytest.raises(ValueError, match=msg): + s_or_idx.str.extract('(?:[AB]).*', expand=True) # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) @@ -3315,10 +3323,14 @@ def test_encode_decode(self): tm.assert_series_equal(result, exp) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_encode_decode_errors(self): encodeBase = Series([u('a'), u('b'), u('a\x9d')]) - pytest.raises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252') + msg = (r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to ") + with pytest.raises(UnicodeEncodeError, match=msg): + encodeBase.str.encode('cp1252') f = lambda x: x.encode('cp1252', 'ignore') result = encodeBase.str.encode('cp1252', 'ignore') @@ -3327,7 +3339,10 @@ def test_encode_decode_errors(self): decodeBase = Series([b'a', b'b', b'a\x9d']) - pytest.raises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252') + msg = ("'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to ") + with pytest.raises(UnicodeDecodeError, match=msg): + decodeBase.str.decode('cp1252') f = lambda x: x.decode('cp1252', 'ignore') result = decodeBase.str.decode('cp1252', 'ignore') @@ -3418,7 +3433,8 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - pytest.raises(TypeError, lhs.str.cat, rhs) + with pytest.raises(TypeError, match="can't concat str to bytes"): + lhs.str.cat(rhs) else: result = lhs.str.cat(rhs) expected = Series(np.array( diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index e816d4c04344a..ce9d1888b8e96 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -89,9 +89,8 @@ def test_getitem(self): def test_select_bad_cols(self): df = DataFrame([[1, 2]], columns=['A', 'B']) g = df.rolling(window=5) - pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']] - - pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] + with pytest.raises(KeyError, match="Columns not found: 'C'"): + g[['C']] with pytest.raises(KeyError, match='^[^A]+$'): # A should not be referenced as a bad column... # will have to rethink regex if you change message! @@ -102,7 +101,9 @@ def test_attribute_access(self): df = DataFrame([[1, 2]], columns=['A', 'B']) r = df.rolling(window=5) tm.assert_series_equal(r.A.sum(), r['A'].sum()) - pytest.raises(AttributeError, lambda: r.F) + msg = "'Rolling' object has no attribute 'F'" + with pytest.raises(AttributeError, match=msg): + r.F def tests_skip_nuisance(self): @@ -217,12 +218,11 @@ def test_agg_nested_dicts(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) r = df.rolling(window=3) - def f(): + msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" + with pytest.raises(SpecificationError, match=msg): r.aggregate({'r1': {'A': ['mean', 'sum']}, 'r2': {'B': ['mean', 'sum']}}) - pytest.raises(SpecificationError, f) - expected = concat([r['A'].mean(), r['A'].std(), r['B'].mean(), r['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -1806,26 +1806,38 @@ def test_ewm_alpha_arg(self): def test_ewm_domain_checks(self): # GH 12492 s = Series(self.arr) - # com must satisfy: com >= 0 - pytest.raises(ValueError, s.ewm, com=-0.1) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) s.ewm(com=0.0) s.ewm(com=0.1) - # span must satisfy: span >= 1 - pytest.raises(ValueError, s.ewm, span=-0.1) - pytest.raises(ValueError, s.ewm, span=0.0) - pytest.raises(ValueError, s.ewm, span=0.9) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) s.ewm(span=1.0) s.ewm(span=1.1) - # halflife must satisfy: halflife > 0 - pytest.raises(ValueError, s.ewm, halflife=-0.1) - pytest.raises(ValueError, s.ewm, halflife=0.0) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) s.ewm(halflife=0.1) - # alpha must satisfy: 0 < alpha <= 1 - pytest.raises(ValueError, s.ewm, alpha=-0.1) - pytest.raises(ValueError, s.ewm, alpha=0.0) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) s.ewm(alpha=0.1) s.ewm(alpha=1.0) - pytest.raises(ValueError, s.ewm, alpha=1.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) @pytest.mark.parametrize('method', ['mean', 'vol', 'var']) def test_ew_empty_series(self, method): @@ -2598,7 +2610,10 @@ def get_result(obj, obj2=None): def test_flex_binary_moment(self): # GH3155 # don't blow the stack - pytest.raises(TypeError, rwindow._flex_binary_moment, 5, 6, None) + msg = ("arguments to moment function must be of type" + " np.ndarray/Series/DataFrame") + with pytest.raises(TypeError, match=msg): + rwindow._flex_binary_moment(5, 6, None) def test_corr_sanity(self): # GH 3155 @@ -2682,7 +2697,10 @@ def func(A, B, com, **kwargs): Series([1.]), Series([1.]), 50, min_periods=min_periods) tm.assert_series_equal(result, Series([np.NaN])) - pytest.raises(Exception, func, A, randn(50), 20, min_periods=5) + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + func(A, randn(50), 20, min_periods=5) def test_expanding_apply_args_kwargs(self, raw): @@ -3266,9 +3284,9 @@ def setup_method(self, method): def test_mutated(self): - def f(): + msg = r"group\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg): self.frame.groupby('A', foo=1) - pytest.raises(TypeError, f) g = self.frame.groupby('A') assert not g.mutated From e52f06394253b4e0bac56fe3e146fd0f20fc620c Mon Sep 17 00:00:00 2001 From: Nicholas Musolino Date: Thu, 28 Feb 2019 08:35:01 -0500 Subject: [PATCH 042/110] Fix minor error in dynamic load function (#25256) --- scripts/tests/test_validate_docstrings.py | 28 +++++++++++++++++++++++ scripts/validate_docstrings.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index bb58449843096..09fb5a30cbc3b 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -4,6 +4,8 @@ import textwrap import pytest import numpy as np +import pandas as pd + import validate_docstrings validate_one = validate_docstrings.validate_one @@ -1004,6 +1006,32 @@ def test_item_subsection(self, idx, subsection): assert result[idx][3] == subsection +class TestDocstringClass(object): + @pytest.mark.parametrize('name, expected_obj', + [('pandas.isnull', pd.isnull), + ('pandas.DataFrame', pd.DataFrame), + ('pandas.Series.sum', pd.Series.sum)]) + def test_resolves_class_name(self, name, expected_obj): + d = validate_docstrings.Docstring(name) + assert d.obj is expected_obj + + @pytest.mark.parametrize('invalid_name', ['panda', 'panda.DataFrame']) + def test_raises_for_invalid_module_name(self, invalid_name): + msg = 'No module can be imported from "{}"'.format(invalid_name) + with pytest.raises(ImportError, match=msg): + validate_docstrings.Docstring(invalid_name) + + @pytest.mark.parametrize('invalid_name', + ['pandas.BadClassName', + 'pandas.Series.bad_method_name']) + def test_raises_for_invalid_attribute_name(self, invalid_name): + name_components = invalid_name.split('.') + obj_name, invalid_attr_name = name_components[-2], name_components[-1] + msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) + with pytest.raises(AttributeError, match=msg): + validate_docstrings.Docstring(invalid_name) + + class TestMainFunction(object): def test_exit_status_for_validate_one(self, monkeypatch): monkeypatch.setattr( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index bce33f7e78daa..20f32124a2532 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -267,7 +267,7 @@ def _load_obj(name): else: continue - if 'module' not in locals(): + if 'obj' not in locals(): raise ImportError('No module can be imported ' 'from "{}"'.format(name)) From 64e5612238225d65fd28ffc72ee91c3eddcc9449 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 28 Feb 2019 05:36:57 -0800 Subject: [PATCH 043/110] Cythonized GroupBy Quantile (#20405) --- asv_bench/benchmarks/groupby.py | 7 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/groupby.pxd | 6 ++ pandas/_libs/groupby.pyx | 101 +++++++++++++++++++++++++ pandas/core/groupby/groupby.py | 103 +++++++++++++++++++++++--- pandas/tests/groupby/test_function.py | 49 ++++++++++++ pandas/tests/groupby/test_groupby.py | 10 +-- 7 files changed, 258 insertions(+), 19 deletions(-) create mode 100644 pandas/_libs/groupby.pxd diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 59e43ee22afde..27d279bb90a31 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -14,7 +14,7 @@ method_blacklist = { 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std'}, + 'var', 'mad', 'describe', 'std', 'quantile'}, 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', 'std'} @@ -316,8 +316,9 @@ class GroupByMethods(object): ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', - 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', - 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift', + 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts', + 'var'], ['direct', 'transformation']] def setup(self, dtype, method, application): diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d1fffbc9e2225..a591c498d00c3 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -112,6 +112,7 @@ Performance Improvements - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) +- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/_libs/groupby.pxd b/pandas/_libs/groupby.pxd new file mode 100644 index 0000000000000..70ad8a62871e9 --- /dev/null +++ b/pandas/_libs/groupby.pxd @@ -0,0 +1,6 @@ +cdef enum InterpolationEnumType: + INTERPOLATION_LINEAR, + INTERPOLATION_LOWER, + INTERPOLATION_HIGHER, + INTERPOLATION_NEAREST, + INTERPOLATION_MIDPOINT diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e6b6e2c8a0055..71e25c3955a6d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -644,5 +644,106 @@ def _group_ohlc(floating[:, :] out, group_ohlc_float32 = _group_ohlc['float'] group_ohlc_float64 = _group_ohlc['double'] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_quantile(ndarray[float64_t] out, + ndarray[int64_t] labels, + numeric[:] values, + ndarray[uint8_t] mask, + float64_t q, + object interpolation): + """ + Calculate the quantile per group. + + Parameters + ---------- + out : ndarray + Array of aggregated values that will be written to. + labels : ndarray + Array containing the unique group labels. + values : ndarray + Array containing the values to apply the function against. + q : float + The quantile value to search for. + + Notes + ----- + Rather than explicitly returning a value, this function modifies the + provided `out` parameter. + """ + cdef: + Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz + Py_ssize_t grp_start=0, idx=0 + int64_t lab + uint8_t interp + float64_t q_idx, frac, val, next_val + ndarray[int64_t] counts, non_na_counts, sort_arr + + assert values.shape[0] == N + inter_methods = { + 'linear': INTERPOLATION_LINEAR, + 'lower': INTERPOLATION_LOWER, + 'higher': INTERPOLATION_HIGHER, + 'nearest': INTERPOLATION_NEAREST, + 'midpoint': INTERPOLATION_MIDPOINT, + } + interp = inter_methods[interpolation] + + counts = np.zeros_like(out, dtype=np.int64) + non_na_counts = np.zeros_like(out, dtype=np.int64) + ngroups = len(counts) + + # First figure out the size of every group + with nogil: + for i in range(N): + lab = labels[i] + counts[lab] += 1 + if not mask[i]: + non_na_counts[lab] += 1 + + # Get an index of values sorted by labels and then values + order = (values, labels) + sort_arr = np.lexsort(order).astype(np.int64, copy=False) + + with nogil: + for i in range(ngroups): + # Figure out how many group elements there are + grp_sz = counts[i] + non_na_sz = non_na_counts[i] + + if non_na_sz == 0: + out[i] = NaN + else: + # Calculate where to retrieve the desired value + # Casting to int will intentionaly truncate result + idx = grp_start + (q * (non_na_sz - 1)) + + val = values[sort_arr[idx]] + # If requested quantile falls evenly on a particular index + # then write that index's value out. Otherwise interpolate + q_idx = q * (non_na_sz - 1) + frac = q_idx % 1 + + if frac == 0.0 or interp == INTERPOLATION_LOWER: + out[i] = val + else: + next_val = values[sort_arr[idx + 1]] + if interp == INTERPOLATION_LINEAR: + out[i] = val + (next_val - val) * frac + elif interp == INTERPOLATION_HIGHER: + out[i] = next_val + elif interp == INTERPOLATION_MIDPOINT: + out[i] = (val + next_val) / 2.0 + elif interp == INTERPOLATION_NEAREST: + if frac > .5 or (frac == .5 and q > .5): # Always OK? + out[i] = next_val + else: + out[i] = val + + # Increment the index reference in sorted_arr for the next group + grp_start += grp_sz + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c63bc5164e25b..c364f069bf53d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -29,6 +29,8 @@ class providing the base-class of operations. ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna +from pandas.api.types import ( + is_datetime64_dtype, is_integer_dtype, is_object_dtype) import pandas.core.algorithms as algorithms from pandas.core.base import ( DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) @@ -1024,15 +1026,17 @@ def _bool_agg(self, val_test, skipna): """ def objs_to_bool(vals): - try: - vals = vals.astype(np.bool) - except ValueError: # for objects + # type: np.ndarray -> (np.ndarray, typing.Type) + if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) + else: + vals = vals.astype(np.bool) - return vals.view(np.uint8) + return vals.view(np.uint8), np.bool - def result_to_bool(result): - return result.astype(np.bool, copy=False) + def result_to_bool(result, inference): + # type: (np.ndarray, typing.Type) -> np.ndarray + return result.astype(inference, copy=False) return self._get_cythonized_result('group_any_all', self.grouper, aggregate=True, @@ -1688,6 +1692,75 @@ def nth(self, n, dropna=None): return result + def quantile(self, q=0.5, interpolation='linear'): + """ + Return group values at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value(s) between 0 and 1 providing the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Method to use when the desired quantile falls between two points. + + Returns + ------- + Series or DataFrame + Return type determined by caller of GroupBy object. + + See Also + -------- + Series.quantile : Similar method for Series. + DataFrame.quantile : Similar method for DataFrame. + numpy.percentile : NumPy method to compute qth percentile. + + Examples + -------- + >>> df = pd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + """ + + def pre_processor(vals): + # type: np.ndarray -> (np.ndarray, Optional[typing.Type]) + if is_object_dtype(vals): + raise TypeError("'quantile' cannot be performed against " + "'object' dtypes!") + + inference = None + if is_integer_dtype(vals): + inference = np.int64 + elif is_datetime64_dtype(vals): + inference = 'datetime64[ns]' + vals = vals.astype(np.float) + + return vals, inference + + def post_processor(vals, inference): + # type: (np.ndarray, Optional[typing.Type]) -> np.ndarray + if inference: + # Check for edge case + if not (is_integer_dtype(inference) and + interpolation in {'linear', 'midpoint'}): + vals = vals.astype(inference) + + return vals + + return self._get_cythonized_result('group_quantile', self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, interpolation=interpolation) + @Substitution(name='groupby') def ngroup(self, ascending=True): """ @@ -1924,10 +1997,16 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, Whether the result of the Cython operation is an index of values to be retrieved, instead of the actual values themselves pre_processing : function, default None - Function to be applied to `values` prior to passing to Cython - Raises if `needs_values` is False + Function to be applied to `values` prior to passing to Cython. + Function should return a tuple where the first element is the + values to be passed to Cython and the second element is an optional + type which the values should be converted to after being returned + by the Cython operation. Raises if `needs_values` is False. post_processing : function, default None - Function to be applied to result of Cython function + Function to be applied to result of Cython function. Should accept + an array of values as the first argument and type inferences as its + second argument, i.e. the signature should be + (ndarray, typing.Type). **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -1963,10 +2042,12 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) + inferences = None + if needs_values: vals = obj.values if pre_processing: - vals = pre_processing(vals) + vals, inferences = pre_processing(vals) func = partial(func, vals) if needs_mask: @@ -1982,7 +2063,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result = algorithms.take_nd(obj.values, result) if post_processing: - result = post_processing(result) + result = post_processing(result, inferences) output[name] = result diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1788b29a11082..b5e328ef64424 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1069,6 +1069,55 @@ def test_size(df): tm.assert_series_equal(df.groupby('A').size(), out) +# quantile +# -------------------------------- +@pytest.mark.parametrize("interpolation", [ + "linear", "lower", "higher", "nearest", "midpoint"]) +@pytest.mark.parametrize("a_vals,b_vals", [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]), + # Missing data + ([1., np.nan, 3., np.nan, 5.], [5., np.nan, 3., np.nan, 1.]), + ([np.nan, 4., np.nan, 2., np.nan], [np.nan, 4., np.nan, 2., np.nan]), + # Timestamps + ([x for x in pd.date_range('1/1/18', freq='D', periods=5)], + [x for x in pd.date_range('1/1/18', freq='D', periods=5)][::-1]), + # All NA + ([np.nan] * 5, [np.nan] * 5), +]) +@pytest.mark.parametrize('q', [0, .25, .5, .75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q): + if interpolation == 'nearest' and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip("Unclear numpy expectation for nearest result with " + "equidistant data") + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame({ + 'key': ['a'] * len(a_vals) + ['b'] * len(b_vals), + 'val': a_vals + b_vals}) + + expected = DataFrame([a_expected, b_expected], columns=['val'], + index=Index(['a', 'b'], name='key')) + result = df.groupby('key').quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = pd.DataFrame([ + ['foo', 'a'], ['foo', 'b'], ['foo', 'c']], columns=['key', 'val']) + + with pytest.raises(TypeError, match="cannot be performed against " + "'object' dtypes"): + df.groupby('key').quantile() + + # pipe # -------------------------------- diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 12a5d494648fc..6a11f0ae9b44a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -208,7 +208,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected, check_names=False) + assert_series_equal(agg_result, agg_expected) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -223,13 +223,13 @@ def f(x, q=None, axis=0): agg_result = df_grouped.agg(np.percentile, 80, axis=0) apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) - assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected, check_names=False) + assert_frame_equal(apply_result, expected, check_names=False) + assert_frame_equal(agg_result, expected) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected, check_names=False) - assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected) + assert_frame_equal(apply_result, expected, check_names=False) def test_len(): From 50c40ff1afa4a4a6772225e02c320294c422ed1a Mon Sep 17 00:00:00 2001 From: Flavien Lambert Date: Thu, 28 Feb 2019 21:55:33 +0800 Subject: [PATCH 044/110] BUG: Fix regression on DataFrame.replace for regex (#25266) * BUG: Fix regression on DataFrame.replace for regex The commit ensures that the replacement for regex is not confined to the beginning of the string but spans all the characters within. The behaviour is then consistent with versions prior to 0.24.0. One test has been added to account for character replacement when the character is not at the beginning of the string. --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/internals/managers.py | 12 ++++++------ pandas/tests/frame/test_replace.py | 7 +++++++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8f4beb3f484a4..4fcde7769b362 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -23,6 +23,7 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) +- Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) - Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 38b719db1709f..407db772d73e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -552,9 +552,9 @@ def comp(s, regex=False): if isna(s): return isna(values) if hasattr(s, 'asm8'): - return _compare_or_regex_match(maybe_convert_objects(values), - getattr(s, 'asm8'), regex) - return _compare_or_regex_match(values, s, regex) + return _compare_or_regex_search(maybe_convert_objects(values), + getattr(s, 'asm8'), regex) + return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for i, s in enumerate(src_list)] @@ -1897,11 +1897,11 @@ def _consolidate(blocks): return new_blocks -def _compare_or_regex_match(a, b, regex=False): +def _compare_or_regex_search(a, b, regex=False): """ Compare two array_like inputs of the same shape or two scalar values - Calls operator.eq or re.match, depending on regex argument. If regex is + Calls operator.eq or re.search, depending on regex argument. If regex is True, perform an element-wise regex matching. Parameters @@ -1917,7 +1917,7 @@ def _compare_or_regex_match(a, b, regex=False): if not regex: op = lambda x: operator.eq(x, b) else: - op = np.vectorize(lambda x: bool(re.match(b, x)) if isinstance(x, str) + op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str) else False) is_a_array = isinstance(a, np.ndarray) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 219f7a1585fc2..127a64da38ba3 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -466,6 +466,13 @@ def test_regex_replace_dict_nested(self): assert_frame_equal(res3, expec) assert_frame_equal(res4, expec) + def test_regex_replace_dict_nested_non_first_character(self): + # GH 25259 + df = pd.DataFrame({'first': ['abc', 'bca', 'cab']}) + expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']}) + result = df.replace({'a': '.'}, regex=True) + assert_frame_equal(result, expected) + def test_regex_replace_dict_nested_gh4115(self): df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2}) expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2}) From 28abbee762972a7ecafbfcdd88d7984c0afccd50 Mon Sep 17 00:00:00 2001 From: Max van Deursen Date: Thu, 28 Feb 2019 16:17:28 +0100 Subject: [PATCH 045/110] Correct contribution guide docbuild instruction (#25479) --- doc/source/development/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 1270bfec098e8..027f2d90bbb73 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -435,7 +435,7 @@ reducing the turn-around time for checking your changes. # compile the reference docs for a single function python make.py clean - python make.py --single DataFrame.join + python make.py --single pandas.DataFrame.join For comparison, a full documentation build may take 15 minutes, but a single section may take 15 seconds. Subsequent builds, which only process portions From f04342ae8e3afec2b30db45d4792209f9e21c1cb Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 28 Feb 2019 10:20:11 -0500 Subject: [PATCH 046/110] TST/REF: Add pytest idiom to test_frequencies.py (#25430) --- pandas/tests/tseries/frequencies/__init__.py | 0 .../tseries/frequencies/test_freq_code.py | 149 ++++ .../tseries/frequencies/test_inference.py | 406 +++++++++ .../tseries/frequencies/test_to_offset.py | 146 ++++ pandas/tests/tseries/test_frequencies.py | 793 ------------------ 5 files changed, 701 insertions(+), 793 deletions(-) create mode 100644 pandas/tests/tseries/frequencies/__init__.py create mode 100644 pandas/tests/tseries/frequencies/test_freq_code.py create mode 100644 pandas/tests/tseries/frequencies/test_inference.py create mode 100644 pandas/tests/tseries/frequencies/test_to_offset.py delete mode 100644 pandas/tests/tseries/test_frequencies.py diff --git a/pandas/tests/tseries/frequencies/__init__.py b/pandas/tests/tseries/frequencies/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py new file mode 100644 index 0000000000000..0aa29e451b1ba --- /dev/null +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -0,0 +1,149 @@ +import pytest + +from pandas._libs.tslibs import frequencies as libfrequencies, resolution +from pandas._libs.tslibs.frequencies import ( + FreqGroup, _period_code_map, get_freq, get_freq_code) +import pandas.compat as compat + +import pandas.tseries.offsets as offsets + + +@pytest.fixture(params=list(compat.iteritems(_period_code_map))) +def period_code_item(request): + return request.param + + +@pytest.mark.parametrize("freqstr,expected", [ + ("A", 1000), ("3A", 1000), ("-1A", 1000), + ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), + ("W", 4000), ("W-MON", 4001), ("W-FRI", 4005) +]) +def test_freq_code(freqstr, expected): + assert get_freq(freqstr) == expected + + +def test_freq_code_match(period_code_item): + freqstr, code = period_code_item + assert get_freq(freqstr) == code + + +@pytest.mark.parametrize("freqstr,expected", [ + ("A", 1000), ("3A", 1000), ("-1A", 1000), ("A-JAN", 1000), + ("A-MAY", 1000), ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), + ("Y-JAN", 1000), ("Y-MAY", 1000), (offsets.YearEnd(), 1000), + (offsets.YearEnd(month=1), 1000), (offsets.YearEnd(month=5), 1000), + ("W", 4000), ("W-MON", 4000), ("W-FRI", 4000), (offsets.Week(), 4000), + (offsets.Week(weekday=1), 4000), (offsets.Week(weekday=5), 4000), + ("T", FreqGroup.FR_MIN), +]) +def test_freq_group(freqstr, expected): + assert resolution.get_freq_group(freqstr) == expected + + +def test_freq_group_match(period_code_item): + freqstr, code = period_code_item + + str_group = resolution.get_freq_group(freqstr) + code_group = resolution.get_freq_group(code) + + assert str_group == code_group == code // 1000 * 1000 + + +@pytest.mark.parametrize("freqstr,exp_freqstr", [ + ("D", "D"), ("W", "D"), ("M", "D"), + ("S", "S"), ("T", "S"), ("H", "S") +]) +def test_get_to_timestamp_base(freqstr, exp_freqstr): + tsb = libfrequencies.get_to_timestamp_base + + assert tsb(get_freq_code(freqstr)[0]) == get_freq_code(exp_freqstr)[0] + + +_reso = resolution.Resolution + + +@pytest.mark.parametrize("freqstr,expected", [ + ("A", "year"), ("Q", "quarter"), ("M", "month"), + ("D", "day"), ("H", "hour"), ("T", "minute"), + ("S", "second"), ("L", "millisecond"), + ("U", "microsecond"), ("N", "nanosecond") +]) +def test_get_str_from_freq(freqstr, expected): + assert _reso.get_str_from_freq(freqstr) == expected + + +@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", + "T", "S", "L", "U", "N"]) +def test_get_freq_roundtrip(freq): + result = _reso.get_freq(_reso.get_str_from_freq(freq)) + assert freq == result + + +@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U"]) +def test_get_freq_roundtrip2(freq): + result = _reso.get_freq(_reso.get_str(_reso.get_reso_from_freq(freq))) + assert freq == result + + +@pytest.mark.parametrize("args,expected", [ + ((1.5, "T"), (90, "S")), ((62.4, "T"), (3744, "S")), + ((1.04, "H"), (3744, "S")), ((1, "D"), (1, "D")), + ((0.342931, "H"), (1234551600, "U")), ((1.2345, "D"), (106660800, "L")) +]) +def test_resolution_bumping(args, expected): + # see gh-14378 + assert _reso.get_stride_from_decimal(*args) == expected + + +@pytest.mark.parametrize("args", [ + (0.5, "N"), + + # Too much precision in the input can prevent. + (0.3429324798798269273987982, "H") +]) +def test_cat(args): + msg = "Could not convert to integer offset at any resolution" + + with pytest.raises(ValueError, match=msg): + _reso.get_stride_from_decimal(*args) + + +@pytest.mark.parametrize("freq_input,expected", [ + # Frequency string. + ("A", (get_freq("A"), 1)), + ("3D", (get_freq("D"), 3)), + ("-2M", (get_freq("M"), -2)), + + # Tuple. + (("D", 1), (get_freq("D"), 1)), + (("A", 3), (get_freq("A"), 3)), + (("M", -2), (get_freq("M"), -2)), + ((5, "T"), (FreqGroup.FR_MIN, 5)), + + # Numeric Tuple. + ((1000, 1), (1000, 1)), + + # Offsets. + (offsets.Day(), (get_freq("D"), 1)), + (offsets.Day(3), (get_freq("D"), 3)), + (offsets.Day(-2), (get_freq("D"), -2)), + (offsets.MonthEnd(), (get_freq("M"), 1)), + (offsets.MonthEnd(3), (get_freq("M"), 3)), + (offsets.MonthEnd(-2), (get_freq("M"), -2)), + (offsets.Week(), (get_freq("W"), 1)), + (offsets.Week(3), (get_freq("W"), 3)), + (offsets.Week(-2), (get_freq("W"), -2)), + (offsets.Hour(), (FreqGroup.FR_HR, 1)), + + # Monday is weekday=0. + (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), + (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), + (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), +]) +def test_get_freq_code(freq_input, expected): + assert get_freq_code(freq_input) == expected + + +def test_get_code_invalid(): + with pytest.raises(ValueError, match="Invalid frequency"): + get_freq_code((5, "baz")) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py new file mode 100644 index 0000000000000..9e7ddbc45bba8 --- /dev/null +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -0,0 +1,406 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs.ccalendar import DAYS, MONTHS +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +import pandas.compat as compat +from pandas.compat import is_platform_windows, range + +from pandas import ( + DatetimeIndex, Index, Series, Timestamp, date_range, period_range) +from pandas.core.tools.datetimes import to_datetime +import pandas.util.testing as tm + +import pandas.tseries.frequencies as frequencies +import pandas.tseries.offsets as offsets + + +def _check_generated_range(start, periods, freq): + """ + Check the range generated from a given start, frequency, and period count. + + Parameters + ---------- + start : str + The start date. + periods : int + The number of periods. + freq : str + The frequency of the range. + """ + freq = freq.upper() + + gen = date_range(start, periods=periods, freq=freq) + index = DatetimeIndex(gen.values) + + if not freq.startswith("Q-"): + assert frequencies.infer_freq(index) == gen.freqstr + else: + inf_freq = frequencies.infer_freq(index) + is_dec_range = inf_freq == "Q-DEC" and gen.freqstr in ( + "Q", "Q-DEC", "Q-SEP", "Q-JUN", "Q-MAR") + is_nov_range = inf_freq == "Q-NOV" and gen.freqstr in ( + "Q-NOV", "Q-AUG", "Q-MAY", "Q-FEB") + is_oct_range = inf_freq == "Q-OCT" and gen.freqstr in ( + "Q-OCT", "Q-JUL", "Q-APR", "Q-JAN") + assert is_dec_range or is_nov_range or is_oct_range + + +@pytest.fixture(params=[(timedelta(1), "D"), + (timedelta(hours=1), "H"), + (timedelta(minutes=1), "T"), + (timedelta(seconds=1), "S"), + (np.timedelta64(1, "ns"), "N"), + (timedelta(microseconds=1), "U"), + (timedelta(microseconds=1000), "L")]) +def base_delta_code_pair(request): + return request.param + + +@pytest.fixture(params=[1, 2, 3, 4]) +def count(request): + return request.param + + +@pytest.fixture(params=DAYS) +def day(request): + return request.param + + +@pytest.fixture(params=MONTHS) +def month(request): + return request.param + + +@pytest.fixture(params=[5, 7]) +def periods(request): + return request.param + + +def test_raise_if_period_index(): + index = period_range(start="1/1/1990", periods=20, freq="M") + msg = "Check the `freq` attribute instead of using infer_freq" + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(index) + + +def test_raise_if_too_few(): + index = DatetimeIndex(["12/31/1998", "1/3/1999"]) + msg = "Need at least 3 dates to infer frequency" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(index) + + +def test_business_daily(): + index = DatetimeIndex(["01/01/1999", "1/4/1999", "1/5/1999"]) + assert frequencies.infer_freq(index) == "B" + + +def test_business_daily_look_alike(): + # see gh-16624 + # + # Do not infer "B when "weekend" (2-day gap) in wrong place. + index = DatetimeIndex(["12/31/1998", "1/3/1999", "1/4/1999"]) + assert frequencies.infer_freq(index) is None + + +def test_day_corner(): + index = DatetimeIndex(["1/1/2000", "1/2/2000", "1/3/2000"]) + assert frequencies.infer_freq(index) == "D" + + +def test_non_datetime_index(): + dates = to_datetime(["1/1/2000", "1/2/2000", "1/3/2000"]) + assert frequencies.infer_freq(dates) == "D" + + +def test_fifth_week_of_month_infer(): + # see gh-9425 + # + # Only attempt to infer up to WOM-4. + index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) + assert frequencies.infer_freq(index) is None + + +def test_week_of_month_fake(): + # All of these dates are on same day + # of week and are 4 or 5 weeks apart. + index = DatetimeIndex(["2013-08-27", "2013-10-01", + "2013-10-29", "2013-11-26"]) + assert frequencies.infer_freq(index) != "WOM-4TUE" + + +def test_fifth_week_of_month(): + # see gh-9425 + # + # Only supports freq up to WOM-4. + msg = ("Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified") + + with pytest.raises(ValueError, match=msg): + date_range("2014-01-01", freq="WOM-5MON") + + +def test_monthly_ambiguous(): + rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"]) + assert rng.inferred_freq == "M" + + +def test_annual_ambiguous(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + assert rng.inferred_freq == "A-JAN" + + +def test_infer_freq_delta(base_delta_code_pair, count): + b = Timestamp(datetime.now()) + base_delta, code = base_delta_code_pair + + inc = base_delta * count + index = DatetimeIndex([b + inc * j for j in range(3)]) + + exp_freq = "%d%s" % (count, code) if count > 1 else code + assert frequencies.infer_freq(index) == exp_freq + + +@pytest.mark.parametrize("constructor", [ + lambda now, delta: DatetimeIndex([now + delta * 7] + + [now + delta * j for j in range(3)]), + lambda now, delta: DatetimeIndex([now + delta * j for j in range(3)] + + [now + delta * 7]) +]) +def test_infer_freq_custom(base_delta_code_pair, constructor): + b = Timestamp(datetime.now()) + base_delta, _ = base_delta_code_pair + + index = constructor(b, base_delta) + assert frequencies.infer_freq(index) is None + + +def test_weekly_infer(periods, day): + _check_generated_range("1/1/2000", periods, "W-{day}".format(day=day)) + + +def test_week_of_month_infer(periods, day, count): + _check_generated_range("1/1/2000", periods, + "WOM-{count}{day}".format(count=count, day=day)) + + +@pytest.mark.parametrize("freq", ["M", "BM", "BMS"]) +def test_monthly_infer(periods, freq): + _check_generated_range("1/1/2000", periods, "M") + + +def test_quarterly_infer(month, periods): + _check_generated_range("1/1/2000", periods, + "Q-{month}".format(month=month)) + + +@pytest.mark.parametrize("annual", ["A", "BA"]) +def test_annually_infer(month, periods, annual): + _check_generated_range("1/1/2000", periods, + "{annual}-{month}".format(annual=annual, + month=month)) + + +@pytest.mark.parametrize("freq,expected", [ + ("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT") +]) +def test_infer_freq_index(freq, expected): + rng = period_range("1959Q2", "2009Q3", freq=freq) + rng = Index(rng.to_timestamp("D", how="e").astype(object)) + + assert rng.inferred_freq == expected + + +@pytest.mark.parametrize( + "expected,dates", + list(compat.iteritems( + {"AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], + "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], + "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], + "H": ["2011-12-31 22:00", "2011-12-31 23:00", + "2012-01-01 00:00", "2012-01-01 01:00"]})) +) +def test_infer_freq_tz(tz_naive_fixture, expected, dates): + # see gh-7310 + tz = tz_naive_fixture + idx = DatetimeIndex(dates, tz=tz) + assert idx.inferred_freq == expected + + +@pytest.mark.parametrize("date_pair", [ + ["2013-11-02", "2013-11-5"], # Fall DST + ["2014-03-08", "2014-03-11"], # Spring DST + ["2014-01-01", "2014-01-03"] # Regular Time +]) +@pytest.mark.parametrize("freq", [ + "3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N" +]) +def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): + # see gh-8772 + tz = tz_naive_fixture + idx = date_range(date_pair[0], date_pair[1], freq=freq, tz=tz) + assert idx.inferred_freq == freq + + +def test_infer_freq_tz_transition_custom(): + index = date_range("2013-11-03", periods=5, + freq="3H").tz_localize("America/Chicago") + assert index.inferred_freq is None + + +@pytest.mark.parametrize("data,expected", [ + # Hourly freq in a day must result in "H" + (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", + "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00"], "H"), + + (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", + "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00", + "2014-07-01 15:00", "2014-07-01 16:00", "2014-07-02 09:00", + "2014-07-02 10:00", "2014-07-02 11:00"], "BH"), + (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", + "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", + "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", + "2014-07-07 10:00", "2014-07-07 11:00"], "BH"), + (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", + "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", + "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", + "2014-07-07 10:00", "2014-07-07 11:00", "2014-07-07 12:00", + "2014-07-07 13:00", "2014-07-07 14:00", "2014-07-07 15:00", + "2014-07-07 16:00", "2014-07-08 09:00", "2014-07-08 10:00", + "2014-07-08 11:00", "2014-07-08 12:00", "2014-07-08 13:00", + "2014-07-08 14:00", "2014-07-08 15:00", "2014-07-08 16:00"], "BH"), +]) +def test_infer_freq_business_hour(data, expected): + # see gh-7905 + idx = DatetimeIndex(data) + assert idx.inferred_freq == expected + + +def test_not_monotonic(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + rng = rng[::-1] + + assert rng.inferred_freq == "-1A-JAN" + + +def test_non_datetime_index2(): + rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) + vals = rng.to_pydatetime() + + result = frequencies.infer_freq(vals) + assert result == rng.inferred_freq + + +@pytest.mark.parametrize("idx", [ + tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10) +]) +def test_invalid_index_types(idx): + msg = ("(cannot infer freq from a non-convertible)|" + "(Check the `freq` attribute instead of using infer_freq)") + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(idx) + + +@pytest.mark.skipif(is_platform_windows(), + reason="see gh-10822: Windows issue") +@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), + tm.makeUnicodeIndex(10)]) +def test_invalid_index_types_unicode(idx): + # see gh-10822 + # + # Odd error message on conversions to datetime for unicode. + msg = "Unknown string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(idx) + + +def test_string_datetime_like_compat(): + # see gh-6463 + data = ["2004-01", "2004-02", "2004-03", "2004-04"] + + expected = frequencies.infer_freq(data) + result = frequencies.infer_freq(Index(data)) + + assert result == expected + + +def test_series(): + # see gh-6407 + s = Series(date_range("20130101", "20130110")) + inferred = frequencies.infer_freq(s) + assert inferred == "D" + + +@pytest.mark.parametrize("end", [10, 10.]) +def test_series_invalid_type(end): + # see gh-6407 + msg = "cannot infer freq from a non-convertible dtype on a Series" + s = Series(np.arange(end)) + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(s) + + +def test_series_inconvertible_string(): + # see gh-6407 + msg = "Unknown string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) + + +@pytest.mark.parametrize("freq", [None, "L"]) +def test_series_period_index(freq): + # see gh-6407 + # + # Cannot infer on PeriodIndex + msg = "cannot infer freq from a non-convertible dtype on a Series" + s = Series(period_range("2013", periods=10, freq=freq)) + + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(s) + + +@pytest.mark.parametrize("freq", ["M", "L", "S"]) +def test_series_datetime_index(freq): + s = Series(date_range("20130101", periods=10, freq=freq)) + inferred = frequencies.infer_freq(s) + assert inferred == freq + + +@pytest.mark.parametrize("offset_func", [ + frequencies.get_offset, + lambda freq: date_range("2011-01-01", periods=5, freq=freq) +]) +@pytest.mark.parametrize("freq", [ + "WEEKDAY", "EOM", "W@MON", "W@TUE", "W@WED", "W@THU", + "W@FRI", "W@SAT", "W@SUN", "Q@JAN", "Q@FEB", "Q@MAR", + "A@JAN", "A@FEB", "A@MAR", "A@APR", "A@MAY", "A@JUN", + "A@JUL", "A@AUG", "A@SEP", "A@OCT", "A@NOV", "A@DEC", + "Y@JAN", "WOM@1MON", "WOM@2MON", "WOM@3MON", + "WOM@4MON", "WOM@1TUE", "WOM@2TUE", "WOM@3TUE", + "WOM@4TUE", "WOM@1WED", "WOM@2WED", "WOM@3WED", + "WOM@4WED", "WOM@1THU", "WOM@2THU", "WOM@3THU", + "WOM@4THU", "WOM@1FRI", "WOM@2FRI", "WOM@3FRI", + "WOM@4FRI" +]) +def test_legacy_offset_warnings(offset_func, freq): + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + offset_func(freq) + + +def test_ms_vs_capital_ms(): + left = frequencies.get_offset("ms") + right = frequencies.get_offset("MS") + + assert left == offsets.Milli() + assert right == offsets.MonthBegin() diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tseries/frequencies/test_to_offset.py new file mode 100644 index 0000000000000..c9c35b47f3475 --- /dev/null +++ b/pandas/tests/tseries/frequencies/test_to_offset.py @@ -0,0 +1,146 @@ +import re + +import pytest + +from pandas import Timedelta + +import pandas.tseries.frequencies as frequencies +import pandas.tseries.offsets as offsets + + +@pytest.mark.parametrize("freq_input,expected", [ + (frequencies.to_offset("10us"), offsets.Micro(10)), + (offsets.Hour(), offsets.Hour()), + ((5, "T"), offsets.Minute(5)), + ("2h30min", offsets.Minute(150)), + ("2h 30min", offsets.Minute(150)), + ("2h30min15s", offsets.Second(150 * 60 + 15)), + ("2h 60min", offsets.Hour(3)), + ("2h 20.5min", offsets.Second(8430)), + ("1.5min", offsets.Second(90)), + ("0.5S", offsets.Milli(500)), + ("15l500u", offsets.Micro(15500)), + ("10s75L", offsets.Milli(10075)), + ("1s0.25ms", offsets.Micro(1000250)), + ("1s0.25L", offsets.Micro(1000250)), + ("2800N", offsets.Nano(2800)), + ("2SM", offsets.SemiMonthEnd(2)), + ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), + ("2SMS-15", offsets.SemiMonthBegin(2)), +]) +def test_to_offset(freq_input, expected): + result = frequencies.to_offset(freq_input) + assert result == expected + + +@pytest.mark.parametrize("freqstr,expected", [ + ("-1S", -1), + ("-2SM", -2), + ("-1SMS", -1), + ("-5min10s", -310), +]) +def test_to_offset_negative(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize("freqstr", [ + "2h20m", "U1", "-U", "3U1", "-2-3U", "-2D:3H", + "1.5.0S", "2SMS-15-15", "2SMS-15D", "100foo", + + # Invalid leading +/- signs. + "+-1d", "-+1h", "+1", "-7", "+d", "-m", + + # Invalid shortcut anchors. + "SM-0", "SM-28", "SM-29", "SM-FOO", "BSM", "SM--1", "SMS-1", + "SMS-28", "SMS-30", "SMS-BAR", "SMS-BYR", "BSMS", "SMS--2" +]) +def test_to_offset_invalid(freqstr): + # see gh-13930 + + # We escape string because some of our + # inputs contain regex special characters. + msg = re.escape("Invalid frequency: {freqstr}".format(freqstr=freqstr)) + with pytest.raises(ValueError, match=msg): + frequencies.to_offset(freqstr) + + +def test_to_offset_no_evaluate(): + with pytest.raises(ValueError, match="Could not evaluate"): + frequencies.to_offset(("", "")) + + +@pytest.mark.parametrize("freqstr,expected", [ + ("2D 3H", offsets.Hour(51)), + ("2 D3 H", offsets.Hour(51)), + ("2 D 3 H", offsets.Hour(51)), + (" 2 D 3 H ", offsets.Hour(51)), + (" H ", offsets.Hour()), + (" 3 H ", offsets.Hour(3)), +]) +def test_to_offset_whitespace(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result == expected + + +@pytest.mark.parametrize("freqstr,expected", [ + ("00H 00T 01S", 1), + ("-00H 03T 14S", -194), +]) +def test_to_offset_leading_zero(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize("freqstr,expected", [ + ("+1d", 1), + ("+2h30min", 150), +]) +def test_to_offset_leading_plus(freqstr, expected): + result = frequencies.to_offset(freqstr) + assert result.n == expected + + +@pytest.mark.parametrize("kwargs,expected", [ + (dict(days=1, seconds=1), offsets.Second(86401)), + (dict(days=-1, seconds=1), offsets.Second(-86399)), + (dict(hours=1, minutes=10), offsets.Minute(70)), + (dict(hours=1, minutes=-10), offsets.Minute(50)), + (dict(weeks=1), offsets.Day(7)), + (dict(hours=1), offsets.Hour(1)), + (dict(hours=1), frequencies.to_offset("60min")), + (dict(microseconds=1), offsets.Micro(1)) +]) +def test_to_offset_pd_timedelta(kwargs, expected): + # see gh-9064 + td = Timedelta(**kwargs) + result = frequencies.to_offset(td) + assert result == expected + + +def test_to_offset_pd_timedelta_invalid(): + # see gh-9064 + msg = "Invalid frequency: 0 days 00:00:00" + td = Timedelta(microseconds=0) + + with pytest.raises(ValueError, match=msg): + frequencies.to_offset(td) + + +@pytest.mark.parametrize("shortcut,expected", [ + ("W", offsets.Week(weekday=6)), + ("W-SUN", offsets.Week(weekday=6)), + ("Q", offsets.QuarterEnd(startingMonth=12)), + ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), + ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), + ("SM", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), + ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), +]) +def test_anchored_shortcuts(shortcut, expected): + result = frequencies.to_offset(shortcut) + assert result == expected diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py deleted file mode 100644 index eb4e63654b47b..0000000000000 --- a/pandas/tests/tseries/test_frequencies.py +++ /dev/null @@ -1,793 +0,0 @@ -from datetime import datetime, timedelta - -import numpy as np -import pytest - -from pandas._libs.tslibs import frequencies as libfrequencies, resolution -from pandas._libs.tslibs.ccalendar import MONTHS -from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, FreqGroup, _period_code_map, get_freq, get_freq_code) -import pandas.compat as compat -from pandas.compat import is_platform_windows, range - -from pandas import ( - DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range, - period_range) -from pandas.core.tools.datetimes import to_datetime -import pandas.util.testing as tm - -import pandas.tseries.frequencies as frequencies -import pandas.tseries.offsets as offsets - - -class TestToOffset(object): - - def test_to_offset_multiple(self): - freqstr = '2h30min' - freqstr2 = '2h 30min' - - result = frequencies.to_offset(freqstr) - assert (result == frequencies.to_offset(freqstr2)) - expected = offsets.Minute(150) - assert (result == expected) - - freqstr = '2h30min15s' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(150 * 60 + 15) - assert (result == expected) - - freqstr = '2h 60min' - result = frequencies.to_offset(freqstr) - expected = offsets.Hour(3) - assert (result == expected) - - freqstr = '2h 20.5min' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(8430) - assert (result == expected) - - freqstr = '1.5min' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(90) - assert (result == expected) - - freqstr = '0.5S' - result = frequencies.to_offset(freqstr) - expected = offsets.Milli(500) - assert (result == expected) - - freqstr = '15l500u' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(15500) - assert (result == expected) - - freqstr = '10s75L' - result = frequencies.to_offset(freqstr) - expected = offsets.Milli(10075) - assert (result == expected) - - freqstr = '1s0.25ms' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(1000250) - assert (result == expected) - - freqstr = '1s0.25L' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(1000250) - assert (result == expected) - - freqstr = '2800N' - result = frequencies.to_offset(freqstr) - expected = offsets.Nano(2800) - assert (result == expected) - - freqstr = '2SM' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2) - assert (result == expected) - - freqstr = '2SM-16' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2, day_of_month=16) - assert (result == expected) - - freqstr = '2SMS-14' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2, day_of_month=14) - assert (result == expected) - - freqstr = '2SMS-15' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2) - assert (result == expected) - - # malformed - with pytest.raises(ValueError, match='Invalid frequency: 2h20m'): - frequencies.to_offset('2h20m') - - def test_to_offset_negative(self): - freqstr = '-1S' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - freqstr = '-5min10s' - result = frequencies.to_offset(freqstr) - assert (result.n == -310) - - freqstr = '-2SM' - result = frequencies.to_offset(freqstr) - assert (result.n == -2) - - freqstr = '-1SMS' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - def test_to_offset_invalid(self): - # GH 13930 - with pytest.raises(ValueError, match='Invalid frequency: U1'): - frequencies.to_offset('U1') - with pytest.raises(ValueError, match='Invalid frequency: -U'): - frequencies.to_offset('-U') - with pytest.raises(ValueError, match='Invalid frequency: 3U1'): - frequencies.to_offset('3U1') - with pytest.raises(ValueError, match='Invalid frequency: -2-3U'): - frequencies.to_offset('-2-3U') - with pytest.raises(ValueError, match='Invalid frequency: -2D:3H'): - frequencies.to_offset('-2D:3H') - with pytest.raises(ValueError, match='Invalid frequency: 1.5.0S'): - frequencies.to_offset('1.5.0S') - - # split offsets with spaces are valid - assert frequencies.to_offset('2D 3H') == offsets.Hour(51) - assert frequencies.to_offset('2 D3 H') == offsets.Hour(51) - assert frequencies.to_offset('2 D 3 H') == offsets.Hour(51) - assert frequencies.to_offset(' 2 D 3 H ') == offsets.Hour(51) - assert frequencies.to_offset(' H ') == offsets.Hour() - assert frequencies.to_offset(' 3 H ') == offsets.Hour(3) - - # special cases - assert frequencies.to_offset('2SMS-15') == offsets.SemiMonthBegin(2) - with pytest.raises(ValueError, match='Invalid frequency: 2SMS-15-15'): - frequencies.to_offset('2SMS-15-15') - with pytest.raises(ValueError, match='Invalid frequency: 2SMS-15D'): - frequencies.to_offset('2SMS-15D') - - def test_to_offset_leading_zero(self): - freqstr = '00H 00T 01S' - result = frequencies.to_offset(freqstr) - assert (result.n == 1) - - freqstr = '-00H 03T 14S' - result = frequencies.to_offset(freqstr) - assert (result.n == -194) - - def test_to_offset_leading_plus(self): - freqstr = '+1d' - result = frequencies.to_offset(freqstr) - assert (result.n == 1) - - freqstr = '+2h30min' - result = frequencies.to_offset(freqstr) - assert (result.n == 150) - - for bad_freq in ['+-1d', '-+1h', '+1', '-7', '+d', '-m']: - with pytest.raises(ValueError, match='Invalid frequency:'): - frequencies.to_offset(bad_freq) - - def test_to_offset_pd_timedelta(self): - # Tests for #9064 - td = Timedelta(days=1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(86401) - assert (expected == result) - - td = Timedelta(days=-1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(-86399) - assert (expected == result) - - td = Timedelta(hours=1, minutes=10) - result = frequencies.to_offset(td) - expected = offsets.Minute(70) - assert (expected == result) - - td = Timedelta(hours=1, minutes=-10) - result = frequencies.to_offset(td) - expected = offsets.Minute(50) - assert (expected == result) - - td = Timedelta(weeks=1) - result = frequencies.to_offset(td) - expected = offsets.Day(7) - assert (expected == result) - - td1 = Timedelta(hours=1) - result1 = frequencies.to_offset(td1) - result2 = frequencies.to_offset('60min') - assert (result1 == result2) - - td = Timedelta(microseconds=1) - result = frequencies.to_offset(td) - expected = offsets.Micro(1) - assert (expected == result) - - td = Timedelta(microseconds=0) - pytest.raises(ValueError, lambda: frequencies.to_offset(td)) - - def test_anchored_shortcuts(self): - result = frequencies.to_offset('W') - expected = frequencies.to_offset('W-SUN') - assert (result == expected) - - result1 = frequencies.to_offset('Q') - result2 = frequencies.to_offset('Q-DEC') - expected = offsets.QuarterEnd(startingMonth=12) - assert (result1 == expected) - assert (result2 == expected) - - result1 = frequencies.to_offset('Q-MAY') - expected = offsets.QuarterEnd(startingMonth=5) - assert (result1 == expected) - - result1 = frequencies.to_offset('SM') - result2 = frequencies.to_offset('SM-15') - expected = offsets.SemiMonthEnd(day_of_month=15) - assert (result1 == expected) - assert (result2 == expected) - - result = frequencies.to_offset('SM-1') - expected = offsets.SemiMonthEnd(day_of_month=1) - assert (result == expected) - - result = frequencies.to_offset('SM-27') - expected = offsets.SemiMonthEnd(day_of_month=27) - assert (result == expected) - - result = frequencies.to_offset('SMS-2') - expected = offsets.SemiMonthBegin(day_of_month=2) - assert (result == expected) - - result = frequencies.to_offset('SMS-27') - expected = offsets.SemiMonthBegin(day_of_month=27) - assert (result == expected) - - # ensure invalid cases fail as expected - invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1', - 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'SMS-BYR' 'BSMS', - 'SMS--2'] - for invalid_anchor in invalid_anchors: - with pytest.raises(ValueError, match='Invalid frequency: '): - frequencies.to_offset(invalid_anchor) - - -def test_ms_vs_MS(): - left = frequencies.get_offset('ms') - right = frequencies.get_offset('MS') - assert left == offsets.Milli() - assert right == offsets.MonthBegin() - - -def test_rule_aliases(): - rule = frequencies.to_offset('10us') - assert rule == offsets.Micro(10) - - -class TestFrequencyCode(object): - - def test_freq_code(self): - assert get_freq('A') == 1000 - assert get_freq('3A') == 1000 - assert get_freq('-1A') == 1000 - - assert get_freq('Y') == 1000 - assert get_freq('3Y') == 1000 - assert get_freq('-1Y') == 1000 - - assert get_freq('W') == 4000 - assert get_freq('W-MON') == 4001 - assert get_freq('W-FRI') == 4005 - - for freqstr, code in compat.iteritems(_period_code_map): - result = get_freq(freqstr) - assert result == code - - result = resolution.get_freq_group(freqstr) - assert result == code // 1000 * 1000 - - result = resolution.get_freq_group(code) - assert result == code // 1000 * 1000 - - def test_freq_group(self): - assert resolution.get_freq_group('A') == 1000 - assert resolution.get_freq_group('3A') == 1000 - assert resolution.get_freq_group('-1A') == 1000 - assert resolution.get_freq_group('A-JAN') == 1000 - assert resolution.get_freq_group('A-MAY') == 1000 - - assert resolution.get_freq_group('Y') == 1000 - assert resolution.get_freq_group('3Y') == 1000 - assert resolution.get_freq_group('-1Y') == 1000 - assert resolution.get_freq_group('Y-JAN') == 1000 - assert resolution.get_freq_group('Y-MAY') == 1000 - - assert resolution.get_freq_group(offsets.YearEnd()) == 1000 - assert resolution.get_freq_group(offsets.YearEnd(month=1)) == 1000 - assert resolution.get_freq_group(offsets.YearEnd(month=5)) == 1000 - - assert resolution.get_freq_group('W') == 4000 - assert resolution.get_freq_group('W-MON') == 4000 - assert resolution.get_freq_group('W-FRI') == 4000 - assert resolution.get_freq_group(offsets.Week()) == 4000 - assert resolution.get_freq_group(offsets.Week(weekday=1)) == 4000 - assert resolution.get_freq_group(offsets.Week(weekday=5)) == 4000 - - def test_get_to_timestamp_base(self): - tsb = libfrequencies.get_to_timestamp_base - - assert (tsb(get_freq_code('D')[0]) == - get_freq_code('D')[0]) - assert (tsb(get_freq_code('W')[0]) == - get_freq_code('D')[0]) - assert (tsb(get_freq_code('M')[0]) == - get_freq_code('D')[0]) - - assert (tsb(get_freq_code('S')[0]) == - get_freq_code('S')[0]) - assert (tsb(get_freq_code('T')[0]) == - get_freq_code('S')[0]) - assert (tsb(get_freq_code('H')[0]) == - get_freq_code('S')[0]) - - def test_freq_to_reso(self): - Reso = resolution.Resolution - - assert Reso.get_str_from_freq('A') == 'year' - assert Reso.get_str_from_freq('Q') == 'quarter' - assert Reso.get_str_from_freq('M') == 'month' - assert Reso.get_str_from_freq('D') == 'day' - assert Reso.get_str_from_freq('H') == 'hour' - assert Reso.get_str_from_freq('T') == 'minute' - assert Reso.get_str_from_freq('S') == 'second' - assert Reso.get_str_from_freq('L') == 'millisecond' - assert Reso.get_str_from_freq('U') == 'microsecond' - assert Reso.get_str_from_freq('N') == 'nanosecond' - - for freq in ['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U', 'N']: - # check roundtrip - result = Reso.get_freq(Reso.get_str_from_freq(freq)) - assert freq == result - - for freq in ['D', 'H', 'T', 'S', 'L', 'U']: - result = Reso.get_freq(Reso.get_str(Reso.get_reso_from_freq(freq))) - assert freq == result - - def test_resolution_bumping(self): - # see gh-14378 - Reso = resolution.Resolution - - assert Reso.get_stride_from_decimal(1.5, 'T') == (90, 'S') - assert Reso.get_stride_from_decimal(62.4, 'T') == (3744, 'S') - assert Reso.get_stride_from_decimal(1.04, 'H') == (3744, 'S') - assert Reso.get_stride_from_decimal(1, 'D') == (1, 'D') - assert (Reso.get_stride_from_decimal(0.342931, 'H') == - (1234551600, 'U')) - assert Reso.get_stride_from_decimal(1.2345, 'D') == (106660800, 'L') - - with pytest.raises(ValueError): - Reso.get_stride_from_decimal(0.5, 'N') - - # too much precision in the input can prevent - with pytest.raises(ValueError): - Reso.get_stride_from_decimal(0.3429324798798269273987982, 'H') - - def test_get_freq_code(self): - # frequency str - assert (get_freq_code('A') == - (get_freq('A'), 1)) - assert (get_freq_code('3D') == - (get_freq('D'), 3)) - assert (get_freq_code('-2M') == - (get_freq('M'), -2)) - - # tuple - assert (get_freq_code(('D', 1)) == - (get_freq('D'), 1)) - assert (get_freq_code(('A', 3)) == - (get_freq('A'), 3)) - assert (get_freq_code(('M', -2)) == - (get_freq('M'), -2)) - - # numeric tuple - assert get_freq_code((1000, 1)) == (1000, 1) - - # offsets - assert (get_freq_code(offsets.Day()) == - (get_freq('D'), 1)) - assert (get_freq_code(offsets.Day(3)) == - (get_freq('D'), 3)) - assert (get_freq_code(offsets.Day(-2)) == - (get_freq('D'), -2)) - - assert (get_freq_code(offsets.MonthEnd()) == - (get_freq('M'), 1)) - assert (get_freq_code(offsets.MonthEnd(3)) == - (get_freq('M'), 3)) - assert (get_freq_code(offsets.MonthEnd(-2)) == - (get_freq('M'), -2)) - - assert (get_freq_code(offsets.Week()) == - (get_freq('W'), 1)) - assert (get_freq_code(offsets.Week(3)) == - (get_freq('W'), 3)) - assert (get_freq_code(offsets.Week(-2)) == - (get_freq('W'), -2)) - - # Monday is weekday=0 - assert (get_freq_code(offsets.Week(weekday=1)) == - (get_freq('W-TUE'), 1)) - assert (get_freq_code(offsets.Week(3, weekday=0)) == - (get_freq('W-MON'), 3)) - assert (get_freq_code(offsets.Week(-2, weekday=4)) == - (get_freq('W-FRI'), -2)) - - def test_frequency_misc(self): - assert (resolution.get_freq_group('T') == - FreqGroup.FR_MIN) - - code, stride = get_freq_code(offsets.Hour()) - assert code == FreqGroup.FR_HR - - code, stride = get_freq_code((5, 'T')) - assert code == FreqGroup.FR_MIN - assert stride == 5 - - offset = offsets.Hour() - result = frequencies.to_offset(offset) - assert result == offset - - result = frequencies.to_offset((5, 'T')) - expected = offsets.Minute(5) - assert result == expected - - with pytest.raises(ValueError, match='Invalid frequency'): - get_freq_code((5, 'baz')) - - with pytest.raises(ValueError, match='Invalid frequency'): - frequencies.to_offset('100foo') - - with pytest.raises(ValueError, match='Could not evaluate'): - frequencies.to_offset(('', '')) - - -_dti = DatetimeIndex - - -class TestFrequencyInference(object): - - def test_raise_if_period_index(self): - index = period_range(start="1/1/1990", periods=20, freq="M") - pytest.raises(TypeError, frequencies.infer_freq, index) - - def test_raise_if_too_few(self): - index = _dti(['12/31/1998', '1/3/1999']) - pytest.raises(ValueError, frequencies.infer_freq, index) - - def test_business_daily(self): - index = _dti(['01/01/1999', '1/4/1999', '1/5/1999']) - assert frequencies.infer_freq(index) == 'B' - - def test_business_daily_look_alike(self): - # GH 16624, do not infer 'B' when 'weekend' (2-day gap) in wrong place - index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) - assert frequencies.infer_freq(index) is None - - def test_day(self): - self._check_tick(timedelta(1), 'D') - - def test_day_corner(self): - index = _dti(['1/1/2000', '1/2/2000', '1/3/2000']) - assert frequencies.infer_freq(index) == 'D' - - def test_non_datetimeindex(self): - dates = to_datetime(['1/1/2000', '1/2/2000', '1/3/2000']) - assert frequencies.infer_freq(dates) == 'D' - - def test_hour(self): - self._check_tick(timedelta(hours=1), 'H') - - def test_minute(self): - self._check_tick(timedelta(minutes=1), 'T') - - def test_second(self): - self._check_tick(timedelta(seconds=1), 'S') - - def test_millisecond(self): - self._check_tick(timedelta(microseconds=1000), 'L') - - def test_microsecond(self): - self._check_tick(timedelta(microseconds=1), 'U') - - def test_nanosecond(self): - self._check_tick(np.timedelta64(1, 'ns'), 'N') - - def _check_tick(self, base_delta, code): - b = Timestamp(datetime.now()) - for i in range(1, 5): - inc = base_delta * i - index = _dti([b + inc * j for j in range(3)]) - if i > 1: - exp_freq = '%d%s' % (i, code) - else: - exp_freq = code - assert frequencies.infer_freq(index) == exp_freq - - index = _dti([b + base_delta * 7] + [b + base_delta * j for j in range( - 3)]) - assert frequencies.infer_freq(index) is None - - index = _dti([b + base_delta * j for j in range(3)] + [b + base_delta * - 7]) - - assert frequencies.infer_freq(index) is None - - def test_weekly(self): - days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - - for day in days: - self._check_generated_range('1/1/2000', 'W-%s' % day) - - def test_week_of_month(self): - days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - - for day in days: - for i in range(1, 5): - self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day)) - - def test_fifth_week_of_month(self): - # Only supports freq up to WOM-4. See #9425 - func = lambda: date_range('2014-01-01', freq='WOM-5MON') - pytest.raises(ValueError, func) - - def test_fifth_week_of_month_infer(self): - # Only attempts to infer up to WOM-4. See #9425 - index = DatetimeIndex(["2014-03-31", "2014-06-30", "2015-03-30"]) - assert frequencies.infer_freq(index) is None - - def test_week_of_month_fake(self): - # All of these dates are on same day of week and are 4 or 5 weeks apart - index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29", - "2013-11-26"]) - assert frequencies.infer_freq(index) != 'WOM-4TUE' - - def test_monthly(self): - self._check_generated_range('1/1/2000', 'M') - - def test_monthly_ambiguous(self): - rng = _dti(['1/31/2000', '2/29/2000', '3/31/2000']) - assert rng.inferred_freq == 'M' - - def test_business_monthly(self): - self._check_generated_range('1/1/2000', 'BM') - - def test_business_start_monthly(self): - self._check_generated_range('1/1/2000', 'BMS') - - def test_quarterly(self): - for month in ['JAN', 'FEB', 'MAR']: - self._check_generated_range('1/1/2000', 'Q-%s' % month) - - def test_annual(self): - for month in MONTHS: - self._check_generated_range('1/1/2000', 'A-%s' % month) - - def test_business_annual(self): - for month in MONTHS: - self._check_generated_range('1/1/2000', 'BA-%s' % month) - - def test_annual_ambiguous(self): - rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) - assert rng.inferred_freq == 'A-JAN' - - def _check_generated_range(self, start, freq): - freq = freq.upper() - - gen = date_range(start, periods=7, freq=freq) - index = _dti(gen.values) - if not freq.startswith('Q-'): - assert frequencies.infer_freq(index) == gen.freqstr - else: - inf_freq = frequencies.infer_freq(index) - is_dec_range = inf_freq == 'Q-DEC' and gen.freqstr in ( - 'Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR') - is_nov_range = inf_freq == 'Q-NOV' and gen.freqstr in ( - 'Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB') - is_oct_range = inf_freq == 'Q-OCT' and gen.freqstr in ( - 'Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN') - assert is_dec_range or is_nov_range or is_oct_range - - gen = date_range(start, periods=5, freq=freq) - index = _dti(gen.values) - - if not freq.startswith('Q-'): - assert frequencies.infer_freq(index) == gen.freqstr - else: - inf_freq = frequencies.infer_freq(index) - is_dec_range = inf_freq == 'Q-DEC' and gen.freqstr in ( - 'Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', 'Q-MAR') - is_nov_range = inf_freq == 'Q-NOV' and gen.freqstr in ( - 'Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB') - is_oct_range = inf_freq == 'Q-OCT' and gen.freqstr in ( - 'Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN') - - assert is_dec_range or is_nov_range or is_oct_range - - def test_infer_freq(self): - rng = period_range('1959Q2', '2009Q3', freq='Q') - rng = Index(rng.to_timestamp('D', how='e').astype(object)) - assert rng.inferred_freq == 'Q-DEC' - - rng = period_range('1959Q2', '2009Q3', freq='Q-NOV') - rng = Index(rng.to_timestamp('D', how='e').astype(object)) - assert rng.inferred_freq == 'Q-NOV' - - rng = period_range('1959Q2', '2009Q3', freq='Q-OCT') - rng = Index(rng.to_timestamp('D', how='e').astype(object)) - assert rng.inferred_freq == 'Q-OCT' - - def test_infer_freq_tz(self): - - freqs = {'AS-JAN': - ['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01'], - 'Q-OCT': - ['2009-01-31', '2009-04-30', '2009-07-31', '2009-10-31'], - 'M': ['2010-11-30', '2010-12-31', '2011-01-31', '2011-02-28'], - 'W-SAT': - ['2010-12-25', '2011-01-01', '2011-01-08', '2011-01-15'], - 'D': ['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04'], - 'H': ['2011-12-31 22:00', '2011-12-31 23:00', - '2012-01-01 00:00', '2012-01-01 01:00']} - - # GH 7310 - for tz in [None, 'Australia/Sydney', 'Asia/Tokyo', 'Europe/Paris', - 'US/Pacific', 'US/Eastern']: - for expected, dates in compat.iteritems(freqs): - idx = DatetimeIndex(dates, tz=tz) - assert idx.inferred_freq == expected - - def test_infer_freq_tz_transition(self): - # Tests for #8772 - date_pairs = [['2013-11-02', '2013-11-5'], # Fall DST - ['2014-03-08', '2014-03-11'], # Spring DST - ['2014-01-01', '2014-01-03']] # Regular Time - freqs = ['3H', '10T', '3601S', '3600001L', '3600000001U', - '3600000000001N'] - - for tz in [None, 'Australia/Sydney', 'Asia/Tokyo', 'Europe/Paris', - 'US/Pacific', 'US/Eastern']: - for date_pair in date_pairs: - for freq in freqs: - idx = date_range(date_pair[0], date_pair[ - 1], freq=freq, tz=tz) - assert idx.inferred_freq == freq - - index = date_range("2013-11-03", periods=5, - freq="3H").tz_localize("America/Chicago") - assert index.inferred_freq is None - - def test_infer_freq_businesshour(self): - # GH 7905 - idx = DatetimeIndex( - ['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', - '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00']) - # hourly freq in a day must result in 'H' - assert idx.inferred_freq == 'H' - - idx = DatetimeIndex( - ['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', - '2014-07-01 12:00', '2014-07-01 13:00', '2014-07-01 14:00', - '2014-07-01 15:00', '2014-07-01 16:00', '2014-07-02 09:00', - '2014-07-02 10:00', '2014-07-02 11:00']) - assert idx.inferred_freq == 'BH' - - idx = DatetimeIndex( - ['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00', - '2014-07-07 10:00', '2014-07-07 11:00']) - assert idx.inferred_freq == 'BH' - - idx = DatetimeIndex( - ['2014-07-04 09:00', '2014-07-04 10:00', '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00', '2014-07-07 09:00', - '2014-07-07 10:00', '2014-07-07 11:00', '2014-07-07 12:00', - '2014-07-07 13:00', '2014-07-07 14:00', '2014-07-07 15:00', - '2014-07-07 16:00', '2014-07-08 09:00', '2014-07-08 10:00', - '2014-07-08 11:00', '2014-07-08 12:00', '2014-07-08 13:00', - '2014-07-08 14:00', '2014-07-08 15:00', '2014-07-08 16:00']) - assert idx.inferred_freq == 'BH' - - def test_not_monotonic(self): - rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) - rng = rng[::-1] - assert rng.inferred_freq == '-1A-JAN' - - def test_non_datetimeindex2(self): - rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) - - vals = rng.to_pydatetime() - - result = frequencies.infer_freq(vals) - assert result == rng.inferred_freq - - def test_invalid_index_types(self): - - # test all index types - for i in [tm.makeIntIndex(10), tm.makeFloatIndex(10), - tm.makePeriodIndex(10)]: - pytest.raises(TypeError, lambda: frequencies.infer_freq(i)) - - # GH 10822 - # odd error message on conversions to datetime for unicode - if not is_platform_windows(): - for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]: - pytest.raises(ValueError, lambda: frequencies.infer_freq(i)) - - def test_string_datetimelike_compat(self): - - # GH 6463 - expected = frequencies.infer_freq(['2004-01', '2004-02', '2004-03', - '2004-04']) - result = frequencies.infer_freq(Index(['2004-01', '2004-02', '2004-03', - '2004-04'])) - assert result == expected - - def test_series(self): - - # GH6407 - # inferring series - - # invalid type of Series - for s in [Series(np.arange(10)), Series(np.arange(10.))]: - pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) - - # a non-convertible string - pytest.raises(ValueError, lambda: frequencies.infer_freq( - Series(['foo', 'bar']))) - - # cannot infer on PeriodIndex - for freq in [None, 'L']: - s = Series(period_range('2013', periods=10, freq=freq)) - pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) - - # DateTimeIndex - for freq in ['M', 'L', 'S']: - s = Series(date_range('20130101', periods=10, freq=freq)) - inferred = frequencies.infer_freq(s) - assert inferred == freq - - s = Series(date_range('20130101', '20130110')) - inferred = frequencies.infer_freq(s) - assert inferred == 'D' - - def test_legacy_offset_warnings(self): - freqs = ['WEEKDAY', 'EOM', 'W@MON', 'W@TUE', 'W@WED', 'W@THU', - 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', - 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', - 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', - 'Y@JAN', 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', - 'WOM@4MON', 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', - 'WOM@4TUE', 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', - 'WOM@4WED', 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', - 'WOM@4THU', 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', - 'WOM@4FRI'] - - msg = INVALID_FREQ_ERR_MSG - for freq in freqs: - with pytest.raises(ValueError, match=msg): - frequencies.get_offset(freq) - - with pytest.raises(ValueError, match=msg): - date_range('2011-01-01', periods=5, freq=freq) From e9de5f3e159296a46bc62aa42a6225b67d8a4f10 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral Date: Thu, 28 Feb 2019 16:53:29 +0100 Subject: [PATCH 047/110] BUG: Fix index type casting in read_json with orient='table' and float index (#25433) (#25434) --- doc/source/whatsnew/v0.25.0.rst | 2 ++ pandas/io/json/json.py | 28 +++++++++++++------ .../tests/io/json/test_json_table_schema.py | 11 ++------ pandas/tests/io/json/test_pandas.py | 23 +++++++++++++-- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a591c498d00c3..a6f7395f5177e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -210,6 +210,8 @@ I/O - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) +- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) +- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) - - - diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 725e2d28ffd67..4bae067ee5196 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -227,7 +227,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, - convert_axes=True, convert_dates=True, keep_default_dates=True, + convert_axes=None, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression='infer'): """ @@ -277,18 +277,25 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, 'table' as an allowed value for the ``orient`` argument typ : type of object to recover (series or frame), default 'frame' - dtype : boolean or dict, default True + dtype : boolean or dict, default None If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. - Not applicable with ``orient='table'``. + For all ``orient`` values except ``'table'``, default is True. - .. versionchanged:: 0.25 + .. versionchanged:: 0.25.0 - Not applicable with ``orient='table'``. + Not applicable for ``orient='table'``. - convert_axes : boolean, default True + convert_axes : boolean, default None Try to convert the axes to the proper dtypes. + + For all ``orient`` values except ``'table'``, default is True. + + .. versionchanged:: 0.25.0 + + Not applicable for ``orient='table'``. + convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if @@ -417,8 +424,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, if orient == 'table' and dtype: raise ValueError("cannot pass both dtype and orient='table'") + if orient == 'table' and convert_axes: + raise ValueError("cannot pass both convert_axes and orient='table'") - dtype = orient != 'table' if dtype is None else dtype + if dtype is None and orient != 'table': + dtype = True + if convert_axes is None and orient != 'table': + convert_axes = True compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( @@ -692,7 +704,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, # don't try to coerce, unless a force conversion if use_dtypes: - if self.dtype is False: + if not self.dtype: return data, False elif self.dtype is True: pass diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 3002d1dfb5f8a..351b495e5d8fc 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -564,17 +564,10 @@ def test_multiindex(self, index_names): result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) - @pytest.mark.parametrize("strict_check", [ - pytest.param(True, marks=pytest.mark.xfail), - False - ]) - def test_empty_frame_roundtrip(self, strict_check): + def test_empty_frame_roundtrip(self): # GH 21287 df = pd.DataFrame([], columns=['a', 'b', 'c']) expected = df.copy() out = df.to_json(orient='table') result = pd.read_json(out, orient='table') - # TODO: When DF coercion issue (#21345) is resolved tighten type checks - tm.assert_frame_equal(expected, result, - check_dtype=strict_check, - check_index_type=strict_check) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fecd0f0572757..ed598b730d960 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -194,7 +194,7 @@ def _check_orient(df, orient, dtype=None, numpy=False, else: unser = unser.sort_index() - if dtype is False: + if not dtype: check_dtype = False if not convert_axes and df.index.dtype.type == np.datetime64: @@ -1202,6 +1202,16 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after + @pytest.mark.parametrize('index', [None, [1, 2], [1., 2.], ['a', 'b'], + ['1', '2'], ['1.', '2.']]) + @pytest.mark.parametrize('columns', [['a', 'b'], ['1', '2'], ['1.', '2.']]) + def test_from_json_to_json_table_index_and_columns(self, index, columns): + # GH25433 GH25435 + expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns) + dfjson = expected.to_json(orient='table') + result = pd.read_json(dfjson, orient='table') + assert_frame_equal(result, expected) + def test_from_json_to_json_table_dtypes(self): # GH21345 expected = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) @@ -1214,9 +1224,18 @@ def test_read_json_table_dtype_raises(self, dtype): # GH21345 df = pd.DataFrame({'a': [1, 2], 'b': [3., 4.], 'c': ['5', '6']}) dfjson = df.to_json(orient='table') - with pytest.raises(ValueError): + msg = "cannot pass both dtype and orient='table'" + with pytest.raises(ValueError, match=msg): pd.read_json(dfjson, orient='table', dtype=dtype) + def test_read_json_table_convert_axes_raises(self): + # GH25433 GH25435 + df = DataFrame([[1, 2], [3, 4]], index=[1., 2.], columns=['1.', '2.']) + dfjson = df.to_json(orient='table') + msg = "cannot pass both convert_axes and orient='table'" + with pytest.raises(ValueError, match=msg): + pd.read_json(dfjson, orient='table', convert_axes=True) + @pytest.mark.parametrize('data, expected', [ (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']), {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}), From 169a56a33fb7f4fad447e70869599a99832eeee5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Feb 2019 09:38:47 -0800 Subject: [PATCH 048/110] BUG: Groupby.agg with reduction function with tz aware data (#25308) * BUG: Groupby.agg cannot reduce with tz aware data * Handle output always as UTC * Add whatsnew * isort and add another fixed groupby.first/last issue * bring condition at a higher level * Add try for _try_cast * Add comments * Don't pass the utc_dtype explicitly * Remove unused import * Use string dtype instead --- doc/source/whatsnew/v0.25.0.rst | 4 ++-- pandas/_libs/reduction.pyx | 4 +++- pandas/core/groupby/groupby.py | 19 +++++++++++++++++-- pandas/tests/groupby/aggregate/test_other.py | 15 +++++++++++++++ pandas/tests/groupby/test_nth.py | 20 ++++++++++++++++++++ 5 files changed, 57 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6f7395f5177e..f847c1d827186 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -229,8 +229,8 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- -- +- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) Reshaping diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 507567cf480d7..517d59c399179 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -342,7 +342,9 @@ cdef class SeriesGrouper: index = None else: values = dummy.values - if dummy.dtype != self.arr.dtype: + # GH 23683: datetimetz types are equivalent to datetime types here + if (dummy.dtype != self.arr.dtype + and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c364f069bf53d..926da40deaff2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,8 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) + ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, + is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna from pandas.api.types import ( @@ -766,7 +767,21 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if is_extension_array_dtype(dtype): + if is_datetime64tz_dtype(dtype): + # GH 23683 + # Prior results _may_ have been generated in UTC. + # Ensure we localize to UTC first before converting + # to the target timezone + try: + result = obj._values._from_sequence( + result, dtype='datetime64[ns, UTC]' + ) + result = result.astype(dtype) + except TypeError: + # _try_cast was called at a point where the result + # was already tz-aware + pass + elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. try: diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index b5214b11bddcc..cacfdb7694de1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -512,3 +512,18 @@ def test_agg_list_like_func(): expected = pd.DataFrame({'A': [str(x) for x in range(3)], 'B': [[str(x)] for x in range(3)]}) tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_with_timezone(): + # GH 23683 + df = pd.DataFrame({ + 'tag': [1, 1], + 'date': [ + pd.Timestamp('2018-01-01', tz='UTC'), + pd.Timestamp('2018-01-02', tz='UTC')] + }) + result = df.groupby('tag').agg({'date': lambda e: e.head(1)}) + expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')], + index=pd.Index([1], name='tag'), + columns=['date']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 255d9a8acf2d0..7a3d189d3020e 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last): assert_frame_equal(result, expected[['id', 'time']]) +@pytest.mark.parametrize('method, ts, alpha', [ + ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'], + ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b'] +]) +def test_first_last_tz_multi_column(method, ts, alpha): + # GH 21603 + df = pd.DataFrame({'group': [1, 1, 2], + 'category_string': pd.Series(list('abc')).astype( + 'category'), + 'datetimetz': pd.date_range('20130101', periods=3, + tz='US/Eastern')}) + result = getattr(df.groupby('group'), method)() + expepcted = pd.DataFrame({'category_string': [alpha, 'c'], + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expepcted) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex From ae1ab8993c5152ee14a1c4bb3577d268fb3b90e3 Mon Sep 17 00:00:00 2001 From: Gordon Blackadder Date: Thu, 28 Feb 2019 20:00:30 +0000 Subject: [PATCH 049/110] DOC: Fix docstring for read_sql_table (#25465) --- pandas/io/sql.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index aaface5415384..02fba52eac7f7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -182,26 +182,29 @@ def execute(sql, con, cur=None, params=None): def read_sql_table(table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None): - """Read SQL database table into a DataFrame. + """ + Read SQL database table into a DataFrame. Given a table name and a SQLAlchemy connectable, returns a DataFrame. This function does not support DBAPI connections. Parameters ---------- - table_name : string + table_name : str Name of SQL table in database. - con : SQLAlchemy connectable (or database string URI) + con : SQLAlchemy connectable or str + A database URI could be provided as as str. SQLite DBAPI connection mode not supported. - schema : string, default None + schema : str, default None Name of SQL schema in database to query (if database flavor supports this). Uses default schema if None (default). - index_col : string or list of strings, optional, default: None + index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). - coerce_float : boolean, default True + coerce_float : bool, default True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point. Can result in loss of Precision. - parse_dates : list or dict, default: None + parse_dates : list or dict, default None + The behavior is as follows: - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times or is one of @@ -210,8 +213,8 @@ def read_sql_table(table_name, con, schema=None, index_col=None, to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. - columns : list, default: None - List of column names to select from SQL table + columns : list, default None + List of column names to select from SQL table. chunksize : int, default None If specified, returns an iterator where `chunksize` is the number of rows to include in each chunk. @@ -219,15 +222,21 @@ def read_sql_table(table_name, con, schema=None, index_col=None, Returns ------- DataFrame + A SQL table is returned as two-dimensional data structure with labeled + axes. See Also -------- read_sql_query : Read SQL query into a DataFrame. - read_sql + read_sql : Read SQL query or database table into a DataFrame. Notes ----- Any datetime values with time zone information will be converted to UTC. + + Examples + -------- + >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP """ con = _engine_builder(con) From db978c716369064421c5ca71bd26002e0021e0d1 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 28 Feb 2019 22:21:39 +0100 Subject: [PATCH 050/110] ENH: Add Series.str.casefold (#25419) --- doc/source/reference/series.rst | 1 + doc/source/user_guide/text.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/strings.py | 19 +++++++++++++------ pandas/tests/test_strings.py | 11 ++++++++++- 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index a6ac40b5203bf..b406893e3414a 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -409,6 +409,7 @@ strings and apply several methods to it. These can be accessed like :template: autosummary/accessor_method.rst Series.str.capitalize + Series.str.casefold Series.str.cat Series.str.center Series.str.contains diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index e4f60a761750d..6f21a7d9beb36 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -600,6 +600,7 @@ Method Summary :meth:`~Series.str.partition`;Equivalent to ``str.partition`` :meth:`~Series.str.rpartition`;Equivalent to ``str.rpartition`` :meth:`~Series.str.lower`;Equivalent to ``str.lower`` + :meth:`~Series.str.casefold`;Equivalent to ``str.casefold`` :meth:`~Series.str.upper`;Equivalent to ``str.upper`` :meth:`~Series.str.find`;Equivalent to ``str.find`` :meth:`~Series.str.rfind`;Equivalent to ``str.rfind`` diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f847c1d827186..d1f1ea862110e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -22,6 +22,7 @@ Other Enhancements - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) +- ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index cc7a4db515c42..9577b07360f65 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2926,7 +2926,7 @@ def rindex(self, sub, start=0, end=None): _shared_docs['casemethods'] = (""" Convert strings in the Series/Index to %(type)s. - + %(version)s Equivalent to :meth:`str.%(method)s`. Returns @@ -2943,6 +2943,7 @@ def rindex(self, sub, start=0, end=None): remaining to lowercase. Series.str.swapcase : Converts uppercase to lowercase and lowercase to uppercase. + Series.str.casefold: Removes all case distinctions in the string. Examples -------- @@ -2989,12 +2990,15 @@ def rindex(self, sub, start=0, end=None): 3 sWaPcAsE dtype: object """) - _shared_docs['lower'] = dict(type='lowercase', method='lower') - _shared_docs['upper'] = dict(type='uppercase', method='upper') - _shared_docs['title'] = dict(type='titlecase', method='title') + _shared_docs['lower'] = dict(type='lowercase', method='lower', version='') + _shared_docs['upper'] = dict(type='uppercase', method='upper', version='') + _shared_docs['title'] = dict(type='titlecase', method='title', version='') _shared_docs['capitalize'] = dict(type='be capitalized', - method='capitalize') - _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase') + method='capitalize', version='') + _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase', + version='') + _shared_docs['casefold'] = dict(type='be casefolded', method='casefold', + version='\n .. versionadded:: 0.25.0\n') lower = _noarg_wrapper(lambda x: x.lower(), docstring=_shared_docs['casemethods'] % _shared_docs['lower']) @@ -3010,6 +3014,9 @@ def rindex(self, sub, start=0, end=None): swapcase = _noarg_wrapper(lambda x: x.swapcase(), docstring=_shared_docs['casemethods'] % _shared_docs['swapcase']) + casefold = _noarg_wrapper(lambda x: x.casefold(), + docstring=_shared_docs['casemethods'] % + _shared_docs['casefold']) _shared_docs['ismethods'] = (""" Check whether all characters in each string are %(type)s. diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bbcdc24f58f9b..40a83f90c8dfd 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -76,7 +76,7 @@ def assert_series_or_index_equal(left, right): 'len', 'lower', 'lstrip', 'partition', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', - 'strip', 'swapcase', 'title', 'upper' + 'strip', 'swapcase', 'title', 'upper', 'casefold' ], [()] * 100, [{}] * 100)) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id @@ -3440,3 +3440,12 @@ def test_method_on_bytes(self): expected = Series(np.array( ['ad', 'be', 'cf'], 'S2').astype(object)) tm.assert_series_equal(result, expected) + + @pytest.mark.skipif(compat.PY2, reason='not in python2') + def test_casefold(self): + # GH25405 + expected = Series(['ss', NA, 'case', 'ssd']) + s = Series(['ß', NA, 'case', 'ßd']) + result = s.str.casefold() + + tm.assert_series_equal(result, expected) From 0a61ecdf6b4ea61a67afb4e3862df79adc07053a Mon Sep 17 00:00:00 2001 From: Thein Oo Date: Thu, 28 Feb 2019 20:46:43 -0500 Subject: [PATCH 051/110] Fix PR10 error and Clean up docstrings from functions related to RT05 errors (#25132) --- ci/code_checks.sh | 4 +- pandas/core/algorithms.py | 13 +++-- pandas/core/arrays/categorical.py | 14 ++--- pandas/core/arrays/datetimelike.py | 6 +-- pandas/core/frame.py | 7 +-- pandas/core/generic.py | 32 +++++++---- pandas/core/indexes/base.py | 23 ++++---- pandas/core/indexes/multi.py | 6 +-- pandas/core/series.py | 2 + pandas/io/excel/_base.py | 2 +- pandas/plotting/_core.py | 86 +++++++++++++++--------------- pandas/plotting/_misc.py | 12 ++--- pandas/tseries/frequencies.py | 6 +-- 13 files changed, 118 insertions(+), 95 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ac6aade106ce6..c4840f1e836c4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -241,8 +241,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL06, GL07, GL09, SS04, PR03, PR05, PR10, EX04, RT04, RT05, SS05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,GL07,GL09,SS04,PR03,PR04,PR05,EX04,RT04,RT05,SS05,SA05 + MSG='Validate docstrings (GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b056a357d0a51..4a71951e2435e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -288,10 +288,15 @@ def unique(values): Returns ------- - unique values. - If the input is an Index, the return is an Index - If the input is a Categorical dtype, the return is a Categorical - If the input is a Series/ndarray, the return will be an ndarray. + numpy.ndarray or ExtensionArray + + The return can be: + + * Index : when the input is an Index + * Categorical : when the input is a Categorical dtype + * ndarray : when the input is a Series/ndarray + + Return numpy.ndarray or ExtensionArray. See Also -------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 79e565df94eae..37a24a54be8b1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1289,7 +1289,7 @@ def __array__(self, dtype=None): Returns ------- - values : numpy array + numpy.array A numpy array of either the specified dtype or, if dtype==None (default), the same dtype as categorical.categories.dtype. @@ -1499,9 +1499,9 @@ def get_values(self): Returns ------- - values : numpy array + numpy.array A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods + Index if datetime / periods. """ # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): @@ -1540,7 +1540,7 @@ def argsort(self, *args, **kwargs): Returns ------- - argsorted : numpy array + numpy.array See Also -------- @@ -1593,7 +1593,7 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): Returns ------- - y : Categorical or None + Categorical or None See Also -------- @@ -1667,7 +1667,7 @@ def _values_for_rank(self): Returns ------- - numpy array + numpy.array """ from pandas import Series @@ -1695,7 +1695,7 @@ def ravel(self, order='C'): Returns ------- - raveled : numpy array + numpy.array """ return np.array(self) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 84536ac72a455..94668c74c1693 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -144,7 +144,7 @@ def strftime(self, date_format): Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in `python string format - doc <%(URL)s>`__ + doc <%(URL)s>`__. Parameters ---------- @@ -748,7 +748,7 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): mask the result if needed, convert to the provided dtype if its not None - This is an internal routine + This is an internal routine. """ if self._hasnans: @@ -1047,7 +1047,7 @@ def _sub_period_array(self, other): Returns ------- result : np.ndarray[object] - Array of DateOffset objects; nulls represented by NaT + Array of DateOffset objects; nulls represented by NaT. """ if not is_period_dtype(self): raise TypeError("cannot subtract {dtype}-dtype from {cls}" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a40733b7076b0..6b4d95055d06d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2696,7 +2696,7 @@ def get_value(self, index, col, takeable=False): Returns ------- - scalar value + scalar """ warnings.warn("get_value is deprecated and will be removed " @@ -2736,7 +2736,7 @@ def set_value(self, index, col, value, takeable=False): ---------- index : row label col : column label - value : scalar value + value : scalar takeable : interpret the index/col as indexers, default False Returns @@ -6896,7 +6896,7 @@ def round(self, decimals=0, *args, **kwargs): Returns ------- - DataFrame : + DataFrame A DataFrame with the affected columns rounded to the specified number of decimal places. @@ -7000,6 +7000,7 @@ def corr(self, method='pearson', min_periods=1): * spearman : Spearman rank correlation * callable: callable with input two 1d ndarrays and returning a float + .. versionadded:: 0.24.0 min_periods : int, optional diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 523543ada235c..eb427a42a249b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2807,14 +2807,17 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. decimal : str, default '.' Character recognized as decimal separator, e.g. ',' in Europe. + .. versionadded:: 0.18.0 multicolumn : bool, default True Use \multicolumn to enhance MultiIndex columns. The default will be read from the config module. + .. versionadded:: 0.20.0 multicolumn_format : str, default 'l' The alignment for multicolumns, similar to `column_format` The default will be read from the config module. + .. versionadded:: 0.20.0 multirow : bool, default False Use \multirow to enhance MultiIndex rows. Requires adding a @@ -2822,6 +2825,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. + .. versionadded:: 0.20.0 Returns @@ -4948,11 +4952,15 @@ def pipe(self, func, *args, **kwargs): Returns ------- - DataFrame, Series or scalar - If DataFrame.agg is called with a single function, returns a Series - If DataFrame.agg is called with several functions, returns a DataFrame - If Series.agg is called with single function, returns a scalar - If Series.agg is called with several functions, returns a Series. + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. %(see_also)s @@ -6879,11 +6887,15 @@ def asof(self, where, subset=None): ------- scalar, Series, or DataFrame - Scalar : when `self` is a Series and `where` is a scalar. - Series: when `self` is a Series and `where` is an array-like, - or when `self` is a DataFrame and `where` is a scalar. - DataFrame : when `self` is a DataFrame and `where` is an - array-like. + The return can be: + + * scalar : when `self` is a Series and `where` is a scalar + * Series: when `self` is a Series and `where` is an array-like, + or when `self` is a DataFrame and `where` is a scalar + * DataFrame : when `self` is a DataFrame and `where` is an + array-like + + Return scalar, Series, or DataFrame. See Also -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1cdacc908b663..dee181fc1c569 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1443,7 +1443,7 @@ def sortlevel(self, level=None, ascending=True, sort_remaining=None): Returns ------- - sorted_index : Index + Index """ return self.sort_values(return_indexer=True, ascending=ascending) @@ -1461,7 +1461,7 @@ def _get_level_values(self, level): Returns ------- - values : Index + Index Calling object, as there is only one level in the Index. See Also @@ -1506,7 +1506,7 @@ def droplevel(self, level=0): Returns ------- - index : Index or MultiIndex + Index or MultiIndex """ if not isinstance(level, (tuple, list)): level = [level] @@ -1558,11 +1558,11 @@ def droplevel(self, level=0): Returns ------- grouper : Index - Index of values to group on + Index of values to group on. labels : ndarray of int or None - Array of locations in level_index + Array of locations in level_index. uniques : Index or None - Index of unique values for level + Index of unique values for level. """ @Appender(_index_shared_docs['_get_grouper_for_level']) @@ -2972,9 +2972,10 @@ def _convert_listlike_indexer(self, keyarr, kind=None): Returns ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys + indexer : numpy.ndarray or None + Return an ndarray or None if cannot convert. + keyarr : numpy.ndarray + Return tuple-safe keys. """ if isinstance(keyarr, Index): keyarr = self._convert_index_indexer(keyarr) @@ -3158,9 +3159,9 @@ def _reindex_non_unique(self, target): Returns ------- new_index : pd.Index - Resulting index + Resulting index. indexer : np.ndarray or None - Indices of output values in original index + Indices of output values in original index. """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 492d28476e1f0..616c17cd16f9a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -61,7 +61,7 @@ def _codes_to_ints(self, codes): Returns ------ int_keys : scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each) + Integer(s) representing one combination (each). """ # Shift the representation of each level by the pre-calculated number # of bits: @@ -101,7 +101,7 @@ def _codes_to_ints(self, codes): Returns ------ int_keys : int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each) + Integer(s) representing one combination (each). """ # Shift the representation of each level by the pre-calculated number @@ -2195,7 +2195,7 @@ def reindex(self, target, method=None, level=None, limit=None, new_index : pd.MultiIndex Resulting index indexer : np.ndarray or None - Indices of output values in original index + Indices of output values in original index. """ # GH6552: preserve names when reindexing to non-named target diff --git a/pandas/core/series.py b/pandas/core/series.py index ad7c6af21f637..cada6663ce651 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1669,6 +1669,8 @@ def unique(self): * Sparse * IntegerNA + See Examples section. + Examples -------- >>> pd.Series([2, 1, 3, 3], name='A').unique() diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8f7bf8e0466f9..c6d390692c789 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -510,7 +510,7 @@ class ExcelWriter(object): mode : {'w' or 'a'}, default 'w' File mode to use (write or append). - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0 Attributes ---------- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 2c672f235f1e1..48d870bfc2e03 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1413,7 +1413,7 @@ def orientation(self): Returns ------- - axes : matplotlib.axes.Axes or numpy.ndarray of them + matplotlib.axes.Axes or numpy.ndarray of them See Also -------- @@ -1809,26 +1809,26 @@ def _plot(data, x=None, y=None, subplots=False, Allows plotting of one column versus another""" series_coord = "" -df_unique = """stacked : boolean, default False in line and +df_unique = """stacked : bool, default False in line and bar plots, and True in area plot. If True, create stacked plot. - sort_columns : boolean, default False + sort_columns : bool, default False Sort column names to determine plot ordering - secondary_y : boolean or sequence, default False + secondary_y : bool or sequence, default False Whether to plot on the secondary y-axis If a list/tuple, which columns to plot on secondary y-axis""" series_unique = """label : label argument to provide to plot - secondary_y : boolean or sequence of ints, default False + secondary_y : bool or sequence of ints, default False If True then y-axis will be on the right""" df_ax = """ax : matplotlib axes object, default None - subplots : boolean, default False + subplots : bool, default False Make separate subplots for each column - sharex : boolean, default True if ax is None else False + sharex : bool, default True if ax is None else False In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in; Be aware, that passing in both an ax and sharex=True will alter all x axis labels for all axis in a figure! - sharey : boolean, default False + sharey : bool, default False In case subplots=True, share y axis and set some y axis labels to invisible layout : tuple (optional) @@ -1882,23 +1882,23 @@ def _plot(data, x=None, y=None, subplots=False, %(klass_kind)s %(klass_ax)s figsize : a tuple (width, height) in inches - use_index : boolean, default True + use_index : bool, default True Use index as ticks for x axis title : string or list Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and `subplots` is True, print each item in the list above the corresponding subplot. - grid : boolean, default None (matlab style default) + grid : bool, default None (matlab style default) Axis grid lines legend : False/True/'reverse' Place legend on axis subplots style : list or dict matplotlib line style per column - logx : boolean, default False + logx : bool, default False Use log scaling on x axis - logy : boolean, default False + logy : bool, default False Use log scaling on y axis - loglog : boolean, default False + loglog : bool, default False Use log scaling on both x and y axes xticks : sequence Values to use for the xticks @@ -1913,12 +1913,12 @@ def _plot(data, x=None, y=None, subplots=False, colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. - colorbar : boolean, optional + colorbar : bool, optional If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots) position : float Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) - table : boolean, Series or DataFrame, default False + table : bool, Series or DataFrame, default False If True, draw a table using the data in the DataFrame and the data will be transposed to meet matplotlib's default layout. If a Series or DataFrame is passed, use passed data to draw a table. @@ -1927,7 +1927,7 @@ def _plot(data, x=None, y=None, subplots=False, detail. xerr : same types as yerr. %(klass_unique)s - mark_right : boolean, default True + mark_right : bool, default True When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend `**kwds` : keywords @@ -1935,7 +1935,7 @@ def _plot(data, x=None, y=None, subplots=False, Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them Notes ----- @@ -2025,7 +2025,7 @@ def plot_series(data, kind='line', ax=None, # Series unique rot : int or float, default 0 The rotation angle of labels (in degrees) with respect to the screen coordinate system. - grid : boolean, default True + grid : bool, default True Setting this to True will show the grid. figsize : A tuple (width, height) in inches The size of the figure to create in matplotlib. @@ -2070,6 +2070,7 @@ def plot_series(data, kind='line', ax=None, # Series unique * :class:`~pandas.Series` * :class:`~numpy.array` (for ``return_type = None``) + Return Series or numpy.array. Use ``return_type='dict'`` when you want to tweak the appearance of the lines after plotting. In this case a dict containing the Lines @@ -2272,7 +2273,7 @@ def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, Returns ------- - fig : matplotlib.Figure + matplotlib.Figure """ import matplotlib.pyplot as plt @@ -2321,7 +2322,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, If passed, will be used to limit data to a subset of columns. by : object, optional If passed, then used to form histograms for separate groups. - grid : boolean, default True + grid : bool, default True Whether to show axis grid lines. xlabelsize : int, default None If specified changes the x-axis label size. @@ -2335,13 +2336,13 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, y labels rotated 90 degrees clockwise. ax : Matplotlib axes object, default None The axes to plot the histogram on. - sharex : boolean, default True if ax is None else False + sharex : bool, default True if ax is None else False In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in. Note that passing in both an ax and sharex=True will alter all x axis labels for all subplots in a figure. - sharey : boolean, default False + sharey : bool, default False In case subplots=True, share y axis and set some y axis labels to invisible. figsize : tuple @@ -2360,7 +2361,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, Returns ------- - axes : matplotlib.AxesSubplot or numpy.ndarray of them + matplotlib.AxesSubplot or numpy.ndarray of them See Also -------- @@ -2428,7 +2429,7 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, If passed, then used to form histograms for separate groups ax : matplotlib axis object If not passed, uses gca() - grid : boolean, default True + grid : bool, default True Whether to show axis grid lines xlabelsize : int, default None If specified changes the x-axis label size @@ -2510,15 +2511,15 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, bins : int, default 50 figsize : tuple, optional layout : optional - sharex : boolean, default False - sharey : boolean, default False + sharex : bool, default False + sharey : bool, default False rot : int, default 90 grid : bool, default True kwargs : dict, keyword arguments passed to matplotlib.Axes.hist Returns ------- - axes : collection of Matplotlib Axes + collection of Matplotlib Axes """ _raise_if_no_mpl() _converter._WARN = False @@ -2752,7 +2753,7 @@ def line(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them Examples -------- @@ -2777,7 +2778,7 @@ def bar(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='bar', **kwds) @@ -2793,7 +2794,7 @@ def barh(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='barh', **kwds) @@ -2809,7 +2810,7 @@ def box(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='box', **kwds) @@ -2827,7 +2828,7 @@ def hist(self, bins=10, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='hist', bins=bins, **kwds) @@ -2886,7 +2887,7 @@ def area(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='area', **kwds) @@ -2902,7 +2903,7 @@ def pie(self, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them """ return self(kind='pie', **kwds) @@ -2962,8 +2963,8 @@ def line(self, x=None, y=None, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or :class:`numpy.ndarray` - Returns an ndarray when ``subplots=True``. + :class:`matplotlib.axes.Axes` or :class:`numpy.ndarray` + Return an ndarray when ``subplots=True``. See Also -------- @@ -3027,7 +3028,7 @@ def bar(self, x=None, y=None, **kwds): Returns ------- - axes : matplotlib.axes.Axes or np.ndarray of them + matplotlib.axes.Axes or np.ndarray of them An ndarray is returned with one :class:`matplotlib.axes.Axes` per column when ``subplots=True``. @@ -3109,7 +3110,7 @@ def barh(self, x=None, y=None, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them. + :class:`matplotlib.axes.Axes` or numpy.ndarray of them See Also -------- @@ -3196,7 +3197,7 @@ def box(self, by=None, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them See Also -------- @@ -3239,7 +3240,8 @@ def hist(self, by=None, bins=10, **kwds): Returns ------- - axes : matplotlib.AxesSubplot histogram. + class:`matplotlib.AxesSubplot` + Return a histogram plot. See Also -------- @@ -3403,7 +3405,7 @@ def pie(self, y=None, **kwds): Returns ------- - axes : matplotlib.axes.Axes or np.ndarray of them. + matplotlib.axes.Axes or np.ndarray of them A NumPy array is returned when `subplots` is True. See Also @@ -3479,7 +3481,7 @@ def scatter(self, x, y, s=None, c=None, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + :class:`matplotlib.axes.Axes` or numpy.ndarray of them See Also -------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 21592a5b4a0a1..5171ea68fd497 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -178,7 +178,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): Returns ------- - axes : :class:`matplotlib.axes.Axes` + class:`matplotlib.axes.Axes` See Also -------- @@ -302,7 +302,7 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, Returns ------- - ax : Matplotlib axis object + class:`matplotlip.axis.Axes` """ from math import sqrt, pi @@ -389,7 +389,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): Returns ------- - fig : matplotlib.figure.Figure + matplotlib.figure.Figure Matplotlib figure. See Also @@ -490,7 +490,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, Returns ------- - ax: matplotlib axis object + class:`matplotlib.axis.Axes` Examples -------- @@ -579,7 +579,7 @@ def lag_plot(series, lag=1, ax=None, **kwds): Returns ------- - ax: Matplotlib axis object + class:`matplotlib.axis.Axes` """ import matplotlib.pyplot as plt @@ -610,7 +610,7 @@ def autocorrelation_plot(series, ax=None, **kwds): Returns: ----------- - ax: Matplotlib axis object + class:`matplotlib.axis.Axes` """ import matplotlib.pyplot as plt n = len(series) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4802447cbc99d..1b782b430a1a7 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -67,7 +67,7 @@ def to_offset(freq): Returns ------- - delta : DateOffset + DateOffset None if freq is None. Raises @@ -214,7 +214,7 @@ def infer_freq(index, warn=True): Returns ------- - freq : string or None + str or None None if no discernible frequency TypeError if the index is not datetime-like ValueError if there are less than three values. @@ -300,7 +300,7 @@ def get_freq(self): Returns ------- - freqstr : str or None + str or None """ if not self.is_monotonic or not self.index._is_unique: return None From 9bb98b8832d3eb90ae592cf8f1275c35edea778e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 1 Mar 2019 04:47:30 -0800 Subject: [PATCH 052/110] Fix unreliable test (#25496) --- pandas/tests/indexes/datetimes/test_datetime.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a3ee5fe39769f..c7147e6fe7063 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -100,8 +100,7 @@ def test_hash_error(self): def test_stringified_slice_with_tz(self): # GH#2658 - import datetime - start = datetime.datetime.now() + start = '2013-01-07' idx = date_range(start=start, freq="1d", periods=10, tz='US/Eastern') df = DataFrame(lrange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here From 011f0a6ebacb13619bf8225b3bc2370872b85769 Mon Sep 17 00:00:00 2001 From: Max van Deursen Date: Fri, 1 Mar 2019 13:57:49 +0100 Subject: [PATCH 053/110] DOC: Clarifying doc/make.py --single parameter (#25482) --- doc/make.py | 8 +++++--- doc/source/development/contributing.rst | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/make.py b/doc/make.py index 438c4a04a3f08..8b2a77987e663 100755 --- a/doc/make.py +++ b/doc/make.py @@ -294,14 +294,16 @@ def main(): help='number of jobs used by sphinx-build') argparser.add_argument('--no-api', default=False, - help='ommit api and autosummary', + help='omit api and autosummary', action='store_true') argparser.add_argument('--single', metavar='FILENAME', type=str, default=None, - help=('filename of section or method name to ' - 'compile, e.g. "indexing", "DataFrame.join"')) + help=('filename (relative to the "source" folder)' + ' of section or method name to compile, e.g. ' + '"development/contributing.rst",' + ' "ecosystem.rst", "pandas.DataFrame.join"')) argparser.add_argument('--python-path', type=str, default=os.path.dirname(DOC_PATH), diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 027f2d90bbb73..a87a66cd08ad1 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -428,10 +428,10 @@ reducing the turn-around time for checking your changes. python make.py clean python make.py --no-api - # compile the docs with only a single - # section, that which is in indexing.rst + # compile the docs with only a single section, relative to the "source" folder. + # For example, compiling only this guide (docs/source/development/contributing.rst) python make.py clean - python make.py --single indexing + python make.py --single development/contributing.rst # compile the reference docs for a single function python make.py clean From 1f8d7e07d7188e217145b45a8746dcc624c5f06c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 1 Mar 2019 16:52:07 +0000 Subject: [PATCH 054/110] fix MacPython / pandas-wheels ci failures (#25505) --- pandas/tests/test_algos.py | 2 +- pandas/tests/test_sorting.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c56bf944699e2..3f75c508d22f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -229,7 +229,7 @@ def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) - msg = ("'<' not supported between instances of 'complex' and" + msg = (r"'(<|>)' not supported between instances of 'complex' and" r" 'complex'|" r"unorderable types: complex\(\) > complex\(\)") with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index e83bdb1af9121..2a64947042979 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -409,9 +409,9 @@ def test_mixed_integer_from_list(self): def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - msg = ("'<' not supported between instances of 'datetime.datetime'" - r" and 'int'|" - r"unorderable types: int\(\) > datetime.datetime\(\)") + msg = (r"'(<|>)' not supported between instances of" + r" 'datetime\.datetime' and 'int'|" + r"unorderable types: int\(\) > datetime\.datetime\(\)") if compat.PY2: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception with warnings.catch_warnings(): From 1d3b4a57fd931520f53185aa120db9bcbf6f0c79 Mon Sep 17 00:00:00 2001 From: Nicholas Musolino Date: Fri, 1 Mar 2019 12:21:02 -0500 Subject: [PATCH 055/110] DOC: Reword Series.interpolate docstring for clarity (#25491) --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb427a42a249b..ee8f9cba951b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6618,10 +6618,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * 'pad': Fill in NaNs using existing values. * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline', 'barycentric', 'polynomial': Passed to - `scipy.interpolate.interp1d`. Both 'polynomial' and 'spline' - require that you also specify an `order` (int), - e.g. ``df.interpolate(method='polynomial', order=5)``. - These use the numerical values of the index. + `scipy.interpolate.interp1d`. These methods use the numerical + values of the index. Both 'polynomial' and 'spline' require that + you also specify an `order` (int), e.g. + ``df.interpolate(method='polynomial', order=5)``. * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima': Wrappers around the SciPy interpolation methods of similar names. See `Notes`. From ae4db8665484a0082949cef0acf16559c07c922f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 1 Mar 2019 09:57:38 -0800 Subject: [PATCH 056/110] Changed insertion order to sys.path (#25486) --- doc/make.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/make.py b/doc/make.py index 8b2a77987e663..6ffbd3ef86e68 100755 --- a/doc/make.py +++ b/doc/make.py @@ -325,7 +325,7 @@ def main(): # the import of `python_path` correctly. The latter is used to resolve # the import within the module, injecting it into the global namespace os.environ['PYTHONPATH'] = args.python_path - sys.path.append(args.python_path) + sys.path.insert(0, args.python_path) globals()['pandas'] = importlib.import_module('pandas') # Set the matplotlib backend to the non-interactive Agg backend for all From 3e3c9019a11fbdf9e61fc36ca216fd128c25ae04 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 2 Mar 2019 16:08:08 -0500 Subject: [PATCH 057/110] TST: xfail non-writeable pytables tests with numpy 1.16x (#25517) --- pandas/compat/numpy/__init__.py | 4 ++- pandas/tests/indexes/multi/test_analytics.py | 4 +-- pandas/tests/io/test_pytables.py | 31 ++++++++++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index bc9af01a97467..6e9f768d8bd68 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -13,6 +13,7 @@ _np_version_under1p14 = _nlv < LooseVersion('1.14') _np_version_under1p15 = _nlv < LooseVersion('1.15') _np_version_under1p16 = _nlv < LooseVersion('1.16') +_np_version_under1p17 = _nlv < LooseVersion('1.17') if _nlv < '1.12': @@ -66,5 +67,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): '_np_version_under1p13', '_np_version_under1p14', '_np_version_under1p15', - '_np_version_under1p16' + '_np_version_under1p16', + '_np_version_under1p17' ] diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 27a5ba9e5434a..d5a6e9acaa5f3 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -4,7 +4,7 @@ import pytest from pandas.compat import PY2, lrange -from pandas.compat.numpy import _np_version_under1p16 +from pandas.compat.numpy import _np_version_under1p17 import pandas as pd from pandas import Index, MultiIndex, date_range, period_range @@ -287,7 +287,7 @@ def test_numpy_ufuncs(idx, func): # test ufuncs of numpy. see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html - if _np_version_under1p16: + if _np_version_under1p17: expected_exception = AttributeError msg = "'tuple' object has no attribute '{}'".format(func.__name__) else: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b464903d8b4e0..69ff32d1b728b 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -34,6 +34,15 @@ tables = pytest.importorskip('tables') +# TODO: +# remove when gh-24839 is fixed; this affects numpy 1.16 +# and pytables 3.4.4 +xfail_non_writeable = pytest.mark.xfail( + LooseVersion(np.__version__) >= LooseVersion('1.16'), + reason=('gh-25511, gh-24839. pytables needs a ' + 'release beyong 3.4.4 to support numpy 1.16x')) + + _default_compressor = ('blosc' if LooseVersion(tables.__version__) >= LooseVersion('2.2') else 'zlib') @@ -862,6 +871,7 @@ def test_put_integer(self): df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal) + @xfail_non_writeable def test_put_mixed_type(self): df = tm.makeTimeDataFrame() df['obj1'] = 'foo' @@ -1438,7 +1448,10 @@ def test_to_hdf_with_min_itemsize(self): tm.assert_series_equal(pd.read_hdf(path, 'ss4'), pd.concat([df['B'], df2['B']])) - @pytest.mark.parametrize("format", ['fixed', 'table']) + @pytest.mark.parametrize( + "format", + [pytest.param('fixed', marks=xfail_non_writeable), + 'table']) def test_to_hdf_errors(self, format): data = ['\ud800foo'] @@ -1815,6 +1828,7 @@ def test_pass_spec_to_storer(self): pytest.raises(TypeError, store.select, 'df', where=[('columns=A')]) + @xfail_non_writeable def test_append_misc(self): with ensure_clean_store(self.path) as store: @@ -2006,6 +2020,7 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... pytest.raises(TypeError, store.append, 'df_unimplemented', df) + @xfail_non_writeable @pytest.mark.skipif( LooseVersion(np.__version__) == LooseVersion('1.15.0'), reason=("Skipping pytables test when numpy version is " @@ -2245,6 +2260,7 @@ def test_float_index(self): s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal) + @xfail_non_writeable def test_tuple_index(self): # GH #492 @@ -2257,6 +2273,7 @@ def test_tuple_index(self): simplefilter("ignore", pd.errors.PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal) + @xfail_non_writeable @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_index_types(self): @@ -2320,6 +2337,7 @@ def test_timeseries_preepoch(self): except OverflowError: pytest.skip('known failer on some windows platforms') + @xfail_non_writeable @pytest.mark.parametrize("compression", [ False, pytest.param(True, marks=td.skip_if_windows_python_3) ]) @@ -2350,6 +2368,7 @@ def test_frame(self, compression): # empty self._check_roundtrip(df[:0], tm.assert_frame_equal) + @xfail_non_writeable def test_empty_series_frame(self): s0 = Series() s1 = Series(name='myseries') @@ -2363,8 +2382,10 @@ def test_empty_series_frame(self): self._check_roundtrip(df1, tm.assert_frame_equal) self._check_roundtrip(df2, tm.assert_frame_equal) - def test_empty_series(self): - for dtype in [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']: + @xfail_non_writeable + @pytest.mark.parametrize( + 'dtype', [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']) + def test_empty_series(self, dtype): s = Series(dtype=dtype) self._check_roundtrip(s, tm.assert_series_equal) @@ -2445,6 +2466,7 @@ def test_store_series_name(self): recons = store['series'] tm.assert_series_equal(recons, series) + @xfail_non_writeable @pytest.mark.parametrize("compression", [ False, pytest.param(True, marks=td.skip_if_windows_python_3) ]) @@ -3954,6 +3976,7 @@ def test_pytables_native2_read(self, datapath): d1 = store['detector'] assert isinstance(d1, DataFrame) + @xfail_non_writeable def test_legacy_table_fixed_format_read_py2(self, datapath): # GH 24510 # legacy table with fixed format written in Python 2 @@ -4117,6 +4140,7 @@ def test_unicode_longer_encoded(self): result = store.get('df') tm.assert_frame_equal(result, df) + @xfail_non_writeable def test_store_datetime_mixed(self): df = DataFrame( @@ -4677,6 +4701,7 @@ def test_complex_table(self): reread = read_hdf(path, 'df') assert_frame_equal(df, reread) + @xfail_non_writeable def test_complex_mixed_fixed(self): complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) From d7084616472ee39faa2758117dc7d6707b3a8bb3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 3 Mar 2019 01:41:53 +0000 Subject: [PATCH 058/110] =?UTF-8?q?STY:=20use=20pytest.raises=20context=20?= =?UTF-8?q?manager=20(arithmetic,=20arrays,=20computati=E2=80=A6=20(#25504?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/tests/arithmetic/test_timedelta64.py | 66 ++++++++++---- .../arrays/categorical/test_analytics.py | 29 +++--- .../arrays/categorical/test_operators.py | 88 +++++++++++++------ pandas/tests/arrays/sparse/test_libsparse.py | 10 ++- pandas/tests/computation/test_eval.py | 74 ++++++++++------ pandas/tests/dtypes/test_common.py | 15 ++-- pandas/tests/dtypes/test_dtypes.py | 18 ++-- 7 files changed, 205 insertions(+), 95 deletions(-) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index c31d7acad3111..0faed74d4a021 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -205,10 +205,20 @@ def test_subtraction_ops(self): td = Timedelta('1 days') dt = Timestamp('20130101') - pytest.raises(TypeError, lambda: tdi - dt) - pytest.raises(TypeError, lambda: tdi - dti) - pytest.raises(TypeError, lambda: td - dt) - pytest.raises(TypeError, lambda: td - dti) + msg = "cannot subtract a datelike from a TimedeltaArray" + with pytest.raises(TypeError, match=msg): + tdi - dt + with pytest.raises(TypeError, match=msg): + tdi - dti + + msg = (r"descriptor '__sub__' requires a 'datetime\.datetime' object" + " but received a 'Timedelta'") + with pytest.raises(TypeError, match=msg): + td - dt + + msg = "bad operand type for unary -: 'DatetimeArray'" + with pytest.raises(TypeError, match=msg): + td - dti result = dt - dti expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') @@ -265,19 +275,38 @@ def _check(result, expected): _check(result, expected) # tz mismatches - pytest.raises(TypeError, lambda: dt_tz - ts) - pytest.raises(TypeError, lambda: dt_tz - dt) - pytest.raises(TypeError, lambda: dt_tz - ts_tz2) - pytest.raises(TypeError, lambda: dt - dt_tz) - pytest.raises(TypeError, lambda: ts - dt_tz) - pytest.raises(TypeError, lambda: ts_tz2 - ts) - pytest.raises(TypeError, lambda: ts_tz2 - dt) - pytest.raises(TypeError, lambda: ts_tz - ts_tz2) + msg = ("Timestamp subtraction must have the same timezones or no" + " timezones") + with pytest.raises(TypeError, match=msg): + dt_tz - ts + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt_tz - dt + msg = ("Timestamp subtraction must have the same timezones or no" + " timezones") + with pytest.raises(TypeError, match=msg): + dt_tz - ts_tz2 + msg = "can't subtract offset-naive and offset-aware datetimes" + with pytest.raises(TypeError, match=msg): + dt - dt_tz + msg = ("Timestamp subtraction must have the same timezones or no" + " timezones") + with pytest.raises(TypeError, match=msg): + ts - dt_tz + with pytest.raises(TypeError, match=msg): + ts_tz2 - ts + with pytest.raises(TypeError, match=msg): + ts_tz2 - dt + with pytest.raises(TypeError, match=msg): + ts_tz - ts_tz2 # with dti - pytest.raises(TypeError, lambda: dti - ts_tz) - pytest.raises(TypeError, lambda: dti_tz - ts) - pytest.raises(TypeError, lambda: dti_tz - ts_tz2) + with pytest.raises(TypeError, match=msg): + dti - ts_tz + with pytest.raises(TypeError, match=msg): + dti_tz - ts + with pytest.raises(TypeError, match=msg): + dti_tz - ts_tz2 result = dti_tz - dt_tz expected = TimedeltaIndex(['0 days', '1 days', '2 days']) @@ -349,8 +378,11 @@ def test_addition_ops(self): tm.assert_index_equal(result, expected) # unequal length - pytest.raises(ValueError, lambda: tdi + dti[0:1]) - pytest.raises(ValueError, lambda: tdi[0:1] + dti) + msg = "cannot add indices of unequal length" + with pytest.raises(ValueError, match=msg): + tdi + dti[0:1] + with pytest.raises(ValueError, match=msg): + tdi[0:1] + dti # random indexes with pytest.raises(NullFrequencyError): diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 5efcd527de8d8..7ce82d5bcdded 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -18,8 +18,11 @@ def test_min_max(self): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) - pytest.raises(TypeError, lambda: cat.min()) - pytest.raises(TypeError, lambda: cat.max()) + msg = "Categorical is not ordered for operation {}" + with pytest.raises(TypeError, match=msg.format('min')): + cat.min() + with pytest.raises(TypeError, match=msg.format('max')): + cat.max() cat = Categorical(["a", "b", "c", "d"], ordered=True) _min = cat.min() @@ -108,18 +111,24 @@ def test_searchsorted(self): tm.assert_numpy_array_equal(res_ser, exp) # Searching for a single value that is not from the Categorical - pytest.raises(KeyError, lambda: c1.searchsorted('cucumber')) - pytest.raises(KeyError, lambda: s1.searchsorted('cucumber')) + msg = r"Value\(s\) to be inserted must be in categories" + with pytest.raises(KeyError, match=msg): + c1.searchsorted('cucumber') + with pytest.raises(KeyError, match=msg): + s1.searchsorted('cucumber') # Searching for multiple values one of each is not from the Categorical - pytest.raises(KeyError, - lambda: c1.searchsorted(['bread', 'cucumber'])) - pytest.raises(KeyError, - lambda: s1.searchsorted(['bread', 'cucumber'])) + with pytest.raises(KeyError, match=msg): + c1.searchsorted(['bread', 'cucumber']) + with pytest.raises(KeyError, match=msg): + s1.searchsorted(['bread', 'cucumber']) # searchsorted call for unordered Categorical - pytest.raises(ValueError, lambda: c2.searchsorted('apple')) - pytest.raises(ValueError, lambda: s2.searchsorted('apple')) + msg = "Categorical not ordered" + with pytest.raises(ValueError, match=msg): + c2.searchsorted('apple') + with pytest.raises(ValueError, match=msg): + s2.searchsorted('apple') def test_unique(self): # categories are reordered based on value when ordered=False diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index b2965bbcc456a..e1264722aedcd 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY2 + import pandas as pd from pandas import Categorical, DataFrame, Series, date_range from pandas.tests.arrays.categorical.common import TestCategorical @@ -17,6 +19,7 @@ def test_categories_none_comparisons(self): 'a', 'c', 'c', 'c'], ordered=True) tm.assert_categorical_equal(factor, self.factor) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_comparisons(self): result = self.factor[self.factor == 'a'] @@ -95,16 +98,24 @@ def test_comparisons(self): # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) - pytest.raises(TypeError, lambda: cat > s) - pytest.raises(TypeError, lambda: cat_rev > s) - pytest.raises(TypeError, lambda: s < cat) - pytest.raises(TypeError, lambda: s < cat_rev) + msg = ("Cannot compare a Categorical for op __gt__ with type" + r" ") + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev # comparison with numpy.array will raise in both direction, but only on # newer numpy versions a = np.array(["b", "b", "b"]) - pytest.raises(TypeError, lambda: cat > a) - pytest.raises(TypeError, lambda: cat_rev > a) + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a # Make sure that unequal comparison take the categories order in # account @@ -163,16 +174,23 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - pytest.raises(TypeError, lambda: cat < 4) - pytest.raises(TypeError, lambda: cat > 4) - pytest.raises(TypeError, lambda: 4 < cat) - pytest.raises(TypeError, lambda: 4 > cat) + msg = ("Cannot compare a Categorical for op __{}__ with a scalar," + " which is not a category") + with pytest.raises(TypeError, match=msg.format('lt')): + cat < 4 + with pytest.raises(TypeError, match=msg.format('gt')): + cat > 4 + with pytest.raises(TypeError, match=msg.format('gt')): + 4 < cat + with pytest.raises(TypeError, match=msg.format('lt')): + 4 > cat tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] @@ -219,16 +237,26 @@ def test_comparisons(self, data, reverse, base): # categorical cannot be compared to Series or numpy array, and also # not the other way around - pytest.raises(TypeError, lambda: cat > s) - pytest.raises(TypeError, lambda: cat_rev > s) - pytest.raises(TypeError, lambda: cat > a) - pytest.raises(TypeError, lambda: cat_rev > a) + msg = ("Cannot compare a Categorical for op __gt__ with type" + r" ") + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a - pytest.raises(TypeError, lambda: s < cat) - pytest.raises(TypeError, lambda: s < cat_rev) + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev - pytest.raises(TypeError, lambda: a < cat) - pytest.raises(TypeError, lambda: a < cat_rev) + with pytest.raises(TypeError, match=msg): + a < cat + with pytest.raises(TypeError, match=msg): + a < cat_rev @pytest.mark.parametrize('ctor', [ lambda *args, **kwargs: Categorical(*args, **kwargs), @@ -287,16 +315,21 @@ def test_numeric_like_ops(self): right=False, labels=cat_labels) # numeric ops should not succeed - for op in ['__add__', '__sub__', '__mul__', '__truediv__']: - pytest.raises(TypeError, - lambda: getattr(df, op)(df)) + for op, str_rep in [('__add__', r'\+'), + ('__sub__', '-'), + ('__mul__', r'\*'), + ('__truediv__', '/')]: + msg = r"Series cannot perform the operation {}".format(str_rep) + with pytest.raises(TypeError, match=msg): + getattr(df, op)(df) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) s = df['value_group'] for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']: - pytest.raises(TypeError, - lambda: getattr(s, op)(numeric_only=False)) + msg = "Categorical cannot perform the operation {}".format(op) + with pytest.raises(TypeError, match=msg): + getattr(s, op)(numeric_only=False) # mad technically works because it takes always the numeric data @@ -306,8 +339,13 @@ def test_numeric_like_ops(self): np.sum(s) # numeric ops on a Series - for op in ['__add__', '__sub__', '__mul__', '__truediv__']: - pytest.raises(TypeError, lambda: getattr(s, op)(2)) + for op, str_rep in [('__add__', r'\+'), + ('__sub__', '-'), + ('__mul__', r'\*'), + ('__truediv__', '/')]: + msg = r"Series cannot perform the operation {}".format(str_rep) + with pytest.raises(TypeError, match=msg): + getattr(s, op)(2) # invalid ufunc with pytest.raises(TypeError): diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 6e9d790bf85f3..2cbe7d9ea084c 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -449,11 +449,13 @@ def test_check_integrity(self): # also OK even though empty index = BlockIndex(1, locs, lengths) # noqa - # block extend beyond end - pytest.raises(Exception, BlockIndex, 10, [5], [10]) + msg = "Block 0 extends beyond end" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [5], [10]) - # block overlap - pytest.raises(Exception, BlockIndex, 10, [2, 5], [5, 3]) + msg = "Block 0 overlaps" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [2, 5], [5, 3]) def test_to_int_index(self): locs = [0, 10] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c1ba15f428eb7..a14d8e4471c23 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -285,10 +285,14 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) + msg = (r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')bool(\]|')|" + "argument of type 'bool' is not iterable") if cmp1 in ('in', 'not in') and not is_list_like(rhs): - pytest.raises(TypeError, pd.eval, ex, engine=self.engine, - parser=self.parser, local_dict={'lhs': lhs, - 'rhs': rhs}) + with pytest.raises(TypeError, match=msg): + pd.eval(ex, engine=self.engine, parser=self.parser, + local_dict={'lhs': lhs, 'rhs': rhs}) else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -341,9 +345,11 @@ def check_floor_division(self, lhs, arith1, rhs): expected = lhs // rhs self.check_equal(res, expected) else: - pytest.raises(TypeError, pd.eval, ex, - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) + msg = (r"unsupported operand type\(s\) for //: 'VariableNode' and" + " 'VariableNode'") + with pytest.raises(TypeError, match=msg): + pd.eval(ex, local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) def get_expected_pow_result(self, lhs, rhs): try: @@ -396,10 +402,14 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) + msg = (r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')float(\]|')|" + "argument of type 'float' is not iterable") if is_scalar(rhs) and cmp1 in skip_these: - pytest.raises(TypeError, pd.eval, ex, engine=self.engine, - parser=self.parser, local_dict={'lhs': lhs, - 'rhs': rhs}) + with pytest.raises(TypeError, match=msg): + pd.eval(ex, engine=self.engine, parser=self.parser, + local_dict={'lhs': lhs, 'rhs': rhs}) else: # compound if is_scalar(lhs) and is_scalar(rhs): @@ -1101,8 +1111,9 @@ def test_simple_arith_ops(self): ex3 = '1 {0} (x + 1)'.format(op) if op in ('in', 'not in'): - pytest.raises(TypeError, pd.eval, ex, - engine=self.engine, parser=self.parser) + msg = "argument of type 'int' is not iterable" + with pytest.raises(TypeError, match=msg): + pd.eval(ex, engine=self.engine, parser=self.parser) else: expec = _eval_single_bin(1, op, 1, self.engine) x = self.eval(ex, engine=self.engine, parser=self.parser) @@ -1236,19 +1247,25 @@ def test_assignment_fails(self): df = DataFrame(np.random.randn(5, 3), columns=list('abc')) df2 = DataFrame(np.random.randn(5, 3)) expr1 = 'df = df2' - pytest.raises(ValueError, self.eval, expr1, - local_dict={'df': df, 'df2': df2}) + msg = "cannot assign without a target object" + with pytest.raises(ValueError, match=msg): + self.eval(expr1, local_dict={'df': df, 'df2': df2}) def test_assignment_column(self): df = DataFrame(np.random.randn(5, 2), columns=list('ab')) orig_df = df.copy() # multiple assignees - pytest.raises(SyntaxError, df.eval, 'd c = a + b') + with pytest.raises(SyntaxError, match="invalid syntax"): + df.eval('d c = a + b') # invalid assignees - pytest.raises(SyntaxError, df.eval, 'd,c = a + b') - pytest.raises(SyntaxError, df.eval, 'Timestamp("20131001") = a + b') + msg = "left hand side of an assignment must be a single name" + with pytest.raises(SyntaxError, match=msg): + df.eval('d,c = a + b') + msg = "can't assign to function call" + with pytest.raises(SyntaxError, match=msg): + df.eval('Timestamp("20131001") = a + b') # single assignment - existing variable expected = orig_df.copy() @@ -1291,7 +1308,9 @@ def f(): # multiple assignment df = orig_df.copy() df.eval('c = a + b', inplace=True) - pytest.raises(SyntaxError, df.eval, 'c = a = b') + msg = "can only assign a single expression" + with pytest.raises(SyntaxError, match=msg): + df.eval('c = a = b') # explicit targets df = orig_df.copy() @@ -1545,21 +1564,24 @@ def test_check_many_exprs(self): def test_fails_and(self): df = DataFrame(np.random.randn(5, 3)) - pytest.raises(NotImplementedError, pd.eval, 'df > 2 and df > 3', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval('df > 2 and df > 3', local_dict={'df': df}, + parser=self.parser, engine=self.engine) def test_fails_or(self): df = DataFrame(np.random.randn(5, 3)) - pytest.raises(NotImplementedError, pd.eval, 'df > 2 or df > 3', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval('df > 2 or df > 3', local_dict={'df': df}, + parser=self.parser, engine=self.engine) def test_fails_not(self): df = DataFrame(np.random.randn(5, 3)) - pytest.raises(NotImplementedError, pd.eval, 'not df > 2', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) + msg = "'Not' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval('not df > 2', local_dict={'df': df}, parser=self.parser, + engine=self.engine) def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) # noqa diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 62e96fd39a759..5c1f6ff405b3b 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -607,13 +607,16 @@ def test__get_dtype(input_param, result): assert com._get_dtype(input_param) == result -@pytest.mark.parametrize('input_param', [None, - 1, 1.2, - 'random string', - pd.DataFrame([1, 2])]) -def test__get_dtype_fails(input_param): +@pytest.mark.parametrize('input_param,expected_error_message', [ + (None, "Cannot deduce dtype from null object"), + (1, "data type not understood"), + (1.2, "data type not understood"), + ('random string', "data type 'random string' not understood"), + (pd.DataFrame([1, 2]), "data type not understood")]) +def test__get_dtype_fails(input_param, expected_error_message): # python objects - pytest.raises(TypeError, com._get_dtype, input_param) + with pytest.raises(TypeError, match=expected_error_message): + com._get_dtype(input_param) @pytest.mark.parametrize('input_param,result', [ diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 1c1442d6f2f23..4366f610871ff 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -38,7 +38,8 @@ def test_equality_invalid(self): assert not is_dtype_equal(self.dtype, np.int64) def test_numpy_informed(self): - pytest.raises(TypeError, np.dtype, self.dtype) + with pytest.raises(TypeError, match="data type not understood"): + np.dtype(self.dtype) assert not self.dtype == np.str_ assert not np.str_ == self.dtype @@ -87,8 +88,9 @@ def test_equality(self): def test_construction_from_string(self): result = CategoricalDtype.construct_from_string('category') assert is_dtype_equal(self.dtype, result) - pytest.raises( - TypeError, lambda: CategoricalDtype.construct_from_string('foo')) + msg = "cannot construct a CategoricalDtype" + with pytest.raises(TypeError, match=msg): + CategoricalDtype.construct_from_string('foo') def test_constructor_invalid(self): msg = "Parameter 'categories' must be list-like" @@ -202,8 +204,9 @@ def test_hash_vs_equality(self): assert hash(dtype2) != hash(dtype4) def test_construction(self): - pytest.raises(ValueError, - lambda: DatetimeTZDtype('ms', 'US/Eastern')) + msg = "DatetimeTZDtype only supports ns units" + with pytest.raises(ValueError, match=msg): + DatetimeTZDtype('ms', 'US/Eastern') def test_subclass(self): a = DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]') @@ -226,8 +229,9 @@ def test_construction_from_string(self): result = DatetimeTZDtype.construct_from_string( 'datetime64[ns, US/Eastern]') assert is_dtype_equal(self.dtype, result) - pytest.raises(TypeError, - lambda: DatetimeTZDtype.construct_from_string('foo')) + msg = "Could not construct DatetimeTZDtype from 'foo'" + with pytest.raises(TypeError, match=msg): + DatetimeTZDtype.construct_from_string('foo') def test_construct_from_string_raises(self): with pytest.raises(TypeError, match="notatz"): From cc5b73e9807e5d4527fdd36187ebd11d744217de Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sat, 2 Mar 2019 18:44:22 -0700 Subject: [PATCH 059/110] BUG: Fix RecursionError during IntervalTree construction (#25498) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/_libs/intervaltree.pxi.in | 2 +- pandas/tests/indexes/interval/test_interval_tree.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 4fcde7769b362..926239e7e5dc5 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -98,6 +98,7 @@ Bug Fixes - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) +- Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`) - .. _whatsnew_0.242.contributors: diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index fb6f30c030f11..196841f35ed8d 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -284,7 +284,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: else: # calculate a pivot so we can create child nodes self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 + self.pivot = np.median(left / 2 + right / 2) left_set, right_set, center_set = self.classify_intervals( left, right) diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 90722e66d8d8c..46b2d12015a22 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -171,3 +171,13 @@ def test_is_overlapping_trivial(self, closed, left, right): # GH 23309 tree = IntervalTree(left, right, closed=closed) assert tree.is_overlapping is False + + def test_construction_overflow(self): + # GH 25485 + left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 + tree = IntervalTree(left, right) + + # pivot should be average of left/right medians + result = tree.root.pivot + expected = (50 + np.iinfo(np.int64).max) / 2 + assert result == expected From c66028cab32b32f3e0fbb592eba5d05cce5eb443 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 3 Mar 2019 01:46:07 +0000 Subject: [PATCH 060/110] STY: use pytest.raises context manager (plotting, reductions, scalar...) (#25483) * STY: use pytest.raises context manager (plotting, reductions, scalar...) * revert removed testing in test_timedelta.py * remove TODO from test_frame.py * skip py2 ci failure --- pandas/tests/plotting/test_boxplot_method.py | 21 ++++++--- pandas/tests/plotting/test_datetimelike.py | 13 +++-- pandas/tests/plotting/test_hist_method.py | 17 ++++--- pandas/tests/plotting/test_misc.py | 14 ++++-- pandas/tests/reductions/test_reductions.py | 4 +- pandas/tests/scalar/period/test_period.py | 21 +++++---- .../tests/scalar/timedelta/test_timedelta.py | 45 +++++++++++++----- .../tests/scalar/timestamp/test_timestamp.py | 8 +++- pandas/tests/sparse/frame/test_frame.py | 47 +++++++++++++------ pandas/tests/sparse/series/test_series.py | 34 +++++++++----- pandas/tests/tseries/offsets/test_offsets.py | 17 +++++-- .../tests/tseries/offsets/test_yqm_offsets.py | 20 +++++--- 12 files changed, 180 insertions(+), 81 deletions(-) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 7d721c7de3398..e6b9795aebe7c 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -267,13 +267,20 @@ def test_grouped_box_return_type(self): def test_grouped_box_layout(self): df = self.hist_df - pytest.raises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(1, 1)) - pytest.raises(ValueError, df.boxplot, - column=['height', 'weight', 'category'], - layout=(2, 1), return_type='dict') - pytest.raises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(-1, -1)) + msg = "Layout of 1x1 must be larger than required size 2" + with pytest.raises(ValueError, match=msg): + df.boxplot(column=['weight', 'height'], by=df.gender, + layout=(1, 1)) + + msg = "The 'layout' keyword is not supported when 'by' is None" + with pytest.raises(ValueError, match=msg): + df.boxplot(column=['height', 'weight', 'category'], + layout=(2, 1), return_type='dict') + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + df.boxplot(column=['weight', 'height'], by=df.gender, + layout=(-1, -1)) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index ad79cc97f8b77..6702ad6cfb761 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -97,7 +97,9 @@ def test_nonnumeric_exclude(self): assert len(ax.get_lines()) == 1 # B was plotted self.plt.close(fig) - pytest.raises(TypeError, df['A'].plot) + msg = "Empty 'DataFrame': no numeric data to plot" + with pytest.raises(TypeError, match=msg): + df['A'].plot() def test_tsplot_deprecated(self): from pandas.tseries.plotting import tsplot @@ -140,10 +142,15 @@ def f(*args, **kwds): def test_both_style_and_color(self): ts = tm.makeTimeSeries() - pytest.raises(ValueError, ts.plot, style='b-', color='#000099') + msg = ("Cannot pass 'style' string with a color symbol and 'color' " + "keyword argument. Please use one or the other or pass 'style'" + " without a color symbol") + with pytest.raises(ValueError, match=msg): + ts.plot(style='b-', color='#000099') s = ts.reset_index(drop=True) - pytest.raises(ValueError, s.plot, style='b-', color='#000099') + with pytest.raises(ValueError, match=msg): + s.plot(style='b-', color='#000099') @pytest.mark.slow def test_high_freq(self): diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 7bdbdac54f7a6..4f0bef52b5e15 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -332,12 +332,17 @@ def test_grouped_hist_legacy2(self): @pytest.mark.slow def test_grouped_hist_layout(self): df = self.hist_df - pytest.raises(ValueError, df.hist, column='weight', by=df.gender, - layout=(1, 1)) - pytest.raises(ValueError, df.hist, column='height', by=df.category, - layout=(1, 3)) - pytest.raises(ValueError, df.hist, column='height', by=df.category, - layout=(-1, -1)) + msg = "Layout of 1x1 must be larger than required size 2" + with pytest.raises(ValueError, match=msg): + df.hist(column='weight', by=df.gender, layout=(1, 1)) + + msg = "Layout of 1x3 must be larger than required size 4" + with pytest.raises(ValueError, match=msg): + df.hist(column='height', by=df.category, layout=(1, 3)) + + msg = "At least one dimension of layout must be positive" + with pytest.raises(ValueError, match=msg): + df.hist(column='height', by=df.category, layout=(-1, -1)) with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(df.hist, column='height', by=df.gender, diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 44b95f7d1b00b..98248586f3d27 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -278,14 +278,20 @@ def test_subplot_titles(self, iris): assert [p.get_title() for p in plot] == title # Case len(title) > len(df) - pytest.raises(ValueError, df.plot, subplots=True, - title=title + ["kittens > puppies"]) + msg = ("The length of `title` must equal the number of columns if" + " using `title` of type `list` and `subplots=True`") + with pytest.raises(ValueError, match=msg): + df.plot(subplots=True, title=title + ["kittens > puppies"]) # Case len(title) < len(df) - pytest.raises(ValueError, df.plot, subplots=True, title=title[:2]) + with pytest.raises(ValueError, match=msg): + df.plot(subplots=True, title=title[:2]) # Case subplots=False and title is of type list - pytest.raises(ValueError, df.plot, subplots=False, title=title) + msg = ("Using `title` of type `list` is not supported unless" + " `subplots=True` is passed") + with pytest.raises(ValueError, match=msg): + df.plot(subplots=False, title=title) # Case df with 3 numeric columns but layout of (2,2) plot = df.drop('SepalWidth', axis=1).plot(subplots=True, layout=(2, 2), diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8520855d14918..fbf7f610688ba 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -276,7 +276,9 @@ def test_timedelta_ops(self): # invalid ops for op in ['skew', 'kurt', 'sem', 'prod']: - pytest.raises(TypeError, getattr(td, op)) + msg = "reduction operation '{}' not allowed for this dtype" + with pytest.raises(TypeError, match=msg.format(op)): + getattr(td, op)() # GH#10040 # make sure NaT is properly handled by median() diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index d0f87618ad3af..8ca19745055a3 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -8,6 +8,7 @@ from pandas._libs.tslibs.ccalendar import DAYS, MONTHS from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas._libs.tslibs.parsing import DateParseError +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz from pandas.compat import iteritems, text_type from pandas.compat.numpy import np_datetime64_compat @@ -35,7 +36,9 @@ def test_construction(self): i4 = Period('2005', freq='M') i5 = Period('2005', freq='m') - pytest.raises(ValueError, i1.__ne__, i4) + msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + i1 != i4 assert i4 == i5 i1 = Period.now('Q') @@ -74,9 +77,12 @@ def test_construction(self): freq='U') assert i1 == expected - pytest.raises(ValueError, Period, ordinal=200701) + msg = "Must supply freq for ordinal value" + with pytest.raises(ValueError, match=msg): + Period(ordinal=200701) - pytest.raises(ValueError, Period, '2007-1-1', freq='X') + with pytest.raises(ValueError, match="Invalid frequency: X"): + Period('2007-1-1', freq='X') def test_construction_bday(self): @@ -233,10 +239,6 @@ def test_period_constructor_offsets(self): freq='U') assert i1 == expected - pytest.raises(ValueError, Period, ordinal=200701) - - pytest.raises(ValueError, Period, '2007-1-1', freq='X') - def test_invalid_arguments(self): with pytest.raises(ValueError): Period(datetime.now()) @@ -925,8 +927,9 @@ def test_properties_secondly(self): class TestPeriodField(object): def test_get_period_field_array_raises_on_out_of_range(self): - pytest.raises(ValueError, libperiod.get_period_field_arr, -1, - np.empty(1), 0) + msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'" + with pytest.raises(ValueError, match=msg): + libperiod.get_period_field_arr(-1, np.empty(1), 0) class TestComparisons(object): diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index bf71c37aa9c3d..ee2c2e9e1959c 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -250,9 +250,13 @@ def check(value): assert rng.microseconds == 0 assert rng.nanoseconds == 0 - pytest.raises(AttributeError, lambda: rng.hours) - pytest.raises(AttributeError, lambda: rng.minutes) - pytest.raises(AttributeError, lambda: rng.milliseconds) + msg = "'Timedelta' object has no attribute '{}'" + with pytest.raises(AttributeError, match=msg.format('hours')): + rng.hours + with pytest.raises(AttributeError, match=msg.format('minutes')): + rng.minutes + with pytest.raises(AttributeError, match=msg.format('milliseconds')): + rng.milliseconds # GH 10050 check(rng.days) @@ -272,9 +276,13 @@ def check(value): assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 100 * 1000 + 123 assert rng.nanoseconds == 456 - pytest.raises(AttributeError, lambda: rng.hours) - pytest.raises(AttributeError, lambda: rng.minutes) - pytest.raises(AttributeError, lambda: rng.milliseconds) + msg = "'Timedelta' object has no attribute '{}'" + with pytest.raises(AttributeError, match=msg.format('hours')): + rng.hours + with pytest.raises(AttributeError, match=msg.format('minutes')): + rng.minutes + with pytest.raises(AttributeError, match=msg.format('milliseconds')): + rng.milliseconds # components tup = pd.to_timedelta(-1, 'us').components @@ -449,8 +457,12 @@ def test_round(self): assert r2 == s2 # invalid - for freq in ['Y', 'M', 'foobar']: - pytest.raises(ValueError, lambda: t1.round(freq)) + for freq, msg in [ + ('Y', ' is a non-fixed frequency'), + ('M', ' is a non-fixed frequency'), + ('foobar', 'Invalid frequency: foobar')]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) t1 = timedelta_range('1 days', periods=3, freq='1 min 2 s 3 us') t2 = -1 * t1 @@ -495,11 +507,15 @@ def test_round(self): r1 = t1.round(freq) tm.assert_index_equal(r1, s1) r2 = t2.round(freq) - tm.assert_index_equal(r2, s2) + tm.assert_index_equal(r2, s2) # invalid - for freq in ['Y', 'M', 'foobar']: - pytest.raises(ValueError, lambda: t1.round(freq)) + for freq, msg in [ + ('Y', ' is a non-fixed frequency'), + ('M', ' is a non-fixed frequency'), + ('foobar', 'Invalid frequency: foobar')]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) def test_contains(self): # Checking for any NaT-like objects @@ -609,9 +625,12 @@ def test_overflow(self): assert np.allclose(result.value / 1000, expected.value / 1000) # sum - pytest.raises(ValueError, lambda: (s - s.min()).sum()) + msg = "overflow in timedelta operation" + with pytest.raises(ValueError, match=msg): + (s - s.min()).sum() s1 = s[0:10000] - pytest.raises(ValueError, lambda: (s1 - s1.min()).sum()) + with pytest.raises(ValueError, match=msg): + (s1 - s1.min()).sum() s2 = s[0:1000] result = (s2 - s2.min()).sum() diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 7d81d905eac4f..b55d00b44fd67 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -60,7 +60,9 @@ def check(value, equal): check(ts.hour, 9) check(ts.minute, 6) check(ts.second, 3) - pytest.raises(AttributeError, lambda: ts.millisecond) + msg = "'Timestamp' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + ts.millisecond check(ts.microsecond, 100) check(ts.nanosecond, 1) check(ts.dayofweek, 6) @@ -78,7 +80,9 @@ def check(value, equal): check(ts.hour, 23) check(ts.minute, 59) check(ts.second, 0) - pytest.raises(AttributeError, lambda: ts.millisecond) + msg = "'Timestamp' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + ts.millisecond check(ts.microsecond, 0) check(ts.nanosecond, 0) check(ts.dayofweek, 2) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index bfb5103c97adc..b31738794c854 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -7,7 +7,7 @@ import pytest from pandas._libs.sparse import BlockIndex, IntIndex -from pandas.compat import lrange +from pandas.compat import PY2, lrange from pandas.errors import PerformanceWarning import pandas as pd @@ -145,8 +145,9 @@ def test_constructor_ndarray(self, float_frame): tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=['A'])) # raise on level argument - pytest.raises(TypeError, float_frame.reindex, columns=['A'], - level=1) + msg = "Reindex by level not supported for sparse" + with pytest.raises(TypeError, match=msg): + float_frame.reindex(columns=['A'], level=1) # wrong length index / columns with pytest.raises(ValueError, match="^Index length"): @@ -433,7 +434,8 @@ def test_getitem(self): exp = sdf.reindex(columns=['a', 'b']) tm.assert_sp_frame_equal(result, exp) - pytest.raises(Exception, sdf.__getitem__, ['a', 'd']) + with pytest.raises(KeyError, match=r"\['d'\] not in index"): + sdf[['a', 'd']] def test_iloc(self, float_frame): @@ -504,7 +506,9 @@ def test_getitem_overload(self, float_frame): subframe = float_frame[indexer] tm.assert_index_equal(subindex, subframe.index) - pytest.raises(Exception, float_frame.__getitem__, indexer[:-1]) + msg = "Item wrong length 9 instead of 10" + with pytest.raises(ValueError, match=msg): + float_frame[indexer[:-1]] def test_setitem(self, float_frame, float_frame_int_kind, float_frame_dense, @@ -551,8 +555,9 @@ def _check_frame(frame, orig): assert len(frame['I'].sp_values) == N // 2 # insert ndarray wrong size - pytest.raises(Exception, frame.__setitem__, 'foo', - np.random.randn(N - 1)) + msg = "Length of values does not match length of index" + with pytest.raises(AssertionError, match=msg): + frame['foo'] = np.random.randn(N - 1) # scalar value frame['J'] = 5 @@ -625,17 +630,22 @@ def test_delitem(self, float_frame): def test_set_columns(self, float_frame): float_frame.columns = float_frame.columns - pytest.raises(Exception, setattr, float_frame, 'columns', - float_frame.columns[:-1]) + msg = ("Length mismatch: Expected axis has 4 elements, new values have" + " 3 elements") + with pytest.raises(ValueError, match=msg): + float_frame.columns = float_frame.columns[:-1] def test_set_index(self, float_frame): float_frame.index = float_frame.index - pytest.raises(Exception, setattr, float_frame, 'index', - float_frame.index[:-1]) + msg = ("Length mismatch: Expected axis has 10 elements, new values" + " have 9 elements") + with pytest.raises(ValueError, match=msg): + float_frame.index = float_frame.index[:-1] def test_ctor_reindex(self): idx = pd.Index([0, 1, 2, 3]) - with pytest.raises(ValueError, match=''): + msg = "Length of passed values is 2, index implies 4" + with pytest.raises(ValueError, match=msg): pd.SparseDataFrame({"A": [1, 2]}, index=idx) def test_append(self, float_frame): @@ -858,6 +868,7 @@ def test_describe(self, float_frame): str(float_frame) desc = float_frame.describe() # noqa + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_join(self, float_frame): left = float_frame.loc[:, ['A', 'B']] right = float_frame.loc[:, ['C', 'D']] @@ -865,7 +876,10 @@ def test_join(self, float_frame): tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False) right = float_frame.loc[:, ['B', 'D']] - pytest.raises(Exception, left.join, right) + msg = (r"columns overlap but no suffix specified: Index\(\['B'\]," + r" dtype='object'\)") + with pytest.raises(ValueError, match=msg): + left.join(right) with pytest.raises(ValueError, match='Other Series must have a name'): float_frame.join(Series( @@ -1046,8 +1060,11 @@ def _check(frame): _check(float_frame_int_kind) # for now - pytest.raises(Exception, _check, float_frame_fill0) - pytest.raises(Exception, _check, float_frame_fill2) + msg = "This routine assumes NaN fill value" + with pytest.raises(TypeError, match=msg): + _check(float_frame_fill0) + with pytest.raises(TypeError, match=msg): + _check(float_frame_fill2) def test_transpose(self, float_frame, float_frame_int_kind, float_frame_dense, diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 7eed47d0de888..93cf629f20957 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -452,12 +452,13 @@ def _check_getitem(sp, dense): _check_getitem(self.ziseries, self.ziseries.to_dense()) # exception handling - pytest.raises(Exception, self.bseries.__getitem__, - len(self.bseries) + 1) + with pytest.raises(IndexError, match="Out of bounds access"): + self.bseries[len(self.bseries) + 1] # index not contained - pytest.raises(Exception, self.btseries.__getitem__, - self.btseries.index[-1] + BDay()) + msg = r"Timestamp\('2011-01-31 00:00:00', freq='B'\)" + with pytest.raises(KeyError, match=msg): + self.btseries[self.btseries.index[-1] + BDay()] def test_get_get_value(self): tm.assert_almost_equal(self.bseries.get(10), self.bseries[10]) @@ -523,8 +524,9 @@ def _compare(idx): self._check_all(_compare_with_dense) - pytest.raises(Exception, self.bseries.take, - [0, len(self.bseries) + 1]) + msg = "index 21 is out of bounds for size 20" + with pytest.raises(IndexError, match=msg): + self.bseries.take([0, len(self.bseries) + 1]) # Corner case # XXX: changed test. Why wsa this considered a corner case? @@ -1138,25 +1140,35 @@ def test_to_coo_text_names_text_row_levels_nosort(self): def test_to_coo_bad_partition_nonnull_intersection(self): ss = self.sparse_series[0] - pytest.raises(ValueError, ss.to_coo, ['A', 'B', 'C'], ['C', 'D']) + msg = "Is not a partition because intersection is not null" + with pytest.raises(ValueError, match=msg): + ss.to_coo(['A', 'B', 'C'], ['C', 'D']) def test_to_coo_bad_partition_small_union(self): ss = self.sparse_series[0] - pytest.raises(ValueError, ss.to_coo, ['A'], ['C', 'D']) + msg = "Is not a partition because union is not the whole" + with pytest.raises(ValueError, match=msg): + ss.to_coo(['A'], ['C', 'D']) def test_to_coo_nlevels_less_than_two(self): ss = self.sparse_series[0] ss.index = np.arange(len(ss.index)) - pytest.raises(ValueError, ss.to_coo) + msg = "to_coo requires MultiIndex with nlevels > 2" + with pytest.raises(ValueError, match=msg): + ss.to_coo() def test_to_coo_bad_ilevel(self): ss = self.sparse_series[0] - pytest.raises(KeyError, ss.to_coo, ['A', 'B'], ['C', 'D', 'E']) + with pytest.raises(KeyError, match="Level E not found"): + ss.to_coo(['A', 'B'], ['C', 'D', 'E']) def test_to_coo_duplicate_index_entries(self): ss = pd.concat([self.sparse_series[0], self.sparse_series[0]]).to_sparse() - pytest.raises(ValueError, ss.to_coo, ['A', 'B'], ['C', 'D']) + msg = ("Duplicate index entries are not allowed in to_coo" + " transformation") + with pytest.raises(ValueError, match=msg): + ss.to_coo(['A', 'B'], ['C', 'D']) def test_from_coo_dense_index(self): ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 621572da57541..e6f21a7b47c3b 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs.frequencies import ( INVALID_FREQ_ERR_MSG, get_freq_code, get_freq_str) import pandas._libs.tslibs.offsets as liboffsets +from pandas._libs.tslibs.offsets import ApplyTypeError import pandas.compat as compat from pandas.compat import range from pandas.compat.numpy import np_datetime64_compat @@ -150,7 +151,8 @@ def test_sub(self): # offset2 attr return off = self.offset2 - with pytest.raises(Exception): + msg = "Cannot subtract datetime from offset" + with pytest.raises(TypeError, match=msg): off - self.d assert 2 * off - off == off @@ -736,7 +738,10 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - pytest.raises(TypeError, BDay().apply, BMonthEnd()) + msg = ("Only know how to combine business day with datetime or" + " timedelta") + with pytest.raises(ApplyTypeError, match=msg): + BDay().apply(BMonthEnd()) class TestBusinessHour(Base): @@ -812,7 +817,8 @@ def test_sub(self): # we have to override test_sub here becasue self.offset2 is not # defined as self._offset(2) off = self.offset2 - with pytest.raises(Exception): + msg = "Cannot subtract datetime from offset" + with pytest.raises(TypeError, match=msg): off - self.d assert 2 * off - off == off @@ -1796,7 +1802,10 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - pytest.raises(Exception, CDay().apply, BMonthEnd()) + msg = ("Only know how to combine trading day with datetime, datetime64" + " or timedelta") + with pytest.raises(ApplyTypeError, match=msg): + CDay().apply(BMonthEnd()) def test_holidays(self): # Define a TradingDay offset diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index 8023ee3139dd5..9ee03d2e886f3 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -713,7 +713,8 @@ class TestYearBegin(Base): _offset = YearBegin def test_misspecified(self): - pytest.raises(ValueError, YearBegin, month=13) + with pytest.raises(ValueError, match="Month must go from 1 to 12"): + YearBegin(month=13) offset_cases = [] offset_cases.append((YearBegin(), { @@ -804,7 +805,8 @@ class TestYearEnd(Base): _offset = YearEnd def test_misspecified(self): - pytest.raises(ValueError, YearEnd, month=13) + with pytest.raises(ValueError, match="Month must go from 1 to 12"): + YearEnd(month=13) offset_cases = [] offset_cases.append((YearEnd(), { @@ -900,8 +902,11 @@ class TestBYearBegin(Base): _offset = BYearBegin def test_misspecified(self): - pytest.raises(ValueError, BYearBegin, month=13) - pytest.raises(ValueError, BYearEnd, month=13) + msg = "Month must go from 1 to 12" + with pytest.raises(ValueError, match=msg): + BYearBegin(month=13) + with pytest.raises(ValueError, match=msg): + BYearEnd(month=13) offset_cases = [] offset_cases.append((BYearBegin(), { @@ -993,8 +998,11 @@ class TestBYearEndLagged(Base): _offset = BYearEnd def test_bad_month_fail(self): - pytest.raises(Exception, BYearEnd, month=13) - pytest.raises(Exception, BYearEnd, month=0) + msg = "Month must go from 1 to 12" + with pytest.raises(ValueError, match=msg): + BYearEnd(month=13) + with pytest.raises(ValueError, match=msg): + BYearEnd(month=0) offset_cases = [] offset_cases.append((BYearEnd(month=6), { From ce4720575f65df38e59472d7f4d8543ffe7a0c13 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Sun, 3 Mar 2019 02:36:36 +0000 Subject: [PATCH 061/110] BUG: Fix potential segfault after pd.Categorical(pd.Series(...), categories=...) (#25368) --- doc/source/whatsnew/v0.24.2.rst | 2 ++ pandas/core/arrays/categorical.py | 13 ++++--------- .../tests/arrays/categorical/test_constructors.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 926239e7e5dc5..e80b1060e867d 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -31,6 +31,8 @@ Fixed Regressions - Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) +- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) + .. _whatsnew_0242.enhancements: Enhancements diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 37a24a54be8b1..7f77a5dcce613 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below - if is_categorical(values): - # GH23814, for perf, if values._values already an instance of - # Categorical, set values to codes, and run fastpath - if (isinstance(values, (ABCSeries, ABCIndexClass)) and - isinstance(values._values, type(self))): - values = values._values.codes.copy() - fastpath = True - if fastpath: self._codes = coerce_indexer_dtype(values, dtype.categories) self._dtype = self._dtype.update_dtype(dtype) @@ -382,7 +374,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values): - old_codes = (values.cat.codes if isinstance(values, ABCSeries) + old_codes = (values._values.codes if isinstance(values, ABCSeries) else values.codes) codes = _recode_for_categories(old_codes, values.dtype.categories, dtype.categories) @@ -2625,6 +2617,9 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls return codes.copy() + elif new_categories.equals(old_categories): + # Same categories, so no need to actually recode + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 25c299692ceca..f07e3aba53cd4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -212,6 +212,18 @@ def test_constructor(self): c = Categorical(np.array([], dtype='int64'), # noqa categories=[3, 2, 1], ordered=True) + def test_constructor_with_existing_categories(self): + # GH25318: constructing with pd.Series used to bogusly skip recoding + # categories + c0 = Categorical(["a", "b", "c", "a"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + + c2 = Categorical(c0, categories=c1.categories) + tm.assert_categorical_equal(c1, c2) + + c3 = Categorical(Series(c0), categories=c1.categories) + tm.assert_categorical_equal(c1, c3) + def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 msg = r"^Parameter 'categories' must be list-like, was" From 0c193c654bba185e39057bc99245093dc877b7da Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 3 Mar 2019 02:59:51 +0000 Subject: [PATCH 062/110] Make DataFrame.to_html output full content (#24841) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/html.py | 13 +++++++++++-- pandas/tests/io/formats/test_to_html.py | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d1f1ea862110e..eb6e172648ef7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -209,6 +209,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 456583509565e..66d13bf2668f9 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -12,7 +12,7 @@ from pandas.core.dtypes.generic import ABCMultiIndex -from pandas import compat +from pandas import compat, option_context from pandas.core.config import get_option from pandas.io.common import _is_url @@ -320,9 +320,15 @@ def _write_header(self, indent): self.write('', indent) + def _get_formatted_values(self): + with option_context('display.max_colwidth', 999999): + fmt_values = {i: self.fmt._format_col(i) + for i in range(self.ncols)} + return fmt_values + def _write_body(self, indent): self.write('', indent) - fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + fmt_values = self._get_formatted_values() # write values if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): @@ -486,6 +492,9 @@ class NotebookFormatter(HTMLFormatter): DataFrame._repr_html_() and DataFrame.to_html(notebook=True) """ + def _get_formatted_values(self): + return {i: self.fmt._format_col(i) for i in range(self.ncols)} + def write_style(self): # We use the "scoped" attribute here so that the desired # style properties for the data frame are not then applied diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 554cfd306e2a7..428f1411a10a6 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -15,6 +15,15 @@ import pandas.io.formats.format as fmt +lorem_ipsum = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" + " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" + " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex" + " ea commodo consequat. Duis aute irure dolor in reprehenderit in" + " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" + " sint occaecat cupidatat non proident, sunt in culpa qui officia" + " deserunt mollit anim id est laborum.") + def expected_html(datapath, name): """ @@ -600,3 +609,17 @@ def test_to_html_render_links(render_links, expected, datapath): result = df.to_html(render_links=render_links) expected = expected_html(datapath, expected) assert result == expected + + +@pytest.mark.parametrize('method,expected', [ + ('to_html', lambda x:lorem_ipsum), + ('_repr_html_', lambda x:lorem_ipsum[:x - 4] + '...') # regression case +]) +@pytest.mark.parametrize('max_colwidth', [10, 20, 50, 100]) +def test_ignore_display_max_colwidth(method, expected, max_colwidth): + # see gh-17004 + df = DataFrame([lorem_ipsum]) + with pd.option_context('display.max_colwidth', max_colwidth): + result = getattr(df, method)() + expected = expected(max_colwidth) + assert expected in result From 42b4c975e55c59da7868f99f9949841aca12a08d Mon Sep 17 00:00:00 2001 From: Justin Zheng Date: Sat, 2 Mar 2019 20:46:44 -0800 Subject: [PATCH 063/110] BUG-16807-1 SparseFrame fills with default_fill_value if data is None (#24842) Closes gh-16807. --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/sparse/frame.py | 4 ++-- pandas/tests/sparse/frame/test_frame.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index eb6e172648ef7..124ec8f4ab92c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -247,7 +247,7 @@ Sparse ^^^^^^ - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) -- +- Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index e0af11d13774c..2d54b82a3c844 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -124,8 +124,8 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, columns = Index([]) else: for c in columns: - data[c] = SparseArray(np.nan, index=index, - kind=self._default_kind, + data[c] = SparseArray(self._default_fill_value, + index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index b31738794c854..888d1fa1bfe45 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -270,6 +270,19 @@ def test_type_coercion_at_construction(self): default_fill_value=0) tm.assert_sp_frame_equal(result, expected) + def test_default_dtype(self): + result = pd.SparseDataFrame(columns=list('ab'), index=range(2)) + expected = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], + columns=list('ab'), index=range(2)) + tm.assert_sp_frame_equal(result, expected) + + def test_nan_data_with_int_dtype_raises_error(self): + sdf = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], + columns=list('ab'), index=range(2)) + msg = "Cannot convert non-finite values" + with pytest.raises(ValueError, match=msg): + pd.SparseDataFrame(sdf, dtype=np.int64) + def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan @@ -1263,6 +1276,14 @@ def test_notna(self): 'B': [True, False, True, True, False]}) tm.assert_frame_equal(res.to_dense(), exp) + def test_default_fill_value_with_no_data(self): + # GH 16807 + expected = pd.SparseDataFrame([[1.0, 1.0], [1.0, 1.0]], + columns=list('ab'), index=range(2)) + result = pd.SparseDataFrame(columns=list('ab'), index=range(2), + default_fill_value=1.0) + tm.assert_frame_equal(expected, result) + class TestSparseDataFrameArithmetic(object): From bd49d2f2af5c44d4f96031757743ba83e6b29408 Mon Sep 17 00:00:00 2001 From: yehia67 Date: Sun, 3 Mar 2019 22:30:18 +0200 Subject: [PATCH 064/110] DOC: Add conda uninstall pandas to contributing guide (#25490) * fix #25487 add modify documentation --- doc/source/development/contributing.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index a87a66cd08ad1..434df772ae9d1 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -178,6 +178,7 @@ We'll now kick off a three-step process: # Create and activate the build environment conda env create -f environment.yml conda activate pandas-dev + conda uninstall --force pandas # or with older versions of Anaconda: source activate pandas-dev From 705c44299332f05786b19d7e44363a38f5be03bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Mar 2019 10:39:41 -0800 Subject: [PATCH 065/110] fix segfault when running with cython coverage enabled, xref cython#2879 (#25529) --- pandas/_libs/tslibs/period.pyx | 96 +++++++++++++++++----------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e38e9a1ca5df6..a5a50ea59753d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -138,11 +138,11 @@ cdef int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil: return daytime_conversion_factor_matrix[row - 6][col - 6] -cdef int64_t nofunc(int64_t ordinal, asfreq_info *af_info): - return np.iinfo(np.int32).min +cdef int64_t nofunc(int64_t ordinal, asfreq_info *af_info) nogil: + return INT32_MIN -cdef int64_t no_op(int64_t ordinal, asfreq_info *af_info): +cdef int64_t no_op(int64_t ordinal, asfreq_info *af_info) nogil: return ordinal @@ -270,7 +270,8 @@ cdef int64_t DtoB_weekday(int64_t unix_date) nogil: return ((unix_date + 4) // 7) * 5 + ((unix_date + 4) % 7) - 4 -cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back, int64_t unix_date): +cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back, + int64_t unix_date) nogil: cdef: int day_of_week = dayofweek(dts.year, dts.month, dts.day) @@ -286,21 +287,23 @@ cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back, int64_t unix_date): return DtoB_weekday(unix_date) -cdef inline int64_t upsample_daytime(int64_t ordinal, asfreq_info *af_info): +cdef inline int64_t upsample_daytime(int64_t ordinal, + asfreq_info *af_info) nogil: if (af_info.is_end): return (ordinal + 1) * af_info.intraday_conversion_factor - 1 else: return ordinal * af_info.intraday_conversion_factor -cdef inline int64_t downsample_daytime(int64_t ordinal, asfreq_info *af_info): +cdef inline int64_t downsample_daytime(int64_t ordinal, + asfreq_info *af_info) nogil: return ordinal // (af_info.intraday_conversion_factor) cdef inline int64_t transform_via_day(int64_t ordinal, asfreq_info *af_info, freq_conv_func first_func, - freq_conv_func second_func): + freq_conv_func second_func) nogil: cdef: int64_t result @@ -313,7 +316,7 @@ cdef inline int64_t transform_via_day(int64_t ordinal, # Conversion _to_ Daily Freq cdef void AtoD_ym(int64_t ordinal, int64_t *year, - int *month, asfreq_info *af_info): + int *month, asfreq_info *af_info) nogil: year[0] = ordinal + 1970 month[0] = 1 @@ -327,7 +330,7 @@ cdef void AtoD_ym(int64_t ordinal, int64_t *year, year[0] -= 1 -cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int64_t unix_date, year int month @@ -341,7 +344,7 @@ cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info): cdef void QtoD_ym(int64_t ordinal, int *year, - int *month, asfreq_info *af_info): + int *month, asfreq_info *af_info) nogil: year[0] = ordinal // 4 + 1970 month[0] = (ordinal % 4) * 3 + 1 @@ -353,7 +356,7 @@ cdef void QtoD_ym(int64_t ordinal, int *year, year[0] -= 1 -cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int64_t unix_date int year, month @@ -366,12 +369,12 @@ cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info): return upsample_daytime(unix_date, af_info) -cdef void MtoD_ym(int64_t ordinal, int *year, int *month): +cdef void MtoD_ym(int64_t ordinal, int *year, int *month) nogil: year[0] = ordinal // 12 + 1970 month[0] = ordinal % 12 + 1 -cdef int64_t asfreq_MtoDT(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_MtoDT(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int64_t unix_date int year, month @@ -384,7 +387,7 @@ cdef int64_t asfreq_MtoDT(int64_t ordinal, asfreq_info *af_info): return upsample_daytime(unix_date, af_info) -cdef int64_t asfreq_WtoDT(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoDT(int64_t ordinal, asfreq_info *af_info) nogil: ordinal = (ordinal * 7 + af_info.from_end - 4 + (7 - 1) * (af_info.is_end - 1)) return upsample_daytime(ordinal, af_info) @@ -393,7 +396,7 @@ cdef int64_t asfreq_WtoDT(int64_t ordinal, asfreq_info *af_info): # -------------------------------------------------------------------- # Conversion _to_ BusinessDay Freq -cdef int64_t asfreq_AtoB(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoB(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int roll_back npy_datetimestruct dts @@ -404,7 +407,7 @@ cdef int64_t asfreq_AtoB(int64_t ordinal, asfreq_info *af_info): return DtoB(&dts, roll_back, unix_date) -cdef int64_t asfreq_QtoB(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoB(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int roll_back npy_datetimestruct dts @@ -415,7 +418,7 @@ cdef int64_t asfreq_QtoB(int64_t ordinal, asfreq_info *af_info): return DtoB(&dts, roll_back, unix_date) -cdef int64_t asfreq_MtoB(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_MtoB(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int roll_back npy_datetimestruct dts @@ -426,7 +429,7 @@ cdef int64_t asfreq_MtoB(int64_t ordinal, asfreq_info *af_info): return DtoB(&dts, roll_back, unix_date) -cdef int64_t asfreq_WtoB(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoB(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int roll_back npy_datetimestruct dts @@ -437,7 +440,7 @@ cdef int64_t asfreq_WtoB(int64_t ordinal, asfreq_info *af_info): return DtoB(&dts, roll_back, unix_date) -cdef int64_t asfreq_DTtoB(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_DTtoB(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int roll_back npy_datetimestruct dts @@ -452,7 +455,7 @@ cdef int64_t asfreq_DTtoB(int64_t ordinal, asfreq_info *af_info): # ---------------------------------------------------------------------- # Conversion _from_ Daily Freq -cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info) nogil: cdef: npy_datetimestruct dts @@ -464,7 +467,7 @@ cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info): return (dts.year - 1970) -cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year): +cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year) nogil: cdef: npy_datetimestruct dts int quarter @@ -485,7 +488,7 @@ cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year): return quarter -cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int year, quarter @@ -495,7 +498,7 @@ cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info): return ((year - 1970) * 4 + quarter - 1) -cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info) nogil: cdef: npy_datetimestruct dts @@ -504,7 +507,7 @@ cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info): return ((dts.year - 1970) * 12 + dts.month - 1) -cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info) nogil: ordinal = downsample_daytime(ordinal, af_info) return (ordinal + 3 - af_info.to_end) // 7 + 1 @@ -512,30 +515,30 @@ cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info): # -------------------------------------------------------------------- # Conversion _from_ BusinessDay Freq -cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info) nogil: ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 -3 return upsample_daytime(ordinal, af_info) -cdef int64_t asfreq_BtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_BtoA(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoA) -cdef int64_t asfreq_BtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_BtoQ(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoQ) -cdef int64_t asfreq_BtoM(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_BtoM(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoM) -cdef int64_t asfreq_BtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_BtoW(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoW) @@ -544,25 +547,25 @@ cdef int64_t asfreq_BtoW(int64_t ordinal, asfreq_info *af_info): # ---------------------------------------------------------------------- # Conversion _from_ Annual Freq -cdef int64_t asfreq_AtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoA(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoA) -cdef int64_t asfreq_AtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoQ(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoQ) -cdef int64_t asfreq_AtoM(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoM(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoM) -cdef int64_t asfreq_AtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_AtoW(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoW) @@ -571,25 +574,25 @@ cdef int64_t asfreq_AtoW(int64_t ordinal, asfreq_info *af_info): # ---------------------------------------------------------------------- # Conversion _from_ Quarterly Freq -cdef int64_t asfreq_QtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoQ(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoQ) -cdef int64_t asfreq_QtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoA(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoA) -cdef int64_t asfreq_QtoM(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoM(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoM) -cdef int64_t asfreq_QtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_QtoW(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoW) @@ -598,19 +601,19 @@ cdef int64_t asfreq_QtoW(int64_t ordinal, asfreq_info *af_info): # ---------------------------------------------------------------------- # Conversion _from_ Monthly Freq -cdef int64_t asfreq_MtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_MtoA(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoA) -cdef int64_t asfreq_MtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_MtoQ(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoQ) -cdef int64_t asfreq_MtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_MtoW(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoW) @@ -619,25 +622,25 @@ cdef int64_t asfreq_MtoW(int64_t ordinal, asfreq_info *af_info): # ---------------------------------------------------------------------- # Conversion _from_ Weekly Freq -cdef int64_t asfreq_WtoA(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoA(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoA) -cdef int64_t asfreq_WtoQ(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoQ(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoQ) -cdef int64_t asfreq_WtoM(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoM(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoM) -cdef int64_t asfreq_WtoW(int64_t ordinal, asfreq_info *af_info): +cdef int64_t asfreq_WtoW(int64_t ordinal, asfreq_info *af_info) nogil: return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoW) @@ -971,7 +974,7 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): return qtr_freq -cdef inline int month_to_quarter(int month): +cdef inline int month_to_quarter(int month) nogil: return (month - 1) // 3 + 1 @@ -1024,9 +1027,6 @@ def periodarr_to_dt64arr(int64_t[:] periodarr, int freq): with nogil: for i in range(l): - if periodarr[i] == NPY_NAT: - out[i] = NPY_NAT - continue out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out.base # .base to access underlying np.ndarray From f85f7a153d310137bcf92683191a5ce5bc57db58 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Mar 2019 10:40:36 -0800 Subject: [PATCH 066/110] TST: inline empty_frame = DataFrame({}) fixture (#24886) --- pandas/tests/frame/common.py | 2 +- pandas/tests/frame/conftest.py | 8 -------- pandas/tests/frame/test_analytics.py | 4 +++- pandas/tests/frame/test_api.py | 7 +++++-- pandas/tests/frame/test_apply.py | 12 +++++++++--- pandas/tests/frame/test_block_internals.py | 4 +++- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_reshape.py | 2 +- pandas/tests/series/conftest.py | 9 --------- pandas/tests/series/test_constructors.py | 4 +++- 10 files changed, 26 insertions(+), 28 deletions(-) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 2ea087c0510bf..5624f7c1303b6 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -85,7 +85,7 @@ def tzframe(self): @cache_readonly def empty(self): - return pd.DataFrame({}) + return pd.DataFrame() @cache_readonly def ts1(self): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 69ee614ab8d2a..fbe03325a3ad9 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -127,14 +127,6 @@ def timezone_frame(): return df -@pytest.fixture -def empty_frame(): - """ - Fixture for empty DataFrame - """ - return DataFrame({}) - - @pytest.fixture def simple_frame(): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 43a45bb915819..994187a62d862 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1096,7 +1096,9 @@ def test_operators_timedelta64(self): assert df['off1'].dtype == 'timedelta64[ns]' assert df['off2'].dtype == 'timedelta64[ns]' - def test_sum_corner(self, empty_frame): + def test_sum_corner(self): + empty_frame = DataFrame() + axis0 = empty_frame.sum(0) axis1 = empty_frame.sum(1) assert isinstance(axis0, Series) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 0934dd20638e4..e561b327e4fb0 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -142,7 +142,9 @@ def test_tab_completion(self): assert key not in dir(df) assert isinstance(df.__getitem__('A'), pd.DataFrame) - def test_not_hashable(self, empty_frame): + def test_not_hashable(self): + empty_frame = DataFrame() + df = self.klass([1]) pytest.raises(TypeError, hash, df) pytest.raises(TypeError, hash, empty_frame) @@ -171,7 +173,8 @@ def test_get_agg_axis(self, float_frame): pytest.raises(ValueError, float_frame._get_agg_axis, 2) - def test_nonzero(self, float_frame, float_string_frame, empty_frame): + def test_nonzero(self, float_frame, float_string_frame): + empty_frame = DataFrame() assert empty_frame.empty assert not float_frame.empty diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index a4cd1aa3bacb6..4d1e3e7ae1f38 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -74,8 +74,10 @@ def test_apply_mixed_datetimelike(self): result = df.apply(lambda x: x, axis=1) assert_frame_equal(result, df) - def test_apply_empty(self, float_frame, empty_frame): + def test_apply_empty(self, float_frame): # empty + empty_frame = DataFrame() + applied = empty_frame.apply(np.sqrt) assert applied.empty @@ -97,8 +99,10 @@ def test_apply_empty(self, float_frame, empty_frame): result = expected.apply(lambda x: x['a'], axis=1) assert_frame_equal(expected, result) - def test_apply_with_reduce_empty(self, empty_frame): + def test_apply_with_reduce_empty(self): # reduce with an empty DataFrame + empty_frame = DataFrame() + x = [] result = empty_frame.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, empty_frame) @@ -116,7 +120,9 @@ def test_apply_with_reduce_empty(self, empty_frame): # Ensure that x.append hasn't been called assert x == [] - def test_apply_deprecate_reduce(self, empty_frame): + def test_apply_deprecate_reduce(self): + empty_frame = DataFrame() + x = [] with tm.assert_produces_warning(FutureWarning): empty_frame.apply(x.append, axis=1, reduce=True) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 5419f4d5127f6..39d84f2e6086c 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -347,7 +347,9 @@ def test_copy(self, float_frame, float_string_frame): copy = float_string_frame.copy() assert copy._data is not float_string_frame._data - def test_pickle(self, float_string_frame, empty_frame, timezone_frame): + def test_pickle(self, float_string_frame, timezone_frame): + empty_frame = DataFrame() + unpickled = tm.round_trip_pickle(float_string_frame) assert_frame_equal(float_string_frame, unpickled) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a8a78b26e317c..b32255da324f4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -247,7 +247,7 @@ def test_constructor_dict(self): assert isna(frame['col3']).all() # Corner cases - assert len(DataFrame({})) == 0 + assert len(DataFrame()) == 0 # mix dict and array, wrong size - no spec for which error should raise # first diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index daac084f657af..4fe5172fefbcd 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -58,7 +58,7 @@ def test_pivot_duplicates(self): def test_pivot_empty(self): df = DataFrame({}, columns=['a', 'b', 'c']) result = df.pivot('a', 'b', 'c') - expected = DataFrame({}) + expected = DataFrame() tm.assert_frame_equal(result, expected, check_names=False) def test_pivot_integer_bug(self): diff --git a/pandas/tests/series/conftest.py b/pandas/tests/series/conftest.py index 431aacb1c8d56..367e7a1baa7f3 100644 --- a/pandas/tests/series/conftest.py +++ b/pandas/tests/series/conftest.py @@ -1,6 +1,5 @@ import pytest -from pandas import Series import pandas.util.testing as tm @@ -32,11 +31,3 @@ def object_series(): s = tm.makeObjectSeries() s.name = 'objects' return s - - -@pytest.fixture -def empty_series(): - """ - Fixture for empty Series - """ - return Series([], index=[]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d92ca48751d0a..8525b877618c9 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -47,7 +47,9 @@ def test_scalar_conversion(self): assert int(Series([1.])) == 1 assert long(Series([1.])) == 1 - def test_constructor(self, datetime_series, empty_series): + def test_constructor(self, datetime_series): + empty_series = Series() + assert datetime_series.index.is_all_dates # Pass in Series From 1c9de6984406e89491f5e989bc62cdf9b288cc09 Mon Sep 17 00:00:00 2001 From: leerssej Date: Mon, 4 Mar 2019 10:54:12 -0800 Subject: [PATCH 067/110] DOC: Polishing typos out of doc/source/user_guide/indexing.rst (#25528) --- doc/source/user_guide/indexing.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index be1745e2664a1..00d4dc9efc8cc 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -435,7 +435,7 @@ Selection By Position This is sometimes called ``chained assignment`` and should be avoided. See :ref:`Returning a View versus Copy `. -Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bounds is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. +Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: @@ -545,7 +545,7 @@ Selection By Callable .. versionadded:: 0.18.1 ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. -The ``callable`` must be a function with one argument (the calling Series, DataFrame or Panel) and that returns valid output for indexing. +The ``callable`` must be a function with one argument (the calling Series, DataFrame or Panel) that returns valid output for indexing. .. ipython:: python @@ -569,7 +569,7 @@ You can use callable indexing in ``Series``. df1.A.loc[lambda s: s > 0] Using these methods / indexers, you can chain data selection operations -without using temporary variable. +without using a temporary variable. .. ipython:: python @@ -907,7 +907,7 @@ of the DataFrame): df[df['A'] > 0] -List comprehensions and ``map`` method of Series can also be used to produce +List comprehensions and the ``map`` method of Series can also be used to produce more complex criteria: .. ipython:: python @@ -1556,7 +1556,7 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. ind ``set_names``, ``set_levels``, and ``set_codes`` also take an optional -`level`` argument +``level`` argument .. ipython:: python From 3bbcacf95b142776150a235dabb234806d87dff9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 4 Mar 2019 18:56:49 +0000 Subject: [PATCH 068/110] STY: use pytest.raises context manager (frame) (#25516) --- pandas/tests/frame/test_alter_axes.py | 3 +- pandas/tests/frame/test_analytics.py | 25 +++-- pandas/tests/frame/test_api.py | 19 +++- .../tests/frame/test_axis_select_reindex.py | 43 ++++++--- pandas/tests/frame/test_block_internals.py | 10 +- pandas/tests/frame/test_constructors.py | 51 ++++++---- pandas/tests/frame/test_convert_to.py | 8 +- pandas/tests/frame/test_dtypes.py | 20 ++-- pandas/tests/frame/test_indexing.py | 93 +++++++++++++------ pandas/tests/frame/test_missing.py | 39 ++++++-- pandas/tests/frame/test_mutate_columns.py | 4 +- pandas/tests/frame/test_nonunique_indexes.py | 15 ++- pandas/tests/frame/test_quantile.py | 11 ++- pandas/tests/frame/test_query_eval.py | 15 +-- pandas/tests/frame/test_replace.py | 8 +- pandas/tests/frame/test_reshape.py | 5 +- pandas/tests/frame/test_sorting.py | 8 +- pandas/tests/frame/test_timeseries.py | 28 ++++-- pandas/tests/frame/test_to_csv.py | 5 +- 19 files changed, 284 insertions(+), 126 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index a25e893e08900..f4a2a5f8032a0 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -633,7 +633,8 @@ def test_rename(self, float_frame): tm.assert_index_equal(renamed.index, Index(['BAR', 'FOO'])) # have to pass something - pytest.raises(TypeError, float_frame.rename) + with pytest.raises(TypeError, match="must pass an index to rename"): + float_frame.rename() # partial columns renamed = float_frame.rename(columns={'C': 'foo', 'D': 'bar'}) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 994187a62d862..3363a45149fff 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -898,6 +898,7 @@ def test_var_std(self, datetime_frame): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") @pytest.mark.parametrize( "meth", ['sem', 'var', 'std']) def test_numeric_only_flag(self, meth): @@ -919,10 +920,12 @@ def test_numeric_only_flag(self, meth): tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside - pytest.raises(TypeError, lambda: getattr(df1, meth)( - axis=1, numeric_only=False)) - pytest.raises(TypeError, lambda: getattr(df2, meth)( - axis=1, numeric_only=False)) + msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + getattr(df1, meth)(axis=1, numeric_only=False) + msg = "could not convert string to float: 'a'" + with pytest.raises(TypeError, match=msg): + getattr(df2, meth)(axis=1, numeric_only=False) def test_sem(self, datetime_frame): result = datetime_frame.sem(ddof=4) @@ -1369,6 +1372,7 @@ def test_pct_change(self): # ---------------------------------------------------------------------- # Index of max / min + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_idxmin(self, float_frame, int_frame): frame = float_frame frame.loc[5:10] = np.nan @@ -1381,8 +1385,11 @@ def test_idxmin(self, float_frame, int_frame): skipna=skipna) tm.assert_series_equal(result, expected) - pytest.raises(ValueError, frame.idxmin, axis=2) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.idxmin(axis=2) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_idxmax(self, float_frame, int_frame): frame = float_frame frame.loc[5:10] = np.nan @@ -1395,7 +1402,9 @@ def test_idxmax(self, float_frame, int_frame): skipna=skipna) tm.assert_series_equal(result, expected) - pytest.raises(ValueError, frame.idxmax, axis=2) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.idxmax(axis=2) # ---------------------------------------------------------------------- # Logical reductions @@ -1881,7 +1890,9 @@ def test_round_issue(self): tm.assert_index_equal(rounded.index, dfs.index) decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A']) - pytest.raises(ValueError, df.round, decimals) + msg = "Index of decimals must be unique" + with pytest.raises(ValueError, match=msg): + df.round(decimals) def test_built_in_round(self): if not compat.PY3: diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index e561b327e4fb0..118341276d799 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import long, lrange, range +from pandas.compat import PY2, long, lrange, range import pandas as pd from pandas import ( @@ -146,8 +146,12 @@ def test_not_hashable(self): empty_frame = DataFrame() df = self.klass([1]) - pytest.raises(TypeError, hash, df) - pytest.raises(TypeError, hash, empty_frame) + msg = ("'(Sparse)?DataFrame' objects are mutable, thus they cannot be" + " hashed") + with pytest.raises(TypeError, match=msg): + hash(df) + with pytest.raises(TypeError, match=msg): + hash(empty_frame) def test_new_empty_index(self): df1 = self.klass(np.random.randn(0, 3)) @@ -171,7 +175,9 @@ def test_get_agg_axis(self, float_frame): idx = float_frame._get_agg_axis(1) assert idx is float_frame.index - pytest.raises(ValueError, float_frame._get_agg_axis, 2) + msg = r"Axis must be 0 or 1 \(got 2\)" + with pytest.raises(ValueError, match=msg): + float_frame._get_agg_axis(2) def test_nonzero(self, float_frame, float_string_frame): empty_frame = DataFrame() @@ -354,12 +360,15 @@ def test_transpose(self, float_frame): for col, s in compat.iteritems(mixed_T): assert s.dtype == np.object_ + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_swapaxes(self): df = self.klass(np.random.randn(10, 5)) self._assert_frame_equal(df.T, df.swapaxes(0, 1)) self._assert_frame_equal(df.T, df.swapaxes(1, 0)) self._assert_frame_equal(df, df.swapaxes(0, 0)) - pytest.raises(ValueError, df.swapaxes, 2, 5) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + df.swapaxes(2, 5) def test_axis_aliases(self, float_frame): f = float_frame diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index dea925dcde676..fb00776b33cbb 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import lrange, lzip, u +from pandas.compat import PY2, lrange, lzip, u from pandas.errors import PerformanceWarning import pandas as pd @@ -38,8 +38,11 @@ def test_drop_names(self): assert obj.columns.name == 'second' assert list(df.columns) == ['d', 'e', 'f'] - pytest.raises(KeyError, df.drop, ['g']) - pytest.raises(KeyError, df.drop, ['g'], 1) + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(['g']) + with pytest.raises(KeyError, match=msg): + df.drop(['g'], 1) # errors = 'ignore' dropped = df.drop(['g'], errors='ignore') @@ -84,10 +87,14 @@ def test_drop(self): assert_frame_equal(simple.drop( [0, 3], axis='index'), simple.loc[[1, 2], :]) - pytest.raises(KeyError, simple.drop, 5) - pytest.raises(KeyError, simple.drop, 'C', 1) - pytest.raises(KeyError, simple.drop, [1, 5]) - pytest.raises(KeyError, simple.drop, ['A', 'C'], 1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop('C', 1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(['A', 'C'], 1) # errors = 'ignore' assert_frame_equal(simple.drop(5, errors='ignore'), simple) @@ -444,7 +451,9 @@ def test_reindex_dups(self): assert_frame_equal(result, expected) # reindex fails - pytest.raises(ValueError, df.reindex, index=list(range(len(df)))) + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df.reindex(index=list(range(len(df)))) def test_reindex_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 @@ -963,10 +972,15 @@ def test_take(self): assert_frame_equal(result, expected, check_names=False) # illegal indices - pytest.raises(IndexError, df.take, [3, 1, 2, 30], axis=0) - pytest.raises(IndexError, df.take, [3, 1, 2, -31], axis=0) - pytest.raises(IndexError, df.take, [3, 1, 2, 5], axis=1) - pytest.raises(IndexError, df.take, [3, 1, 2, -5], axis=1) + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 30], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -31], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 5], axis=1) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -5], axis=1) # mixed-dtype order = [4, 1, 2, 0, 3] @@ -1037,6 +1051,7 @@ def test_reindex_corner(self): smaller = self.intframe.reindex(columns=['A', 'B', 'E']) assert smaller['E'].dtype == np.float64 + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_reindex_axis(self): cols = ['A', 'B', 'E'] with tm.assert_produces_warning(FutureWarning) as m: @@ -1052,7 +1067,9 @@ def test_reindex_axis(self): reindexed2 = self.intframe.reindex(index=rows) assert_frame_equal(reindexed1, reindexed2) - pytest.raises(ValueError, self.intframe.reindex_axis, rows, axis=2) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + self.intframe.reindex_axis(rows, axis=2) # no-op case cols = self.frame.columns.copy() diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 39d84f2e6086c..4b06d2e35cdfc 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -274,10 +274,12 @@ def f(dtype): columns=["A", "B", "C"], dtype=dtype) - pytest.raises(NotImplementedError, f, - [("A", "datetime64[h]"), - ("B", "str"), - ("C", "int32")]) + msg = ("compound dtypes are not implemented in the DataFrame" + " constructor") + with pytest.raises(NotImplementedError, match=msg): + f([("A", "datetime64[h]"), + ("B", "str"), + ("C", "int32")]) # these work (though results may be unexpected) f('int64') diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b32255da324f4..fc642d211b30c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -12,7 +12,8 @@ import pytest from pandas.compat import ( - PY3, PY36, is_platform_little_endian, lmap, long, lrange, lzip, range, zip) + PY2, PY3, PY36, is_platform_little_endian, lmap, long, lrange, lzip, range, + zip) from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype @@ -58,8 +59,9 @@ def test_constructor_cast_failure(self): df['foo'] = np.ones((4, 2)).tolist() # this is not ok - pytest.raises(ValueError, df.__setitem__, tuple(['test']), - np.ones((4, 2))) + msg = "Wrong number of items passed 2, placement implies 1" + with pytest.raises(ValueError, match=msg): + df['test'] = np.ones((4, 2)) # this is ok df['foo2'] = np.ones((4, 2)).tolist() @@ -1259,7 +1261,9 @@ def test_constructor_Series_named(self): expected = DataFrame({0: s}) tm.assert_frame_equal(df, expected) - pytest.raises(ValueError, DataFrame, s, columns=[1, 2]) + msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)" + with pytest.raises(ValueError, match=msg): + DataFrame(s, columns=[1, 2]) # #2234 a = Series([], name='x') @@ -1433,8 +1437,10 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - pytest.raises(ValueError, DataFrame.from_dict, - OrderedDict([('b', 8), ('a', 5), ('a', 6)])) + msg = "If using all scalar values, you must pass an index" + with pytest.raises(ValueError, match=msg): + DataFrame.from_dict( + OrderedDict([('b', 8), ('a', 5), ('a', 6)])) def test_constructor_empty_with_string_dtype(self): # GH 9428 @@ -1465,8 +1471,11 @@ def test_constructor_single_value(self): dtype=object), index=[1, 2], columns=['a', 'c'])) - pytest.raises(ValueError, DataFrame, 'a', [1, 2]) - pytest.raises(ValueError, DataFrame, 'a', columns=['a', 'c']) + msg = "DataFrame constructor not properly called!" + with pytest.raises(ValueError, match=msg): + DataFrame('a', [1, 2]) + with pytest.raises(ValueError, match=msg): + DataFrame('a', columns=['a', 'c']) msg = 'incompatible data and dtype' with pytest.raises(TypeError, match=msg): @@ -1692,6 +1701,7 @@ def test_constructor_series_copy(self): assert not (series['A'] == 5).all() + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_constructor_with_nas(self): # GH 5016 # na's in indices @@ -1704,9 +1714,11 @@ def check(df): # No NaN found -> error if len(indexer) == 0: - def f(): + msg = ("cannot do label indexing on" + r" " + r" with these indexers \[nan\] of ") + with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] - pytest.raises(TypeError, f) # single nan should result in Series elif len(indexer) == 1: tm.assert_series_equal(df.iloc[:, indexer[0]], @@ -1782,13 +1794,15 @@ def test_constructor_categorical(self): tm.assert_frame_equal(df, expected) # invalid (shape) - pytest.raises(ValueError, - lambda: DataFrame([Categorical(list('abc')), - Categorical(list('abdefg'))])) + msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" + with pytest.raises(ValueError, match=msg): + DataFrame([Categorical(list('abc')), + Categorical(list('abdefg'))]) # ndim > 1 - pytest.raises(NotImplementedError, - lambda: Categorical(np.array([list('abcd')]))) + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + Categorical(np.array([list('abcd')])) def test_constructor_categorical_series(self): @@ -2164,8 +2178,11 @@ def test_from_records_bad_index_column(self): tm.assert_index_equal(df1.index, Index(df.C)) # should fail - pytest.raises(ValueError, DataFrame.from_records, df, index=[2]) - pytest.raises(KeyError, DataFrame.from_records, df, index=2) + msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)" + with pytest.raises(ValueError, match=msg): + DataFrame.from_records(df, index=[2]) + with pytest.raises(KeyError, match=r"^2$"): + DataFrame.from_records(df, index=2) def test_from_records_non_tuple(self): class Record(object): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 601a4c6b72fe3..db60fbf0f8563 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -75,11 +75,15 @@ def test_to_dict_index_not_unique_with_index_orient(self): # GH22801 # Data loss when indexes are not unique. Raise ValueError. df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) - pytest.raises(ValueError, df.to_dict, orient='index') + msg = "DataFrame index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient='index') def test_to_dict_invalid_orient(self): df = DataFrame({'A': [0, 1]}) - pytest.raises(ValueError, df.to_dict, orient='xinvalid') + msg = "orient 'xinvalid' not understood" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient='xinvalid') def test_to_records_dt64(self): df = DataFrame([["one", "two", "three"], diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index a8776c84b98ca..b37bf02a6b8e7 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -154,8 +154,8 @@ def test_select_dtypes_include_using_list_like(self): ei = df[['h', 'i']] assert_frame_equal(ri, ei) - pytest.raises(NotImplementedError, - lambda: df.select_dtypes(include=['period'])) + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include=['period']) def test_select_dtypes_exclude_using_list_like(self): df = DataFrame({'a': list('abc'), @@ -218,8 +218,8 @@ def test_select_dtypes_include_using_scalars(self): ei = df[['f']] assert_frame_equal(ri, ei) - pytest.raises(NotImplementedError, - lambda: df.select_dtypes(include='period')) + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include='period') def test_select_dtypes_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), @@ -245,8 +245,8 @@ def test_select_dtypes_exclude_using_scalars(self): ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] assert_frame_equal(ri, ei) - pytest.raises(NotImplementedError, - lambda: df.select_dtypes(exclude='period')) + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(exclude='period') def test_select_dtypes_include_exclude_using_scalars(self): df = DataFrame({'a': list('abc'), @@ -601,8 +601,12 @@ def test_astype_dict_like(self, dtype_class): # in the keys of the dtype dict dt4 = dtype_class({'b': str, 2: str}) dt5 = dtype_class({'e': str}) - pytest.raises(KeyError, df.astype, dt4) - pytest.raises(KeyError, df.astype, dt5) + msg = ("Only a column name can be used for the key in a dtype mappings" + " argument") + with pytest.raises(KeyError, match=msg): + df.astype(dt4) + with pytest.raises(KeyError, match=msg): + df.astype(dt5) assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 19b8ae4eb6e0f..ffe54f7a94307 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -9,7 +9,7 @@ import pytest from pandas._libs.tslib import iNaT -from pandas.compat import long, lrange, lzip, map, range, zip +from pandas.compat import PY2, long, lrange, lzip, map, range, zip from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar from pandas.core.dtypes.dtypes import CategoricalDtype @@ -431,8 +431,9 @@ def test_getitem_setitem_ix_negative_integers(self): def test_getattr(self): assert_series_equal(self.frame.A, self.frame['A']) - pytest.raises(AttributeError, getattr, self.frame, - 'NONEXISTENT_NAME') + msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" + with pytest.raises(AttributeError, match=msg): + self.frame.NONEXISTENT_NAME def test_setattr_column(self): df = DataFrame({'foobar': 1}, index=lrange(10)) @@ -793,7 +794,8 @@ def test_delitem_corner(self): f = self.frame.copy() del f['D'] assert len(f.columns) == 3 - pytest.raises(KeyError, f.__delitem__, 'D') + with pytest.raises(KeyError, match=r"^'D'$"): + del f['D'] del f['B'] assert len(f.columns) == 2 @@ -842,7 +844,9 @@ def test_getitem_fancy_2d(self): with catch_warnings(record=True): simplefilter("ignore", DeprecationWarning) - pytest.raises(ValueError, f.ix.__getitem__, f > 0.5) + msg = "Cannot index with multidimensional key" + with pytest.raises(ValueError, match=msg): + f.ix[f > 0.5] def test_slice_floats(self): index = [52195.504153, 52196.303147, 52198.369883] @@ -865,6 +869,7 @@ def test_getitem_fancy_slice_integers_step(self): df.iloc[:8:2] = np.nan assert isna(df.iloc[:8:2]).values.all() + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_getitem_setitem_integer_slice_keyerrors(self): df = DataFrame(np.random.randn(10, 5), index=lrange(0, 20, 2)) @@ -887,8 +892,10 @@ def test_getitem_setitem_integer_slice_keyerrors(self): # non-monotonic, raise KeyError df2 = df.iloc[lrange(5) + lrange(5, 10)[::-1]] - pytest.raises(KeyError, df2.loc.__getitem__, slice(3, 11)) - pytest.raises(KeyError, df2.loc.__setitem__, slice(3, 11), 0) + with pytest.raises(KeyError, match=r"^3$"): + df2.loc[3:11] + with pytest.raises(KeyError, match=r"^3$"): + df2.loc[3:11] = 0 def test_setitem_fancy_2d(self): @@ -1077,6 +1084,7 @@ def test_fancy_getitem_int_labels(self): expected = df[3] assert_series_equal(result, expected) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_fancy_index_int_labels_exceptions(self): df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) @@ -1084,14 +1092,18 @@ def test_fancy_index_int_labels_exceptions(self): simplefilter("ignore", DeprecationWarning) # labels that aren't contained - pytest.raises(KeyError, df.ix.__setitem__, - ([0, 1, 2], [2, 3, 4]), 5) + with pytest.raises(KeyError, match=r"\[1\] not in index"): + df.ix[[0, 1, 2], [2, 3, 4]] = 5 # try to set indices not contained in frame - pytest.raises(KeyError, self.frame.ix.__setitem__, - ['foo', 'bar', 'baz'], 1) - pytest.raises(KeyError, self.frame.ix.__setitem__, - (slice(None, None), ['E']), 1) + msg = (r"None of \[Index\(\['foo', 'bar', 'baz'\]," + r" dtype='object'\)\] are in the \[index\]") + with pytest.raises(KeyError, match=msg): + self.frame.ix[['foo', 'bar', 'baz']] = 1 + msg = (r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" + r" \[columns\]") + with pytest.raises(KeyError, match=msg): + self.frame.ix[:, ['E']] = 1 # partial setting now allows this GH2578 # pytest.raises(KeyError, self.frame.ix.__setitem__, @@ -1504,6 +1516,7 @@ def test_getitem_setitem_boolean_multi(self): expected.loc[[0, 2], [1]] = 5 assert_frame_equal(df, expected) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.randn(5, 5), index=index) @@ -1537,7 +1550,11 @@ def test_getitem_setitem_float_labels(self): df = DataFrame(np.random.randn(5, 5), index=index) # positional slicing only via iloc! - pytest.raises(TypeError, lambda: df.iloc[1.0:5]) + msg = ("cannot do slice indexing on" + r" with" + r" these indexers \[1.0\] of ") + with pytest.raises(TypeError, match=msg): + df.iloc[1.0:5] result = df.iloc[4:5] expected = df.reindex([5.0]) @@ -1744,11 +1761,16 @@ def test_getitem_setitem_ix_bool_keyerror(self): # #2199 df = DataFrame({'a': [1, 2, 3]}) - pytest.raises(KeyError, df.loc.__getitem__, False) - pytest.raises(KeyError, df.loc.__getitem__, True) + with pytest.raises(KeyError, match=r"^False$"): + df.loc[False] + with pytest.raises(KeyError, match=r"^True$"): + df.loc[True] - pytest.raises(KeyError, df.loc.__setitem__, False, 0) - pytest.raises(KeyError, df.loc.__setitem__, True, 0) + msg = "cannot use a single bool to index into setitem" + with pytest.raises(KeyError, match=msg): + df.loc[False] = 0 + with pytest.raises(KeyError, match=msg): + df.loc[True] = 0 def test_getitem_list_duplicates(self): # #1943 @@ -1813,6 +1835,7 @@ def test_set_value(self): self.frame.set_value(idx, col, 1) assert self.frame[col][idx] == 1 + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_set_value_resize(self): with tm.assert_produces_warning(FutureWarning, @@ -1849,7 +1872,9 @@ def test_set_value_resize(self): assert isna(res3['baz'].drop(['foobar'])).all() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - pytest.raises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') + msg = "could not convert string to float: 'sam'" + with pytest.raises(ValueError, match=msg): + res3.set_value('foobar', 'baz', 'sam') def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), @@ -1888,7 +1913,8 @@ def test_get_set_value_no_partial_indexing(self): df = DataFrame(index=index, columns=lrange(4)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - pytest.raises(KeyError, df.get_value, 0, 1) + with pytest.raises(KeyError, match=r"^0$"): + df.get_value(0, 1) def test_single_element_ix_dont_upcast(self): self.frame['E'] = 1 @@ -2158,10 +2184,15 @@ def test_non_monotonic_reindex_methods(self): df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list('A')) # index is not monotonic increasing or decreasing - pytest.raises(ValueError, df_rev.reindex, df.index, method='pad') - pytest.raises(ValueError, df_rev.reindex, df.index, method='ffill') - pytest.raises(ValueError, df_rev.reindex, df.index, method='bfill') - pytest.raises(ValueError, df_rev.reindex, df.index, method='nearest') + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method='pad') + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method='ffill') + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method='bfill') + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method='nearest') def test_reindex_level(self): from itertools import permutations @@ -2669,14 +2700,20 @@ def _check_align(df, cond, other, check_dtypes=True): # invalid conditions df = default_frame err1 = (df + 1).values[0:2, :] - pytest.raises(ValueError, df.where, cond, err1) + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) err2 = cond.iloc[:2, :].values other1 = _safe_add(df) - pytest.raises(ValueError, df.where, err2, other1) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) - pytest.raises(ValueError, df.mask, True) - pytest.raises(ValueError, df.mask, 0) + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) # where inplace def _check_set(df, cond, check_dtypes=True): diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 77a3d4785d295..2f3b0a9f76de9 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import lrange +from pandas.compat import PY2, lrange import pandas.util._test_decorators as td import pandas as pd @@ -83,6 +83,7 @@ def test_dropIncompleteRows(self): tm.assert_index_equal(samesize_frame.index, self.frame.index) tm.assert_index_equal(inp_frame2.index, self.frame.index) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) df[2][:2] = np.nan @@ -139,7 +140,9 @@ def test_dropna(self): assert_frame_equal(dropped, expected) # bad input - pytest.raises(ValueError, df.dropna, axis=3) + msg = "No axis named 3 for object type " + with pytest.raises(ValueError, match=msg): + df.dropna(axis=3) def test_drop_and_dropna_caching(self): # tst that cacher updates @@ -158,10 +161,15 @@ def test_drop_and_dropna_caching(self): def test_dropna_corner(self): # bad input - pytest.raises(ValueError, self.frame.dropna, how='foo') - pytest.raises(TypeError, self.frame.dropna, how=None) + msg = "invalid how option: foo" + with pytest.raises(ValueError, match=msg): + self.frame.dropna(how='foo') + msg = "must specify how or thresh" + with pytest.raises(TypeError, match=msg): + self.frame.dropna(how=None) # non-existent column - 8303 - pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X']) + with pytest.raises(KeyError, match=r"^\['X'\]$"): + self.frame.dropna(subset=['A', 'X']) def test_dropna_multiple_axes(self): df = DataFrame([[1, np.nan, 2, 3], @@ -226,8 +234,12 @@ def test_fillna(self): result = self.mixed_frame.fillna(value=0) result = self.mixed_frame.fillna(method='pad') - pytest.raises(ValueError, self.tsframe.fillna) - pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill') + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + self.tsframe.fillna() + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + self.tsframe.fillna(5, method='ffill') # mixed numeric (but no float16) mf = self.mixed_float.reindex(columns=['A', 'B', 'D']) @@ -595,11 +607,18 @@ def test_fillna_invalid_method(self): def test_fillna_invalid_value(self): # list - pytest.raises(TypeError, self.frame.fillna, [1, 2]) + msg = ("\"value\" parameter must be a scalar or dict, but you passed" + " a \"{}\"") + with pytest.raises(TypeError, match=msg.format('list')): + self.frame.fillna([1, 2]) # tuple - pytest.raises(TypeError, self.frame.fillna, (1, 2)) + with pytest.raises(TypeError, match=msg.format('tuple')): + self.frame.fillna((1, 2)) # frame with series - pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame) + msg = ("\"value\" parameter must be a scalar, dict or Series, but you" + " passed a \"DataFrame\"") + with pytest.raises(TypeError, match=msg): + self.frame.iloc[:, 0].fillna(self.frame) def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 1f4da1bbb0470..6bef7e3f65b21 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -177,7 +177,9 @@ def test_insert(self): with pytest.raises(ValueError, match='already exists'): df.insert(1, 'a', df['b']) - pytest.raises(ValueError, df.insert, 1, 'c', df['b']) + msg = "cannot insert c, already exists" + with pytest.raises(ValueError, match=msg): + df.insert(1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index a5bed14cf06d2..799d548100b5e 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -187,8 +187,11 @@ def check(result, expected=None): # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) - pytest.raises(ValueError, df.reindex, columns=['bar']) - pytest.raises(ValueError, df.reindex, columns=['bar', 'foo']) + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df.reindex(columns=['bar']) + with pytest.raises(ValueError, match=msg): + df.reindex(columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], @@ -306,7 +309,9 @@ def check(result, expected=None): # boolean with the duplicate raises df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') - pytest.raises(ValueError, lambda: df[df.A > 6]) + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df[df.A > 6] # dup aligining operations should work # GH 5185 @@ -323,7 +328,9 @@ def check(result, expected=None): columns=['A', 'A']) # not-comparing like-labelled - pytest.raises(ValueError, lambda: df1 == df2) + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + df1 == df2 df1r = df1.reindex_like(df2) result = df1r == df2 diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index d1f1299a5202e..19b6636978643 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import PY2 + import pandas as pd from pandas import DataFrame, Series, Timestamp from pandas.tests.frame.common import TestData @@ -71,6 +73,7 @@ def test_quantile_axis_mixed(self): with pytest.raises(TypeError): df.quantile(.5, axis=1, numeric_only=False) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_quantile_axis_parameter(self): # GH 9543/9544 @@ -92,8 +95,12 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis="columns") assert_series_equal(result, expected) - pytest.raises(ValueError, df.quantile, 0.1, axis=-1) - pytest.raises(ValueError, df.quantile, 0.1, axis="column") + msg = "No axis named -1 for object type " + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis=-1) + msg = "No axis named column for object type " + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis="column") def test_quantile_interpolation(self): # see gh-10174 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0d06d0006a9e2..ba02cb54bcea1 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -78,10 +78,10 @@ def test_query_numexpr(self): result = df.eval('A+1', engine='numexpr') assert_series_equal(result, self.expected2, check_names=False) else: - pytest.raises(ImportError, - lambda: df.query('A>0', engine='numexpr')) - pytest.raises(ImportError, - lambda: df.eval('A+1', engine='numexpr')) + with pytest.raises(ImportError): + df.query('A>0', engine='numexpr') + with pytest.raises(ImportError): + df.eval('A+1', engine='numexpr') class TestDataFrameEval(TestData): @@ -852,9 +852,10 @@ def test_str_query_method(self, parser, engine): for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) - pytest.raises(NotImplementedError, df.query, ex, - engine=engine, parser=parser, - local_dict={'strings': df.strings}) + msg = r"'(Not)?In' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + df.query(ex, engine=engine, parser=parser, + local_dict={'strings': df.strings}) else: res = df.query('"a" == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 127a64da38ba3..50c66d3f8db00 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -837,7 +837,9 @@ def test_replace_input_formats_listlike(self): expected.replace(to_rep[i], values[i], inplace=True) assert_frame_equal(result, expected) - pytest.raises(ValueError, df.replace, to_rep, values[1:]) + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + df.replace(to_rep, values[1:]) def test_replace_input_formats_scalar(self): df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], @@ -850,7 +852,9 @@ def test_replace_input_formats_scalar(self): for k, v in compat.iteritems(df)} assert_frame_equal(filled, DataFrame(expected)) - pytest.raises(TypeError, df.replace, to_rep, [np.nan, 0, '']) + msg = "value argument must be scalar, dict, or Series" + with pytest.raises(TypeError, match=msg): + df.replace(to_rep, [np.nan, 0, '']) # list to scalar to_rep = [np.nan, 0, ''] diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 4fe5172fefbcd..8abf3a6706886 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -394,7 +394,10 @@ def test_stack_mixed_levels(self): # When mixed types are passed and the ints are not level # names, raise - pytest.raises(ValueError, df2.stack, level=['animal', 0]) + msg = ("level should contain all level names or all level numbers, not" + " a mixture of the two") + with pytest.raises(ValueError, match=msg): + df2.stack(level=['animal', 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 85e6373b384e4..8b29394bcab84 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import lrange +from pandas.compat import PY2, lrange import pandas as pd from pandas import ( @@ -21,6 +21,7 @@ class TestDataFrameSorting(TestData): + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_sort_values(self): frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list('ABC')) @@ -54,8 +55,9 @@ def test_sort_values(self): sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) assert_frame_equal(sorted_df, expected) - pytest.raises(ValueError, lambda: frame.sort_values( - by=['A', 'B'], axis=2, inplace=True)) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=['A', 'B'], axis=2, inplace=True) # by row (axis=1): GH 10806 sorted_df = frame.sort_values(by=3, axis=1) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 31e81a9ca77c2..716a9e30e4cc3 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -8,7 +8,7 @@ import pytest import pytz -from pandas.compat import product +from pandas.compat import PY2, product import pandas as pd from pandas import ( @@ -395,7 +395,9 @@ def test_tshift(self): assert_frame_equal(unshifted, inferred_ts) no_freq = self.tsframe.iloc[[0, 5, 7], :] - pytest.raises(ValueError, no_freq.tshift) + msg = "Freq was not given and was not set in the index" + with pytest.raises(ValueError, match=msg): + no_freq.tshift() def test_truncate(self): ts = self.tsframe[::3] @@ -436,9 +438,10 @@ def test_truncate(self): truncated = ts.truncate(after=end_missing) assert_frame_equal(truncated, expected) - pytest.raises(ValueError, ts.truncate, - before=ts.index[-1] - ts.index.freq, - after=ts.index[0] + ts.index.freq) + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" + with pytest.raises(ValueError, match=msg): + ts.truncate(before=ts.index[-1] - ts.index.freq, + after=ts.index[0] + ts.index.freq) def test_truncate_copy(self): index = self.tsframe.index @@ -781,14 +784,18 @@ def test_between_time_axis_raises(self, axis): ts = DataFrame(rand_data, index=rng, columns=rng) stime, etime = ('08:00:00', '09:00:00') + msg = "Index must be DatetimeIndex" if axis in ['columns', 1]: ts.index = mask - pytest.raises(TypeError, ts.between_time, stime, etime) - pytest.raises(TypeError, ts.between_time, stime, etime, axis=0) + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime) + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=0) if axis in ['index', 0]: ts.columns = mask - pytest.raises(TypeError, ts.between_time, stime, etime, axis=1) + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=1) def test_operation_on_NaT(self): # Both NaT and Timestamp are in DataFrame. @@ -829,6 +836,7 @@ def test_datetime_assignment_with_NaT_and_diff_time_units(self): 'new': [1e9, None]}, dtype='datetime64[ns]') tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_frame_to_period(self): K = 5 @@ -854,7 +862,9 @@ def test_frame_to_period(self): pts = df.to_period('M', axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) - pytest.raises(ValueError, df.to_period, axis=2) + msg = "No axis named 2 for object type " + with pytest.raises(ValueError, match=msg): + df.to_period(axis=2) @pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert']) def test_tz_convert_and_localize(self, fn): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 61eefccede5dd..54a8712a9c645 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -109,8 +109,9 @@ def test_to_csv_from_csv2(self): xp.columns = col_aliases assert_frame_equal(xp, rs) - pytest.raises(ValueError, self.frame2.to_csv, path, - header=['AA', 'X']) + msg = "Writing 4 cols but got 2 aliases" + with pytest.raises(ValueError, match=msg): + self.frame2.to_csv(path, header=['AA', 'X']) def test_to_csv_from_csv3(self): From 076b5a85bcbed979c3b8df5dd825b1b2771894ff Mon Sep 17 00:00:00 2001 From: Bharat Raghunathan Date: Tue, 5 Mar 2019 08:37:07 +0530 Subject: [PATCH 069/110] DOC: Fix #24268 by updating description for keep in Series.nlargest (#25358) * DOC: Fix #24268 by updating description for keep --- pandas/core/series.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cada6663ce651..f6598ed1ee614 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3098,8 +3098,10 @@ def nlargest(self, n=5, keep='first'): When there are duplicate values that cannot all fit in a Series of `n` elements: - - ``first`` : take the first occurrences based on the index order - - ``last`` : take the last occurrences based on the index order + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. - ``all`` : keep all occurrences. This can result in a Series of size larger than `n`. @@ -3194,8 +3196,10 @@ def nsmallest(self, n=5, keep='first'): When there are duplicate values that cannot all fit in a Series of `n` elements: - - ``first`` : take the first occurrences based on the index order - - ``last`` : take the last occurrences based on the index order + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. - ``all`` : keep all occurrences. This can result in a Series of size larger than `n`. @@ -3236,7 +3240,7 @@ def nsmallest(self, n=5, keep='first'): Monserat 5200 dtype: int64 - The `n` largest elements where ``n=5`` by default. + The `n` smallest elements where ``n=5`` by default. >>> s.nsmallest() Monserat 5200 From d10bbce349b293e9f753d799cd2ad032d5ec7333 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 5 Mar 2019 10:24:46 +0000 Subject: [PATCH 070/110] fix MacPython / pandas-wheels ci failures (#25537) --- pandas/tests/test_sorting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 2a64947042979..7528566e8326e 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -409,8 +409,9 @@ def test_mixed_integer_from_list(self): def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - msg = (r"'(<|>)' not supported between instances of" - r" 'datetime\.datetime' and 'int'|" + msg = (r"'(<|>)' not supported between instances of ('" + r"datetime\.datetime' and 'int'|'int' and 'datetime\.datetime" + r"')|" r"unorderable types: int\(\) > datetime\.datetime\(\)") if compat.PY2: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception From a54852a66f341425a83beb70cfe35a105b719794 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 5 Mar 2019 18:44:10 +0000 Subject: [PATCH 071/110] TST/CLN: Remove more Panel tests (#25550) --- pandas/tests/indexing/common.py | 45 +- .../tests/indexing/multiindex/test_panel.py | 57 -- pandas/tests/indexing/test_loc.py | 24 - pandas/tests/indexing/test_panel.py | 104 ---- pandas/tests/test_panel.py | 573 ------------------ 5 files changed, 4 insertions(+), 799 deletions(-) delete mode 100644 pandas/tests/indexing/multiindex/test_panel.py delete mode 100644 pandas/tests/indexing/test_panel.py delete mode 100644 pandas/tests/test_panel.py diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 91ea38920c702..1b74eeea1a8c3 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -4,15 +4,13 @@ from warnings import catch_warnings, filterwarnings import numpy as np -import pytest from pandas.compat import lrange from pandas.core.dtypes.common import is_scalar from pandas import ( - DataFrame, Float64Index, MultiIndex, Panel, Series, UInt64Index, - date_range) + DataFrame, Float64Index, MultiIndex, Series, UInt64Index, date_range) from pandas.util import testing as tm from pandas.io.formats.printing import pprint_thing @@ -31,11 +29,10 @@ def _axify(obj, key, axis): return tuple(axes) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") class Base(object): """ indexing comprehensive base class """ - _objs = {'series', 'frame', 'panel'} + _objs = {'series', 'frame'} _typs = {'ints', 'uints', 'labels', 'mixed', 'ts', 'floats', 'empty', 'ts_rev', 'multi'} @@ -45,31 +42,18 @@ def setup_method(self, method): self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) - with catch_warnings(record=True): - self.panel_ints = Panel(np.random.rand(4, 4, 4), - items=lrange(0, 8, 2), - major_axis=lrange(0, 12, 3), - minor_axis=lrange(0, 16, 4)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) - self.panel_uints = Panel(np.random.rand(4, 4, 4), - items=UInt64Index(lrange(0, 8, 2)), - major_axis=UInt64Index(lrange(0, 12, 3)), - minor_axis=UInt64Index(lrange(0, 16, 4))) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame(np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3))) - self.panel_floats = Panel(np.random.rand(4, 4, 4), - items=Float64Index(range(0, 8, 2)), - major_axis=Float64Index(range(0, 12, 3)), - minor_axis=Float64Index(range(0, 16, 4))) m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), @@ -80,31 +64,19 @@ def setup_method(self, method): self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) - self.panel_multi = Panel(np.random.rand(4, 4, 4), - items=m_idces[0], - major_axis=m_idces[1], - minor_axis=m_idces[2]) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) - self.panel_labels = Panel(np.random.randn(4, 4, 4), - items=list('abcd'), - major_axis=list('ABCD'), - minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) - self.panel_mixed = Panel(np.random.randn(4, 4, 4), - items=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) - self.panel_ts = Panel(np.random.randn(4, 4, 4), - items=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4) .sort_values(ascending=False)) @@ -112,12 +84,9 @@ def setup_method(self, method): index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) - self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), - items=dates_rev) self.frame_empty = DataFrame({}) self.series_empty = Series({}) - self.panel_empty = Panel({}) # form agglomerates for o in self._objs: @@ -273,7 +242,7 @@ def _print(result, error=None): else: axes = list(axes) else: - axes = [0, 1, 2] + axes = [0, 1] # check for o in objs: @@ -296,10 +265,4 @@ def _call(obj=obj): k2 = key2 _eq(t, o, a, obj, key1, k2) - # Panel deprecations - if isinstance(obj, Panel): - with catch_warnings(): - filterwarnings("ignore", "\nPanel*", FutureWarning) - _call() - else: - _call() + _call() diff --git a/pandas/tests/indexing/multiindex/test_panel.py b/pandas/tests/indexing/multiindex/test_panel.py deleted file mode 100644 index 314009146911a..0000000000000 --- a/pandas/tests/indexing/multiindex/test_panel.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame, MultiIndex, Panel, Series -from pandas.util import testing as tm - - -@pytest.mark.filterwarnings('ignore:\\nPanel:FutureWarning') -class TestMultiIndexPanel(object): - - def test_iloc_getitem_panel_multiindex(self): - - # GH 7199 - # Panel with multi-index - multi_index = MultiIndex.from_tuples([('ONE', 'one'), - ('TWO', 'two'), - ('THREE', 'three')], - names=['UPPER', 'lower']) - - simple_index = [x[0] for x in multi_index] - wd1 = Panel(items=['First', 'Second'], - major_axis=['a', 'b', 'c', 'd'], - minor_axis=multi_index) - - wd2 = Panel(items=['First', 'Second'], - major_axis=['a', 'b', 'c', 'd'], - minor_axis=simple_index) - - expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] - result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG - tm.assert_frame_equal(result1, expected1) - - expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] - result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] - tm.assert_frame_equal(result2, expected2) - - expected1 = DataFrame(index=['a'], columns=multi_index, - dtype='float64') - result1 = wd1.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result1, expected1) - - expected2 = DataFrame(index=['a'], columns=simple_index, - dtype='float64') - result2 = wd2.iloc[0, [0], [0, 1, 2]] - tm.assert_frame_equal(result2, expected2) - - # GH 7516 - mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) - p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), - items=['a', 'b', 'c'], major_axis=mi, - minor_axis=['u', 'v', 'w']) - result = p.iloc[:, 1, 0] - expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') - tm.assert_series_equal(result, expected) - - result = p.loc[:, (1, 'y'), 'u'] - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 29f70929624fc..c4f98b892feb7 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -95,8 +95,6 @@ def test_loc_getitem_int(self): typs=['ints', 'uints'], axes=0) self.check_result('int label', 'loc', 3, 'ix', 3, typs=['ints', 'uints'], axes=1) - self.check_result('int label', 'loc', 4, 'ix', 4, - typs=['ints', 'uints'], axes=2) self.check_result('int label', 'loc', 2, 'ix', 2, typs=['label'], fails=KeyError) @@ -137,14 +135,10 @@ def test_loc_getitem_label_list(self): typs=['ints', 'uints'], axes=0) self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) - self.check_result('list lbl', 'loc', [4, 8, 12], 'ix', [4, 8, 12], - typs=['ints', 'uints'], axes=2) self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', ['a', 'b', 'd'], typs=['labels'], axes=0) self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', ['A', 'B', 'C'], typs=['labels'], axes=1) - self.check_result('list lbl', 'loc', ['Z', 'Y', 'W'], 'ix', - ['Z', 'Y', 'W'], typs=['labels'], axes=2) self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix', [2, 8, 'null'], typs=['mixed'], axes=0) self.check_result('list lbl', 'loc', @@ -167,11 +161,6 @@ def test_loc_getitem_label_list_with_missing(self): typs=['ints', 'uints', 'floats'], axes=1, fails=KeyError) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], - typs=['ints', 'uints', 'floats'], - axes=2, fails=KeyError) - # GH 17758 - MultiIndex and missing keys with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.check_result('list lbl', 'loc', [(1, 3), (1, 4), (2, 5)], @@ -194,8 +183,6 @@ def test_loc_getitem_label_list_fails(self): # fails self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], typs=['ints', 'uints'], axes=1, fails=KeyError) - self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints', 'uints'], axes=2, fails=KeyError) def test_loc_getitem_label_array_like(self): # array like @@ -203,8 +190,6 @@ def test_loc_getitem_label_array_like(self): 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) - self.check_result('array like', 'loc', Series(index=[4, 8, 12]).index, - 'ix', [4, 8, 12], typs=['ints', 'uints'], axes=2) def test_loc_getitem_bool(self): # boolean indexers @@ -222,8 +207,6 @@ def test_loc_getitem_int_slice(self): typs=['ints', 'uints'], axes=0) self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], typs=['ints', 'uints'], axes=1) - self.check_result('int slice2', 'loc', slice(4, 8), 'ix', [4, 8], - typs=['ints', 'uints'], axes=2) def test_loc_to_fail(self): @@ -318,8 +301,6 @@ def test_loc_getitem_label_slice(self): 'ix', slice('a', 'c'), typs=['labels'], axes=0) self.check_result('lab slice', 'loc', slice('A', 'C'), 'ix', slice('A', 'C'), typs=['labels'], axes=1) - self.check_result('lab slice', 'loc', slice('W', 'Z'), - 'ix', slice('W', 'Z'), typs=['labels'], axes=2) self.check_result('ts slice', 'loc', slice('20130102', '20130104'), 'ix', slice('20130102', '20130104'), @@ -327,9 +308,6 @@ def test_loc_getitem_label_slice(self): self.check_result('ts slice', 'loc', slice('20130102', '20130104'), 'ix', slice('20130102', '20130104'), typs=['ts'], axes=1, fails=TypeError) - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=2, fails=TypeError) # GH 14316 self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'), @@ -339,8 +317,6 @@ def test_loc_getitem_label_slice(self): typs=['mixed'], axes=0, fails=TypeError) self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), typs=['mixed'], axes=1, fails=KeyError) - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=2, fails=KeyError) self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) diff --git a/pandas/tests/indexing/test_panel.py b/pandas/tests/indexing/test_panel.py deleted file mode 100644 index 8033d19f330b3..0000000000000 --- a/pandas/tests/indexing/test_panel.py +++ /dev/null @@ -1,104 +0,0 @@ -from warnings import catch_warnings - -import numpy as np -import pytest - -from pandas import Panel, date_range -from pandas.util import testing as tm - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -class TestPanel(object): - - def test_iloc_getitem_panel(self): - - with catch_warnings(record=True): - # GH 7189 - p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2), - items=['A', 'B', 'C', 'D'], - major_axis=['a', 'b', 'c'], - minor_axis=['one', 'two']) - - result = p.iloc[1] - expected = p.loc['B'] - tm.assert_frame_equal(result, expected) - - result = p.iloc[1, 1] - expected = p.loc['B', 'b'] - tm.assert_series_equal(result, expected) - - result = p.iloc[1, 1, 1] - expected = p.loc['B', 'b', 'two'] - assert result == expected - - # combined - result = p.iloc[0, [True, True], [0, 1]] - expected = p.loc['A', ['a', 'b'], ['one', 'two']] - tm.assert_frame_equal(result, expected) - - # out-of-bounds exception - with pytest.raises(IndexError): - p.iloc[tuple([10, 5])] - - with pytest.raises(IndexError): - p.iloc[0, [True, True], [0, 1, 2]] - - # trying to use a label - with pytest.raises(ValueError): - p.iloc[tuple(['j', 'D'])] - - # GH - p = Panel( - np.random.rand(4, 3, 2), items=['A', 'B', 'C', 'D'], - major_axis=['U', 'V', 'W'], minor_axis=['X', 'Y']) - expected = p['A'] - - result = p.iloc[0, :, :] - tm.assert_frame_equal(result, expected) - - result = p.iloc[0, [True, True, True], :] - tm.assert_frame_equal(result, expected) - - result = p.iloc[0, [True, True, True], [0, 1]] - tm.assert_frame_equal(result, expected) - - with pytest.raises(IndexError): - p.iloc[0, [True, True, True], [0, 1, 2]] - - with pytest.raises(IndexError): - p.iloc[0, [True, True, True], [2]] - - def test_iloc_panel_issue(self): - - with catch_warnings(record=True): - # see gh-3617 - p = Panel(np.random.randn(4, 4, 4)) - - assert p.iloc[:3, :3, :3].shape == (3, 3, 3) - assert p.iloc[1, :3, :3].shape == (3, 3) - assert p.iloc[:3, 1, :3].shape == (3, 3) - assert p.iloc[:3, :3, 1].shape == (3, 3) - assert p.iloc[1, 1, :3].shape == (3, ) - assert p.iloc[1, :3, 1].shape == (3, ) - assert p.iloc[:3, 1, 1].shape == (3, ) - - @pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning") - def test_panel_getitem(self): - - with catch_warnings(record=True): - # with an object-like - # GH 9140 - class TestObject(object): - - def __str__(self): - return "TestObject" - - obj = TestObject() - - p = Panel(np.random.randn(1, 5, 4), items=[obj], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - - expected = p.iloc[0] - result = p[obj] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py deleted file mode 100644 index b418091de8d7f..0000000000000 --- a/pandas/tests/test_panel.py +++ /dev/null @@ -1,573 +0,0 @@ -# -*- coding: utf-8 -*- -# pylint: disable=W0612,E1101 -from collections import OrderedDict -from datetime import datetime - -import numpy as np -import pytest - -from pandas.compat import lrange - -from pandas import DataFrame, MultiIndex, Series, date_range, notna -import pandas.core.panel as panelm -from pandas.core.panel import Panel -import pandas.util.testing as tm -from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, - makeCustomDataframe as mkdf, makeMixedDataFrame) - -from pandas.tseries.offsets import MonthEnd - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -class PanelTests(object): - panel = None - - def not_hashable(self): - c_empty = Panel() - c = Panel(Panel([[[1]]])) - pytest.raises(TypeError, hash, c_empty) - pytest.raises(TypeError, hash, c) - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -class SafeForSparse(object): - - # issue 7692 - def test_raise_when_not_implemented(self): - p = Panel(np.arange(3 * 4 * 5).reshape(3, 4, 5), - items=['ItemA', 'ItemB', 'ItemC'], - major_axis=date_range('20130101', periods=4), - minor_axis=list('ABCDE')) - d = p.sum(axis=1).iloc[0] - ops = ['add', 'sub', 'mul', 'truediv', - 'floordiv', 'div', 'mod', 'pow'] - for op in ops: - with pytest.raises(NotImplementedError): - getattr(p, op)(d, axis=0) - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -class CheckIndexing(object): - - def test_delitem_and_pop(self): - - values = np.empty((3, 3, 3)) - values[0] = 0 - values[1] = 1 - values[2] = 2 - - panel = Panel(values, lrange(3), lrange(3), lrange(3)) - - # did we delete the right row? - - panelc = panel.copy() - del panelc[0] - tm.assert_frame_equal(panelc[1], panel[1]) - tm.assert_frame_equal(panelc[2], panel[2]) - - panelc = panel.copy() - del panelc[1] - tm.assert_frame_equal(panelc[0], panel[0]) - tm.assert_frame_equal(panelc[2], panel[2]) - - panelc = panel.copy() - del panelc[2] - tm.assert_frame_equal(panelc[1], panel[1]) - tm.assert_frame_equal(panelc[0], panel[0]) - - def test_setitem(self): - # bad shape - p = Panel(np.random.randn(4, 3, 2)) - msg = (r"shape of value must be \(3, 2\), " - r"shape of given object was \(4, 2\)") - with pytest.raises(ValueError, match=msg): - p[0] = np.random.randn(4, 2) - - def test_setitem_ndarray(self): - timeidx = date_range(start=datetime(2009, 1, 1), - end=datetime(2009, 12, 31), - freq=MonthEnd()) - lons_coarse = np.linspace(-177.5, 177.5, 72) - lats_coarse = np.linspace(-87.5, 87.5, 36) - P = Panel(items=timeidx, major_axis=lons_coarse, - minor_axis=lats_coarse) - data = np.random.randn(72 * 36).reshape((72, 36)) - key = datetime(2009, 2, 28) - P[key] = data - - assert_almost_equal(P[key].values, data) - - def test_set_minor_major(self): - # GH 11014 - df1 = DataFrame(['a', 'a', 'a', np.nan, 'a', np.nan]) - df2 = DataFrame([1.0, np.nan, 1.0, np.nan, 1.0, 1.0]) - panel = Panel({'Item1': df1, 'Item2': df2}) - - newminor = notna(panel.iloc[:, :, 0]) - panel.loc[:, :, 'NewMinor'] = newminor - assert_frame_equal(panel.loc[:, :, 'NewMinor'], - newminor.astype(object)) - - newmajor = notna(panel.iloc[:, 0, :]) - panel.loc[:, 'NewMajor', :] = newmajor - assert_frame_equal(panel.loc[:, 'NewMajor', :], - newmajor.astype(object)) - - def test_getitem_fancy_slice(self): - pass - - def test_ix_setitem_slice_dataframe(self): - a = Panel(items=[1, 2, 3], major_axis=[11, 22, 33], - minor_axis=[111, 222, 333]) - b = DataFrame(np.random.randn(2, 3), index=[111, 333], - columns=[1, 2, 3]) - - a.loc[:, 22, [111, 333]] = b - - assert_frame_equal(a.loc[:, 22, [111, 333]], b) - - def test_ix_align(self): - from pandas import Series - b = Series(np.random.randn(10), name=0) - b.sort_values() - df_orig = Panel(np.random.randn(3, 10, 2)) - df = df_orig.copy() - - df.loc[0, :, 0] = b - assert_series_equal(df.loc[0, :, 0].reindex(b.index), b) - - df = df_orig.swapaxes(0, 1) - df.loc[:, 0, 0] = b - assert_series_equal(df.loc[:, 0, 0].reindex(b.index), b) - - df = df_orig.swapaxes(1, 2) - df.loc[0, 0, :] = b - assert_series_equal(df.loc[0, 0, :].reindex(b.index), b) - - def test_ix_frame_align(self): - # GH3830, panel assignent by values/frame - for dtype in ['float64', 'int64']: - - panel = Panel(np.arange(40).reshape((2, 4, 5)), - items=['a1', 'a2'], dtype=dtype) - df1 = panel.iloc[0] - df2 = panel.iloc[1] - - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) - - # Assignment by Value Passes for 'a2' - panel.loc['a2'] = df1.values - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df1) - - # Assignment by DataFrame Ok w/o loc 'a2' - panel['a2'] = df2 - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) - - # Assignment by DataFrame Fails for 'a2' - panel.loc['a2'] = df2 - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) - - def test_logical_with_nas(self): - d = Panel({'ItemA': {'a': [np.nan, False]}, - 'ItemB': {'a': [True, True]}}) - - result = d['ItemA'] | d['ItemB'] - expected = DataFrame({'a': [np.nan, True]}) - assert_frame_equal(result, expected) - - # this is autodowncasted here - result = d['ItemA'].fillna(False) | d['ItemB'] - expected = DataFrame({'a': [True, True]}) - assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -class TestPanel(PanelTests, CheckIndexing, SafeForSparse): - - def test_constructor_cast(self): - # can't cast - data = [[['foo', 'bar', 'baz']]] - pytest.raises(ValueError, Panel, data, dtype=float) - - def test_constructor_empty_panel(self): - empty = Panel() - assert len(empty.items) == 0 - assert len(empty.major_axis) == 0 - assert len(empty.minor_axis) == 0 - - def test_constructor_observe_dtype(self): - # GH #411 - panel = Panel(items=lrange(3), major_axis=lrange(3), - minor_axis=lrange(3), dtype='O') - assert panel.values.dtype == np.object_ - - def test_constructor_dtypes(self): - # GH #797 - - def _check_dtype(panel, dtype): - for i in panel.items: - assert panel[i].values.dtype.name == dtype - - # only nan holding types allowed here - for dtype in ['float64', 'float32', 'object']: - panel = Panel(items=lrange(2), major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel(np.array(np.random.randn(2, 10, 5), dtype=dtype), - items=lrange(2), - major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel(np.array(np.random.randn(2, 10, 5), dtype='O'), - items=lrange(2), - major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel( - np.random.randn(2, 10, 5), - items=lrange(2), major_axis=lrange(10), - minor_axis=lrange(5), - dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - df1 = DataFrame(np.random.randn(2, 5), - index=lrange(2), columns=lrange(5)) - df2 = DataFrame(np.random.randn(2, 5), - index=lrange(2), columns=lrange(5)) - panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype) - _check_dtype(panel, dtype) - - def test_constructor_fails_with_not_3d_input(self): - msg = "The number of dimensions required is 3" - with pytest.raises(ValueError, match=msg): - Panel(np.random.randn(10, 2)) - - def test_ctor_orderedDict(self): - keys = list(set(np.random.randint(0, 5000, 100)))[ - :50] # unique random int keys - d = OrderedDict([(k, mkdf(10, 5)) for k in keys]) - p = Panel(d) - assert list(p.items) == keys - - p = Panel.from_dict(d) - assert list(p.items) == keys - - def test_from_dict_mixed_orient(self): - df = tm.makeDataFrame() - df['foo'] = 'bar' - - data = {'k1': df, 'k2': df} - - panel = Panel.from_dict(data, orient='minor') - - assert panel['foo'].values.dtype == np.object_ - assert panel['A'].values.dtype == np.float64 - - def test_constructor_error_msgs(self): - msg = (r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(4, 5, 5\)") - with pytest.raises(ValueError, match=msg): - Panel(np.random.randn(3, 4, 5), - lrange(4), lrange(5), lrange(5)) - - msg = (r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(5, 4, 5\)") - with pytest.raises(ValueError, match=msg): - Panel(np.random.randn(3, 4, 5), - lrange(5), lrange(4), lrange(5)) - - msg = (r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(5, 5, 4\)") - with pytest.raises(ValueError, match=msg): - Panel(np.random.randn(3, 4, 5), - lrange(5), lrange(5), lrange(4)) - - def test_apply_slabs(self): - # with multi-indexes - # GH7469 - index = MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ( - 'two', 'a'), ('two', 'b')]) - dfa = DataFrame(np.array(np.arange(12, dtype='int64')).reshape( - 4, 3), columns=list("ABC"), index=index) - dfb = DataFrame(np.array(np.arange(10, 22, dtype='int64')).reshape( - 4, 3), columns=list("ABC"), index=index) - p = Panel({'f': dfa, 'g': dfb}) - result = p.apply(lambda x: x.sum(), axis=0) - - # on windows this will be in32 - result = result.astype('int64') - expected = p.sum(0) - assert_frame_equal(result, expected) - - def test_apply_no_or_zero_ndim(self): - # GH10332 - self.panel = Panel(np.random.rand(5, 5, 5)) - - result_int = self.panel.apply(lambda df: 0, axis=[1, 2]) - result_float = self.panel.apply(lambda df: 0.0, axis=[1, 2]) - result_int64 = self.panel.apply( - lambda df: np.int64(0), axis=[1, 2]) - result_float64 = self.panel.apply(lambda df: np.float64(0.0), - axis=[1, 2]) - - expected_int = expected_int64 = Series([0] * 5) - expected_float = expected_float64 = Series([0.0] * 5) - - assert_series_equal(result_int, expected_int) - assert_series_equal(result_int64, expected_int64) - assert_series_equal(result_float, expected_float) - assert_series_equal(result_float64, expected_float64) - - def test_fillna(self): - # limit not implemented when only value is specified - p = Panel(np.random.randn(3, 4, 5)) - p.iloc[0:2, 0:2, 0:2] = np.nan - pytest.raises(NotImplementedError, - lambda: p.fillna(999, limit=1)) - - def test_to_frame_multi_major(self): - idx = MultiIndex.from_tuples( - [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]) - df = DataFrame([[1, 'a', 1], [2, 'b', 1], - [3, 'c', 1], [4, 'd', 1]], - columns=['A', 'B', 'C'], index=idx) - wp = Panel({'i1': df, 'i2': df}) - expected_idx = MultiIndex.from_tuples( - [ - (1, 'one', 'A'), (1, 'one', 'B'), - (1, 'one', 'C'), (1, 'two', 'A'), - (1, 'two', 'B'), (1, 'two', 'C'), - (2, 'one', 'A'), (2, 'one', 'B'), - (2, 'one', 'C'), (2, 'two', 'A'), - (2, 'two', 'B'), (2, 'two', 'C') - ], - names=[None, None, 'minor']) - expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, - 'c', 1, 4, 'd', 1], - 'i2': [1, 'a', 1, 2, 'b', - 1, 3, 'c', 1, 4, 'd', 1]}, - index=expected_idx) - result = wp.to_frame() - assert_frame_equal(result, expected) - - wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 - result = wp.to_frame() - assert_frame_equal(result, expected[1:]) - - idx = MultiIndex.from_tuples( - [(1, 'two'), (1, 'one'), (2, 'one'), (np.nan, 'two')]) - df = DataFrame([[1, 'a', 1], [2, 'b', 1], - [3, 'c', 1], [4, 'd', 1]], - columns=['A', 'B', 'C'], index=idx) - wp = Panel({'i1': df, 'i2': df}) - ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), - (1, 'two', 'C'), - (1, 'one', 'A'), - (1, 'one', 'B'), - (1, 'one', 'C'), - (2, 'one', 'A'), - (2, 'one', 'B'), - (2, 'one', 'C'), - (np.nan, 'two', 'A'), - (np.nan, 'two', 'B'), - (np.nan, 'two', 'C')], - names=[None, None, 'minor']) - expected.index = ex_idx - result = wp.to_frame() - assert_frame_equal(result, expected) - - def test_to_frame_multi_major_minor(self): - cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( - 2, 'two'), (3, 'three'), (4, 'four')]) - df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], - ['a', 'b', 'w', 'x'], - ['c', 'd', 'y', 'z'], [-1, -2, -3, -4], - [-5, -6, -7, -8]], columns=cols, index=idx) - wp = Panel({'i1': df, 'i2': df}) - - exp_idx = MultiIndex.from_tuples( - [(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'), - (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'), - (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'), - (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'), - (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'), - (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'), - (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'), - (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'), - (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'), - (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'), - (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'), - (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')], - names=[None, None, None, None]) - exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], - [3, 3], [4, 4], - [13, 13], [14, 14], ['a', 'a'], - ['b', 'b'], ['w', 'w'], - ['x', 'x'], ['c', 'c'], ['d', 'd'], [ - 'y', 'y'], ['z', 'z'], - [-1, -1], [-2, -2], [-3, -3], [-4, -4], - [-5, -5], [-6, -6], - [-7, -7], [-8, -8]] - result = wp.to_frame() - expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx) - assert_frame_equal(result, expected) - - def test_to_frame_multi_drop_level(self): - idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')]) - df = DataFrame({'A': [np.nan, 1, 2]}, index=idx) - wp = Panel({'i1': df, 'i2': df}) - result = wp.to_frame() - exp_idx = MultiIndex.from_tuples( - [(2, 'one', 'A'), (2, 'two', 'A')], - names=[None, None, 'minor']) - expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx) - assert_frame_equal(result, expected) - - def test_panel_dups(self): - - # GH 4960 - # duplicates in an index - - # items - data = np.random.randn(5, 100, 5) - no_dup_panel = Panel(data, items=list("ABCDE")) - panel = Panel(data, items=list("AACDE")) - - expected = no_dup_panel['A'] - result = panel.iloc[0] - assert_frame_equal(result, expected) - - expected = no_dup_panel['E'] - result = panel.loc['E'] - assert_frame_equal(result, expected) - - # major - data = np.random.randn(5, 5, 5) - no_dup_panel = Panel(data, major_axis=list("ABCDE")) - panel = Panel(data, major_axis=list("AACDE")) - - expected = no_dup_panel.loc[:, 'A'] - result = panel.iloc[:, 0] - assert_frame_equal(result, expected) - - expected = no_dup_panel.loc[:, 'E'] - result = panel.loc[:, 'E'] - assert_frame_equal(result, expected) - - # minor - data = np.random.randn(5, 100, 5) - no_dup_panel = Panel(data, minor_axis=list("ABCDE")) - panel = Panel(data, minor_axis=list("AACDE")) - - expected = no_dup_panel.loc[:, :, 'A'] - result = panel.iloc[:, :, 0] - assert_frame_equal(result, expected) - - expected = no_dup_panel.loc[:, :, 'E'] - result = panel.loc[:, :, 'E'] - assert_frame_equal(result, expected) - - def test_filter(self): - pass - - def test_shift(self): - # mixed dtypes #6959 - data = [('item ' + ch, makeMixedDataFrame()) - for ch in list('abcde')] - data = dict(data) - mixed_panel = Panel.from_dict(data, orient='minor') - shifted = mixed_panel.shift(1) - assert_series_equal(mixed_panel.dtypes, shifted.dtypes) - - def test_numpy_round(self): - values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], - [-1566.213, 88.88], [-12, 94.5]], - [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12], - [272.212, -99.99], [23, -76.5]]] - p = Panel(values, items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B']) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.round(p, out=p) - - # removing Panel before NumPy enforces, so just ignore - @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") - def test_multiindex_get(self): - ind = MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1), ('b', 2)], - names=['first', 'second']) - wp = Panel(np.random.random((4, 5, 5)), - items=ind, - major_axis=np.arange(5), - minor_axis=np.arange(5)) - f1 = wp['a'] - f2 = wp.loc['a'] - - assert (f1.items == [1, 2]).all() - assert (f2.items == [1, 2]).all() - - MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - names=['first', 'second']) - - def test_repr_empty(self): - empty = Panel() - repr(empty) - - @pytest.mark.parametrize('bad_kwarg, exception, msg', [ - # errors must be 'ignore' or 'raise' - ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), - ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') - ]) - def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): - pan = Panel([[[1.5, np.nan, 3.]]]) - with pytest.raises(exception, match=msg): - pan.update(pan, **bad_kwarg) - - def test_update_raise_on_overlap(self): - pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) - - with pytest.raises(ValueError, match='Data overlaps'): - pan.update(pan, errors='raise') - - @pytest.mark.parametrize('raise_conflict', [True, False]) - def test_update_deprecation(self, raise_conflict): - pan = Panel([[[1.5, np.nan, 3.]]]) - other = Panel([[[]]]) - with tm.assert_produces_warning(FutureWarning): - pan.update(other, raise_conflict=raise_conflict) - - -def test_panel_index(): - index = panelm.panel_index([1, 2, 3, 4], [1, 2, 3]) - expected = MultiIndex.from_arrays([np.tile([1, 2, 3, 4], 3), - np.repeat([1, 2, 3], 4)], - names=['time', 'panel']) - tm.assert_index_equal(index, expected) - - -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") -def test_panel_np_all(): - wp = Panel({"A": DataFrame({'b': [1, 2]})}) - result = np.all(wp) - assert result == np.bool_(True) From 221be3b4adde0f45927803b1c593b56d4678faeb Mon Sep 17 00:00:00 2001 From: Jop Vermeer Date: Tue, 5 Mar 2019 22:11:48 +0100 Subject: [PATCH 072/110] BUG: caught typeError in series.at (#25506) (#25533) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/series.py | 2 +- pandas/tests/indexing/test_scalar.py | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 124ec8f4ab92c..e1a1c975b5ed8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -150,7 +150,7 @@ Timezones - Bug in :func:`to_datetime` with ``utc=True`` and datetime strings that would apply previously parsed UTC offsets to subsequent arguments (:issue:`24992`) - Bug in :func:`Timestamp.tz_localize` and :func:`Timestamp.tz_convert` does not propagate ``freq`` (:issue:`25241`) -- +- Bug in :func:`Series.at` where setting :class:`Timestamp` with timezone raises ``TypeError`` (:issue:`25506`) Numeric ^^^^^^^ diff --git a/pandas/core/series.py b/pandas/core/series.py index f6598ed1ee614..03fc26efa4516 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1229,7 +1229,7 @@ def _set_value(self, label, value, takeable=False): self._values[label] = value else: self.index._engine.set_value(self._values, label, value) - except KeyError: + except (KeyError, TypeError): # set using a non-recursive method self.loc[label] = value diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 0cd41562541d1..20053264ac4f1 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -185,6 +185,14 @@ def test_at_with_tz(self): result = df.at[0, 'date'] assert result == expected + def test_series_set_tz_timestamp(self, tz_naive_fixture): + # GH 25506 + ts = Timestamp('2017-08-05 00:00:00+0100', tz=tz_naive_fixture) + result = Series(ts) + result.at[1] = ts + expected = Series([ts, ts]) + tm.assert_series_equal(result, expected) + def test_mixed_index_at_iat_loc_iloc_series(self): # GH 19860 s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) From 615fbb3ce4bb171e8610c590a9f9899c60028d54 Mon Sep 17 00:00:00 2001 From: Max van Deursen Date: Tue, 5 Mar 2019 23:15:19 +0100 Subject: [PATCH 073/110] ENH: Add errors parameter to DataFrame.rename (#25535) * ENH: GH13473 Add errors parameter to DataFrame.rename --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/frame.py | 40 +++++++++++++++++++++------ pandas/core/generic.py | 24 +++++++++++++++- pandas/tests/frame/test_alter_axes.py | 19 ++++++++++++- 4 files changed, 74 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e1a1c975b5ed8..5dd6ce168a0de 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -25,7 +25,7 @@ Other Enhancements - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) -- +- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b4d95055d06d..eadffb779734f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3911,7 +3911,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, @rewrite_axis_style_signature('mapper', [('copy', True), ('inplace', False), - ('level', None)]) + ('level', None), + ('errors', 'ignore')]) def rename(self, *args, **kwargs): """ Alter axes labels. @@ -3924,30 +3925,49 @@ def rename(self, *args, **kwargs): Parameters ---------- - mapper, index, columns : dict-like or function, optional - dict-like or functions transformations to apply to + mapper : dict-like or function + Dict-like or functions transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. - axis : int or str, optional + index : dict-like or function + Alternative to specifying axis (``mapper, axis=0`` + is equivalent to ``index=mapper``). + columns : dict-like or function + Alternative to specifying axis (``mapper, axis=1`` + is equivalent to ``columns=mapper``). + axis : int or str Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. - copy : boolean, default True - Also copy underlying data - inplace : boolean, default False + copy : bool, default True + Also copy underlying data. + inplace : bool, default False Whether to return a new DataFrame. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. Returns ------- DataFrame + DataFrame with the renamed axis labels. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". See Also -------- - DataFrame.rename_axis + DataFrame.rename_axis : Set the name of the axis. Examples -------- @@ -3973,6 +3993,10 @@ def rename(self, *args, **kwargs): 1 2 5 2 3 6 + >>> df.rename(index=str, columns={"A": "a", "C": "c"}, errors="raise") + Traceback (most recent call last): + KeyError: ['C'] not found in axis + Using axis-style parameters >>> df.rename(str.lower, axis='columns') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee8f9cba951b3..7915d98662c9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -981,11 +981,23 @@ def rename(self, *args, **kwargs): level : int or level name, default None In case of a MultiIndex, only rename labels in the specified level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. Returns ------- renamed : %(klass)s (new object) + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". + See Also -------- NDFrame.rename_axis @@ -1065,6 +1077,7 @@ def rename(self, *args, **kwargs): inplace = kwargs.pop('inplace', False) level = kwargs.pop('level', None) axis = kwargs.pop('axis', None) + errors = kwargs.pop('errors', 'ignore') if axis is not None: # Validate the axis self._get_axis_number(axis) @@ -1085,10 +1098,19 @@ def rename(self, *args, **kwargs): if v is None: continue f = com._get_rename_function(v) - baxis = self._get_block_manager_axis(axis) if level is not None: level = self.axes[axis]._get_level_number(level) + + # GH 13473 + if not callable(v): + indexer = self.axes[axis].get_indexer_for(v) + if errors == 'raise' and len(indexer[indexer == -1]): + missing_labels = [label for index, label in enumerate(v) + if indexer[index] == -1] + raise KeyError('{} not found in axis' + .format(missing_labels)) + result._data = result._data.rename_axis(f, axis=baxis, copy=copy, level=level) result._clear_item_cache() diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index f4a2a5f8032a0..bc5cf30d096fd 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -872,6 +872,23 @@ def test_rename_bug2(self): columns=["a"]) tm.assert_frame_equal(df, expected) + def test_rename_errors_raises(self): + df = DataFrame(columns=['A', 'B', 'C', 'D']) + with pytest.raises(KeyError, match='\'E\'] not found in axis'): + df.rename(columns={'A': 'a', 'E': 'e'}, errors='raise') + + @pytest.mark.parametrize('mapper, errors, expected_columns', [ + ({'A': 'a', 'E': 'e'}, 'ignore', ['a', 'B', 'C', 'D']), + ({'A': 'a'}, 'raise', ['a', 'B', 'C', 'D']), + (str.lower, 'raise', ['a', 'b', 'c', 'd'])]) + def test_rename_errors(self, mapper, errors, expected_columns): + # GH 13473 + # rename now works with errors parameter + df = DataFrame(columns=['A', 'B', 'C', 'D']) + result = df.rename(columns=mapper, errors=errors) + expected = DataFrame(columns=expected_columns) + tm.assert_frame_equal(result, expected) + def test_reorder_levels(self): index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], @@ -1329,7 +1346,7 @@ def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) assert parameters == {"self", "mapper", "index", "columns", "axis", - "inplace", "copy", "level"} + "inplace", "copy", "level", "errors"} @pytest.mark.skipif(PY2, reason="inspect.signature") def test_reindex_signature(self): From fbfe4abd28803d180458ca446ef4ccf19af40a45 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Tue, 5 Mar 2019 21:17:06 -0700 Subject: [PATCH 074/110] TST: Skip IntervalTree construction overflow test on 32bit (#25558) --- pandas/tests/indexes/interval/test_interval_tree.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 46b2d12015a22..5d9ef2a9a6c32 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -172,6 +172,7 @@ def test_is_overlapping_trivial(self, closed, left, right): tree = IntervalTree(left, right, closed=closed) assert tree.is_overlapping is False + @pytest.mark.skipif(compat.is_platform_32bit(), reason='GH 23440') def test_construction_overflow(self): # GH 25485 left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 From 3e652ac565e054b763ccbe23006c02704f6cd1b1 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 6 Mar 2019 02:13:12 -0700 Subject: [PATCH 075/110] DOC: Small fixes to 0.24.2 whatsnew (#25559) --- doc/source/whatsnew/v0.24.2.rst | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index e80b1060e867d..f864fcd04e3d4 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -24,13 +24,11 @@ Fixed Regressions - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) - Fixed regression in :meth:`DataFrame.replace` where ``regex=True`` was only replacing patterns matching the start of the string (:issue:`25259`) - - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) -- Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) -- Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the `Series` on the right contains null values (:issue:`25317`) -- Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) +- Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ``Categorical`` data (:issue:`25299`) +- Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the ``Series`` on the right contains null values (:issue:`25317`) +- Fixed regression in :class:`TimedeltaIndex` where ``np.sum(index)`` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) .. _whatsnew_0242.enhancements: @@ -60,7 +58,7 @@ Bug Fixes **I/O** -- Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`); +- Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`) - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`) - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`) - Bug where float indexes could have misaligned values when printing (:issue:`25061`) @@ -86,7 +84,7 @@ Bug Fixes **Reshaping** -- Bug in :meth:`pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) +- Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) - @@ -103,7 +101,7 @@ Bug Fixes - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`) - -.. _whatsnew_0.242.contributors: +.. _whatsnew_0242.contributors: Contributors ~~~~~~~~~~~~ From 07625af622c650f2ea124e3ddc5a80df37e6a2aa Mon Sep 17 00:00:00 2001 From: Kapil Patel Date: Thu, 7 Mar 2019 02:56:46 +0530 Subject: [PATCH 076/110] minor typo error (#25574) --- doc/source/getting_started/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 972b562cfebba..50c53a56174c8 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -103,7 +103,7 @@ Display the index, columns: df.columns :meth:`DataFrame.to_numpy` gives a NumPy representation of the underlying data. -Note that his can be an expensive operation when your :class:`DataFrame` has +Note that this can be an expensive operation when your :class:`DataFrame` has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: **NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column**. When you call From 1c72dda3d9813f6f5f826eb64911b5f5bb38d989 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 6 Mar 2019 22:35:55 +0000 Subject: [PATCH 077/110] BUG: in error message raised when invalid axis parameter (#25553) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/generic.py | 4 ++-- pandas/tests/frame/test_analytics.py | 6 ++++-- pandas/tests/frame/test_api.py | 3 ++- pandas/tests/frame/test_axis_select_reindex.py | 3 ++- pandas/tests/frame/test_missing.py | 3 ++- pandas/tests/frame/test_quantile.py | 6 ++++-- pandas/tests/frame/test_sorting.py | 3 ++- pandas/tests/frame/test_timeseries.py | 3 ++- pandas/tests/series/test_analytics.py | 4 +++- pandas/tests/series/test_missing.py | 6 ++++-- pandas/tests/series/test_rank.py | 6 ++++-- pandas/tests/series/test_sorting.py | 6 +++++- pandas/tests/series/test_timeseries.py | 6 ++++-- 14 files changed, 41 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5dd6ce168a0de..ea08a0a6fe07b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -196,7 +196,7 @@ Missing ^^^^^^^ - Fixed misleading exception message in :meth:`Series.missing` if argument ``order`` is required, but omitted (:issue:`10633`, :issue:`24014`). -- +- Fixed class type displayed in exception message in :meth:`DataFrame.dropna` if invalid ``axis`` parameter passed (:issue:`25555`) - MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7915d98662c9e..d7f71df99cdb6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -358,7 +358,7 @@ def _get_axis_number(cls, axis): except KeyError: pass raise ValueError('No axis named {0} for object type {1}' - .format(axis, type(cls))) + .format(axis, cls)) @classmethod def _get_axis_name(cls, axis): @@ -372,7 +372,7 @@ def _get_axis_name(cls, axis): except KeyError: pass raise ValueError('No axis named {0} for object type {1}' - .format(axis, type(cls))) + .format(axis, cls)) def _get_axis(self, axis): name = self._get_axis_name(axis) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 3363a45149fff..2969e8be2db03 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1385,7 +1385,8 @@ def test_idxmin(self, float_frame, int_frame): skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + " ") with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) @@ -1402,7 +1403,8 @@ def test_idxmax(self, float_frame, int_frame): skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + " ") with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 118341276d799..badfa0ca8fd15 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -366,7 +366,8 @@ def test_swapaxes(self): self._assert_frame_equal(df.T, df.swapaxes(0, 1)) self._assert_frame_equal(df.T, df.swapaxes(1, 0)) self._assert_frame_equal(df, df.swapaxes(0, 0)) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + r" ") with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index fb00776b33cbb..cf8c55f00b061 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1067,7 +1067,8 @@ def test_reindex_axis(self): reindexed2 = self.intframe.reindex(index=rows) assert_frame_equal(reindexed1, reindexed2) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + " ") with pytest.raises(ValueError, match=msg): self.intframe.reindex_axis(rows, axis=2) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 2f3b0a9f76de9..189531c7b4459 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -140,7 +140,8 @@ def test_dropna(self): assert_frame_equal(dropped, expected) # bad input - msg = "No axis named 3 for object type " + msg = ("No axis named 3 for object type" + " ") with pytest.raises(ValueError, match=msg): df.dropna(axis=3) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 19b6636978643..facbfdd0c032b 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -95,10 +95,12 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis="columns") assert_series_equal(result, expected) - msg = "No axis named -1 for object type " + msg = ("No axis named -1 for object type" + " ") with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) - msg = "No axis named column for object type " + msg = ("No axis named column for object type" + " ") with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 8b29394bcab84..baf50982d8ab0 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -55,7 +55,8 @@ def test_sort_values(self): sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) assert_frame_equal(sorted_df, expected) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + " ") with pytest.raises(ValueError, match=msg): frame.sort_values(by=['A', 'B'], axis=2, inplace=True) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 716a9e30e4cc3..9965be9091451 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -862,7 +862,8 @@ def test_frame_to_period(self): pts = df.to_period('M', axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) - msg = "No axis named 2 for object type " + msg = ("No axis named 2 for object type" + " ") with pytest.raises(ValueError, match=msg): df.to_period(axis=2) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1f265d574da15..d7d9c526503cb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -771,6 +771,7 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_ptp(self): # GH21614 N = 1000 @@ -796,7 +797,8 @@ def test_ptp(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - msg = r"No axis named 1 for object type <(class|type) 'type'>" + msg = ("No axis named 1 for object type" + " ") with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f07dd1dfb5fda..ef9e575e60385 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -10,7 +10,7 @@ import pytz from pandas._libs.tslib import iNaT -from pandas.compat import range +from pandas.compat import PY2, range from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -654,6 +654,7 @@ def test_timedelta64_nan(self): # expected = (datetime_series >= -0.5) & (datetime_series <= 0.5) # assert_series_equal(selector, expected) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_dropna_empty(self): s = Series([]) assert len(s.dropna()) == 0 @@ -661,7 +662,8 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = r"No axis named 1 for object type <(class|type) 'type'>" + msg = ("No axis named 1 for object type" + " ") with pytest.raises(ValueError, match=msg): s.dropna(axis=1) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index dfcda889269ee..373083c077e28 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -9,7 +9,7 @@ from pandas._libs.algos import Infinity, NegInfinity from pandas._libs.tslib import iNaT import pandas.compat as compat -from pandas.compat import product +from pandas.compat import PY2, product import pandas.util._test_decorators as td from pandas import NaT, Series, Timestamp, date_range @@ -203,10 +203,12 @@ def test_rank_categorical(self): assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average') - msg = r"No axis named average for object type <(class|type) 'type'>" + msg = ("No axis named average for object type" + " ") with pytest.raises(ValueError, match=msg): s.rank('average') diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 216f84c8f077a..162fa4ac9ab52 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import PY2 + from pandas import Categorical, DataFrame, IntervalIndex, MultiIndex, Series import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal, assert_series_equal @@ -88,6 +90,7 @@ def test_sort_values(self): with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_sort_index(self): rindex = list(self.ts.index) random.shuffle(rindex) @@ -109,7 +112,8 @@ def test_sort_index(self): sorted_series = random_order.sort_index(axis=0) assert_series_equal(sorted_series, self.ts) - msg = r"No axis named 1 for object type <(class|type) 'type'>" + msg = ("No axis named 1 for object type" + " ") with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d082b023e1f27..b6896685dd474 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -8,7 +8,7 @@ from pandas._libs.tslib import iNaT from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas.compat import StringIO, lrange, product +from pandas.compat import PY2, StringIO, lrange, product from pandas.errors import NullFrequencyError import pandas.util._test_decorators as td @@ -867,6 +867,7 @@ def test_between_time_formats(self): for time_string in strings: assert len(ts.between_time(*time_string)) == expected_length + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_between_time_axis(self): # issue 8839 rng = date_range('1/1/2000', periods=100, freq='10min') @@ -876,7 +877,8 @@ def test_between_time_axis(self): assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = r"No axis named 1 for object type <(class|type) 'type'>" + msg = ("No axis named 1 for object type" + " ") with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) From a07c7b9698a6e63b12bacbef1084e52865223a14 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 6 Mar 2019 16:39:50 -0600 Subject: [PATCH 078/110] BLD: Fixed pip install with no numpy (#25568) --- doc/source/whatsnew/v0.24.2.rst | 1 + setup.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index f864fcd04e3d4..4ca9d57f3a2e5 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -30,6 +30,7 @@ Fixed Regressions - Fixed regression in :class:`TimedeltaIndex` where ``np.sum(index)`` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) +- Fixed pip installing from source into an environment without NumPy (:issue:`25193`) .. _whatsnew_0242.enhancements: diff --git a/setup.py b/setup.py index c8d29a2e4be5a..a83e07b50ed57 100755 --- a/setup.py +++ b/setup.py @@ -477,6 +477,11 @@ def maybe_cythonize(extensions, *args, **kwargs): # Avoid running cythonize on `python setup.py clean` # See https://github.com/cython/cython/issues/1495 return extensions + if not cython: + # Avoid trying to look up numpy when installing from sdist + # https://github.com/pandas-dev/pandas/issues/25193 + # TODO: See if this can be removed after pyproject.toml added. + return extensions numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') # TODO: Is this really necessary here? @@ -485,11 +490,8 @@ def maybe_cythonize(extensions, *args, **kwargs): numpy_incl not in ext.include_dirs): ext.include_dirs.append(numpy_incl) - if cython: - build_ext.render_templates(_pxifiles) - return cythonize(extensions, *args, **kwargs) - else: - return extensions + build_ext.render_templates(_pxifiles) + return cythonize(extensions, *args, **kwargs) def srcpath(name=None, suffix='.pyx', subdir='src'): From 09f4484416cb9163c10de7aa90c1755f08283e38 Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Wed, 6 Mar 2019 18:21:07 -0800 Subject: [PATCH 079/110] Document the behavior of `axis=None` with `style.background_gradient` (#25551) --- pandas/io/formats/style.py | 42 +++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index c8b5dc6b9b7c0..b872f86eb8683 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -576,10 +576,10 @@ def apply(self, func, axis=0, subset=None, **kwargs): on ``axis``), and return an object with the same shape. Must return a DataFrame with identical index and column labels when ``axis=None`` - axis : int, str or None - apply to each column (``axis=0`` or ``'index'``) - or to each row (``axis=1`` or ``'columns'``) or - to the entire DataFrame at once with ``axis=None`` + axis : {0 or 'index', 1 or 'columns', None}, default 0 + apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. subset : IndexSlice a valid indexer to limit ``data`` to *before* applying the function. Consider using a pandas.IndexSlice @@ -894,10 +894,12 @@ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, matplotlib colormap low, high : float compress the range by these values. - axis : int or str - 1 or 'columns' for columnwise, 0 or 'index' for rowwise + axis : {0 or 'index', 1 or 'columns', None}, default 0 + apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. subset : IndexSlice - a valid slice for ``data`` to limit the style application to + a valid slice for ``data`` to limit the style application to. text_color_threshold : float or int luminance threshold for determining text color. Facilitates text visibility across varying background colors. From 0 to 1. @@ -1081,10 +1083,10 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100, ---------- subset : IndexSlice, optional A valid slice for `data` to limit the style application to. - axis : int, str or None, default 0 - Apply to each column (`axis=0` or `'index'`) - or to each row (`axis=1` or `'columns'`) or - to the entire DataFrame at once with `axis=None`. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. color : str or 2-tuple/list If a str is passed, the color is the same for both negative and positive numbers. If 2-tuple/list is used, the @@ -1149,11 +1151,12 @@ def highlight_max(self, subset=None, color='yellow', axis=0): Parameters ---------- subset : IndexSlice, default None - a valid slice for ``data`` to limit the style application to + a valid slice for ``data`` to limit the style application to. color : str, default 'yellow' - axis : int, str, or None; default 0 - 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, - or ``None`` for tablewise + axis : {0 or 'index', 1 or 'columns', None}, default 0 + apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. Returns ------- @@ -1169,11 +1172,12 @@ def highlight_min(self, subset=None, color='yellow', axis=0): Parameters ---------- subset : IndexSlice, default None - a valid slice for ``data`` to limit the style application to + a valid slice for ``data`` to limit the style application to. color : str, default 'yellow' - axis : int, str, or None; default 0 - 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, - or ``None`` for tablewise + axis : {0 or 'index', 1 or 'columns', None}, default 0 + apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. Returns ------- From 74a9ae309d057d5da50717ffb18fd3dd9212fad7 Mon Sep 17 00:00:00 2001 From: ezcitron <36384768+ezcitron@users.noreply.github.com> Date: Wed, 6 Mar 2019 18:22:19 -0800 Subject: [PATCH 080/110] fix minor typos in dsintro.rst (#25579) --- doc/source/getting_started/dsintro.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 94bec5c5bc83d..c8a2399739cd5 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -576,14 +576,14 @@ To write code compatible with all versions of Python, split the assignment in tw .. warning:: - Dependent assignment maybe subtly change the behavior of your code between + Dependent assignment may subtly change the behavior of your code between Python 3.6 and older versions of Python. - If you wish write code that supports versions of python before and after 3.6, + If you wish to write code that supports versions of python before and after 3.6, you'll need to take care when passing ``assign`` expressions that - * Updating an existing column - * Referring to the newly updated column in the same ``assign`` + * Update an existing column + * Refer to the newly updated column in the same ``assign`` For example, we'll update column "A" and then refer to it when creating "B". @@ -665,8 +665,8 @@ row-wise. For example: df - df.iloc[0] -In the special case of working with time series data, and the DataFrame index -also contains dates, the broadcasting will be column-wise: +In the special case of working with time series data, if the DataFrame index +contains dates, the broadcasting will be column-wise: .. ipython:: python :okwarning: From b72e7ed320de4d5b9aa8e0b7f2166cee2c7e6fdf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Mar 2019 08:24:55 -0600 Subject: [PATCH 081/110] BUG: Handle readonly arrays in period_array (#25556) * BUG: Handle readonly arrays in period_array Closes #25403 --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/_libs/tslibs/period.pyx | 8 ++++++-- pandas/tests/arrays/test_period.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 4ca9d57f3a2e5..7da99590d5a0a 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -29,6 +29,7 @@ Fixed Regressions - Fixed regression in subtraction between :class:`Series` objects with ``datetime64[ns]`` dtype incorrectly raising ``OverflowError`` when the ``Series`` on the right contains null values (:issue:`25317`) - Fixed regression in :class:`TimedeltaIndex` where ``np.sum(index)`` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) +- Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a5a50ea59753d..c8eaa2cfd85c2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1438,7 +1438,9 @@ cdef accessor _get_accessor_func(int code): @cython.wraparound(False) @cython.boundscheck(False) -def extract_ordinals(object[:] values, freq): +def extract_ordinals(ndarray[object] values, freq): + # TODO: Change type to const object[:] when Cython supports that. + cdef: Py_ssize_t i, n = len(values) int64_t[:] ordinals = np.empty(n, dtype=np.int64) @@ -1472,7 +1474,9 @@ def extract_ordinals(object[:] values, freq): return ordinals.base # .base to access underlying np.ndarray -def extract_freq(object[:] values): +def extract_freq(ndarray[object] values): + # TODO: Change type to const object[:] when Cython supports that. + cdef: Py_ssize_t i, n = len(values) object p diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index affe3b3854490..99255d819d28e 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -41,6 +41,22 @@ def test_period_array_ok(data, freq, expected): tm.assert_numpy_array_equal(result, expected) +def test_period_array_readonly_object(): + # https://github.com/pandas-dev/pandas/issues/25403 + pa = period_array([pd.Period('2019-01-01')]) + arr = np.asarray(pa, dtype='object') + arr.setflags(write=False) + + result = period_array(arr) + tm.assert_period_array_equal(result, pa) + + result = pd.Series(arr) + tm.assert_series_equal(result, pd.Series(pa)) + + result = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, pd.DataFrame({"A": pa})) + + def test_from_datetime64_freq_changes(): # https://github.com/pandas-dev/pandas/issues/23438 arr = pd.date_range("2017", periods=3, freq="D") From 89a67f96c447c40b92ce7619f5ba26254ee72e73 Mon Sep 17 00:00:00 2001 From: Caleb Braun Date: Thu, 7 Mar 2019 15:40:39 -0500 Subject: [PATCH 082/110] DOC: Fix typo in tz_localize (#25598) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d7f71df99cdb6..0b81576404e2f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9321,7 +9321,7 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, times nonexistent : str, default 'raise' A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. Valid valuse are: + where clocks moved forward due to DST. Valid values are: - 'shift_forward' will shift the nonexistent time forward to the closest existing time From e6670da0b387cb808c0952ccae61d06f86aefef8 Mon Sep 17 00:00:00 2001 From: Johan von Forstner Date: Thu, 7 Mar 2019 21:42:28 +0100 Subject: [PATCH 083/110] BUG: secondary y axis could not be set to log scale (#25545) (#25586) --- doc/source/whatsnew/v0.24.2.rst | 2 +- pandas/plotting/_core.py | 3 +++ pandas/tests/plotting/test_series.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 7da99590d5a0a..2c6d1e01ed89b 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -92,7 +92,7 @@ Bug Fixes **Visualization** -- +- Bug in :meth:`Series.plot` where a secondary y axis could not be set to log scale (:issue:`25545`) - - diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 48d870bfc2e03..0ea92a57ac3f8 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -287,6 +287,9 @@ def _maybe_right_yaxis(self, ax, axes_num): if not self._has_plotted_object(orig_ax): # no data on left y orig_ax.get_yaxis().set_visible(False) + + if self.logy or self.loglog: + new_ax.set_yscale('log') return new_ax def _setup_subplots(self): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 07a4b168a66f1..a234ea8f9416b 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -570,6 +570,18 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() + @pytest.mark.slow + def test_secondary_logy(self): + # GH 25545 + s1 = Series(np.random.randn(30)) + s2 = Series(np.random.randn(30)) + + ax1 = s1.plot(logy=True) + ax2 = s2.plot(secondary_y=True, logy=True) + + assert ax1.get_yscale() == 'log' + assert ax2.get_yscale() == 'log' + @pytest.mark.slow def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) From 5c159d2ea2450cc6501722c13b916581706d19fd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 Mar 2019 21:59:39 +0000 Subject: [PATCH 084/110] TST: add test for groupby on list of empty list (#25589) --- pandas/tests/groupby/test_groupby.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6a11f0ae9b44a..c062fb90ca43b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1714,3 +1714,12 @@ def test_groupby_multiindex_nat(): result = ser.groupby(level=1).mean() expected = pd.Series([3., 2.5], index=["a", "b"]) assert_series_equal(result, expected) + + +def test_groupby_empty_list_raises(): + # GH 5289 + values = zip(range(10), range(10)) + df = DataFrame(values, columns=['apple', 'b']) + msg = "Grouper and axis must be same length" + with pytest.raises(ValueError, match=msg): + df.groupby([[]]) From 0d893d080da96ebed42302032fe337d77ce5d9eb Mon Sep 17 00:00:00 2001 From: Richard Eames Date: Fri, 8 Mar 2019 14:52:27 -0700 Subject: [PATCH 085/110] TYPING: Small fixes to make stubgen happy (#25576) --- pandas/core/arrays/period.py | 2 +- pandas/core/arrays/sparse.py | 2 +- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/internals/blocks.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 3ddceb8c2839d..0ec1bc7a84231 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -710,7 +710,7 @@ def _raise_on_incompatible(left, right): # Constructor Helpers def period_array(data, freq=None, copy=False): - # type: (Sequence[Optional[Period]], Optional[Tick]) -> PeriodArray + # type: (Sequence[Optional[Period]], Optional[Tick], bool) -> PeriodArray """ Construct a new PeriodArray from a sequence of Period scalars. diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 6114e578dc90f..9be2c9af169e8 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -397,6 +397,7 @@ def _get_fill(arr): def _sparse_array_op(left, right, op, name): + # type: (SparseArray, SparseArray, Callable, str) -> Any """ Perform a binary operation between two arrays. @@ -413,7 +414,6 @@ def _sparse_array_op(left, right, op, name): ------- SparseArray """ - # type: (SparseArray, SparseArray, Callable, str) -> Any if name.startswith('__'): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 926da40deaff2..36dcb692bb079 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1041,7 +1041,7 @@ def _bool_agg(self, val_test, skipna): """ def objs_to_bool(vals): - # type: np.ndarray -> (np.ndarray, typing.Type) + # type: (np.ndarray) -> (np.ndarray, typing.Type) if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) else: @@ -1743,7 +1743,7 @@ def quantile(self, q=0.5, interpolation='linear'): """ def pre_processor(vals): - # type: np.ndarray -> (np.ndarray, Optional[typing.Type]) + # type: (np.ndarray) -> (np.ndarray, Optional[typing.Type]) if is_object_dtype(vals): raise TypeError("'quantile' cannot be performed against " "'object' dtypes!") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4e2c04dba8b04..ada663556899b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1827,13 +1827,13 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None, placement=self.mgr_locs) def shift(self, periods, axis=0, fill_value=None): + # type: (int, Optional[BlockPlacement], Any) -> List[ExtensionBlock] """ Shift the block by `periods`. Dispatches to underlying ExtensionArray and re-boxes in an ExtensionBlock. """ - # type: (int, Optional[BlockPlacement]) -> List[ExtensionBlock] return [ self.make_block_same_class( self.values.shift(periods=periods, fill_value=fill_value), From cc4a7e510f4bbd5aeb8138de16b888f4ce1551da Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 9 Mar 2019 02:02:55 +0000 Subject: [PATCH 086/110] CLN: Parmeterize test cases (#25355) --- pandas/tests/io/test_sql.py | 50 +++++++++++++------------------------ 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9d0bce3b342b4..806bd7f2b7c93 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -704,45 +704,29 @@ def test_complex(self): # Complex data type should raise error pytest.raises(ValueError, df.to_sql, 'test_complex', self.conn) - def test_to_sql_index_label(self): - temp_frame = DataFrame({'col1': range(4)}) - + @pytest.mark.parametrize("index_name,index_label,expected", [ # no index name, defaults to 'index' - sql.to_sql(temp_frame, 'test_index_label', self.conn) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == 'index' - + (None, None, "index"), # specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label='other_label') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == "other_label" - + (None, "other_label", "other_label"), # using the index name - temp_frame.index.name = 'index_name' - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == "index_name" - + ("index_name", None, "index_name"), # has index name, but specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label='other_label') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == "other_label" - + ("index_name", "other_label", "other_label"), # index name is integer - temp_frame.index.name = 0 - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == "0" - - temp_frame.index.name = None + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ]) + def test_to_sql_index_label(self, index_name, + index_label, expected): + temp_frame = DataFrame({'col1': range(4)}) + temp_frame.index.name = index_name + query = 'SELECT * FROM test_index_label' sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label=0) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == "0" + index_label=index_label) + frame = sql.read_sql_query(query, self.conn) + assert frame.columns[0] == expected def test_to_sql_index_label_multiindex(self): temp_frame = DataFrame({'col1': range(4)}, From 69905c5b01a5b84450eb158a029d985e24433964 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 9 Mar 2019 16:29:32 +0000 Subject: [PATCH 087/110] STY: use pytest.raises context manager (generic) (#25603) --- pandas/tests/generic/test_generic.py | 68 ++++++++++++++++++---------- pandas/tests/generic/test_series.py | 23 +++++++--- 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index c2f6cbf4c564c..6f2707f764920 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from pandas.compat import PY3, range, zip +from pandas.compat import PY2, PY3, range, zip from pandas.core.dtypes.common import is_scalar @@ -16,8 +16,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -import pandas.io.formats.printing as printing - # ---------------------------------------------------------------------- # Generic types test cases @@ -135,37 +133,51 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - pytest.raises(ValueError, lambda: bool(obj == 0)) - pytest.raises(ValueError, lambda: bool(obj == 1)) - pytest.raises(ValueError, lambda: bool(obj)) + msg = "The truth value of a {} is ambiguous".format( + self._typ.__name__) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) obj = self._construct(shape=4, value=1) - pytest.raises(ValueError, lambda: bool(obj == 0)) - pytest.raises(ValueError, lambda: bool(obj == 1)) - pytest.raises(ValueError, lambda: bool(obj)) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) obj = self._construct(shape=4, value=np.nan) - pytest.raises(ValueError, lambda: bool(obj == 0)) - pytest.raises(ValueError, lambda: bool(obj == 1)) - pytest.raises(ValueError, lambda: bool(obj)) + with pytest.raises(ValueError, match=msg): + bool(obj == 0) + with pytest.raises(ValueError, match=msg): + bool(obj == 1) + with pytest.raises(ValueError, match=msg): + bool(obj) # empty obj = self._construct(shape=0) - pytest.raises(ValueError, lambda: bool(obj)) + with pytest.raises(ValueError, match=msg): + bool(obj) # invalid behaviors obj1 = self._construct(shape=4, value=1) obj2 = self._construct(shape=4, value=1) - def f(): + with pytest.raises(ValueError, match=msg): if obj1: - printing.pprint_thing("this works and shouldn't") + pass - pytest.raises(ValueError, f) - pytest.raises(ValueError, lambda: obj1 and obj2) - pytest.raises(ValueError, lambda: obj1 or obj2) - pytest.raises(ValueError, lambda: not obj1) + with pytest.raises(ValueError, match=msg): + obj1 and obj2 + with pytest.raises(ValueError, match=msg): + obj1 or obj2 + with pytest.raises(ValueError, match=msg): + not obj1 def test_downcast(self): # test close downcasting @@ -200,9 +212,10 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"), - ("B", "str"), - ("C", "int32")]) + msg = ("compound dtypes are not implemented in the {} constructor" + .format(self._typ.__name__)) + with pytest.raises(NotImplementedError, match=msg): + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) f('int64') @@ -725,6 +738,7 @@ def test_sample(sel): with pytest.raises(ValueError): df.sample(1, weights=s4) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), @@ -755,8 +769,14 @@ def test_squeeze(self): tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - pytest.raises(ValueError, df.squeeze, axis=2) - pytest.raises(ValueError, df.squeeze, axis='x') + msg = ("No axis named 2 for object type ") + with pytest.raises(ValueError, match=msg): + df.squeeze(axis=2) + msg = ("No axis named x for object type ") + with pytest.raises(ValueError, match=msg): + df.squeeze(axis='x') df = tm.makeTimeDataFrame(3) tm.assert_frame_equal(df.squeeze(axis=0), df) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 10430ebde8225..b7d42e45253b0 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -102,23 +102,34 @@ def test_nonzero_single_element(self): s = Series([False]) assert not s.bool() + msg = "The truth value of a Series is ambiguous" # single item nan to raise for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: - pytest.raises(ValueError, lambda: bool(s)) + with pytest.raises(ValueError, match=msg): + bool(s) + msg = "bool cannot act on a non-boolean single element Series" for s in [Series([np.nan]), Series([pd.NaT])]: - pytest.raises(ValueError, lambda: s.bool()) + with pytest.raises(ValueError, match=msg): + s.bool() # multiple bool are still an error + msg = "The truth value of a Series is ambiguous" for s in [Series([True, True]), Series([False, False])]: - pytest.raises(ValueError, lambda: bool(s)) - pytest.raises(ValueError, lambda: s.bool()) + with pytest.raises(ValueError, match=msg): + bool(s) + with pytest.raises(ValueError, match=msg): + s.bool() # single non-bool are an error for s in [Series([1]), Series([0]), Series(['a']), Series([0.0])]: - pytest.raises(ValueError, lambda: bool(s)) - pytest.raises(ValueError, lambda: s.bool()) + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(s) + msg = "bool cannot act on a non-boolean single element Series" + with pytest.raises(ValueError, match=msg): + s.bool() def test_metadata_propagation_indiv(self): # check that the metadata matches up on the resulting ops From 9352e69190f1385277e4a5e308285a358a5103aa Mon Sep 17 00:00:00 2001 From: Sergey Kopylov Date: Sat, 9 Mar 2019 19:32:43 +0300 Subject: [PATCH 088/110] Fix HTML syntax errors in README.md (#25615) --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ce22818705865..633673d5cd04f 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,9 @@ Package Status - status + status + License @@ -73,8 +74,8 @@ Gitter - + + From 976a2db444c20ee71895bda394193aa24e1e5734 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 9 Mar 2019 16:34:40 +0000 Subject: [PATCH 089/110] STY: use pytest.raises context manager (io.sql) (#25597) --- pandas/tests/io/test_sql.py | 120 ++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 806bd7f2b7c93..d51d9418a370b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -28,7 +28,7 @@ import pytest import pandas.compat as compat -from pandas.compat import PY36, lrange, range, string_types +from pandas.compat import PY2, PY36, lrange, range, string_types from pandas.core.dtypes.common import ( is_datetime64_dtype, is_datetime64tz_dtype) @@ -400,8 +400,10 @@ def _to_sql_fail(self): self.test_frame1, 'test_frame1', if_exists='fail') assert self.pandasSQL.has_table('test_frame1') - pytest.raises(ValueError, self.pandasSQL.to_sql, - self.test_frame1, 'test_frame1', if_exists='fail') + msg = "Table 'test_frame1' already exists" + with pytest.raises(ValueError, match=msg): + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='fail') self.drop_table('test_frame1') @@ -563,8 +565,10 @@ def test_to_sql_fail(self): self.conn, if_exists='fail') assert sql.has_table('test_frame2', self.conn) - pytest.raises(ValueError, sql.to_sql, self.test_frame1, - 'test_frame2', self.conn, if_exists='fail') + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(self.test_frame1, 'test_frame2', + self.conn, if_exists='fail') def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', @@ -699,10 +703,11 @@ def test_timedelta(self): result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn) tm.assert_series_equal(result['foo'], df['foo'].astype('int64')) - def test_complex(self): + def test_complex_raises(self): df = DataFrame({'a': [1 + 1j, 2j]}) - # Complex data type should raise error - pytest.raises(ValueError, df.to_sql, 'test_complex', self.conn) + msg = "Complex datatypes not supported" + with pytest.raises(ValueError, match=msg): + df.to_sql('test_complex', self.conn) @pytest.mark.parametrize("index_name,index_label,expected", [ # no index name, defaults to 'index' @@ -758,10 +763,11 @@ def test_to_sql_index_label_multiindex(self): frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) assert frame.columns[:2].tolist() == ['C', 'D'] - # wrong length of index_label - pytest.raises(ValueError, sql.to_sql, temp_frame, - 'test_index_label', self.conn, if_exists='replace', - index_label='C') + msg = ("Length of 'index_label' should match number of levels, which" + " is 2") + with pytest.raises(ValueError, match=msg): + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label='C') def test_multiindex_roundtrip(self): df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], @@ -866,6 +872,8 @@ def test_escaped_table_name(self): @pytest.mark.single +@pytest.mark.skipif( + not SQLALCHEMY_INSTALLED, reason='SQLAlchemy not installed') class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): """ Test the public API as it would be used directly @@ -878,10 +886,7 @@ class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): mode = 'sqlalchemy' def connect(self): - if SQLALCHEMY_INSTALLED: - return sqlalchemy.create_engine('sqlite:///:memory:') - else: - pytest.skip('SQLAlchemy not installed') + return sqlalchemy.create_engine('sqlite:///:memory:') def test_read_table_columns(self): # test columns argument in read_table @@ -1091,20 +1096,21 @@ def test_sql_open_close(self): tm.assert_frame_equal(self.test_frame3, result) + @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason='SQLAlchemy is installed') def test_con_string_import_error(self): - if not SQLALCHEMY_INSTALLED: - conn = 'mysql://root@localhost/pandas_nosetest' - pytest.raises(ImportError, sql.read_sql, "SELECT * FROM iris", - conn) - else: - pytest.skip('SQLAlchemy is installed') + conn = 'mysql://root@localhost/pandas_nosetest' + msg = "Using URI string without sqlalchemy installed" + with pytest.raises(ImportError, match=msg): + sql.read_sql("SELECT * FROM iris", conn) def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2) - pytest.raises(sql.DatabaseError, sql.read_sql, 'iris', self.conn) + msg = "Execution failed on sql 'iris': near \"iris\": syntax error" + with pytest.raises(sql.DatabaseError, match=msg): + sql.read_sql('iris', self.conn) def test_safe_names_warning(self): # GH 6798 @@ -1260,9 +1266,10 @@ def test_read_table_columns(self): tm.equalContents( iris_frame.columns.values, ['SepalLength', 'SepalLength']) - def test_read_table_absent(self): - pytest.raises( - ValueError, sql.read_sql_table, "this_doesnt_exist", con=self.conn) + def test_read_table_absent_raises(self): + msg = "Table this_doesnt_exist not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("this_doesnt_exist", con=self.conn) def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) @@ -1601,8 +1608,9 @@ def test_dtype(self): meta.reflect() sqltype = meta.tables['dtype_test2'].columns['B'].type assert isinstance(sqltype, sqlalchemy.TEXT) - pytest.raises(ValueError, df.to_sql, - 'error', self.conn, dtype={'B': str}) + msg = "The type of B is not a SQLAlchemy type" + with pytest.raises(ValueError, match=msg): + df.to_sql('error', self.conn, dtype={'B': str}) # GH9083 df.to_sql('dtype_test3', self.conn, dtype={'B': sqlalchemy.String(10)}) @@ -1887,8 +1895,9 @@ def test_schema_support(self): res4 = sql.read_sql_table('test_schema_other', self.conn, schema='other') tm.assert_frame_equal(df, res4) - pytest.raises(ValueError, sql.read_sql_table, 'test_schema_other', - self.conn, schema='public') + msg = "Table test_schema_other not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table('test_schema_other', self.conn, schema='public') # different if_exists options @@ -2104,6 +2113,7 @@ def _get_sqlite_column_type(self, table, column): return ctype raise ValueError('Table %s, column %s not found' % (table, column)) + @pytest.mark.skipif(PY2, reason="pytest.raises match regex fails") def test_dtype(self): if self.flavor == 'mysql': pytest.skip('Not applicable to MySQL legacy') @@ -2120,8 +2130,9 @@ def test_dtype(self): assert self._get_sqlite_column_type( 'dtype_test2', 'B') == 'STRING' - pytest.raises(ValueError, df.to_sql, - 'error', self.conn, dtype={'B': bool}) + msg = r"B \(\) not a string" + with pytest.raises(ValueError, match=msg): + df.to_sql('error', self.conn, dtype={'B': bool}) # single dtype df.to_sql('single_dtype_test', self.conn, dtype='STRING') @@ -2153,8 +2164,9 @@ def test_illegal_names(self): # For sqlite, these should work fine df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) - # Raise error on blank - pytest.raises(ValueError, df.to_sql, "", self.conn) + msg = "Empty table or column name specified" + with pytest.raises(ValueError, match=msg): + df.to_sql("", self.conn) for ndx, weird_name in enumerate( ['test_weird_name]', 'test_weird_name[', @@ -2383,25 +2395,19 @@ def clean_up(test_table_to_drop): """ self.drop_table(test_table_to_drop) - # test if invalid value for if_exists raises appropriate error - pytest.raises(ValueError, - sql.to_sql, - frame=df_if_exists_1, - con=self.conn, - name=table_name, - if_exists='notvalidvalue') + msg = "'notvalidvalue' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, + if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='fail') - pytest.raises(ValueError, - sql.to_sql, - frame=df_if_exists_1, - con=self.conn, - name=table_name, - if_exists='fail') - + msg = "Table 'table_if_exists' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, + if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='replace', index=False) @@ -2647,23 +2653,17 @@ def clean_up(test_table_to_drop): self.drop_table(test_table_to_drop) # test if invalid value for if_exists raises appropriate error - pytest.raises(ValueError, - sql.to_sql, - frame=df_if_exists_1, - con=self.conn, - name=table_name, - if_exists='notvalidvalue') + with pytest.raises(ValueError, match=""): + sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, + if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, if_exists='fail', index=False) - pytest.raises(ValueError, - sql.to_sql, - frame=df_if_exists_1, - con=self.conn, - name=table_name, - if_exists='fail') + with pytest.raises(ValueError, match=""): + sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, + if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, From e28ae70ba8bd8b4174143543d1660421bc67887b Mon Sep 17 00:00:00 2001 From: Misha Veldhoen Date: Sat, 9 Mar 2019 19:52:20 +0100 Subject: [PATCH 090/110] DOC: Cleanup docstring pandas.core.filter (#25618) --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b81576404e2f..0128f01bddd10 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4534,11 +4534,11 @@ def filter(self, items=None, like=None, regex=None, axis=None): Parameters ---------- items : list-like - List of axis to restrict to (must not all be present). + Keep labels from axis which are in items. like : string - Keep axis where "arg in col == True". + Keep labels from axis for which "like in label == True". regex : string (regular expression) - Keep axis with re.search(regex, col) == True. + Keep labels from axis for which re.search(regex, label) == True. axis : int or string axis name The axis to filter on. By default this is the info axis, 'index' for Series, 'columns' for DataFrame. @@ -4561,7 +4561,7 @@ def filter(self, items=None, like=None, regex=None, axis=None): Examples -------- - >>> df = pd.DataFrame(np.array(([1,2,3], [4,5,6])), + >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), ... index=['mouse', 'rabbit'], ... columns=['one', 'two', 'three']) From df771cc81285d944a41a06e3a05db26d7eb915d2 Mon Sep 17 00:00:00 2001 From: Kendall Masse Date: Sun, 10 Mar 2019 12:24:55 -0400 Subject: [PATCH 091/110] DOC: Remove makePanel from docs (#25609) (#25612) --- doc/source/getting_started/dsintro.rst | 44 -------- doc/source/whatsnew/v0.13.1.rst | 149 ++++++++++++++++++++----- doc/source/whatsnew/v0.20.0.rst | 64 ++++++++--- doc/source/whatsnew/v0.23.0.rst | 60 ++++++++-- 4 files changed, 220 insertions(+), 97 deletions(-) diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index c8a2399739cd5..373cffd30ff14 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -1030,47 +1030,3 @@ method: major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['a', 'b', 'c', 'd']) panel.to_frame() - - -.. _dsintro.deprecate_panel: - -Deprecate Panel ---------------- - -Over the last few years, pandas has increased in both breadth and depth, with new features, -datatype support, and manipulation routines. As a result, supporting efficient indexing and functional -routines for ``Series``, ``DataFrame`` and ``Panel`` has contributed to an increasingly fragmented and -difficult-to-understand code base. - -The 3-D structure of a ``Panel`` is much less common for many types of data analysis, -than the 1-D of the ``Series`` or the 2-D of the ``DataFrame``. Going forward it makes sense for -pandas to focus on these areas exclusively. - -Oftentimes, one can simply use a MultiIndex ``DataFrame`` for easily working with higher dimensional data. - -In addition, the ``xarray`` package was built from the ground up, specifically in order to -support the multi-dimensional analysis that is one of ``Panel`` s main use cases. -`Here is a link to the xarray panel-transition documentation `__. - -.. ipython:: python - :okwarning: - - import pandas.util.testing as tm - p = tm.makePanel() - p - -Convert to a MultiIndex DataFrame. - -.. ipython:: python - :okwarning: - - p.to_frame() - -Alternatively, one can convert to an xarray ``DataArray``. - -.. ipython:: python - :okwarning: - - p.to_xarray() - -You can see the full-documentation for the `xarray package `__. diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 8a89450be2f48..161b0ef395f05 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -222,60 +222,155 @@ Enhancements - Panel :meth:`~pandas.Panel.apply` will work on non-ufuncs. See :ref:`the docs`. - .. ipython:: python + .. code-block:: ipython + + In [28]: import pandas.util.testing as tm + + In [29]: panel = tm.makePanel(5) - import pandas.util.testing as tm - panel = tm.makePanel(5) - panel - panel['ItemA'] + In [30]: panel + Out[30]: + + Dimensions: 3 (items) x 5 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemC + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-07 00:00:00 + Minor_axis axis: A to D + + In [31]: panel['ItemA'] + Out[31]: + A B C D + 2000-01-03 -0.673690 0.577046 -1.344312 -1.469388 + 2000-01-04 0.113648 -1.715002 0.844885 0.357021 + 2000-01-05 -1.478427 -1.039268 1.075770 -0.674600 + 2000-01-06 0.524988 -0.370647 -0.109050 -1.776904 + 2000-01-07 0.404705 -1.157892 1.643563 -0.968914 + + [5 rows x 4 columns] Specifying an ``apply`` that operates on a Series (to return a single element) - .. ipython:: python + .. code-block:: ipython + + In [32]: panel.apply(lambda x: x.dtype, axis='items') + Out[32]: + A B C D + 2000-01-03 float64 float64 float64 float64 + 2000-01-04 float64 float64 float64 float64 + 2000-01-05 float64 float64 float64 float64 + 2000-01-06 float64 float64 float64 float64 + 2000-01-07 float64 float64 float64 float64 - panel.apply(lambda x: x.dtype, axis='items') + [5 rows x 4 columns] A similar reduction type operation - .. ipython:: python + .. code-block:: ipython + + In [33]: panel.apply(lambda x: x.sum(), axis='major_axis') + Out[33]: + ItemA ItemB ItemC + A -1.108775 -1.090118 -2.984435 + B -3.705764 0.409204 1.866240 + C 2.110856 2.960500 -0.974967 + D -4.532785 0.303202 -3.685193 - panel.apply(lambda x: x.sum(), axis='major_axis') + [4 rows x 3 columns] This is equivalent to - .. ipython:: python + .. code-block:: ipython + + In [34]: panel.sum('major_axis') + Out[34]: + ItemA ItemB ItemC + A -1.108775 -1.090118 -2.984435 + B -3.705764 0.409204 1.866240 + C 2.110856 2.960500 -0.974967 + D -4.532785 0.303202 -3.685193 - panel.sum('major_axis') + [4 rows x 3 columns] A transformation operation that returns a Panel, but is computing the z-score across the major_axis - .. ipython:: python + .. code-block:: ipython - result = panel.apply(lambda x: (x - x.mean()) / x.std(), - axis='major_axis') - result - result['ItemA'] + In [35]: result = panel.apply(lambda x: (x - x.mean()) / x.std(), + ....: axis='major_axis') + ....: + + In [36]: result + Out[36]: + + Dimensions: 3 (items) x 5 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemC + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-07 00:00:00 + Minor_axis axis: A to D + + In [37]: result['ItemA'] # noqa E999 + Out[37]: + A B C D + 2000-01-03 -0.535778 1.500802 -1.506416 -0.681456 + 2000-01-04 0.397628 -1.108752 0.360481 1.529895 + 2000-01-05 -1.489811 -0.339412 0.557374 0.280845 + 2000-01-06 0.885279 0.421830 -0.453013 -1.053785 + 2000-01-07 0.742682 -0.474468 1.041575 -0.075499 + + [5 rows x 4 columns] - Panel :meth:`~pandas.Panel.apply` operating on cross-sectional slabs. (:issue:`1148`) - .. ipython:: python + .. code-block:: ipython - def f(x): - return ((x.T - x.mean(1)) / x.std(1)).T + In [38]: def f(x): + ....: return ((x.T - x.mean(1)) / x.std(1)).T + ....: - result = panel.apply(f, axis=['items', 'major_axis']) - result - result.loc[:, :, 'ItemA'] + In [39]: result = panel.apply(f, axis=['items', 'major_axis']) - This is equivalent to the following + In [40]: result + Out[40]: + + Dimensions: 4 (items) x 5 (major_axis) x 3 (minor_axis) + Items axis: A to D + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-07 00:00:00 + Minor_axis axis: ItemA to ItemC - .. ipython:: python + In [41]: result.loc[:, :, 'ItemA'] + Out[41]: + A B C D + 2000-01-03 0.012922 -0.030874 -0.629546 -0.757034 + 2000-01-04 0.392053 -1.071665 0.163228 0.548188 + 2000-01-05 -1.093650 -0.640898 0.385734 -1.154310 + 2000-01-06 1.005446 -1.154593 -0.595615 -0.809185 + 2000-01-07 0.783051 -0.198053 0.919339 -1.052721 + + [5 rows x 4 columns] - result = pd.Panel({ax: f(panel.loc[:, :, ax]) for ax in panel.minor_axis}) + This is equivalent to the following + + .. code-block:: ipython - result - result.loc[:, :, 'ItemA'] + In [42]: result = pd.Panel({ax: f(panel.loc[:, :, ax]) for ax in panel.minor_axis}) + + In [43]: result + Out[43]: + + Dimensions: 4 (items) x 5 (major_axis) x 3 (minor_axis) + Items axis: A to D + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-07 00:00:00 + Minor_axis axis: ItemA to ItemC + + In [44]: result.loc[:, :, 'ItemA'] + Out[44]: + A B C D + 2000-01-03 0.012922 -0.030874 -0.629546 -0.757034 + 2000-01-04 0.392053 -1.071665 0.163228 0.548188 + 2000-01-05 -1.093650 -0.640898 0.385734 -1.154310 + 2000-01-06 1.005446 -1.154593 -0.595615 -0.809185 + 2000-01-07 0.783051 -0.198053 0.919339 -1.052721 + + [5 rows x 4 columns] Performance ~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index c720e075012eb..26fdee4685c4b 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -45,11 +45,6 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ -.. ipython:: python - :suppress: - - import pandas.util.testing as tm - .. _whatsnew_0200.enhancements.agg: ``agg`` API for DataFrame/Series @@ -1363,24 +1358,65 @@ Deprecate Panel with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion. For more details see :ref:`Deprecate Panel ` documentation. (:issue:`13563`). -.. ipython:: python - :okwarning: +.. code-block:: ipython - p = tm.makePanel() - p + In [133]: import pandas.util.testing as tm + + In [134]: p = tm.makePanel() + + In [135]: p + Out[135]: + + Dimensions: 3 (items) x 3 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemC + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-05 00:00:00 + Minor_axis axis: A to D Convert to a MultiIndex DataFrame -.. ipython:: python +.. code-block:: ipython - p.to_frame() + In [136]: p.to_frame() + Out[136]: + ItemA ItemB ItemC + major minor + 2000-01-03 A 0.628776 -1.409432 0.209395 + B 0.988138 -1.347533 -0.896581 + C -0.938153 1.272395 -0.161137 + D -0.223019 -0.591863 -1.051539 + 2000-01-04 A 0.186494 1.422986 -0.592886 + B -0.072608 0.363565 1.104352 + C -1.239072 -1.449567 0.889157 + D 2.123692 -0.414505 -0.319561 + 2000-01-05 A 0.952478 -2.147855 -1.473116 + B -0.550603 -0.014752 -0.431550 + C 0.139683 -1.195524 0.288377 + D 0.122273 -1.425795 -0.619993 + + [12 rows x 3 columns] Convert to an xarray DataArray -.. ipython:: python - :okwarning: +.. code-block:: ipython - p.to_xarray() + In [137]: p.to_xarray() + Out[137]: + + array([[[ 0.628776, 0.988138, -0.938153, -0.223019], + [ 0.186494, -0.072608, -1.239072, 2.123692], + [ 0.952478, -0.550603, 0.139683, 0.122273]], + + [[-1.409432, -1.347533, 1.272395, -0.591863], + [ 1.422986, 0.363565, -1.449567, -0.414505], + [-2.147855, -0.014752, -1.195524, -1.425795]], + + [[ 0.209395, -0.896581, -0.161137, -1.051539], + [-0.592886, 1.104352, 0.889157, -0.319561], + [-1.473116, -0.43155 , 0.288377, -0.619993]]]) + Coordinates: + * items (items) object 'ItemA' 'ItemB' 'ItemC' + * major_axis (major_axis) datetime64[ns] 2000-01-03 2000-01-04 2000-01-05 + * minor_axis (minor_axis) object 'A' 'B' 'C' 'D' .. _whatsnew_0200.api_breaking.deprecate_group_agg_dict: diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index e52a36a922bd9..7ec5a39c3d384 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -646,29 +646,65 @@ Deprecate Panel with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion. For more details see :ref:`Deprecate Panel ` documentation. (:issue:`13563`, :issue:`18324`). -.. ipython:: python - :suppress: +.. code-block:: ipython - import pandas.util.testing as tm + In [75]: import pandas.util.testing as tm -.. ipython:: python - :okwarning: + In [76]: p = tm.makePanel() - p = tm.makePanel() - p + In [77]: p + Out[77]: + + Dimensions: 3 (items) x 3 (major_axis) x 4 (minor_axis) + Items axis: ItemA to ItemC + Major_axis axis: 2000-01-03 00:00:00 to 2000-01-05 00:00:00 + Minor_axis axis: A to D Convert to a MultiIndex DataFrame -.. ipython:: python +.. code-block:: ipython - p.to_frame() + In [78]: p.to_frame() + Out[78]: + ItemA ItemB ItemC + major minor + 2000-01-03 A 0.469112 0.721555 0.404705 + B -1.135632 0.271860 -1.039268 + C 0.119209 0.276232 -1.344312 + D -2.104569 0.113648 -0.109050 + 2000-01-04 A -0.282863 -0.706771 0.577046 + B 1.212112 -0.424972 -0.370647 + C -1.044236 -1.087401 0.844885 + D -0.494929 -1.478427 1.643563 + 2000-01-05 A -1.509059 -1.039575 -1.715002 + B -0.173215 0.567020 -1.157892 + C -0.861849 -0.673690 1.075770 + D 1.071804 0.524988 -1.469388 + + [12 rows x 3 columns] Convert to an xarray DataArray -.. ipython:: python - :okwarning: +.. code-block:: ipython - p.to_xarray() + In [79]: p.to_xarray() + Out[79]: + + array([[[ 0.469112, -1.135632, 0.119209, -2.104569], + [-0.282863, 1.212112, -1.044236, -0.494929], + [-1.509059, -0.173215, -0.861849, 1.071804]], + + [[ 0.721555, 0.27186 , 0.276232, 0.113648], + [-0.706771, -0.424972, -1.087401, -1.478427], + [-1.039575, 0.56702 , -0.67369 , 0.524988]], + + [[ 0.404705, -1.039268, -1.344312, -0.10905 ], + [ 0.577046, -0.370647, 0.844885, 1.643563], + [-1.715002, -1.157892, 1.07577 , -1.469388]]]) + Coordinates: + * items (items) object 'ItemA' 'ItemB' 'ItemC' + * major_axis (major_axis) datetime64[ns] 2000-01-03 2000-01-04 2000-01-05 + * minor_axis (minor_axis) object 'A' 'B' 'C' 'D' .. _whatsnew_0230.api_breaking.core_common: From 26cfa288edfc2db85d23d3e23cb6ce3888675963 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Mar 2019 13:26:33 -0400 Subject: [PATCH 092/110] TST: failing wheel building on PY2 and old numpy (#25631) closes #25630 --- pandas/tests/frame/test_constructors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fc642d211b30c..92ce6369a5109 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,7 +21,7 @@ import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, Series, Timedelta, Timestamp, - compat, date_range, isna) + _np_version_under1p13, compat, date_range, isna) from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -684,6 +684,8 @@ def test_constructor_ndarray(self): frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) assert len(frame) == 2 + @pytest.mark.skipif(PY2 & _np_version_under1p13, + reason="old numpy & py2") def test_constructor_maskedarray(self): self._check_basic_constructor(ma.masked_all) From 34e8efeaa5b55307f5e548c108a8d278909e8a10 Mon Sep 17 00:00:00 2001 From: James Cobon-Kerr Date: Sun, 10 Mar 2019 21:08:24 +0000 Subject: [PATCH 093/110] DOC: resolve all GL03 docstring validation errors (#25525) * Resolve GL03 docstring validation errors * Update code_checks.sh to validate GL03 * Remove newline concat in _build_option_description function * Resolve PR04 errors * Resolve remaining SS04 errors * Remove newline from end of _cnum_doc string * Move terminating quotes to same line to avoid line breaks --- ci/code_checks.sh | 4 ++-- pandas/_libs/tslibs/timedeltas.pyx | 1 - pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/sparse.py | 1 - pandas/core/base.py | 1 - pandas/core/config.py | 1 - pandas/core/frame.py | 2 -- pandas/core/generic.py | 14 +++----------- pandas/core/groupby/base.py | 4 +--- pandas/core/groupby/groupby.py | 11 +---------- pandas/core/indexes/base.py | 1 - pandas/io/feather_format.py | 1 - pandas/plotting/_misc.py | 1 - 13 files changed, 8 insertions(+), 36 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c4840f1e836c4..51df779341ed5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -241,8 +241,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT04, RT05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT04,RT05,SA05 + MSG='Validate docstrings (GL03, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6e40063fb925a..37aa05659b70f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1156,7 +1156,6 @@ class Timedelta(_Timedelta): Notes ----- The ``.value`` attribute is always in ns. - """ def __new__(cls, object value=_no_input, unit=None, **kwargs): cdef _Timedelta td_base diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 75cf658423210..89f2b9961a4d7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -119,7 +119,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = "\n{}\n".format(docstring) + f.__doc__ = docstring return property(f) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9be2c9af169e8..fd7149edc8d7c 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -541,7 +541,6 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. diff --git a/pandas/core/base.py b/pandas/core/base.py index f896596dd5216..c0f3df1b36c03 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -870,7 +870,6 @@ def to_numpy(self, dtype=None, copy=False): .. versionadded:: 0.24.0 - Parameters ---------- dtype : str or numpy.dtype, optional diff --git a/pandas/core/config.py b/pandas/core/config.py index 01664fffb1e27..b6264a5257dcb 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -651,7 +651,6 @@ def _build_option_description(k): .format(rkey=d.rkey if d.rkey else '')) s += u(')') - s += '\n\n' return s diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eadffb779734f..3996728a1cc90 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2184,7 +2184,6 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, Convert URLs to HTML links. .. versionadded:: 0.24.0 - %(returns)s See Also -------- @@ -6027,7 +6026,6 @@ def unstack(self, level=-1, fill_value=None): columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. - %(versionadded)s Parameters ---------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0128f01bddd10..a18db48bca2ae 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -948,7 +948,6 @@ def swaplevel(self, i=-2, j=-1, axis=0): The indexes ``i`` and ``j`` are now optional, and default to the two innermost levels of the index. - """ axis = self._get_axis_number(axis) result = self.copy() @@ -4951,9 +4950,7 @@ def pipe(self, func, *args, **kwargs): _shared_docs['aggregate'] = dedent(""" Aggregate using one or more operations over the specified axis. - %(versionadded)s - Parameters ---------- func : function, str, list or dict @@ -4983,17 +4980,13 @@ def pipe(self, func, *args, **kwargs): * DataFrame : when DataFrame.agg is called with several functions Return scalar, Series or DataFrame. - %(see_also)s - Notes ----- `agg` is an alias for `aggregate`. Use the alias. A passed user-defined-function will be passed a Series for evaluation. - - %(examples)s - """) + %(examples)s""") _shared_docs['transform'] = (""" Call ``func`` on self producing a %(klass)s with transformed values @@ -10307,7 +10300,7 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s (if level specified) +%(name1)s or %(name2)s (if level specified)\ %(see_also)s %(examples)s\ """ @@ -10464,8 +10457,7 @@ def _doc_parms(cls): %(name2)s.cumsum : Return cumulative sum over %(name2)s axis. %(name2)s.cumprod : Return cumulative product over %(name2)s axis. -%(examples)s -""" +%(examples)s""" _cummin_examples = """\ Examples diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index ebba4a0a9395d..903c898b68873 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -126,9 +126,7 @@ class where members are defined. property_wrapper_template = \ """@property def %(name)s(self) : - \""" - %(doc)s - \""" + \"""%(doc)s\""" return self.__getattr__('%(name)s')""" for name in whitelist: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 36dcb692bb079..3d0a6023ac29f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -221,8 +221,7 @@ class providing the base-class of operations. Examples -------- -%(examples)s -""" +%(examples)s""" _transform_template = """ Call function producing a like-indexed %(klass)s on each group and @@ -1106,9 +1105,7 @@ def mean(self, *args, **kwargs): Returns ------- pandas.Series or pandas.DataFrame - %(see_also)s - Examples -------- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], @@ -1564,9 +1561,7 @@ def nth(self, n, dropna=None): dropna : None or str, optional apply the specified dropna operation before counting which row is the nth row. Needs to be None, 'any' or 'all' - %(see_also)s - Examples -------- @@ -2139,9 +2134,7 @@ def head(self, n=5): Essentially equivalent to ``.apply(lambda x: x.head(n))``, except ignores as_index flag. - %(see_also)s - Examples -------- @@ -2167,9 +2160,7 @@ def tail(self, n=5): Essentially equivalent to ``.apply(lambda x: x.tail(n))``, except ignores as_index flag. - %(see_also)s - Examples -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dee181fc1c569..29b9a47a92a48 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3104,7 +3104,6 @@ def reindex(self, target, method=None, level=None, limit=None, Resulting index. indexer : np.ndarray or None Indices of output values in original index. - """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index d76e6b75d3762..b2c6dff4338b6 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -110,7 +110,6 @@ def read_feather(path, columns=None, use_threads=True): Returns ------- type of object stored in file - """ feather, pyarrow = _try_import() diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 5171ea68fd497..b8073c89892c5 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -303,7 +303,6 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, Returns ------- class:`matplotlip.axis.Axes` - """ from math import sqrt, pi import matplotlib.pyplot as plt From 8de86d4d17bf52f3e2578bb3d5a19d14e78ca922 Mon Sep 17 00:00:00 2001 From: Kendall Masse Date: Sun, 10 Mar 2019 17:36:38 -0400 Subject: [PATCH 094/110] BUG: Fix user-facing AssertionError with to_html (#25608) (#25620) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/formats/html.py | 4 ++-- pandas/tests/io/formats/test_to_html.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ea08a0a6fe07b..f5fa7a71e117c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,7 +214,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- +- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - - diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 66d13bf2668f9..a543b21f287ec 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -163,8 +163,8 @@ def _write_table(self, indent=0): if isinstance(self.classes, str): self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): - raise AssertionError('classes must be list or tuple, not {typ}' - .format(typ=type(self.classes))) + raise TypeError('classes must be a string, list, or tuple, ' + 'not {typ}'.format(typ=type(self.classes))) _classes.extend(self.classes) if self.table_id is None: diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 428f1411a10a6..9cb2704f65587 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -623,3 +623,13 @@ def test_ignore_display_max_colwidth(method, expected, max_colwidth): result = getattr(df, method)() expected = expected(max_colwidth) assert expected in result + + +@pytest.mark.parametrize("classes", [True, 0]) +def test_to_html_invalid_classes_type(classes): + # GH 25608 + df = DataFrame() + msg = "classes must be a string, list, or tuple" + + with pytest.raises(TypeError, match=msg): + df.to_html(classes=classes) From a247f802aec7a2ed57b4aff1c819f5b2b73e777e Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 10 Mar 2019 22:41:26 +0100 Subject: [PATCH 095/110] Fixturize tests/frame/test_asof.py (#25628) --- pandas/tests/frame/test_asof.py | 53 +++++++++++++++++---------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index 0947e6f252dab..4ba3431d102df 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -6,21 +6,26 @@ from pandas import DataFrame, Series, Timestamp, date_range, to_datetime import pandas.util.testing as tm -from .common import TestData +@pytest.fixture +def date_range_frame(): + """ + Fixture for DataFrame of ints with date_range index -class TestFrameAsof(TestData): - def setup_method(self, method): - self.N = N = 50 - self.rng = date_range('1/1/1990', periods=N, freq='53s') - self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, - index=self.rng) + Columns are ['A', 'B']. + """ + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + return DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) - def test_basic(self): - df = self.df.copy() + +class TestFrameAsof(): + + def test_basic(self, date_range_frame): + df = date_range_frame + N = 50 df.loc[15:30, 'A'] = np.nan - dates = date_range('1/1/1990', periods=self.N * 3, - freq='25s') + dates = date_range('1/1/1990', periods=N * 3, freq='25s') result = df.asof(dates) assert result.notna().all(1).all() @@ -35,11 +40,9 @@ def test_basic(self): rs = result[mask] assert (rs == 14).all(1).all() - def test_subset(self): + def test_subset(self, date_range_frame): N = 10 - rng = date_range('1/1/1990', periods=N, freq='53s') - df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, - index=rng) + df = date_range_frame.iloc[:N].copy() df.loc[4:8, 'A'] = np.nan dates = date_range('1/1/1990', periods=N * 3, freq='25s') @@ -54,20 +57,18 @@ def test_subset(self): expected = df.asof(dates) tm.assert_frame_equal(result, expected) - # B gives self.df.asof + # B gives df.asof result = df.asof(dates, subset='B') expected = df.resample('25s', closed='right').ffill().reindex(dates) expected.iloc[20:] = 9 tm.assert_frame_equal(result, expected) - def test_missing(self): + def test_missing(self, date_range_frame): # GH 15118 # no match found - `where` value before earliest date in index N = 10 - rng = date_range('1/1/1990', periods=N, freq='53s') - df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, - index=rng) + df = date_range_frame.iloc[:N].copy() result = df.asof('1989-12-31') expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31')) @@ -78,7 +79,7 @@ def test_missing(self): columns=['A', 'B'], dtype='float64') tm.assert_frame_equal(result, expected) - def test_all_nans(self): + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans result = DataFrame([np.nan]).asof([0]) @@ -86,14 +87,16 @@ def test_all_nans(self): tm.assert_frame_equal(result, expected) # testing non-default indexes, multiple inputs - dates = date_range('1/1/1990', periods=self.N * 3, freq='25s') - result = DataFrame(np.nan, index=self.rng, columns=['A']).asof(dates) + N = 150 + rng = date_range_frame.index + dates = date_range('1/1/1990', periods=N, freq='25s') + result = DataFrame(np.nan, index=rng, columns=['A']).asof(dates) expected = DataFrame(np.nan, index=dates, columns=['A']) tm.assert_frame_equal(result, expected) # testing multiple columns - dates = date_range('1/1/1990', periods=self.N * 3, freq='25s') - result = DataFrame(np.nan, index=self.rng, + dates = date_range('1/1/1990', periods=N, freq='25s') + result = DataFrame(np.nan, index=rng, columns=['A', 'B', 'C']).asof(dates) expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C']) tm.assert_frame_equal(result, expected) From cb43fa668c5dd7ac88c919b8f432a410c65319e7 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 10 Mar 2019 22:45:01 +0100 Subject: [PATCH 096/110] Fixturize tests/frame/test_combine_concat.py (#25634) --- pandas/tests/frame/test_combine_concat.py | 33 +++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index c2364dc135a9a..c803d15a690c4 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -11,12 +11,11 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameConcatCommon(TestData): +class TestDataFrameConcatCommon(): def test_concat_multiple_frames_dtypes(self): @@ -515,7 +514,7 @@ def test_concat_astype_dup_col(self): tm.assert_frame_equal(result, expected) -class TestDataFrameCombineFirst(TestData): +class TestDataFrameCombineFirst(): def test_combine_first_mixed(self): a = Series(['a', 'b'], index=lrange(2)) @@ -531,22 +530,22 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self): + def test_combine_first(self, float_frame): # disjoint - head, tail = self.frame[:5], self.frame[5:] + head, tail = float_frame[:5], float_frame[5:] combined = head.combine_first(tail) - reordered_frame = self.frame.reindex(combined.index) + reordered_frame = float_frame.reindex(combined.index) assert_frame_equal(combined, reordered_frame) - assert tm.equalContents(combined.columns, self.frame.columns) + assert tm.equalContents(combined.columns, float_frame.columns) assert_series_equal(combined['A'], reordered_frame['A']) # same index - fcopy = self.frame.copy() + fcopy = float_frame.copy() fcopy['A'] = 1 del fcopy['C'] - fcopy2 = self.frame.copy() + fcopy2 = float_frame.copy() fcopy2['B'] = 0 del fcopy2['D'] @@ -570,20 +569,20 @@ def test_combine_first(self): assert (combined['A'][:10] == 0).all() # no overlap - f = self.frame[:10] - g = self.frame[10:] + f = float_frame[:10] + g = float_frame[10:] combined = f.combine_first(g) assert_series_equal(combined['A'].reindex(f.index), f['A']) assert_series_equal(combined['A'].reindex(g.index), g['A']) # corner cases - comb = self.frame.combine_first(self.empty) - assert_frame_equal(comb, self.frame) + comb = float_frame.combine_first(DataFrame({})) + assert_frame_equal(comb, float_frame) - comb = self.empty.combine_first(self.frame) - assert_frame_equal(comb, self.frame) + comb = DataFrame({}).combine_first(float_frame) + assert_frame_equal(comb, float_frame) - comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) + comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index # #2525 @@ -850,7 +849,7 @@ def test_concat_datetime_datetime64_frame(self): pd.concat([df1, df2_obj]) -class TestDataFrameUpdate(TestData): +class TestDataFrameUpdate(): def test_update_nan(self): # #15593 #15617 From f2b578cfca9525c5ad4389fb3e5c08f4146c4cf9 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 10 Mar 2019 22:49:39 +0100 Subject: [PATCH 097/110] Fixturize tests/frame/test_join.py (#25639) --- pandas/tests/frame/test_join.py | 42 ++++++++++++++------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 0508658766cd3..2c9fde652493d 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -4,7 +4,6 @@ import pytest from pandas import DataFrame, Index, period_range -from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -16,11 +15,6 @@ def frame_with_period_index(): index=period_range(start='2000', freq='A', periods=4)) -@pytest.fixture -def frame(): - return TestData().frame - - @pytest.fixture def left(): return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) @@ -63,11 +57,11 @@ def test_join(left, right, how, sort, expected): tm.assert_frame_equal(result, expected) -def test_join_index(frame): +def test_join_index(float_frame): # left / right - f = frame.loc[frame.index[:10], ['A', 'B']] - f2 = frame.loc[frame.index[5:], ['C', 'D']].iloc[::-1] + f = float_frame.loc[float_frame.index[:10], ['A', 'B']] + f2 = float_frame.loc[float_frame.index[5:], ['C', 'D']].iloc[::-1] joined = f.join(f2) tm.assert_index_equal(f.index, joined.index) @@ -91,7 +85,7 @@ def test_join_index(frame): # outer joined = f.join(f2, how='outer') - tm.assert_index_equal(joined.index, frame.index.sort_values()) + tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) with pytest.raises(ValueError, match='join method'): @@ -101,16 +95,16 @@ def test_join_index(frame): msg = 'columns overlap but no suffix' for how in ('outer', 'left', 'inner'): with pytest.raises(ValueError, match=msg): - frame.join(frame, how=how) + float_frame.join(float_frame, how=how) -def test_join_index_more(frame): - af = frame.loc[:, ['A', 'B']] - bf = frame.loc[::2, ['C', 'D']] +def test_join_index_more(float_frame): + af = float_frame.loc[:, ['A', 'B']] + bf = float_frame.loc[::2, ['C', 'D']] expected = af.copy() - expected['C'] = frame['C'][::2] - expected['D'] = frame['D'][::2] + expected['C'] = float_frame['C'][::2] + expected['D'] = float_frame['D'][::2] result = af.join(bf) tm.assert_frame_equal(result, expected) @@ -122,28 +116,28 @@ def test_join_index_more(frame): tm.assert_frame_equal(result, expected.loc[:, result.columns]) -def test_join_index_series(frame): - df = frame.copy() - s = df.pop(frame.columns[-1]) +def test_join_index_series(float_frame): + df = float_frame.copy() + s = df.pop(float_frame.columns[-1]) joined = df.join(s) # TODO should this check_names ? - tm.assert_frame_equal(joined, frame, check_names=False) + tm.assert_frame_equal(joined, float_frame, check_names=False) s.name = None with pytest.raises(ValueError, match='must have a name'): df.join(s) -def test_join_overlap(frame): - df1 = frame.loc[:, ['A', 'B', 'C']] - df2 = frame.loc[:, ['B', 'C', 'D']] +def test_join_overlap(float_frame): + df1 = float_frame.loc[:, ['A', 'B', 'C']] + df2 = float_frame.loc[:, ['B', 'C', 'D']] joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') - no_overlap = frame.loc[:, ['A', 'D']] + no_overlap = float_frame.loc[:, ['A', 'D']] expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted From 81174601881af75033d9a43be06396a55cc17086 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 10 Mar 2019 22:54:54 +0100 Subject: [PATCH 098/110] Fixturize tests/frame/test_mutate_columns.py (#25642) --- pandas/tests/frame/test_mutate_columns.py | 25 +++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 6bef7e3f65b21..211173371ac7e 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -8,14 +8,13 @@ from pandas.compat import PY36, lrange, range from pandas import DataFrame, Index, MultiIndex, Series -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal # Column add, remove, delete. -class TestDataFrameMutateColumns(TestData): +class TestDataFrameMutateColumns(): def test_assign(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) @@ -193,9 +192,9 @@ def test_insert(self): exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp) - def test_delitem(self): - del self.frame['A'] - assert 'A' not in self.frame + def test_delitem(self, float_frame): + del float_frame['A'] + assert 'A' not in float_frame def test_delitem_multiindex(self): midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) @@ -223,16 +222,16 @@ def test_delitem_multiindex(self): with pytest.raises(KeyError): del df['A'] - def test_pop(self): - self.frame.columns.name = 'baz' + def test_pop(self, float_frame): + float_frame.columns.name = 'baz' - self.frame.pop('A') - assert 'A' not in self.frame + float_frame.pop('A') + assert 'A' not in float_frame - self.frame['foo'] = 'bar' - self.frame.pop('foo') - assert 'foo' not in self.frame - assert self.frame.columns.name == 'baz' + float_frame['foo'] = 'bar' + float_frame.pop('foo') + assert 'foo' not in float_frame + assert float_frame.columns.name == 'baz' # gh-10912: inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[ From 16edaaf756ce0b3b1d5ab74ca754a9ccd774825a Mon Sep 17 00:00:00 2001 From: Bharat Raghunathan Date: Mon, 11 Mar 2019 03:29:14 +0530 Subject: [PATCH 099/110] BUG: Fix #25481 by fixing the error message in TypeError (#25540) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/plotting/_core.py | 5 ++--- pandas/tests/plotting/test_datetimelike.py | 2 +- pandas/tests/plotting/test_frame.py | 12 +++++++++--- pandas/tests/plotting/test_series.py | 8 ++++++-- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f5fa7a71e117c..284943cf49070 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -121,7 +121,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) -- +- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) - Categorical diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 0ea92a57ac3f8..b9ec4d58db739 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -361,10 +361,9 @@ def _compute_plot_data(self): except AttributeError: is_empty = not len(numeric_data) - # no empty frames or series allowed + # no non-numeric frames or series allowed if is_empty: - raise TypeError('Empty {0!r}: no numeric data to ' - 'plot'.format(numeric_data.__class__.__name__)) + raise TypeError('no numeric data to plot') self.data = numeric_data diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 6702ad6cfb761..b9a29cc4ac27e 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -97,7 +97,7 @@ def test_nonnumeric_exclude(self): assert len(ax.get_lines()) == 1 # B was plotted self.plt.close(fig) - msg = "Empty 'DataFrame': no numeric data to plot" + msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): df['A'].plot() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 98b241f5c8206..28806bb67c896 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -485,7 +485,9 @@ def test_subplots_timeseries_y_axis(self): ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") assert (ax_datetime_all_tz.get_lines()[0].get_data()[1] == testdata["datetime_all_tz"].values).all() - with pytest.raises(TypeError): + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): testdata.plot(y="text") @pytest.mark.xfail(reason='not support for period, categorical, ' @@ -2219,7 +2221,9 @@ def test_all_invalid_plot_data(self): for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue - with pytest.raises(TypeError): + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): df.plot(kind=kind) @pytest.mark.slow @@ -2230,7 +2234,9 @@ def test_partially_invalid_plot_data(self): for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue - with pytest.raises(TypeError): + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): df.plot(kind=kind) with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index a234ea8f9416b..aa78f38b75a10 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -706,7 +706,9 @@ def test_invalid_plot_data(self): for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue - with pytest.raises(TypeError): + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): s.plot(kind=kind, ax=ax) @pytest.mark.slow @@ -723,7 +725,9 @@ def test_partially_invalid_plot_data(self): for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue - with pytest.raises(TypeError): + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): s.plot(kind=kind, ax=ax) def test_invalid_kind(self): From f88613929d6c83952cb5a6bf70678a78be480baa Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Mar 2019 18:57:26 -0400 Subject: [PATCH 100/110] TST: xref #25630 (#25643) * TST: xref #25630 --- pandas/tests/frame/test_constructors.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 92ce6369a5109..1d5cbfec8de52 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -684,7 +684,7 @@ def test_constructor_ndarray(self): frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) assert len(frame) == 2 - @pytest.mark.skipif(PY2 & _np_version_under1p13, + @pytest.mark.skipif(PY2 and _np_version_under1p13, reason="old numpy & py2") def test_constructor_maskedarray(self): self._check_basic_constructor(ma.masked_all) @@ -702,6 +702,8 @@ def test_constructor_maskedarray(self): frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) + @pytest.mark.skipif(PY2 and _np_version_under1p13, + reason="old numpy & py2") def test_constructor_maskedarray_nonfloat(self): # masked int promoted to float mat = ma.masked_all((2, 3), dtype=int) @@ -769,6 +771,8 @@ def test_constructor_maskedarray_nonfloat(self): assert frame['A'][1] is True assert frame['C'][2] is False + @pytest.mark.skipif(PY2 and _np_version_under1p13, + reason="old numpy & py2") def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() @@ -791,6 +795,8 @@ def test_constructor_maskedarray_hardened(self): dtype=float) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(PY2 and _np_version_under1p13, + reason="old numpy & py2") def test_constructor_maskedrecarray_dtype(self): # Ensure constructor honors dtype data = np.ma.array( @@ -802,6 +808,8 @@ def test_constructor_maskedrecarray_dtype(self): columns=['date', 'price']) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(PY2 and _np_version_under1p13, + reason="old numpy & py2") def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 From 3099773f27c9d7847738589e449373a4ad072125 Mon Sep 17 00:00:00 2001 From: danielplawrence Date: Mon, 11 Mar 2019 11:58:34 +0000 Subject: [PATCH 101/110] DOC:Remove hard-coded examples from _flex_doc_SERIES (#24589) (#25524) * DOC:Remove hard-coded examples from _flex_doc_SERIES (#24589) --- pandas/core/ops.py | 286 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 226 insertions(+), 60 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index dbdabecafae3a..4d88ce6836ca4 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -384,57 +384,252 @@ def _get_op_name(op, special): # ----------------------------------------------------------------------------- # Docstring Generation and Templates +_add_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 +""" + +_sub_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.subtract(b, fill_value=0) +a 0.0 +b 1.0 +c 1.0 +d -1.0 +e NaN +dtype: float64 +""" + +_mul_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.multiply(b, fill_value=0) +a 1.0 +b 0.0 +c 0.0 +d 0.0 +e NaN +dtype: float64 +""" + +_div_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.divide(b, fill_value=0) +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" + +_floordiv_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.floordiv(b, fill_value=0) +a 1.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" + +_mod_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.mod(b, fill_value=0) +a 0.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" +_pow_example_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.pow(b, fill_value=0) +a 1.0 +b 1.0 +c 1.0 +d 0.0 +e NaN +dtype: float64 +""" + _op_descriptions = { # Arithmetic Operators 'add': {'op': '+', 'desc': 'Addition', - 'reverse': 'radd'}, + 'reverse': 'radd', + 'series_examples': _add_example_SERIES}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reverse': 'rsub'}, + 'reverse': 'rsub', + 'series_examples': _sub_example_SERIES}, 'mul': {'op': '*', 'desc': 'Multiplication', 'reverse': 'rmul', + 'series_examples': _mul_example_SERIES, 'df_examples': None}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reverse': 'rmod'}, + 'reverse': 'rmod', + 'series_examples': _mod_example_SERIES}, 'pow': {'op': '**', 'desc': 'Exponential power', 'reverse': 'rpow', + 'series_examples': _pow_example_SERIES, 'df_examples': None}, 'truediv': {'op': '/', 'desc': 'Floating division', 'reverse': 'rtruediv', + 'series_examples': _div_example_SERIES, 'df_examples': None}, 'floordiv': {'op': '//', 'desc': 'Integer division', 'reverse': 'rfloordiv', + 'series_examples': _floordiv_example_SERIES, 'df_examples': None}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', 'reverse': 'rdivmod', + 'series_examples': None, 'df_examples': None}, # Comparison Operators 'eq': {'op': '==', 'desc': 'Equal to', - 'reverse': None}, + 'reverse': None, + 'series_examples': None}, 'ne': {'op': '!=', 'desc': 'Not equal to', - 'reverse': None}, + 'reverse': None, + 'series_examples': None}, 'lt': {'op': '<', 'desc': 'Less than', - 'reverse': None}, + 'reverse': None, + 'series_examples': None}, 'le': {'op': '<=', 'desc': 'Less than or equal to', - 'reverse': None}, + 'reverse': None, + 'series_examples': None}, 'gt': {'op': '>', 'desc': 'Greater than', - 'reverse': None}, + 'reverse': None, + 'series_examples': None}, 'ge': {'op': '>=', 'desc': 'Greater than or equal to', - 'reverse': None} + 'reverse': None, + 'series_examples': None} } _op_names = list(_op_descriptions.keys()) @@ -472,51 +667,6 @@ def _get_op_name(op, special): See Also -------- Series.{reverse} - -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 ->>> a.add(b, fill_value=0) -a 2.0 -b 1.0 -c 1.0 -d 1.0 -e NaN -dtype: float64 ->>> a.subtract(b, fill_value=0) -a 0.0 -b 1.0 -c 1.0 -d -1.0 -e NaN -dtype: float64 ->>> a.multiply(b) -a 1.0 -b NaN -c NaN -d NaN -e NaN -dtype: float64 ->>> a.divide(b, fill_value=0) -a 1.0 -b inf -c inf -d 0.0 -e NaN -dtype: float64 """ _arith_doc_FRAME = """ @@ -906,16 +1056,32 @@ def _make_flex_doc(op_name, typ): if typ == 'series': base_doc = _flex_doc_SERIES - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse']) + doc_no_examples = base_doc.format( + desc=op_desc['desc'], + op_name=op_name, + equiv=equiv, + reverse=op_desc['reverse'] + ) + if op_desc['series_examples']: + doc = doc_no_examples + op_desc['series_examples'] + else: + doc = doc_no_examples elif typ == 'dataframe': base_doc = _flex_doc_FRAME - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse']) + doc = base_doc.format( + desc=op_desc['desc'], + op_name=op_name, + equiv=equiv, + reverse=op_desc['reverse'] + ) elif typ == 'panel': base_doc = _flex_doc_PANEL - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse']) + doc = base_doc.format( + desc=op_desc['desc'], + op_name=op_name, + equiv=equiv, + reverse=op_desc['reverse'] + ) else: raise AssertionError('Invalid typ argument.') return doc From e2d0ad0922fd1bf26fcee2b8c664afaad452ac59 Mon Sep 17 00:00:00 2001 From: Daniel Luis Costa Date: Mon, 11 Mar 2019 09:02:04 -0300 Subject: [PATCH 102/110] DOC: require Return section only if return is not None nor commentary (#25008) * Return section only required if at least one return is not None nor commentary --- scripts/tests/test_validate_docstrings.py | 23 ++++++++++++- scripts/validate_docstrings.py | 42 +++++++++++++++++++++-- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 09fb5a30cbc3b..120f8d79819ff 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -231,6 +231,27 @@ def good_imports(self): """ pass + def no_returns(self): + """ + Say hello and have no returns. + """ + pass + + def empty_returns(self): + """ + Say hello and always return None. + + Since this function never returns a value, this + docstring doesn't need a return section. + """ + def say_hello(): + return "Hello World!" + say_hello() + if True: + return + else: + return None + class BadGenericDocStrings(object): """Everything here has a bad docstring @@ -785,7 +806,7 @@ def test_good_class(self, capsys): @pytest.mark.parametrize("func", [ 'plot', 'sample', 'random_letters', 'sample_values', 'head', 'head1', - 'contains', 'mode', 'good_imports']) + 'contains', 'mode', 'good_imports', 'no_returns', 'empty_returns']) def test_good_functions(self, capsys, func): errors = validate_one(self._import_path( klass='GoodDocStrings', func=func))['errors'] diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 20f32124a2532..1c45c79ba7fba 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -26,6 +26,8 @@ import importlib import doctest import tempfile +import ast +import textwrap import flake8.main.application @@ -490,9 +492,45 @@ def yields(self): @property def method_source(self): try: - return inspect.getsource(self.obj) + source = inspect.getsource(self.obj) except TypeError: return '' + return textwrap.dedent(source) + + @property + def method_returns_something(self): + ''' + Check if the docstrings method can return something. + + Bare returns, returns valued None and returns from nested functions are + disconsidered. + + Returns + ------- + bool + Whether the docstrings method can return something. + ''' + + def get_returns_not_on_nested_functions(node): + returns = [node] if isinstance(node, ast.Return) else [] + for child in ast.iter_child_nodes(node): + # Ignore nested functions and its subtrees. + if not isinstance(child, ast.FunctionDef): + child_returns = get_returns_not_on_nested_functions(child) + returns.extend(child_returns) + return returns + + tree = ast.parse(self.method_source).body + if tree: + returns = get_returns_not_on_nested_functions(tree[0]) + return_values = [r.value for r in returns] + # Replace NameConstant nodes valued None for None. + for i, v in enumerate(return_values): + if isinstance(v, ast.NameConstant) and v.value is None: + return_values[i] = None + return any(return_values) + else: + return False @property def first_line_ends_in_dot(self): @@ -691,7 +729,7 @@ def get_validation_data(doc): if doc.is_function_or_method: if not doc.returns: - if 'return' in doc.method_source: + if doc.method_returns_something: errs.append(error('RT01')) else: if len(doc.returns) == 1 and doc.returns[0][1]: From 99bffa23dccf0f38c5fc6f9a93c453f43e534e3d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 11 Mar 2019 05:10:54 -0700 Subject: [PATCH 103/110] CLN: Removed debugging code (#25647) --- pandas/tests/computation/test_eval.py | 3 --- pandas/tests/internals/test_internals.py | 1 - pandas/util/testing.py | 29 ------------------------ 3 files changed, 33 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a14d8e4471c23..062d1876141f8 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -253,9 +253,6 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): # local_dict={'lhs': lhs, 'rhs': rhs}, # engine=self.engine, parser=self.parser) # except AssertionError: - # import ipdb - # - # ipdb.set_trace() # raise else: expected = _eval_single_bin( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index bda486411e01e..4129184373a2a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -865,7 +865,6 @@ class TestIndexing(object): def test_get_slice(self): def assert_slice_ok(mgr, axis, slobj): - # import pudb; pudb.set_trace() mat = mgr.as_array() # we maybe using an ndarray to test slicing and diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a5ae1f6a4d960..6e88cd7f72dcd 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -638,35 +638,6 @@ def set_defaultencoding(encoding): sys.setdefaultencoding(orig) -# ----------------------------------------------------------------------------- -# Console debugging tools - - -def debug(f, *args, **kwargs): - from pdb import Pdb as OldPdb - try: - from IPython.core.debugger import Pdb - kw = dict(color_scheme='Linux') - except ImportError: - Pdb = OldPdb - kw = {} - pdb = Pdb(**kw) - return pdb.runcall(f, *args, **kwargs) - - -def pudebug(f, *args, **kwargs): - import pudb - return pudb.runcall(f, *args, **kwargs) - - -def set_trace(): - from IPython.core.debugger import Pdb - try: - Pdb(color_scheme='Linux').set_trace(sys._getframe().f_back) - except Exception: - from pdb import Pdb as OldPdb - OldPdb().set_trace(sys._getframe().f_back) - # ----------------------------------------------------------------------------- # contextmanager to ensure the file cleanup From 3dfb6d4546ff1f430daacb753d255ff4ce6d131c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 13:13:32 +0100 Subject: [PATCH 104/110] TST: fix incorrect sparse test (now failing on scipy master) (#25653) --- pandas/tests/arrays/sparse/test_array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 9c13a20726553..11b5bcf702e75 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1091,11 +1091,11 @@ def test_from_coo(self): row = [0, 3, 1, 0] col = [0, 3, 1, 2] data = [4, 5, 7, 9] - sp_array = sparse.coo_matrix(data, (row, col)) + sp_array = sparse.coo_matrix((data, (row, col))) result = pd.Series.sparse.from_coo(sp_array) - index = pd.MultiIndex.from_product([[0], [0, 1, 2, 3]]) - expected = pd.Series(data, index=index, dtype='Sparse[int]') + index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) + expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]') tm.assert_series_equal(result, expected) def test_to_coo(self): From de52d0b10c28318ae48390dbe7abcd1807a1154a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 13:41:36 +0100 Subject: [PATCH 105/110] Suppress incorrect warning in nargsort for timezone-aware DatetimeIndex (#25629) --- pandas/core/sorting.py | 9 ++++++++- pandas/tests/test_sorting.py | 10 +++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ef69939d6e978..0b5b017bec9ac 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,4 +1,5 @@ """ miscellaneous sorting / groupby utilities """ +import warnings import numpy as np @@ -254,7 +255,13 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx - items = np.asanyarray(items) + with warnings.catch_warnings(): + # https://github.com/pandas-dev/pandas/issues/25439 + # can be removed once ExtensionArrays are properly handled by nargsort + warnings.filterwarnings( + "ignore", category=FutureWarning, + message="Converting timezone-aware DatetimeArray to") + items = np.asanyarray(items) idx = np.arange(len(items)) mask = isna(items) non_nans = items[~mask] diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 7528566e8326e..fa8fbddd59118 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -9,7 +9,8 @@ from pandas.compat import PY2 -from pandas import DataFrame, MultiIndex, Series, compat, concat, merge +from pandas import ( + DataFrame, MultiIndex, Series, compat, concat, merge, to_datetime) from pandas.core import common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, @@ -183,6 +184,13 @@ def test_nargsort(self): exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) + def test_nargsort_datetimearray_warning(self): + # https://github.com/pandas-dev/pandas/issues/25439 + # can be removed once the FutureWarning for np.array(DTA) is removed + data = to_datetime([0, 2, 0, 1]).tz_localize('Europe/Brussels') + with tm.assert_produces_warning(None): + nargsort(data) + class TestMerge(object): From c4fa3c93c7cee54249eaffc67d0e5a1e306e967c Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Mon, 11 Mar 2019 08:58:12 -0500 Subject: [PATCH 106/110] DOC: file obj for to_csv must be newline='' (#25624) --- doc/source/user_guide/io.rst | 4 ++-- pandas/core/generic.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b23a0f10e9e2b..1b5d96fa9c146 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1689,7 +1689,7 @@ The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` whic allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. -* ``path_or_buf``: A string path to the file to write or a StringIO +* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with `newline=''` * ``sep`` : Field delimiter for the output file (default ",") * ``na_rep``: A string representation of a missing value (default '') * ``float_format``: Format string for floating point numbers @@ -1702,7 +1702,7 @@ function takes a number of arguments. Only the first is required. * ``mode`` : Python write mode, default 'w' * ``encoding``: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3 -* ``line_terminator``: Character sequence denoting line end (default '\\n') +* ``line_terminator``: Character sequence denoting line end (default `os.linesep`) * ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric * ``quotechar``: Character used to quote fields (default '"') * ``doublequote``: Control quoting of ``quotechar`` in fields (default True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a18db48bca2ae..f23aac9ad3a52 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2919,7 +2919,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. + a string. If a file object is passed it should be opened with + `newline=''`, disabling universal newlines. .. versionchanged:: 0.24.0 From 43b949dc70ed4d592a9aaf3e622ba96c27f40a96 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Mon, 11 Mar 2019 10:26:54 -0500 Subject: [PATCH 107/110] BUG: to_csv line endings with compression (#25625) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/io/common.py | 2 +- pandas/tests/frame/test_to_csv.py | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 2c6d1e01ed89b..0f603515c61cc 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,6 +32,7 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.enhancements: diff --git a/pandas/io/common.py b/pandas/io/common.py index ad054d77b3bc8..c1cacf39c5b08 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -434,7 +434,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, if (compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping))): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding) + f = TextIOWrapper(f, encoding=encoding, newline='') handles.append(f) if memory_map and hasattr(f, 'fileno'): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 54a8712a9c645..59bf3d00f979c 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1221,3 +1221,15 @@ def test_multi_index_header(self): '1,5,6,7,8'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected + + def test_gz_lineend(self): + # GH 25311 + df = pd.DataFrame({'a': [1, 2]}) + expected_rows = ['a', '1', '2'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + with ensure_clean('__test_gz_lineend.csv.gz') as path: + df.to_csv(path, index=False) + with tm.decompress_file(path, compression='gzip') as f: + result = f.read().decode('utf-8') + + assert result == expected From 63755498d82e9075826683983a907ce9b900b823 Mon Sep 17 00:00:00 2001 From: Tao He Date: Tue, 12 Mar 2019 00:09:27 +0800 Subject: [PATCH 108/110] BUG: Redefine IndexOpsMixin.size, fix #25580. (#25584) Signed-off-by: HE, Tao --- doc/source/whatsnew/v0.24.2.rst | 3 ++- pandas/core/base.py | 2 +- pandas/tests/resample/test_datetime_index.py | 12 ++++++++++++ pandas/tests/series/test_api.py | 7 +++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 0f603515c61cc..ee9419c79e265 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -102,7 +102,8 @@ Bug Fixes - Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) - Bug in :func:`merge` when merging an empty ``DataFrame`` with an ``Int64`` column or a non-empty ``DataFrame`` with an ``Int64`` column that is all ``NaN`` (:issue:`25183`) - Bug in ``IntervalTree`` where a ``RecursionError`` occurs upon construction due to an overflow when adding endpoints, which also causes :class:`IntervalIndex` to crash during indexing operations (:issue:`25485`) -- +- Bug in :attr:`Series.size` raising for some extension-array-backed ``Series``, rather than returning the size (:issue:`25580`) +- Bug in resampling raising for nullable integer-dtype columns (:issue:`25580`) .. _whatsnew_0242.contributors: diff --git a/pandas/core/base.py b/pandas/core/base.py index c0f3df1b36c03..9fc950b9e7b43 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -762,7 +762,7 @@ def size(self): """ Return the number of elements in the underlying data. """ - return self._values.size + return len(self._values) @property def flags(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ce675893d9907..ec05595536de4 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -101,6 +101,18 @@ def test_resample_basic(series, closed, expected): assert_series_equal(result, expected) +def test_resample_integerarray(): + # GH 25580, resample on IntegerArray + ts = pd.Series(range(9), + index=pd.date_range('1/1/2000', periods=9, freq='T'), + dtype='Int64') + result = ts.resample('3T').sum() + expected = Series([3, 12, 21], + index=pd.date_range('1/1/2000', periods=3, freq='3T'), + dtype="Int64") + assert_series_equal(result, expected) + + def test_resample_basic_grouper(series): s = series result = s.resample('5Min').last() diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 1f2e2b179c687..3ad9d54175f31 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -493,6 +493,13 @@ def test_tab_complete_warning(self, ip): with provisionalcompleter('ignore'): list(ip.Completer.completions('s.', 1)) + def test_integer_series_size(self): + # GH 25580 + s = Series(range(9)) + assert s.size == 9 + s = Series(range(9), dtype="Int64") + assert s.size == 9 + class TestCategoricalSeries(object): From dc7b4662b256b5519dce86ec3e3e12754da34b18 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 17:12:16 +0100 Subject: [PATCH 109/110] DOC: restore toctree maxdepth (#25134) * DOC: restore toctree maxdepth * make it hidden + make sure it is still in sidebar * Add toctree for front page manually * use commented toctree's instead of hidden for api pages to avoid warnings * 0.24.1 -> 0.25.0 * add comment in subsection toctrees to also update main manual toctree --- doc/source/development/index.rst | 3 + doc/source/getting_started/index.rst | 3 + doc/source/index.rst.template | 70 ++++++++++++++++++- doc/source/reference/index.rst | 65 +++++++++-------- .../themes/nature_with_gtoc/layout.html | 4 +- doc/source/user_guide/index.rst | 3 + 6 files changed, 112 insertions(+), 36 deletions(-) diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index d67a6c3a2ca04..a149f31118ed5 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -6,6 +6,9 @@ Development =========== +.. If you update this toctree, also update the manual toctree in the + main index.rst.template + .. toctree:: :maxdepth: 2 diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 4c5d26461a667..eead28830f861 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -6,6 +6,9 @@ Getting started =============== +.. If you update this toctree, also update the manual toctree in the + main index.rst.template + .. toctree:: :maxdepth: 2 diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index d04e9194e71dc..f18c61b5e2f95 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -25,7 +25,7 @@ See the :ref:`overview` for more detail about what's in the library. {% if single_doc and single_doc.endswith('.rst') -%} .. toctree:: - :maxdepth: 2 + :maxdepth: 3 {{ single_doc[:-4] }} {% elif single_doc %} @@ -35,7 +35,8 @@ See the :ref:`overview` for more detail about what's in the library. {{ single_doc }} {% else -%} .. toctree:: - :maxdepth: 2 + :maxdepth: 3 + :hidden: {% endif %} {% if not single_doc -%} @@ -51,4 +52,67 @@ See the :ref:`overview` for more detail about what's in the library. {% if not single_doc -%} development/index whatsnew/index - {% endif -%} +{% endif -%} + + +* :doc:`whatsnew/v0.25.0` +* :doc:`install` +* :doc:`getting_started/index` + + * :doc:`getting_started/overview` + * :doc:`getting_started/10min` + * :doc:`getting_started/basics` + * :doc:`getting_started/dsintro` + * :doc:`getting_started/comparison/index` + * :doc:`getting_started/tutorials` + +* :doc:`user_guide/index` + + * :doc:`user_guide/io` + * :doc:`user_guide/indexing` + * :doc:`user_guide/advanced` + * :doc:`user_guide/merging` + * :doc:`user_guide/reshaping` + * :doc:`user_guide/text` + * :doc:`user_guide/missing_data` + * :doc:`user_guide/categorical` + * :doc:`user_guide/integer_na` + * :doc:`user_guide/visualization` + * :doc:`user_guide/computation` + * :doc:`user_guide/groupby` + * :doc:`user_guide/timeseries` + * :doc:`user_guide/timedeltas` + * :doc:`user_guide/style` + * :doc:`user_guide/options` + * :doc:`user_guide/enhancingperf` + * :doc:`user_guide/sparse` + * :doc:`user_guide/gotchas` + * :doc:`user_guide/cookbook` + +* :doc:`ecosystem` +* :doc:`reference/index` + + * :doc:`reference/io` + * :doc:`reference/general_functions` + * :doc:`reference/series` + * :doc:`reference/frame` + * :doc:`reference/arrays` + * :doc:`reference/panel` + * :doc:`reference/indexing` + * :doc:`reference/offset_frequency` + * :doc:`reference/window` + * :doc:`reference/groupby` + * :doc:`reference/resampling` + * :doc:`reference/style` + * :doc:`reference/plotting` + * :doc:`reference/general_utility_functions` + * :doc:`reference/extensions` + +* :doc:`development/index` + + * :doc:`development/contributing` + * :doc:`development/internals` + * :doc:`development/extending` + * :doc:`development/developer` + +* :doc:`whatsnew/index` diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index ef4676054473a..1e652c9e5497d 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -19,6 +19,9 @@ public functions related to data types in pandas. The ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are PRIVATE. Stable functionality in such modules is not guaranteed. +.. If you update this toctree, also update the manual toctree in the + main index.rst.template + .. toctree:: :maxdepth: 2 @@ -41,40 +44,40 @@ public functions related to data types in pandas. .. This is to prevent warnings in the doc build. We don't want to encourage .. these methods. -.. toctree:: - :hidden: - - api/pandas.DataFrame.blocks - api/pandas.DataFrame.as_matrix - api/pandas.DataFrame.ix - api/pandas.Index.asi8 - api/pandas.Index.data - api/pandas.Index.flags - api/pandas.Index.holds_integer - api/pandas.Index.is_type_compatible - api/pandas.Index.nlevels - api/pandas.Index.sort - api/pandas.Panel.agg - api/pandas.Panel.aggregate - api/pandas.Panel.blocks - api/pandas.Panel.empty - api/pandas.Panel.is_copy - api/pandas.Panel.items - api/pandas.Panel.ix - api/pandas.Panel.major_axis - api/pandas.Panel.minor_axis - api/pandas.Series.asobject - api/pandas.Series.blocks - api/pandas.Series.from_array - api/pandas.Series.ix - api/pandas.Series.imag - api/pandas.Series.real +.. + .. toctree:: + + api/pandas.DataFrame.blocks + api/pandas.DataFrame.as_matrix + api/pandas.DataFrame.ix + api/pandas.Index.asi8 + api/pandas.Index.data + api/pandas.Index.flags + api/pandas.Index.holds_integer + api/pandas.Index.is_type_compatible + api/pandas.Index.nlevels + api/pandas.Index.sort + api/pandas.Panel.agg + api/pandas.Panel.aggregate + api/pandas.Panel.blocks + api/pandas.Panel.empty + api/pandas.Panel.is_copy + api/pandas.Panel.items + api/pandas.Panel.ix + api/pandas.Panel.major_axis + api/pandas.Panel.minor_axis + api/pandas.Series.asobject + api/pandas.Series.blocks + api/pandas.Series.from_array + api/pandas.Series.ix + api/pandas.Series.imag + api/pandas.Series.real .. Can't convince sphinx to generate toctree for this class attribute. .. So we do it manually to avoid a warning -.. toctree:: - :hidden: +.. + .. toctree:: - api/pandas.api.extensions.ExtensionDtype.na_value + api/pandas.api.extensions.ExtensionDtype.na_value diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html index a2106605c5562..b3f13f99f44d4 100644 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -19,7 +19,7 @@ {%- block sidebar1 %} {%- block sidebartoc %}

{{ _('Table Of Contents') }}

- {{ toctree() }} + {{ toctree(includehidden=True) }} {%- endblock %} {%- block sidebarsearch %}

{{ _('Search') }}

@@ -105,4 +105,4 @@

{{ _('Search') }}

var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); })(); -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index d39cf7103ab63..05df83decbd7e 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -15,6 +15,9 @@ Users brand-new to pandas should start with :ref:`10min`. Further information on any specific method can be obtained in the :ref:`api`. +.. If you update this toctree, also update the manual toctree in the + main index.rst.template + .. toctree:: :maxdepth: 2 From feeba90a2a36674c11f0fed32d1652cfec04c608 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 17:51:31 +0100 Subject: [PATCH 110/110] DOC: hardcode contributors for 0.24.x releases (#25662) --- doc/source/whatsnew/v0.24.1.rst | 13 ++++++++++++- doc/source/whatsnew/v0.24.2.rst | 31 ++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index be0a2eb682e87..8f963f1285e1b 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -82,4 +82,15 @@ Bug Fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v0.24.0..v0.24.1 +.. Including the contributors hardcoded for this release, as backporting with + MeeseeksDev loses the commit authors + +A total of 7 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time. + +* Alex Buchkovsky +* Roman Yurchak +* h-vetinari +* jbrockmendel +* Jeremy Schendel +* Joris Van den Bossche +* Tom Augspurger diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index ee9419c79e265..8da33a46e79c6 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -110,4 +110,33 @@ Bug Fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v0.24.1..v0.24.2 +.. Including the contributors hardcoded for this release, as backporting with + MeeseeksDev loses the commit authors + +A total of 25 people contributed patches to this release. People with a "+" by their names contributed a patch for the first time. + +* Albert Villanova del Moral +* Arno Veenstra + +* chris-b1 +* Devin Petersohn + +* EternalLearner42 + +* Flavien Lambert + +* gfyoung +* Gioia Ballin +* jbrockmendel +* Jeff Reback +* Jeremy Schendel +* Johan von Forstner + +* Joris Van den Bossche +* Josh +* Justin Zheng +* Matthew Roeschke +* Max Bolingbroke + +* rbenes + +* Sterling Paramore + +* Tao He + +* Thomas A Caswell +* Tom Augspurger +* Vibhu Agarwal + +* William Ayd +* Zach Angell