From 490b189c2e8306709bdfab0c0bf145e0baf1832a Mon Sep 17 00:00:00 2001 From: Jon Wiggins <35540058+JonWiggins@users.noreply.github.com> Date: Mon, 3 Jan 2022 18:25:23 -0600 Subject: [PATCH] BUG: Add fix for hashing timestamps with folds (#44282) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/tslibs/timestamps.pyx | 2 + pandas/tests/frame/methods/test_reindex.py | 37 +++++++++++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 27 ++++++++++++++ 4 files changed, 67 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 029fb187740b3..6f724ec221c83 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -724,6 +724,7 @@ Datetimelike - Bug in :class:`DateOffset`` addition with :class:`Timestamp` where ``offset.nanoseconds`` would not be included in the result (:issue:`43968`, :issue:`36589`) - Bug in :meth:`Timestamp.fromtimestamp` not supporting the ``tz`` argument (:issue:`45083`) - Bug in :class:`DataFrame` construction from dict of :class:`Series` with mismatched index dtypes sometimes raising depending on the ordering of the passed dict (:issue:`44091`) +- Bug in :class:`Timestamp` hashing during some DST transitions caused a segmentation fault (:issue:`33931` and :issue:`40817`) - Timedelta diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 304ac9405c5e1..03ee62e59aa3d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -180,6 +180,8 @@ cdef class _Timestamp(ABCTimestamp): def __hash__(_Timestamp self): if self.nanosecond: return hash(self.value) + if self.fold: + return datetime.__hash__(self.replace(fold=0)) return datetime.__hash__(self) def __richcmp__(_Timestamp self, object other, int op): diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bee8025275b42..b70ceea845ee8 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._libs.tslibs.timezones import dateutil_gettz as gettz + import pandas as pd from pandas import ( Categorical, @@ -78,6 +80,41 @@ def test_setitem_reset_index_dtypes(self): result = df2.reset_index() tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-40817 + test_timezone = gettz(timezone) + transition_1 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + df = ( + DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]}) + .set_index("index") + .reindex(["1", "2"]) + ) + tm.assert_frame_equal( + df, + DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"), + ) + class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 1c34b5c8e5475..f734c331e3def 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -445,6 +445,33 @@ def test_hash_equivalent(self): stamp = Timestamp(datetime(2011, 1, 1)) assert d[stamp] == 5 + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_hash_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-33931 + test_timezone = gettz(timezone) + transition_1 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + assert hash(transition_1) == hash(transition_2) + def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 with tm.assert_produces_warning(FutureWarning, match="freq"):