From d70d9b380dd23369a2db71f4d5748bf85d38011f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 26 Feb 2022 11:42:38 -0800 Subject: [PATCH] REF/TST: misplaced hashtable tests, rename HashTable.map map_keys_to_values (#46106) --- pandas/_libs/hashtable.pyi | 15 +- pandas/_libs/hashtable_class_helper.pxi.in | 12 +- pandas/core/sorting.py | 2 +- pandas/tests/libs/test_hashtable.py | 151 +++++++++++++++++++-- pandas/tests/series/methods/test_unique.py | 6 + pandas/tests/test_algos.py | 135 ------------------ 6 files changed, 163 insertions(+), 158 deletions(-) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index f4b90648a8dc8..481ff0d36c460 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -120,12 +120,6 @@ class HashTable: # TODO: `item` type is subclass-specific def get_item(self, item): ... # TODO: return type? def set_item(self, item) -> None: ... - # FIXME: we don't actually have this for StringHashTable or ObjectHashTable? - def map( - self, - keys: np.ndarray, # np.ndarray[subclass-specific] - values: np.ndarray, # const int64_t[:] - ) -> None: ... def map_locations( self, values: np.ndarray, # np.ndarray[subclass-specific] @@ -177,11 +171,16 @@ class Float64HashTable(HashTable): ... class Float32HashTable(HashTable): ... class Int64HashTable(HashTable): - # Only Int64HashTable has get_labels_groupby + # Only Int64HashTable has get_labels_groupby, map_keys_to_values def get_labels_groupby( self, - values: np.ndarray, # const int64_t[:] + values: npt.NDArray[np.int64], # const int64_t[:] ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ... + def map_keys_to_values( + self, + keys: npt.NDArray[np.int64], + values: npt.NDArray[np.int64], # const int64_t[:] + ) -> None: ... class Int32HashTable(HashTable): ... class Int16HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0446b675e07d7..6ddf8d42b9baa 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -435,6 +435,7 @@ cdef class {{name}}HashTable(HashTable): } cpdef get_item(self, {{dtype}}_t val): + # Used in core.sorting, IndexEngine.get_loc cdef: khiter_t k {{c_type}} cval @@ -446,6 +447,7 @@ cdef class {{name}}HashTable(HashTable): raise KeyError(val) cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): + # Used in libjoin cdef: khiter_t k int ret = 0 @@ -457,8 +459,13 @@ cdef class {{name}}HashTable(HashTable): else: raise KeyError(key) + {{if dtype == "int64" }} + # We only use this for int64, can reduce build size and make .pyi + # more accurate by only implementing it for int64 @cython.boundscheck(False) - def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None: + def map_keys_to_values( + self, const {{dtype}}_t[:] keys, const int64_t[:] values + ) -> None: cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -470,9 +477,11 @@ cdef class {{name}}HashTable(HashTable): key = {{to_c_type}}(keys[i]) k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] + {{endif}} @cython.boundscheck(False) def map_locations(self, const {{dtype}}_t[:] values) -> None: + # Used in libindex, safe_sort cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -488,6 +497,7 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values) -> ndarray: # -> np.ndarray[np.intp] + # Used in safe_sort, IndexEngine.get_indexer cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 21d9107a61cb7..c505406648d3d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -577,7 +577,7 @@ def get_flattened_list( arrays: DefaultDict[int, list[int]] = defaultdict(list) for labs, level in zip(labels, levels): table = hashtable.Int64HashTable(ngroups) - table.map(comp_ids, labs.astype(np.int64, copy=False)) + table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False)) for i in range(ngroups): arrays[i].append(level[table.get_item(i)]) return [tuple(array) for array in arrays.values()] diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b9abe3388c1b0..6af81468ef1ef 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,4 +1,5 @@ from contextlib import contextmanager +import struct import tracemalloc import numpy as np @@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype): with pytest.raises(KeyError, match=str(index + 2)): table.get_item(index + 2) - def test_map(self, table_type, dtype, writable): - # PyObjectHashTable has no map-method - if table_type != ht.PyObjectHashTable: + def test_map_keys_to_values(self, table_type, dtype, writable): + # only Int64HashTable has this method + if table_type == ht.Int64HashTable: N = 77 table = table_type() keys = np.arange(N).astype(dtype) vals = np.arange(N).astype(np.int64) + N keys.flags.writeable = writable vals.flags.writeable = writable - table.map(keys, vals) + table.map_keys_to_values(keys, vals) for i in range(N): assert table.get_item(keys[i]) == i + N @@ -180,6 +181,124 @@ def test_no_reallocation(self, table_type, dtype, N): assert n_buckets_start == clean_table.get_state()["n_buckets"] +class TestHashTableUnsorted: + # TODO: moved from test_algos; may be redundancies with other tests + def test_string_hashtable_set_item_signature(self): + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault + tbl = ht.StringHashTable() + + tbl.set_item("key", 1) + assert tbl.get_item("key") == 1 + + with pytest.raises(TypeError, match="'key' has incorrect type"): + # key arg typed as string, not object + tbl.set_item(4, 6) + with pytest.raises(TypeError, match="'val' has incorrect type"): + tbl.get_item(4) + + def test_lookup_nan(self, writable): + # GH#21688 ensure we can deal with readonly memory views + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) + xs.setflags(write=writable) + m = ht.Float64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) + + def test_add_signed_zeros(self): + # GH#21866 inconsistent hash-function for float64 + # default hash-function would lead to different hash-buckets + # for 0.0 and -0.0 if there are more than 2^30 hash-buckets + # but this would mean 16GB + N = 4 # 12 * 10**8 would trigger the error, if you have enough memory + m = ht.Float64HashTable(N) + m.set_item(0.0, 0) + m.set_item(-0.0, 0) + assert len(m) == 1 # 0.0 and -0.0 are equivalent + + def test_add_different_nans(self): + # GH#21866 inconsistent hash-function for float64 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + # default hash function would lead to different hash-buckets + # for NAN1 and NAN2 even if there are only 4 buckets: + m = ht.Float64HashTable() + m.set_item(NAN1, 0) + m.set_item(NAN2, 0) + assert len(m) == 1 # NAN1 and NAN2 are equivalent + + def test_lookup_overflow(self, writable): + xs = np.array([1, 2, 2**63], dtype=np.uint64) + # GH 21688 ensure we can deal with readonly memory views + xs.setflags(write=writable) + m = ht.UInt64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) + + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.Int32HashTable, ht.Int32Vector, "int32", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): + # Test for memory errors after internal vector + # reallocations (GH 7157) + vals = np.array(np.random.randn(1000), dtype=dtype) + + # GH 21688 ensures we can deal with read-only memory views + vals.setflags(write=writable) + + # initialise instances; cannot initialise in parametrization, + # as otherwise external views would be held on the array (which is + # one of the things this test is checking) + htable = htable() + uniques = uniques() + + # get_labels may append to uniques + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array() sets an external_view_exists flag on uniques. + tmp = uniques.to_array() + oldshape = tmp.shape + + # subsequent get_labels() calls can no longer append to it + # (except for StringHashTables + ObjectVector) + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with pytest.raises(ValueError, match="external reference.*"): + htable.get_labels(vals, uniques, 0, -1) + + uniques.to_array() # should not raise here + assert tmp.shape == oldshape + + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.Int32HashTable, + ht.UInt64HashTable, + ], + ) + def test_hashtable_large_sizehint(self, hashtable): + # GH#22729 smoketest for not raising when passing a large size_hint + size_hint = np.iinfo(np.uint32).max + 1 + hashtable(size_hint=size_hint) + + class TestPyObjectHashTableWithNans: def test_nan_float(self): nan1 = float("nan") @@ -322,15 +441,6 @@ def test_get_set_contains_len(self, table_type, dtype): assert index in table assert table.get_item(index) == 41 - def test_map(self, table_type, dtype): - N = 332 - table = table_type() - keys = np.full(N, np.nan, dtype=dtype) - vals = (np.arange(N) + N).astype(np.int64) - table.map(keys, vals) - assert len(table) == 1 - assert table.get_item(np.nan) == 2 * N - 1 - def test_map_locations(self, table_type, dtype): N = 10 table = table_type() @@ -468,6 +578,21 @@ def test_unique_label_indices_intp(writable): tm.assert_numpy_array_equal(result, expected) +def test_unique_label_indices(): + + a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) + + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1] + + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + a[np.random.choice(len(a), 10)] = -1 + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1][1:] + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + @pytest.mark.parametrize( "dtype", [ diff --git a/pandas/tests/series/methods/test_unique.py b/pandas/tests/series/methods/test_unique.py index 856fe6e7c4f04..2b7a7f59535f9 100644 --- a/pandas/tests/series/methods/test_unique.py +++ b/pandas/tests/series/methods/test_unique.py @@ -8,6 +8,12 @@ class TestUnique: + def test_unique_uint64(self): + ser = Series([1, 2, 2**63, 2**63], dtype=np.uint64) + res = ser.unique() + exp = np.array([1, 2, 2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(res, exp) + def test_unique_data_ownership(self): # it works! GH#1807 Series(Series(["a", "c", "b"]).unique()).sort_values() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0916f0b45719e..db09c15cd136b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1531,110 +1531,6 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: - def test_string_hashtable_set_item_signature(self): - # GH#30419 fix typing in StringHashTable.set_item to prevent segfault - tbl = ht.StringHashTable() - - tbl.set_item("key", 1) - assert tbl.get_item("key") == 1 - - with pytest.raises(TypeError, match="'key' has incorrect type"): - # key arg typed as string, not object - tbl.set_item(4, 6) - with pytest.raises(TypeError, match="'val' has incorrect type"): - tbl.get_item(4) - - def test_lookup_nan(self, writable): - xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) - # GH 21688 ensure we can deal with readonly memory views - xs.setflags(write=writable) - m = ht.Float64HashTable() - m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) - - def test_add_signed_zeros(self): - # GH 21866 inconsistent hash-function for float64 - # default hash-function would lead to different hash-buckets - # for 0.0 and -0.0 if there are more than 2^30 hash-buckets - # but this would mean 16GB - N = 4 # 12 * 10**8 would trigger the error, if you have enough memory - m = ht.Float64HashTable(N) - m.set_item(0.0, 0) - m.set_item(-0.0, 0) - assert len(m) == 1 # 0.0 and -0.0 are equivalent - - def test_add_different_nans(self): - # GH 21866 inconsistent hash-function for float64 - # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] - assert NAN1 != NAN1 - assert NAN2 != NAN2 - # default hash function would lead to different hash-buckets - # for NAN1 and NAN2 even if there are only 4 buckets: - m = ht.Float64HashTable() - m.set_item(NAN1, 0) - m.set_item(NAN2, 0) - assert len(m) == 1 # NAN1 and NAN2 are equivalent - - def test_lookup_overflow(self, writable): - xs = np.array([1, 2, 2**63], dtype=np.uint64) - # GH 21688 ensure we can deal with readonly memory views - xs.setflags(write=writable) - m = ht.UInt64HashTable() - m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) - - def test_get_unique(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(s.unique(), exp) - - @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case - @pytest.mark.parametrize( - "htable, uniques, dtype, safely_resizes", - [ - (ht.PyObjectHashTable, ht.ObjectVector, "object", False), - (ht.StringHashTable, ht.ObjectVector, "object", True), - (ht.Float64HashTable, ht.Float64Vector, "float64", False), - (ht.Int64HashTable, ht.Int64Vector, "int64", False), - (ht.Int32HashTable, ht.Int32Vector, "int32", False), - (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), - ], - ) - def test_vector_resize( - self, writable, htable, uniques, dtype, safely_resizes, nvals - ): - # Test for memory errors after internal vector - # reallocations (GH 7157) - vals = np.array(np.random.randn(1000), dtype=dtype) - - # GH 21688 ensures we can deal with read-only memory views - vals.setflags(write=writable) - - # initialise instances; cannot initialise in parametrization, - # as otherwise external views would be held on the array (which is - # one of the things this test is checking) - htable = htable() - uniques = uniques() - - # get_labels may append to uniques - htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array() sets an external_view_exists flag on uniques. - tmp = uniques.to_array() - oldshape = tmp.shape - - # subsequent get_labels() calls can no longer append to it - # (except for StringHashTables + ObjectVector) - if safely_resizes: - htable.get_labels(vals, uniques, 0, -1) - else: - with pytest.raises(ValueError, match="external reference.*"): - htable.get_labels(vals, uniques, 0, -1) - - uniques.to_array() # should not raise here - assert tmp.shape == oldshape - @pytest.mark.parametrize( "htable, tm_dtype", [ @@ -1715,37 +1611,6 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): expected_reconstruct = s_duplicated.dropna().values tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) - @pytest.mark.parametrize( - "hashtable", - [ - ht.PyObjectHashTable, - ht.StringHashTable, - ht.Float64HashTable, - ht.Int64HashTable, - ht.Int32HashTable, - ht.UInt64HashTable, - ], - ) - def test_hashtable_large_sizehint(self, hashtable): - # GH#22729 smoketest for not raising when passing a large size_hint - size_hint = np.iinfo(np.uint32).max + 1 - hashtable(size_hint=size_hint) - - -def test_unique_label_indices(): - - a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) - - left = ht.unique_label_indices(a) - right = np.unique(a, return_index=True)[1] - - tm.assert_numpy_array_equal(left, right, check_dtype=False) - - a[np.random.choice(len(a), 10)] = -1 - left = ht.unique_label_indices(a) - right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right, check_dtype=False) - class TestRank: @td.skip_if_no_scipy