Skip to content

Commit

Permalink
REF/TST: misplaced hashtable tests, rename HashTable.map map_keys_to_…
Browse files Browse the repository at this point in the history
…values (pandas-dev#46106)
  • Loading branch information
jbrockmendel authored Feb 26, 2022
1 parent d673e5a commit d70d9b3
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 158 deletions.
15 changes: 7 additions & 8 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,6 @@ class HashTable:
# TODO: `item` type is subclass-specific
def get_item(self, item): ... # TODO: return type?
def set_item(self, item) -> None: ...
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
def map(
self,
keys: np.ndarray, # np.ndarray[subclass-specific]
values: np.ndarray, # const int64_t[:]
) -> None: ...
def map_locations(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
Expand Down Expand Up @@ -177,11 +171,16 @@ class Float64HashTable(HashTable): ...
class Float32HashTable(HashTable): ...

class Int64HashTable(HashTable):
# Only Int64HashTable has get_labels_groupby
# Only Int64HashTable has get_labels_groupby, map_keys_to_values
def get_labels_groupby(
self,
values: np.ndarray, # const int64_t[:]
values: npt.NDArray[np.int64], # const int64_t[:]
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ...
def map_keys_to_values(
self,
keys: npt.NDArray[np.int64],
values: npt.NDArray[np.int64], # const int64_t[:]
) -> None: ...

class Int32HashTable(HashTable): ...
class Int16HashTable(HashTable): ...
Expand Down
12 changes: 11 additions & 1 deletion pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ cdef class {{name}}HashTable(HashTable):
}

cpdef get_item(self, {{dtype}}_t val):
# Used in core.sorting, IndexEngine.get_loc
cdef:
khiter_t k
{{c_type}} cval
Expand All @@ -446,6 +447,7 @@ cdef class {{name}}HashTable(HashTable):
raise KeyError(val)

cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
# Used in libjoin
cdef:
khiter_t k
int ret = 0
Expand All @@ -457,8 +459,13 @@ cdef class {{name}}HashTable(HashTable):
else:
raise KeyError(key)

{{if dtype == "int64" }}
# We only use this for int64, can reduce build size and make .pyi
# more accurate by only implementing it for int64
@cython.boundscheck(False)
def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None:
def map_keys_to_values(
self, const {{dtype}}_t[:] keys, const int64_t[:] values
) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -470,9 +477,11 @@ cdef class {{name}}HashTable(HashTable):
key = {{to_c_type}}(keys[i])
k = kh_put_{{dtype}}(self.table, key, &ret)
self.table.vals[k] = <Py_ssize_t>values[i]
{{endif}}

@cython.boundscheck(False)
def map_locations(self, const {{dtype}}_t[:] values) -> None:
# Used in libindex, safe_sort
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -488,6 +497,7 @@ cdef class {{name}}HashTable(HashTable):
@cython.boundscheck(False)
def lookup(self, const {{dtype}}_t[:] values) -> ndarray:
# -> np.ndarray[np.intp]
# Used in safe_sort, IndexEngine.get_indexer
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def get_flattened_list(
arrays: DefaultDict[int, list[int]] = defaultdict(list)
for labs, level in zip(labels, levels):
table = hashtable.Int64HashTable(ngroups)
table.map(comp_ids, labs.astype(np.int64, copy=False))
table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
for i in range(ngroups):
arrays[i].append(level[table.get_item(i)])
return [tuple(array) for array in arrays.values()]
Expand Down
151 changes: 138 additions & 13 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from contextlib import contextmanager
import struct
import tracemalloc

import numpy as np
Expand Down Expand Up @@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype):
with pytest.raises(KeyError, match=str(index + 2)):
table.get_item(index + 2)

def test_map(self, table_type, dtype, writable):
# PyObjectHashTable has no map-method
if table_type != ht.PyObjectHashTable:
def test_map_keys_to_values(self, table_type, dtype, writable):
# only Int64HashTable has this method
if table_type == ht.Int64HashTable:
N = 77
table = table_type()
keys = np.arange(N).astype(dtype)
vals = np.arange(N).astype(np.int64) + N
keys.flags.writeable = writable
vals.flags.writeable = writable
table.map(keys, vals)
table.map_keys_to_values(keys, vals)
for i in range(N):
assert table.get_item(keys[i]) == i + N

Expand Down Expand Up @@ -180,6 +181,124 @@ def test_no_reallocation(self, table_type, dtype, N):
assert n_buckets_start == clean_table.get_state()["n_buckets"]


class TestHashTableUnsorted:
# TODO: moved from test_algos; may be redundancies with other tests
def test_string_hashtable_set_item_signature(self):
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
tbl = ht.StringHashTable()

tbl.set_item("key", 1)
assert tbl.get_item("key") == 1

with pytest.raises(TypeError, match="'key' has incorrect type"):
# key arg typed as string, not object
tbl.set_item(4, 6)
with pytest.raises(TypeError, match="'val' has incorrect type"):
tbl.get_item(4)

def test_lookup_nan(self, writable):
# GH#21688 ensure we can deal with readonly memory views
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
xs.setflags(write=writable)
m = ht.Float64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))

def test_add_signed_zeros(self):
# GH#21866 inconsistent hash-function for float64
# default hash-function would lead to different hash-buckets
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
# but this would mean 16GB
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
m = ht.Float64HashTable(N)
m.set_item(0.0, 0)
m.set_item(-0.0, 0)
assert len(m) == 1 # 0.0 and -0.0 are equivalent

def test_add_different_nans(self):
# GH#21866 inconsistent hash-function for float64
# create different nans from bit-patterns:
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
assert NAN1 != NAN1
assert NAN2 != NAN2
# default hash function would lead to different hash-buckets
# for NAN1 and NAN2 even if there are only 4 buckets:
m = ht.Float64HashTable()
m.set_item(NAN1, 0)
m.set_item(NAN2, 0)
assert len(m) == 1 # NAN1 and NAN2 are equivalent

def test_lookup_overflow(self, writable):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
# GH 21688 ensure we can deal with readonly memory views
xs.setflags(write=writable)
m = ht.UInt64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))

@pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
@pytest.mark.parametrize(
"htable, uniques, dtype, safely_resizes",
[
(ht.PyObjectHashTable, ht.ObjectVector, "object", False),
(ht.StringHashTable, ht.ObjectVector, "object", True),
(ht.Float64HashTable, ht.Float64Vector, "float64", False),
(ht.Int64HashTable, ht.Int64Vector, "int64", False),
(ht.Int32HashTable, ht.Int32Vector, "int32", False),
(ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
],
)
def test_vector_resize(
self, writable, htable, uniques, dtype, safely_resizes, nvals
):
# Test for memory errors after internal vector
# reallocations (GH 7157)
vals = np.array(np.random.randn(1000), dtype=dtype)

# GH 21688 ensures we can deal with read-only memory views
vals.setflags(write=writable)

# initialise instances; cannot initialise in parametrization,
# as otherwise external views would be held on the array (which is
# one of the things this test is checking)
htable = htable()
uniques = uniques()

# get_labels may append to uniques
htable.get_labels(vals[:nvals], uniques, 0, -1)
# to_array() sets an external_view_exists flag on uniques.
tmp = uniques.to_array()
oldshape = tmp.shape

# subsequent get_labels() calls can no longer append to it
# (except for StringHashTables + ObjectVector)
if safely_resizes:
htable.get_labels(vals, uniques, 0, -1)
else:
with pytest.raises(ValueError, match="external reference.*"):
htable.get_labels(vals, uniques, 0, -1)

uniques.to_array() # should not raise here
assert tmp.shape == oldshape

@pytest.mark.parametrize(
"hashtable",
[
ht.PyObjectHashTable,
ht.StringHashTable,
ht.Float64HashTable,
ht.Int64HashTable,
ht.Int32HashTable,
ht.UInt64HashTable,
],
)
def test_hashtable_large_sizehint(self, hashtable):
# GH#22729 smoketest for not raising when passing a large size_hint
size_hint = np.iinfo(np.uint32).max + 1
hashtable(size_hint=size_hint)


class TestPyObjectHashTableWithNans:
def test_nan_float(self):
nan1 = float("nan")
Expand Down Expand Up @@ -322,15 +441,6 @@ def test_get_set_contains_len(self, table_type, dtype):
assert index in table
assert table.get_item(index) == 41

def test_map(self, table_type, dtype):
N = 332
table = table_type()
keys = np.full(N, np.nan, dtype=dtype)
vals = (np.arange(N) + N).astype(np.int64)
table.map(keys, vals)
assert len(table) == 1
assert table.get_item(np.nan) == 2 * N - 1

def test_map_locations(self, table_type, dtype):
N = 10
table = table_type()
Expand Down Expand Up @@ -468,6 +578,21 @@ def test_unique_label_indices_intp(writable):
tm.assert_numpy_array_equal(result, expected)


def test_unique_label_indices():

a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)

left = ht.unique_label_indices(a)
right = np.unique(a, return_index=True)[1]

tm.assert_numpy_array_equal(left, right, check_dtype=False)

a[np.random.choice(len(a), 10)] = -1
left = ht.unique_label_indices(a)
right = np.unique(a, return_index=True)[1][1:]
tm.assert_numpy_array_equal(left, right, check_dtype=False)


@pytest.mark.parametrize(
"dtype",
[
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/series/methods/test_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@


class TestUnique:
def test_unique_uint64(self):
ser = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
res = ser.unique()
exp = np.array([1, 2, 2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(res, exp)

def test_unique_data_ownership(self):
# it works! GH#1807
Series(Series(["a", "c", "b"]).unique()).sort_values()
Expand Down
Loading

0 comments on commit d70d9b3

Please sign in to comment.