From 661d88e0fa6dd032e135b62464361c0536bd18ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 27 Feb 2022 12:07:38 -0800 Subject: [PATCH] TYP: groupby, sorting (#46133) --- pandas/core/groupby/grouper.py | 6 +++--- pandas/core/groupby/ops.py | 9 ++++++--- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/multi.py | 4 +++- pandas/core/reshape/reshape.py | 1 - pandas/core/sorting.py | 24 +++++++++++++++++------- pandas/tests/test_sorting.py | 4 ++-- 7 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e71e56574d766..3bffe59905a69 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -459,7 +459,7 @@ class Grouping: * groups : dict of {group -> label_list} """ - _codes: np.ndarray | None = None + _codes: npt.NDArray[np.signedinteger] | None = None _group_index: Index | None = None _passed_categorical: bool _all_grouper: Categorical | None @@ -614,7 +614,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: return values._reverse_indexer() @property - def codes(self) -> np.ndarray: + def codes(self) -> npt.NDArray[np.signedinteger]: if self._codes is not None: # _codes is set in __init__ for MultiIndex cases return self._codes @@ -657,7 +657,7 @@ def group_index(self) -> Index: return Index._with_infer(uniques, name=self.name) @cache_readonly - def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: + def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3d7058c173e0b..c3c191ff53532 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -797,7 +797,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @final @property - def codes(self) -> list[np.ndarray]: + def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property @@ -860,11 +860,14 @@ def codes_info(self) -> npt.NDArray[np.intp]: return ids @final - def _get_compressed_codes(self) -> tuple[np.ndarray, npt.NDArray[np.intp]]: + def _get_compressed_codes( + self, + ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: # The first returned ndarray may have any signed integer dtype if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) + # FIXME: compress_group_index's second return value is int64, not intp ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) @@ -875,7 +878,7 @@ def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[np.ndarray]: + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: codes = self.codes ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 108646f8766a6..25112ead2f0b7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2187,7 +2187,9 @@ def _drop_level_numbers(self, levnums: list[int]): verify_integrity=False, ) - def _get_grouper_for_level(self, mapper, *, level=None): + def _get_grouper_for_level( + self, mapper, *, level=None + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: """ Get index grouper corresponding to an index level diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 862f41e66d202..f40857059a794 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1481,7 +1481,9 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- @doc(Index._get_grouper_for_level) - def _get_grouper_for_level(self, mapper, *, level): + def _get_grouper_for_level( + self, mapper, *, level=None + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: indexer = self.codes[level] level_index = self.levels[level] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index da3a717cda55d..931cbd8c2ab56 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -185,7 +185,6 @@ def _make_selectors(self): self.group_index = comp_index self.mask = mask - self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups)) @cache_readonly diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c505406648d3d..d41947510c1bb 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -224,7 +224,9 @@ def is_int64_overflow_possible(shape: Shape) -> bool: return the_prod >= lib.i8max -def decons_group_index(comp_labels, shape: Shape): +def _decons_group_index( + comp_labels: npt.NDArray[np.intp], shape: Shape +) -> list[npt.NDArray[np.intp]]: # reconstruct labels if is_int64_overflow_possible(shape): # at some point group indices are factorized, @@ -233,7 +235,7 @@ def decons_group_index(comp_labels, shape: Shape): label_list = [] factor = 1 - y = 0 + y = np.array(0) x = comp_labels for i in reversed(range(len(shape))): labels = (x - y) % (factor * shape[i]) // factor @@ -245,24 +247,32 @@ def decons_group_index(comp_labels, shape: Shape): def decons_obs_group_ids( - comp_ids: npt.NDArray[np.intp], obs_ids, shape: Shape, labels, xnull: bool -): + comp_ids: npt.NDArray[np.intp], + obs_ids: npt.NDArray[np.intp], + shape: Shape, + labels: Sequence[npt.NDArray[np.signedinteger]], + xnull: bool, +) -> list[npt.NDArray[np.intp]]: """ Reconstruct labels from observed group ids. Parameters ---------- comp_ids : np.ndarray[np.intp] + obs_ids: np.ndarray[np.intp] + shape : tuple[int] + labels : Sequence[np.ndarray[np.signedinteger]] xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") - shape = np.asarray(shape, dtype="i8") + lift + lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp) + arr_shape = np.asarray(shape, dtype=np.intp) + lift + shape = tuple(arr_shape) if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! - out = decons_group_index(obs_ids, shape) + out = _decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] indexer = unique_label_indices(comp_ids) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 37820fe31b6db..396c4d82d01fc 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -22,7 +22,7 @@ from pandas.core.algorithms import safe_sort import pandas.core.common as com from pandas.core.sorting import ( - decons_group_index, + _decons_group_index, get_group_index, is_int64_overflow_possible, lexsort_indexer, @@ -389,7 +389,7 @@ def align(df): ) def test_decons(codes_list, shape): group_index = get_group_index(codes_list, shape, sort=True, xnull=True) - codes_list2 = decons_group_index(group_index, shape) + codes_list2 = _decons_group_index(group_index, shape) for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b)