Skip to content

Commit

Permalink
PERF: assorted (pandas-dev#43822)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Sep 30, 2021
1 parent cafa4f0 commit 742ab04
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 16 deletions.
30 changes: 22 additions & 8 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ cnp.import_array()
from pandas._libs.algos import ensure_int64

from pandas._libs.arrays cimport NDArrayBacked
from pandas._libs.util cimport is_integer_object
from pandas._libs.util cimport (
is_array,
is_integer_object,
)


@cython.final
Expand Down Expand Up @@ -61,8 +64,15 @@ cdef class BlockPlacement:
self._has_array = True
else:
# Cython memoryview interface requires ndarray to be writeable.
arr = np.require(val, dtype=np.intp, requirements='W')
assert arr.ndim == 1, arr.shape
if (
not is_array(val)
or not cnp.PyArray_ISWRITEABLE(val)
or (<ndarray>val).descr.type_num != cnp.NPY_INTP
):
arr = np.require(val, dtype=np.intp, requirements='W')
else:
arr = val
# Caller is responsible for ensuring arr.ndim == 1
self._as_array = arr
self._has_array = True

Expand Down Expand Up @@ -254,11 +264,13 @@ cdef class BlockPlacement:

if slc is not None and slc.step == 1:
new_slc = slice(slc.start * factor, slc.stop * factor, 1)
new_placement = np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
# equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP)
else:
# Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
mapped = [
np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
# equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP)
for x in self
]
new_placement = np.concatenate(mapped)
Expand Down Expand Up @@ -681,15 +693,17 @@ cdef class BlockManager:
cnp.npy_intp length = self.shape[0]
SharedBlock blk
BlockPlacement bp
ndarray[intp_t] new_blknos, new_blklocs

# equiv: np.empty(length, dtype=np.intp)
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
new_blknos.fill(-1)
new_blklocs.fill(-1)
# equiv: new_blknos.fill(-1)
cnp.PyArray_FILLWBYTE(new_blknos, -1)
cnp.PyArray_FILLWBYTE(new_blklocs, -1)

for blkno, blk in enumerate(self.blocks):
bp = blk.mgr_locs
bp = blk._mgr_locs
# Iterating over `bp` is a faster equivalent to
# new_blknos[bp.indexer] = blkno
# new_blklocs[bp.indexer] = np.arange(len(bp))
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1610,9 +1610,7 @@ def searchsorted(
value = cast(int, dtype.type(value))
else:
value = pd_array(cast(ArrayLike, value), dtype=dtype)
elif not (
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
):
else:
# E.g. if `arr` is an array with dtype='datetime64[ns]'
# and `value` is a pd.Timestamp, we may need to convert value
arr = ensure_wrapped_if_datetimelike(arr)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ def apply(
# group might be modified
group_axes = group.axes
res = f(group)
if not _is_indexed_like(res, group_axes, axis):
if not mutated and not _is_indexed_like(res, group_axes, axis):
mutated = True
result_values.append(res)

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,16 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
if not isinstance(dtype, PeriodDtype):
return False
return dtype.freq == self.freq
# For the subset of DateOffsets that can be a dtype.freq, it
# suffices (and is much faster) to compare the dtype_code rather than
# the freq itself.
# See also: PeriodDtype.__eq__
freq = dtype.freq
own_freq = self.freq
return (
freq._period_dtype_code == own_freq._period_dtype_code
and freq.n == own_freq.n
)

# ------------------------------------------------------------------------
# Index Methods
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,7 +1227,7 @@ def idelete(self, indexer) -> BlockManager:
nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
new_columns = self.items[~is_deleted]
axes = [new_columns, self.axes[1]]
return type(self)(tuple(nbs), axes)
return type(self)(tuple(nbs), axes, verify_integrity=False)

# ----------------------------------------------------------------
# Block-wise Operation
Expand Down Expand Up @@ -2043,7 +2043,7 @@ def _merge_blocks(

def _fast_count_smallints(arr: npt.NDArray[np.intp]):
"""Faster version of set(arr) for sequences of small numbers."""
counts = np.bincount(arr.astype(np.int_, copy=False))
counts = np.bincount(arr)
nz = counts.nonzero()[0]
# Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
# in one benchmark by a factor of 11
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
def _has_infs(result) -> bool:
if isinstance(result, np.ndarray):
if result.dtype == "f8" or result.dtype == "f4":
# Note: outside of an nanops-specific test, we always have
# result.ndim == 1, so there is no risk of this ravel making a copy.
return lib.has_infs(result.ravel("K"))
try:
return np.isinf(result).any()
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,9 @@ def test_iget(self):
cols = Index(list("abc"))
values = np.random.rand(3, 3)
block = new_block(
values=values.copy(), placement=np.arange(3), ndim=values.ndim
values=values.copy(),
placement=np.arange(3, dtype=np.intp),
ndim=values.ndim,
)
mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))])

Expand Down

0 comments on commit 742ab04

Please sign in to comment.