PERF: assorted (pandas-dev#43822)

me-kbs · Sep 30, 2021 · 742ab04 · 742ab04
1 parent cafa4f0
commit 742ab04
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 16 deletions.
diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -24,7 +24,10 @@ cnp.import_array()
 from pandas._libs.algos import ensure_int64
 
 from pandas._libs.arrays cimport NDArrayBacked
-from pandas._libs.util cimport is_integer_object
+from pandas._libs.util cimport (
+    is_array,
+    is_integer_object,
+)
 
 
 @cython.final
@@ -61,8 +64,15 @@ cdef class BlockPlacement:
                 self._has_array = True
         else:
             # Cython memoryview interface requires ndarray to be writeable.
-            arr = np.require(val, dtype=np.intp, requirements='W')
-            assert arr.ndim == 1, arr.shape
+            if (
+                not is_array(val)
+                or not cnp.PyArray_ISWRITEABLE(val)
+                or (<ndarray>val).descr.type_num != cnp.NPY_INTP
+            ):
+                arr = np.require(val, dtype=np.intp, requirements='W')
+            else:
+                arr = val
+            # Caller is responsible for ensuring arr.ndim == 1
             self._as_array = arr
             self._has_array = True
 
@@ -254,11 +264,13 @@ cdef class BlockPlacement:
 
         if slc is not None and slc.step == 1:
             new_slc = slice(slc.start * factor, slc.stop * factor, 1)
-            new_placement = np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
+            # equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
+            new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP)
         else:
             # Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
             mapped = [
-                np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
+                # equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
+                cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP)
                 for x in self
             ]
             new_placement = np.concatenate(mapped)
@@ -681,15 +693,17 @@ cdef class BlockManager:
             cnp.npy_intp length = self.shape[0]
             SharedBlock blk
             BlockPlacement bp
+            ndarray[intp_t] new_blknos, new_blklocs
 
         # equiv: np.empty(length, dtype=np.intp)
         new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
         new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
-        new_blknos.fill(-1)
-        new_blklocs.fill(-1)
+        # equiv: new_blknos.fill(-1)
+        cnp.PyArray_FILLWBYTE(new_blknos, -1)
+        cnp.PyArray_FILLWBYTE(new_blklocs, -1)
 
         for blkno, blk in enumerate(self.blocks):
-            bp = blk.mgr_locs
+            bp = blk._mgr_locs
             # Iterating over `bp` is a faster equivalent to
             #  new_blknos[bp.indexer] = blkno
             #  new_blklocs[bp.indexer] = np.arange(len(bp))

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1610,9 +1610,7 @@ def searchsorted(
             value = cast(int, dtype.type(value))
         else:
             value = pd_array(cast(ArrayLike, value), dtype=dtype)
-    elif not (
-        is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
-    ):
+    else:
         # E.g. if `arr` is an array with dtype='datetime64[ns]'
         # and `value` is a pd.Timestamp, we may need to convert value
         arr = ensure_wrapped_if_datetimelike(arr)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -749,7 +749,7 @@ def apply(
             # group might be modified
             group_axes = group.axes
             res = f(group)
-            if not _is_indexed_like(res, group_axes, axis):
+            if not mutated and not _is_indexed_like(res, group_axes, axis):
                 mutated = True
             result_values.append(res)
 

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -308,7 +308,16 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         """
         if not isinstance(dtype, PeriodDtype):
             return False
-        return dtype.freq == self.freq
+        # For the subset of DateOffsets that can be a dtype.freq, it
+        #  suffices (and is much faster) to compare the dtype_code rather than
+        #  the freq itself.
+        # See also: PeriodDtype.__eq__
+        freq = dtype.freq
+        own_freq = self.freq
+        return (
+            freq._period_dtype_code == own_freq._period_dtype_code
+            and freq.n == own_freq.n
+        )
 
     # ------------------------------------------------------------------------
     # Index Methods

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1227,7 +1227,7 @@ def idelete(self, indexer) -> BlockManager:
         nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
         new_columns = self.items[~is_deleted]
         axes = [new_columns, self.axes[1]]
-        return type(self)(tuple(nbs), axes)
+        return type(self)(tuple(nbs), axes, verify_integrity=False)
 
     # ----------------------------------------------------------------
     # Block-wise Operation
@@ -2043,7 +2043,7 @@ def _merge_blocks(
 
 def _fast_count_smallints(arr: npt.NDArray[np.intp]):
     """Faster version of set(arr) for sequences of small numbers."""
-    counts = np.bincount(arr.astype(np.int_, copy=False))
+    counts = np.bincount(arr)
     nz = counts.nonzero()[0]
     # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
     #  in one benchmark by a factor of 11

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -178,6 +178,8 @@ def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
 def _has_infs(result) -> bool:
     if isinstance(result, np.ndarray):
         if result.dtype == "f8" or result.dtype == "f4":
+            # Note: outside of an nanops-specific test, we always have
+            #  result.ndim == 1, so there is no risk of this ravel making a copy.
             return lib.has_infs(result.ravel("K"))
     try:
         return np.isinf(result).any()

diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -408,7 +408,9 @@ def test_iget(self):
         cols = Index(list("abc"))
         values = np.random.rand(3, 3)
         block = new_block(
-            values=values.copy(), placement=np.arange(3), ndim=values.ndim
+            values=values.copy(),
+            placement=np.arange(3, dtype=np.intp),
+            ndim=values.ndim,
         )
         mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))])