Parallelization: try explicit chunking with numba prange iterating over chunks #24

Huite · 2024-10-11T13:13:57Z

A current pain point is that for parallel queries of faces/boxes/edges, the tree is traversed twice: once to count and allocate, and again to actually store results. This is primarily because we have no threadsafe list or something to append to, across threads.

A simpler scheme, however, is to chunk by the number of threads, create a (C++-like) vector for each, and append to that.

I've setup a implementation:

In query.py:

@nb.njit(inline="always")
def locate_box_parallel(box: Box, tree: CellTreeData, indices: IntArray, indices_size: int, index: int) -> Tuple[int, int]:
    tree_bbox = as_box(tree.bbox)
    if not boxes_intersect(box, tree_bbox):
        return 0, indices, indices_size
    stack = allocate_stack()
    stack[0] = 0
    size = 1
    count = 0

    while size > 0:
        node_index, size = pop(stack, size)
        node = tree.nodes[node_index]
        # Check if it's a leaf
        if node["child"] == -1:
            # Iterate over the bboxes in the leaf
            for i in range(node["ptr"], node["ptr"] + node["size"]):
                bbox_index = tree.bb_indices[i]
                # As a named tuple: saves about 15% runtime
                leaf_box = as_box(tree.bb_coords[bbox_index])
                if boxes_intersect(box, leaf_box):
                    # Re-allocate if needed.
                    indices_length = indices.shape[0]
                    if indices_size >= indices_length:
                        new = np.empty((indices_length * 2, 2), dtype=IntDType)
                        new[: indices_length, :] = indices[:, :]
                        indices = new
                    indices[indices_size, 0] = index
                    indices[indices_size, 1] = bbox_index
                    indices_size += 1
                    count += 1
        else:
            dim = 1 if node["dim"] else 0
            minimum = 2 * dim
            maximum = 2 * dim + 1
            left = box[minimum] <= node["Lmax"]
            right = box[maximum] >= node["Rmin"]
            left_child = node["child"]
            right_child = left_child + 1

            if left and right:
                stack, size = push(stack, left_child, size)
                stack, size = push(stack, right_child, size)
            elif left:
                stack, size = push(stack, left_child, size)
            elif right:
                stack, size = push(stack, right_child, size)

    return count, indices, indices_size


@nb.njit(cache=True)
def locate_boxes_parallel(
    box_coords: FloatArray,
    tree: CellTreeData,
) -> IntArray:
    """Mutates box_indices and tree_indices."""
    n_box = box_coords.shape[0]
    indices = np.empty((n_box, 2), dtype=IntDType)
    total_count = 0
    indices_size = 0
    for box_index in range(n_box):
        box = as_box(box_coords[box_index])
        count, indices, indices_size = locate_box_parallel(box, tree, indices, indices_size, box_index)
        total_count += count
    return np.ascontiguousarray(indices[:total_count, :])


@nb.njit(cache=True, parallel=PARALLEL)
def locate_boxes_chunked(
    box_coords: FloatArray,
    tree: CellTreeData,
    n_chunks: int
):
    chunks = np.array_split(box_coords, n_chunks)
    indices = [np.empty((0, 2), dtype=IntDType) for _ in range(n_chunks)]

    for i in nb.prange(n_chunks):
        indices[i] = locate_boxes_parallel(chunks[i], tree)

    # numba seems unhappy with stack or concatenate.
    sizes = np.empty(n_chunks, dtype=IntDType)
    total_size = 0
    for i in range(n_chunks):
        size = len(indices[i])
        sizes[i] = size
        total_size += size

    ii = np.empty(total_size, dtype=IntDType)
    jj = np.empty(total_size, dtype=IntDType)
    start = 0
    for i in range(n_chunks):
        end = start + sizes[i]
        ii[start: end] = indices[i][:, 0]
        jj[start: end] = indices[i][:, 1]
        start = end

    return ii, jj

And calling it in celltree.py:

    def locate_boxes_new(self, bbox_coords: FloatArray) -> Tuple[IntArray, IntArray]:
        bbox_coords = cast_bboxes(bbox_coords)
        n_chunks = nb.get_num_threads()
        return locate_boxes_chunked(bbox_coords, self.celltree_data, n_chunks)

Then benchmarking it:

import os
# os.environ["NUMBA_NUM_THREADS"] = "24"

import numpy as np
import numba_celltree

triangles = np.fromfile("c:/tmp/triangles.bin", dtype=np.int64).reshape((-1, 3))
vertices = np.fromfile("c:/tmp/vertices.bin", dtype=np.float64).reshape((-1, 2))

tree = numba_celltree.CellTree2d(vertices, triangles, -1)
%timeit results = tree.locate_boxes(tree.bb_coords)
%timeit results = tree.locate_boxes_new(tree.bb_coords)

Unfortunately, results are rather disappointing:

404 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
470 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

The text was updated successfully, but these errors were encountered:

Huite · 2024-10-14T10:47:12Z

Fixed by #25

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Parallelization: try explicit chunking with numba prange iterating over chunks #24

Parallelization: try explicit chunking with numba prange iterating over chunks #24

Huite commented Oct 11, 2024 •

edited

Loading

Huite commented Oct 14, 2024

Parallelization: try explicit chunking with numba prange iterating over chunks #24

Parallelization: try explicit chunking with numba prange iterating over chunks #24

Comments

Huite commented Oct 11, 2024 • edited Loading

Huite commented Oct 14, 2024

Huite commented Oct 11, 2024 •

edited

Loading