Skip to content

Commit

Permalink
Prefix scan for various data types with inclusive/exclusive option
Browse files Browse the repository at this point in the history
This commit improves the existing ``jit_scan()`` function with support
for various data types:

- int32/uint32
- uint64
- float
- double

The user scan now also specify whether the scan should be inclusive or
exclusive. Finally, the commit adds comments to facilitate future
modifications of this code.
  • Loading branch information
wjakob committed Sep 24, 2023
1 parent 8ecaaf2 commit 8930bc9
Show file tree
Hide file tree
Showing 16 changed files with 10,810 additions and 3,640 deletions.
2 changes: 1 addition & 1 deletion include/drjit-core/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ Array empty(size_t size) {
: AllocType::HostAsync,
byte_size);
return Array::steal(
jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));
jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));
}

template <typename Array>
Expand Down
48 changes: 31 additions & 17 deletions include/drjit-core/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1592,30 +1592,44 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
*/
extern JIT_EXPORT void jit_reduce(JIT_ENUM JitBackend backend, JIT_ENUM VarType type,
JIT_ENUM ReduceOp rtype,
const void *ptr, uint32_t size, void *out);
const void *in, uint32_t size, void *out);

/**
* \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
* array
/** \brief Compute n prefix sum over the given input array
*
* Both exclusive and inclusive variants are supported. If desired, the scan
* can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
* asynchronously.
*
* The operation is currenly implemented for the following numeric types:
* ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
* ``VarType::Float32``, and ``VarType::Float64``.
*
* If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
* Note that the CUDA implementation will round up \c size to the maximum of
* the following three values for performance reasons:
* Note that the CUDA implementation may round \c size to the maximum of the
* following three values for performance and implementation-related reasons
* (the prefix sum uses a tree-based parallelization scheme).
*
* - the value 4,
* - the value 4
* - the next highest power of two (when size <= 4096),
* - the next highest multiple of 2K (when size > 4096),
*
* For this reason, the the supplied memory regions must be sufficiently large
* to avoid both out-of-bounds reads and writes. This is not an issue for
* memory obtained using \ref jit_malloc(), which internally rounds
* allocations to the next largest power of two and enforces a 64 byte minimum
* allocation size.
*
* Runs asynchronously.
*/
extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t *in,
uint32_t size, uint32_t *out);
* to avoid out-of-bounds reads and writes. This is not an issue for memory
* obtained using \ref jit_malloc(), which internally rounds allocations to the
* next largest power of two and enforces a 64 byte minimum allocation size.
*
* The CUDA backend implementation for *large* numeric types (double precision
* floats, 64 bit integers) has the following technical limitation: when
* reducing 64-bit integers, their values must be smaller than 2**62. When
* reducing double precision arrays, the two least significant mantissa bits
* are clamped to zero when forwarding the prefix from one 512-wide block to
* the next (at a very minor loss in accuracy). The reason is that the
* operations requires two status bits to coordinate the prefix and status of
* each 512-wide block, and those must each fit into a single 64 bit value
* (128-bit writes aren't guaranteed to be atomic).
*/
extern JIT_EXPORT void jit_scan(JIT_ENUM JitBackend backend,
JIT_ENUM VarType type, int exclusive,
const void *in, uint32_t size, void *out);

/**
* \brief Compress a mask into a list of nonzero indices
Expand Down
2 changes: 1 addition & 1 deletion resources/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
COMPUTE_CAPABILITY=compute_70
CUDA_VER=10.2
NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr
NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr -std=c++14

all: kernels.h

Expand Down
4 changes: 4 additions & 0 deletions resources/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#include <limits>

#define KERNEL extern "C" __global__
#define DEVICE __device__
#define FINLINE __forceinline__
#define WARP_SIZE 32
#define FULL_MASK 0xffffffff

template <typename T> struct SharedMemory {
__device__ inline static T *get() {
Expand Down
10 changes: 10 additions & 0 deletions resources/compress.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@

#include "common.h"

DEVICE FINLINE void store_cg(uint64_t *ptr, uint64_t val) {
asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val));
}

DEVICE FINLINE uint64_t load_cg(uint64_t *ptr) {
uint64_t retval;
asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr));
return retval;
}

KERNEL void compress_small(const uint8_t *in, uint32_t *out, uint32_t size, uint32_t *count_out) {
uint32_t *shared = SharedMemory<uint32_t>::get();

Expand Down
Loading

0 comments on commit 8930bc9

Please sign in to comment.