Prefix scan for various data types with inclusive/exclusive option

This commit improves the existing ``jit_scan()`` function with support for various data types: - int32/uint32 - uint64 - float - double The user scan now also specify whether the scan should be inclusive or exclusive. Finally, the commit adds comments to facilitate future modifications of this code.
mitsuba-renderer · Sep 24, 2023 · 8930bc9 · 8930bc9
1 parent 8ecaaf2
commit 8930bc9
Show file tree

Hide file tree

Showing 16 changed files with 10,810 additions and 3,640 deletions.
diff --git a/include/drjit-core/array.h b/include/drjit-core/array.h
@@ -364,7 +364,7 @@ Array empty(size_t size) {
                                                       : AllocType::HostAsync,
                    byte_size);
     return Array::steal(
-        jit_var_map_mem(Array::Backend, Array::Type, ptr, size, 1));
+        jit_var_mem_map(Array::Backend, Array::Type, ptr, size, 1));
 }
 
 template <typename Array>

diff --git a/include/drjit-core/jit.h b/include/drjit-core/jit.h
@@ -1592,30 +1592,44 @@ extern JIT_EXPORT void jit_memcpy_async(JIT_ENUM JitBackend backend, void *dst,
  */
 extern JIT_EXPORT void jit_reduce(JIT_ENUM JitBackend backend, JIT_ENUM VarType type,
                                   JIT_ENUM ReduceOp rtype,
-                                  const void *ptr, uint32_t size, void *out);
+                                  const void *in, uint32_t size, void *out);
 
-/**
- * \brief Perform an exclusive scan / prefix sum over an unsigned 32 bit integer
- * array
+/** \brief Compute n prefix sum over the given input array
+ *
+ * Both exclusive and inclusive variants are supported. If desired, the scan
+ * can be performed in-place (i.e., <tt>out == in</tt>). The operation runs
+ * asynchronously.
+ *
+ * The operation is currenly implemented for the following numeric types:
+ * ``VarType::Int32``, ``VarType::UInt32``, ``VarType::UInt64``,
+ * ``VarType::Float32``, and ``VarType::Float64``.
  *
- * If desired, the scan can be performed in-place (i.e. <tt>in == out</tt>).
- * Note that the CUDA implementation will round up \c size to the maximum of
- * the following three values for performance reasons:
+ * Note that the CUDA implementation may round \c size to the maximum of the
+ * following three values for performance and implementation-related reasons
+ * (the prefix sum uses a tree-based parallelization scheme).
  *
- * - the value 4,
+ * - the value 4
  * - the next highest power of two (when size <= 4096),
  * - the next highest multiple of 2K (when size > 4096),
  *
  * For this reason, the the supplied memory regions must be sufficiently large
- * to avoid both out-of-bounds reads and writes. This is not an issue for
- * memory obtained using \ref jit_malloc(), which internally rounds
- * allocations to the next largest power of two and enforces a 64 byte minimum
- * allocation size.
- *
- * Runs asynchronously.
- */
-extern JIT_EXPORT void jit_scan_u32(JIT_ENUM JitBackend backend, const uint32_t *in,
-                                    uint32_t size, uint32_t *out);
+ * to avoid out-of-bounds reads and writes. This is not an issue for memory
+ * obtained using \ref jit_malloc(), which internally rounds allocations to the
+ * next largest power of two and enforces a 64 byte minimum allocation size.
+ *
+ * The CUDA backend implementation for *large* numeric types (double precision
+ * floats, 64 bit integers) has the following technical limitation: when
+ * reducing 64-bit integers, their values must be smaller than 2**62. When
+ * reducing double precision arrays, the two least significant mantissa bits
+ * are clamped to zero when forwarding the prefix from one 512-wide block to
+ * the next (at a very minor loss in accuracy). The reason is that the
+ * operations requires two status bits to coordinate the prefix and status of
+ * each 512-wide block, and those must each fit into a single 64 bit value
+ * (128-bit writes aren't guaranteed to be atomic).
+ */
+extern JIT_EXPORT void jit_scan(JIT_ENUM JitBackend backend,
+                                JIT_ENUM VarType type, int exclusive,
+                                const void *in, uint32_t size, void *out);
 
 /**
  * \brief Compress a mask into a list of nonzero indices

diff --git a/resources/Makefile b/resources/Makefile
@@ -1,6 +1,6 @@
 COMPUTE_CAPABILITY=compute_70
 CUDA_VER=10.2
-NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr
+NVCC=/usr/local/cuda-$(CUDA_VER)/bin/nvcc -m64 --ptx --expt-relaxed-constexpr -std=c++14
 
 all: kernels.h
 

diff --git a/resources/common.h b/resources/common.h
@@ -5,6 +5,10 @@
 #include <limits>
 
 #define KERNEL extern "C" __global__
+#define DEVICE __device__
+#define FINLINE __forceinline__
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
 
 template <typename T> struct SharedMemory {
     __device__ inline static T *get() {

diff --git a/resources/compress.cuh b/resources/compress.cuh
@@ -10,6 +10,16 @@
 
 #include "common.h"
 
+DEVICE FINLINE void store_cg(uint64_t *ptr, uint64_t val) {
+    asm volatile("st.cg.u64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+
+DEVICE FINLINE uint64_t load_cg(uint64_t *ptr) {
+    uint64_t retval;
+    asm volatile("ld.cg.u64 %0, [%1];" : "=l"(retval) : "l"(ptr));
+    return retval;
+}
+
 KERNEL void compress_small(const uint8_t *in, uint32_t *out, uint32_t size, uint32_t *count_out) {
     uint32_t *shared = SharedMemory<uint32_t>::get();