Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Wrap launch bounds #570

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cub/detail/detect_cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ namespace detail

#endif // CUB_RUNTIME_FUNCTION predefined

#ifndef CUB_DETAIL_LAUNCH_BOUNDS
#ifdef CUB_RDC_ENABLED
#define CUB_DETAIL_LAUNCH_BOUNDS(...)
#else // not defined CUB_RDC_ENABLED
#define CUB_DETAIL_LAUNCH_BOUNDS(...) \
__launch_bounds__(__VA_ARGS__)
#endif // CUB_RDC_ENABLED
#endif // CUB_DETAIL_LAUNCH_BOUNDS

#endif // Do not document

} // namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ template <
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentHistogramPolicyT::BLOCK_THREADS))
gevtushenko marked this conversation as resolved.
Show resolved Hide resolved
__global__ void DeviceHistogramSweepKernel(
SampleIteratorT d_samples, ///< Input data to reduce
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper, ///< The number bins per final output histogram
Expand Down
6 changes: 4 additions & 2 deletions cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
void __global__
DeviceMergeSortBlockSortKernel(bool ping,
KeyInputIteratorT keys_in,
ValueInputIteratorT items_in,
Expand Down Expand Up @@ -136,7 +137,8 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
void __global__
DeviceMergeSortMergeKernel(bool ping,
KeyIteratorT keys_ping,
ValueIteratorT items_ping,
Expand Down
18 changes: 9 additions & 9 deletions cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ template <
bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
typename KeyT, ///< Key type
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortUpsweepKernel(
Expand Down Expand Up @@ -134,7 +134,7 @@ __global__ void DeviceRadixSortUpsweepKernel(
template <
typename ChainedPolicyT, ///< Chained tuning policy
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
__global__ void RadixSortScanBinsKernel(
OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
int num_counts) ///< [in] Total number of bin-counts
Expand Down Expand Up @@ -184,7 +184,7 @@ template <
typename KeyT, ///< Key type
typename ValueT, ///< Value type
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortDownsweepKernel(
Expand Down Expand Up @@ -247,7 +247,7 @@ template <
typename KeyT, ///< Key type
typename ValueT, ///< Value type
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
__global__ void DeviceRadixSortSingleTileKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
Expand Down Expand Up @@ -363,7 +363,7 @@ template <
typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator
typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
CUB_DETAIL_LAUNCH_BOUNDS(int((ALT_DIGIT_BITS) ?
ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
__global__ void DeviceSegmentedRadixSortKernel(
Expand Down Expand Up @@ -536,8 +536,8 @@ template <
bool IS_DESCENDING,
typename KeyT,
typename OffsetT>
__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
DeviceRadixSortHistogramKernel
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
__global__ void DeviceRadixSortHistogramKernel
(OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit)
{
typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT;
Expand All @@ -555,8 +555,8 @@ template <
typename OffsetT,
typename PortionOffsetT,
typename AtomicOffsetT = PortionOffsetT>
__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
DeviceRadixSortOnesweepKernel
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
__global__ void DeviceRadixSortOnesweepKernel
(AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out,
const ValueT* d_values_in, PortionOffsetT num_items, int current_bit, int num_bits)
Expand Down
6 changes: 3 additions & 3 deletions cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ template <typename ChainedPolicyT,
typename OffsetT,
typename ReductionOpT,
typename AccumT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
__global__ void DeviceReduceKernel(InputIteratorT d_in,
AccumT* d_out,
OffsetT num_items,
Expand Down Expand Up @@ -178,7 +178,7 @@ template <typename ChainedPolicyT,
typename ReductionOpT,
typename InitT,
typename AccumT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
__global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in,
OutputIteratorT d_out,
OffsetT num_items,
Expand Down Expand Up @@ -304,7 +304,7 @@ template <typename ChainedPolicyT,
typename ReductionOpT,
typename InitT,
typename AccumT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
__global__ void DeviceSegmentedReduceKernel(
InputIteratorT d_in,
OutputIteratorT d_out,
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ template <typename AgentReduceByKeyPolicyT,
typename ReductionOpT,
typename OffsetT,
typename AccumT>
__launch_bounds__(int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) __global__
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) __global__
void DeviceReduceByKeyKernel(KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ template <
typename ScanTileStateT, ///< Tile status interface type
typename EqualityOpT, ///< T equality operator type
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentRlePolicyT::BLOCK_THREADS))
__global__ void DeviceRleSweepKernel(
InputIteratorT d_in, ///< [in] Pointer to input sequence of data items
OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ template <typename ChainedPolicyT,
typename InitValueT,
typename OffsetT,
typename AccumT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
__global__ void DeviceScanKernel(InputIteratorT d_in,
OutputIteratorT d_out,
ScanTileStateT tile_state,
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ template <typename ChainedPolicyT,
typename OffsetT,
typename AccumT,
typename KeyT = cub::detail::value_t<KeysInputIteratorT>>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
__global__ void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in,
KeyT *d_keys_prev_in,
ValuesInputIteratorT d_values_in,
Expand Down
8 changes: 4 additions & 4 deletions cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <bool IS_DESCENDING,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename OffsetT>
__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
__global__ void DeviceSegmentedSortFallbackKernel(
const KeyT *d_keys_in_orig,
KeyT *d_keys_out_orig,
Expand Down Expand Up @@ -298,7 +298,7 @@ template <bool IS_DESCENDING,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename OffsetT>
__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
__global__ void DeviceSegmentedSortKernelSmall(
unsigned int small_segments,
unsigned int medium_segments,
Expand Down Expand Up @@ -427,7 +427,7 @@ template <bool IS_DESCENDING,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT,
typename OffsetT>
__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
CUB_DETAIL_LAUNCH_BOUNDS(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
__global__ void DeviceSegmentedSortKernelLarge(
const unsigned int *d_segments_indices,
const KeyT *d_keys_in_orig,
Expand Down Expand Up @@ -685,7 +685,7 @@ template <typename ChainedPolicyT,
typename ValueT,
typename BeginOffsetIteratorT,
typename EndOffsetIteratorT>
__launch_bounds__(1) __global__ void
CUB_DETAIL_LAUNCH_BOUNDS(1) __global__ void
DeviceSegmentedSortContinuationKernel(
LargeKernelT large_kernel,
SmallKernelT small_kernel,
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ template <
typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
typename OffsetT, ///< Signed integer type for global offsets
bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output
__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentSelectIfPolicyT::BLOCK_THREADS))
__global__ void DeviceSelectSweepKernel(
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable)
Expand Down
4 changes: 2 additions & 2 deletions cub/device/dispatch/dispatch_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ template <
typename CoordinateT, ///< Merge path coordinate type
bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1
bool HAS_BETA> ///< Whether the input parameter Beta is 0
__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(SpmvPolicyT::BLOCK_THREADS))
__global__ void DeviceSpmvKernel(
SpmvParams<ValueT, OffsetT> spmv_params, ///< [in] SpMV input parameter bundle
CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates
Expand Down Expand Up @@ -197,7 +197,7 @@ template <
typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values
typename OffsetT, ///< Signed integer type for global offsets
typename ScanTileStateT> ///< Tile status interface type
__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
__global__ void DeviceSegmentFixupKernel(
PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template <typename AgentThreeWayPartitionPolicyT,
typename SelectFirstPartOp,
typename SelectSecondPartOp,
typename OffsetT>
__launch_bounds__(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentThreeWayPartitionPolicyT::BLOCK_THREADS)) __global__
void DeviceThreeWayPartitionKernel(InputIteratorT d_in,
FirstOutputIteratorT d_first_part_out,
SecondOutputIteratorT d_second_part_out,
Expand Down
2 changes: 1 addition & 1 deletion cub/device/dispatch/dispatch_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ template <
typename ScanTileStateT, ///< Tile status interface type
typename EqualityOpT, ///< Equality operator type
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS))
CUB_DETAIL_LAUNCH_BOUNDS(int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS))
__global__ void DeviceUniqueByKeySweepKernel(
KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys
ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values
Expand Down