diff --git a/.githooks/pre-commit b/.githooks/pre-commit index e166dadd03..78b5e1a034 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -4,7 +4,7 @@ # are installed, and if so, uses the installed version to format # the staged changes. -base=clang-format-12 +base=clang-format-13 format="" # Redirect output to stderr. diff --git a/Dockerfile b/Dockerfile index fb272596aa..028b2ee3e4 100755 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ build-essential \ cmake \ - clang-format-12 \ + clang-format-13 \ doxygen \ gdb \ git \ diff --git a/Jenkinsfile b/Jenkinsfile index ac3cfd1a0c..da824299c1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -609,7 +609,7 @@ pipeline { -o -iname \'*.cpp.in\' \ -o -iname \'*.cl\' \ | grep -v -E '(build/)|(install/)|(fin/)' \ - | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\'" + | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-13 -style=file {} | diff - {}\'" } steps{ buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, needs_gpu:false, needs_reboot:false) diff --git a/README.md b/README.md index 5858223141..cd498a1933 100755 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ cmake --build . --config Release --target test_tensor All the code is formatted using `clang-format`. To format a file, use: ```shell -clang-format-10 -style=file -i +clang-format-13 -style=file -i ``` To format the code per commit, you can install githooks: diff --git a/docs/install/install.rst b/docs/install/install.rst index a24a5a6bf9..36cce82df2 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -288,7 +288,7 @@ All the code is formatted using `clang-format`. To format a file, use: .. code:: shell - clang-format-10 -style=file -i + clang-format-13 -style=file -i To format the code per commit, you can install githooks: diff --git a/src/composable_kernel/composable_kernel/include/gridwise_operation_wrapper.hpp b/src/composable_kernel/composable_kernel/include/gridwise_operation_wrapper.hpp index 0a1e07ec57..e6e49773f5 100644 --- a/src/composable_kernel/composable_kernel/include/gridwise_operation_wrapper.hpp +++ b/src/composable_kernel/composable_kernel/include/gridwise_operation_wrapper.hpp @@ -4,9 +4,9 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - run_gridwise_operation(Xs... xs) + run_gridwise_operation(Xs... xs) { GridwiseOp{}.Run(xs...); } diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp index ee6a0b7427..0ed43dfc1e 100644 --- a/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp +++ b/src/composable_kernel/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp @@ -90,7 +90,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1 template __device__ static CIndex - CalculateCThreadOriginDataIndex(Number, Number, Number, Number) + CalculateCThreadOriginDataIndex(Number, Number, Number, Number) { const index_t waveId = get_thread_local_1d_id() / WaveSize; @@ -308,7 +308,7 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline template __device__ static CIndex - CalculateCThreadOriginDataIndex(Number, Number, Number, Number) + CalculateCThreadOriginDataIndex(Number, Number, Number, Number) { const index_t waveId = get_thread_local_1d_id() / WaveSize; diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp index fe56d0d813..1a3f0d368b 100644 --- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp +++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp @@ -23,16 +23,16 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_contraction_dlops_v1r2( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1, - const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1, - const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1, - const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10) + kernel_contraction_dlops_v1r2( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1, + const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1, + const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1, + const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10) { constexpr index_t shared_block_size = GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp index d91159b884..861377026b 100644 --- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp +++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp @@ -24,16 +24,16 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_dlops_v1r2( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AKM0M1GridDesc a_k_m0_m1_grid_desc, - const BKN0N1GridDesc b_k_n0_n1_grid_desc, - const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc, - const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r2( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const AKM0M1GridDesc a_k_m0_m1_grid_desc, + const BKN0N1GridDesc b_k_n0_n1_grid_desc, + const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc, + const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); @@ -66,15 +66,15 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k_m0_m1_grid_desc, - const void CONSTANT* p_b_k_n0_n1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r2(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k_m0_m1_grid_desc, + const void CONSTANT* p_b_k_n0_n1_grid_desc, + const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { // first cast void CONSTANT void* to void* // second cast void* to Desc* diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp index 2653dd4340..e6f731cc7b 100644 --- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp +++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp @@ -24,16 +24,16 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_dlops_v1r3( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AK0M0M1K1GridDesc a_k0_m0_m1_k1_grid_desc, - const BK0N0N1K1GridDesc b_k0_n0_n1_k1_grid_desc, - const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc, - const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r3( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const AK0M0M1K1GridDesc a_k0_m0_m1_k1_grid_desc, + const BK0N0N1K1GridDesc b_k0_n0_n1_k1_grid_desc, + const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc, + const CBlockIdToM0N0BlockClusterAdaptor c_blockid_to_m0_n0_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); @@ -66,15 +66,15 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc, - const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + kernel_gemm_dlops_v1r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m0_m1_k1_grid_desc, + const void CONSTANT* p_b_k0_n0_n1_k1_grid_desc, + const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { // first cast void CONSTANT void* to void* // second cast void* to Desc* diff --git a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp index 207f73072f..c08a045bbf 100644 --- a/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp +++ b/src/composable_kernel/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp @@ -22,15 +22,15 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const AK0MK1GridDesc a_k0_m_k1_grid_desc, - const BK0NK1GridDesc b_k0_n_k1_grid_desc, - const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc, - const CBlockClusterAdaptor c_block_cluster_adaptor) + kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const AK0MK1GridDesc a_k0_m_k1_grid_desc, + const BK0NK1GridDesc b_k0_n_k1_grid_desc, + const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc, + const CBlockClusterAdaptor c_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); @@ -56,15 +56,15 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_block_cluster_adaptor) + kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m_k1_grid_desc, + const void CONSTANT* p_b_k0_n_k1_grid_desc, + const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, + const void CONSTANT* p_c_block_cluster_adaptor) { constexpr index_t shared_block_size = GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); diff --git a/src/composable_kernel/composable_kernel/include/utility/magic_division.hpp b/src/composable_kernel/composable_kernel/include/utility/magic_division.hpp index 174c697501..b418106ac4 100644 --- a/src/composable_kernel/composable_kernel/include/utility/magic_division.hpp +++ b/src/composable_kernel/composable_kernel/include/utility/magic_division.hpp @@ -60,7 +60,7 @@ struct MagicDivision // integral_constant template __host__ __device__ static constexpr auto - CalculateMagicNumbers(integral_constant) + CalculateMagicNumbers(integral_constant) { constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor}); @@ -73,7 +73,7 @@ struct MagicDivision template __host__ __device__ static constexpr auto - CalculateMagicMultiplier(integral_constant) + CalculateMagicMultiplier(integral_constant) { constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor}); @@ -82,7 +82,7 @@ struct MagicDivision template __host__ __device__ static constexpr auto - CalculateMagicShift(integral_constant) + CalculateMagicShift(integral_constant) { constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor}); @@ -92,21 +92,21 @@ struct MagicDivision // integral_constant template __host__ __device__ static constexpr auto - CalculateMagicNumbers(integral_constant) + CalculateMagicNumbers(integral_constant) { return CalculateMagicNumbers(integral_constant{}); } template __host__ __device__ static constexpr auto - CalculateMagicMultiplier(integral_constant) + CalculateMagicMultiplier(integral_constant) { return CalculateMagicMultiplier(integral_constant{}); } template __host__ __device__ static constexpr auto - CalculateMagicShift(integral_constant) + CalculateMagicShift(integral_constant) { return CalculateMagicShift(integral_constant{}); } diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp index 09a7fffa3e..212a2de94e 100644 --- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp +++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp @@ -210,16 +210,16 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcy extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k_m0_m1_grid_desc, - const void CONSTANT* p_b_k_n0_n1_grid_desc, - const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k_m0_m1_grid_desc, + const void CONSTANT* p_b_k_n0_n1_grid_desc, + const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp index 51d852617f..c31b14b328 100644 --- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp @@ -206,16 +206,16 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m_k1_grid_desc, + const void CONSTANT* p_b_k0_n_k1_grid_desc, + const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { constexpr auto I0 = Number<0>{}; diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp index 30e4c518ce..2a86f143b5 100644 --- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp +++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp @@ -206,16 +206,16 @@ extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_a_k0_m_k1_grid_desc, - const void CONSTANT* p_b_k0_n_k1_grid_desc, - const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, - const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) + convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_a_k0_m_k1_grid_desc, + const void CONSTANT* p_b_k0_n_k1_grid_desc, + const void CONSTANT* p_c_m0_m1_m2_n_grid_desc, + const void CONSTANT* p_c_blockid_to_m0_n0_block_cluster_adaptor) { constexpr auto I0 = Number<0>{}; diff --git a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp index c1208ac3cb..7fb4271649 100644 --- a/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp +++ b/src/composable_kernel/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp @@ -231,13 +231,13 @@ convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_, extern "C" __global__ void #if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - const void CONSTANT* p_desc_tuple) + convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const void CONSTANT* p_desc_tuple) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index e960c04018..126b5c365c 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -243,7 +243,6 @@ class HeartBeat /// * Solution should provide invoker /// * RunAndMeasureSolution must NOT be implemented. Invoker will be used instead. /// -/// clang-format-off /// ----------------------------------------------- /// Dataflow: /// Forward: @@ -261,7 +260,6 @@ class HeartBeat /// | kernel | --> wei[] (dw) /// bot[] (dy) --> +--------+ /// ------------------------------------------------ -/// clang-format-on template using RunAndMeasure_t = diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp index 56158ccf72..7760b76b23 100644 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp +++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMatrixDescriptor.hpp @@ -53,14 +53,14 @@ __host__ __device__ constexpr auto make_ConstantMatrixDescriptor_packed(Number __host__ __device__ constexpr auto - make_ConstantMatrixDescriptor(Number, Number, Number) +make_ConstantMatrixDescriptor(Number, Number, Number) { return ConstantMatrixDescriptor{}; } template __host__ __device__ constexpr auto - make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) +make_ConstantMatrixDescriptor(ConstantTensorDescriptor_deprecated) { using TDesc = ConstantTensorDescriptor_deprecated; static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp index 78c1daad88..850f7d78e4 100644 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp +++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantTensorDescriptor_deprecated.hpp @@ -372,7 +372,7 @@ struct ConstantTensorDescriptor_deprecated template __host__ __device__ static constexpr auto - StridedSlice(Number, Number, Number) + StridedSlice(Number, Number, Number) { constexpr index_t new_stride = Strides::Get(Number{}) * SliceStride; diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp index 1c63179295..1c65fb7a3a 100644 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp +++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate.hpp @@ -267,7 +267,7 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -275,7 +275,7 @@ struct TensorCoordinate template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp index 494ef1ddd8..a252268039 100644 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp +++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp @@ -327,14 +327,14 @@ struct TensorCoordinate_deprecated private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) { return NormalTensorCoordinate_deprecated>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) { return MergedTensorCoordinate_deprecated< ConstantMergedTensorDescriptor_deprecated>(); diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp index e71096d567..f547f90a11 100644 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp +++ b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_descriptor_helper.hpp @@ -51,7 +51,7 @@ template __host__ __device__ constexpr auto - transform_tensor_descriptor(LowTensorDescriptor, Transforms, LowDimensionIds, UpDimensionIds) +transform_tensor_descriptor(LowTensorDescriptor, Transforms, LowDimensionIds, UpDimensionIds) { return TransformedTensorDescriptor __host__ __device__ constexpr auto - reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, - Sequence, - Sequence, - Sequence) +reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) { return TransformedTensorDescriptor...>, @@ -78,7 +78,7 @@ __host__ __device__ constexpr auto // reorder a NativeTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -96,7 +96,7 @@ __host__ __device__ constexpr auto // reorder a TransformedTensorDescriptor template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) +reorder_tensor_descriptor_given_lower2upper(TransformedTensorDescriptor, MapLower2Upper) { static_assert(is_valid_sequence_map{}, "wrong! MapLower2Upper is not a valid map"); @@ -114,7 +114,7 @@ __host__ __device__ constexpr auto template __host__ __device__ constexpr auto - reorder_tensor_descriptor_given_upper2lower(LowerTensorDescriptor, MapUpper2Lower) +reorder_tensor_descriptor_given_upper2lower(LowerTensorDescriptor, MapUpper2Lower) { return reorder_tensor_descriptor_given_lower2upper( LowerTensorDescriptor{}, typename sequence_map_inverse::type{}); diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp index cfc55e9313..fee5bce6dd 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp index da38031465..11ebc4e149 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.cpp @@ -4,10 +4,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp index afa27e199b..4cf762cca5 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp index 5b485ddc32..0687e941d7 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_ncdhw_kczyx_nkdhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp index 794cb02ab7..d648f2e468 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp index a215cb9ed4..bcac0c63d0 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_out_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_in_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_data_implicit_gemm_v4r1_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_out_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_in_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp index 6b85751408..d7587d40f6 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp index 065f85911f..98f2766ec0 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp index ebf2aa4c21..4bcc4122df 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_out_global, - FLOAT_ACCUM* const __restrict__ p_wei_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_out_global, + FLOAT_ACCUM* const __restrict__ p_wei_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp index c0db8b92c5..86767d1e46 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_out_global, - FLOAT_ACCUM* const __restrict__ p_wei_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_backward_weights_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_out_global, + FLOAT_ACCUM* const __restrict__ p_wei_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp index d32bf2dc65..9e0d798dd5 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp index 0be48afa0b..0370a7c0c7 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_padded_gemm( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp index ee705df074..2881e6e302 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_DEPENDENT_BLOCK_SIZE) void gridwise_convolution_forward_implicit_gemm_v4r5_xdlops_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp index f2178f2af9..7fdf688ca8 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer.cpp @@ -4,10 +4,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r1_gnchw_gkcyx_gnkhw_lds_double_buffer( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp index 7cd2ed0c80..4400f6e4ba 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.cpp @@ -4,10 +4,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp index 1c7fd21e3c..8f4be8d9c6 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp @@ -6,10 +6,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp index f862ce834d..92773ebc0f 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_ncdhw_kczyx_nkdhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp index ff1d531805..dd8069a37c 100644 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp +++ b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp @@ -3,10 +3,10 @@ #include "float_types.h" extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) +__launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw( + const FLOAT* const __restrict__ p_in_global, + const FLOAT* const __restrict__ p_wei_global, + FLOAT* const __restrict__ p_out_global) { using namespace ck; diff --git a/test/gtest/cache.cpp b/test/gtest/cache.cpp index abb5794561..76958efdec 100644 --- a/test/gtest/cache.cpp +++ b/test/gtest/cache.cpp @@ -39,9 +39,9 @@ std::vector random_bytes(size_t length) { auto randchar = []() -> char { - const char charset[] = "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz"; + const char charset[] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; const size_t max_index = (sizeof(charset) - 1); return charset[prng::gen_0_to_B(max_index)]; }; diff --git a/test/include_inliner.cpp b/test/include_inliner.cpp index 62e6510d3f..6332fab03c 100644 --- a/test/include_inliner.cpp +++ b/test/include_inliner.cpp @@ -56,12 +56,12 @@ class InlinerTest const auto invalid_src = test_srcs / "invalid.cl"; const auto header_src = test_srcs / header_filename; - // clang-format-off + // clang-format off std::ofstream(valid_src.c_str()) << "#include <" << header_filename << ">\n" << "#include \"" << header_filename << "\"\n" << "//inliner-include-optional\n" << "#include " << std::endl; - // clang-format-on + // clang-format on std::ofstream(asm_src.c_str()) << ".include \"" << header_filename << "\"" << std::endl; std::ofstream(invalid_src.c_str()) << "#include " << std::endl;