From 113620a6a2c13a66ef6677a9dbfbf355f8aa8ce6 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 2 Jul 2024 11:02:38 +0000 Subject: [PATCH 01/46] UnfoldFwd4d driver, test and api --- driver/CMakeLists.txt | 1 + driver/dm_fold.cpp | 0 driver/dm_unfold.cpp | 39 ++ driver/driver.hpp | 5 +- driver/fold_driver.hpp | 0 driver/mloUnfoldHost.hpp | 114 +++++ driver/unfold_driver.hpp | 422 ++++++++++++++++++ include/miopen/miopen.h | 72 +++ src/CMakeLists.txt | 6 + src/fold.cpp | 82 ++++ src/fold/problem_description.cpp | 79 ++++ src/fold_api.cpp | 63 +++ src/include/miopen/fold.hpp | 50 +++ src/include/miopen/fold/invoke_params.hpp | 63 +++ .../miopen/fold/problem_description.hpp | 163 +++++++ src/include/miopen/fold/solvers.hpp | 75 ++++ src/include/miopen/solver_id.hpp | 4 +- src/include/miopen/tensor_view_utils.hpp | 80 ++++ src/kernels/MIOpenUnfold.cpp | 227 ++++++++++ src/kernels/tensor_view.hpp | 78 ++++ src/solver.cpp | 3 + src/solver/fold/fold_forward.cpp | 168 +++++++ src/solver/fold/unfold_forward.cpp | 178 ++++++++ test/cpu_fold.hpp | 104 +++++ test/gtest/fold.cpp | 97 ++++ test/gtest/fold.hpp | 218 +++++++++ 26 files changed, 2388 insertions(+), 3 deletions(-) create mode 100644 driver/dm_fold.cpp create mode 100644 driver/dm_unfold.cpp create mode 100644 driver/fold_driver.hpp create mode 100644 driver/mloUnfoldHost.hpp create mode 100644 driver/unfold_driver.hpp create mode 100644 src/fold.cpp create mode 100644 src/fold/problem_description.cpp create mode 100644 src/fold_api.cpp create mode 100644 src/include/miopen/fold.hpp create mode 100644 src/include/miopen/fold/invoke_params.hpp create mode 100644 src/include/miopen/fold/problem_description.hpp create mode 100644 src/include/miopen/fold/solvers.hpp create mode 100644 src/include/miopen/tensor_view_utils.hpp create mode 100644 src/kernels/MIOpenUnfold.cpp create mode 100644 src/kernels/tensor_view.hpp create mode 100644 src/solver/fold/fold_forward.cpp create mode 100644 src/solver/fold/unfold_forward.cpp create mode 100644 test/cpu_fold.hpp create mode 100644 test/gtest/fold.cpp create mode 100644 test/gtest/fold.hpp diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 224e550fed..8ca4ccd5c1 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -51,6 +51,7 @@ add_executable(MIOpenDriver dm_softmax.cpp dm_sum.cpp dm_tensorop.cpp + dm_unfold.cpp main.cpp registry_driver_maker.cpp rocrand_wrapper.cpp) diff --git a/driver/dm_fold.cpp b/driver/dm_fold.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/driver/dm_unfold.cpp b/driver/dm_unfold.cpp new file mode 100644 index 0000000000..3d7ed56a91 --- /dev/null +++ b/driver/dm_unfold.cpp @@ -0,0 +1,39 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "unfold_driver.hpp" +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "unfold") + return new UnfoldDriver(); + if(base_arg == "unfoldfp16") + return new UnfoldDriver(); + if(base_arg == "unfoldbfp16") + return new UnfoldDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index 4cfc2b544e..a7396d272f 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n"); + "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "--version") + arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" && + arg != "unfoldbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp new file mode 100644 index 0000000000..465bfa7b4f --- /dev/null +++ b/driver/mloUnfoldHost.hpp @@ -0,0 +1,114 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include <../test/ford.hpp> +#include "tensor_view.hpp" +#include "miopen/tensor_view_utils.hpp" +#include +#include +#include + +template +int32_t mloUnFoldFwd4DRunHost(Tgpu* input, + const miopenTensorDescriptor_t inputDesc, + Tcheck* ref_output, + const miopenTensorDescriptor_t ref_outputDesc, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) +{ + auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc)); + auto input_dims = miopen::deref(inputDesc).GetLengths(); + auto input_size = miopen::deref(inputDesc).GetSize(); + + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + [[maybe_unused]] int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + par_ford(work_size)([&](int gid) { + int ncp = gid / L, l = gid % L; + int nc = ncp / P, p = ncp % P; + int n = nc / C, c = nc % C; + if (n >= N) return; + + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int h = lh * stride_h - padding_h + ph * dilation_h; + int w = lw * stride_w - padding_w + pw * dilation_w; + + Tgpu x = static_cast(0.0f); + if (0 <= h && h < H && 0 <= w && w < W) { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; + x = input[input_idx]; + } + + long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + ref_output[output_idx] = static_cast(x); + }); + + return miopenStatusSuccess; +} + +template +int32_t mloUnFoldBwd4DRunHost(Tgpu* input, + const miopenTensorDescriptor_t inputDesc, + Tcheck* ref_output, + const miopenTensorDescriptor_t ref_outputDesc, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) +{ + return miopenStatusSuccess; +} diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp new file mode 100644 index 0000000000..da835d4f3a --- /dev/null +++ b/driver/unfold_driver.hpp @@ -0,0 +1,422 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_UNFOLD_DRIVER_HPP +#define GUARD_MIOPEN_UNFOLD_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloUnfoldHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "util_driver.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +template +class UnfoldDriver : public Driver +{ +public: + UnfoldDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&dinputDesc); + miopenCreateTensorDescriptor(&doutputDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetTensorLengthsFromCmdLine(); + std::vector GetVectorInt32tFromCmdLine(std::string long_name); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~UnfoldDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(dinputDesc); + miopenDestroyTensorDescriptor(doutputDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + + miopenTensorDescriptor_t doutputDesc; + miopenTensorDescriptor_t dinputDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + + std::unique_ptr doutput_dev; + std::unique_ptr dinput_dev; + + std::vector input; + std::vector output; + + std::vector doutput; + std::vector dinput; + + std::vector output_host; + + std::vector doutput_host; + + std::vector kernel_size; + std::vector stride; + std::vector padding; + std::vector dilation; +}; + +template +int UnfoldDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int UnfoldDriver::GetandSetData() +{ + std::vector input_length = GetTensorLengthsFromCmdLine(); + + kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); + stride = GetVectorInt32tFromCmdLine("stride"); + padding = GetVectorInt32tFromCmdLine("padding"); + dilation = GetVectorInt32tFromCmdLine("dilation"); + std::cout << "asdasdkernel_size " << kernel_size.size() << std::endl; + std::cout << "stride " << stride.size() << std::endl; + std::cout << "padding " << padding.size() << std::endl; + std::cout << "dilation " << dilation.size() << std::endl; + + int spatial_dim_size = input_length.size() - 2; + std::cout << "spatial_dim_size " << spatial_dim_size << std::endl; + + const int N = input_length[0]; + const int C = input_length[1]; + + int P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int l = (input_length[i + 2] + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + 1; + L *= l; + ls.push_back(l); + } + std::cout << "input-asdasd " << input_length.size() << std::endl; + + std::vector output_length = {N, (C*P), L}; + std::cout << "output_length " << output_length.size() << std::endl; + SetTensorNd(inputDesc, input_length, data_type); + SetTensorNd(outputDesc, output_length, data_type); + + SetTensorNd(doutputDesc, output_length, data_type); + SetTensorNd(dinputDesc, input_length, data_type); + + return miopenStatusSuccess; +} + +template +int UnfoldDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int"); + inflags.AddInputFlag("DimLengths", + 'D', + "2,5,3,4", + "The dimensional lengths of the input tensor", + "string"); + inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str"); + inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str"); + inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +std::vector UnfoldDriver::GetTensorLengthsFromCmdLine() +{ + std::string lengthsStr = inflags.GetValueStr("DimLengths"); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + return (lengths); +} + +template +std::vector UnfoldDriver::GetVectorInt32tFromCmdLine(std::string long_name) +{ + std::string lengthsStr = inflags.GetValueStr(long_name); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(static_cast(len)); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(static_cast(len)); + + return (lengths); +} + +template +int UnfoldDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); + + size_t doutput_sz = GetTensorSize(doutputDesc); + size_t dinput_sz = GetTensorSize(dinputDesc); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + + doutput_dev = std::unique_ptr(new GPUMem(ctx, doutput_sz, sizeof(Tgpu))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, dinput_sz, sizeof(Tgpu))); + + input = std::vector(input_sz, static_cast(0.0f)); + output = std::vector(output_sz, static_cast(0.0f)); + + doutput = std::vector(doutput_sz, static_cast(1.0f)); + dinput = std::vector(dinput_sz, static_cast(0.0f)); + + output_host = std::vector(output_sz, static_cast(0.0f)); + + doutput_host = std::vector(doutput_sz, static_cast(0.0f)); + + int status; + + for(int i = 0; i < input_sz; i++) + input[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + status = input_dev->ToGPU(GetStream(), input.data()); + + for(int i = 0; i < doutput_sz; i++) + { + doutput[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + } + status |= doutput_dev->ToGPU(GetStream(), doutput.data()); + status |= dinput_dev->ToGPU(GetStream(), dinput.data()); + + if(status != 0) + std::cout << "Unfold Driver Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int UnfoldDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenUnfoldForward(GetHandle(), + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Unfold Forward Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time + << " ms" << std::endl; + } + + if(output_dev->FromGPU(GetStream(), output.data()) != 0) + std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int UnfoldDriver::RunForwardCPU() +{ + mloUnFoldFwd4DRunHost(input.data(), + inputDesc, + output_host.data(), + outputDesc, + kernel_size, + stride, + padding, + dilation); + + return miopenStatusSuccess; +} + +template +int UnfoldDriver::RunBackwardGPU() +{ + return miopenStatusSuccess; +} + +template +int UnfoldDriver::RunBackwardCPU() +{ + return miopenStatusSuccess; +} + +template +Tref UnfoldDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int UnfoldDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_output = miopen::rms_range(output_host, output); + + if(!std::isfinite(error_output) || error_output > tolerance) + { + std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < " << tolerance << ')' << std::endl; + } + return miopenStatusSuccess; +} + +template +int UnfoldDriver::VerifyBackward() +{ + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_UNFOLD_DRIVER_HPP diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index e768c7b349..ac4e08b63e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6582,6 +6582,78 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d // CLOSEOUT BackendAPI DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +// Fold APIs +/** @addtogroup FOLD + * + * @{ + */ +/*! @brief Execute an unfold forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for data input tensor input (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for data output tensor output (output) + * @param output Data tensor output (output) + * @param kernel_size Size of the sliding box array (input) + * @param kernel_size_size Size of the kernel_size array (input) + * @param stride Stride array of the sliding box (input) + * @param stride_size Size of the stride array (input) + * @param padding Padding array to be added on input (input) + * @param padding_size Size of the padding array (input) + * @param dilation Dilation array control the stride of the elements within the neighborhood (input) + * @param dilation_size Size of the dilation array (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); + +// /*! @brief Execute an unfold forward layer +// * +// * @param handle MIOpen handle (input) +// * @param inputDesc Tensor descriptor for data input tensor input (input) +// * @param input Data tensor input (input) +// * @param outputDesc Tensor descriptor for data output tensor output (output) +// * @param output Data tensor output (output) +// * @param kernel_size Size of the sliding box array (input) +// * @param kernel_size_size Size of the kernel_size array (input) +// * @param stride Stride array of the sliding box (input) +// * @param stride_size Size of the stride array (input) +// * @param padding Padding array to be added on input (input) +// * @param padding_size Size of the padding array (input) +// * @param dilation Dilation array control the stride of the elements within the neighborhood (input) +// * @param dilation_size Size of the dilation array (input) +// * @return miopenStatus_t +// */ +// MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, +// const miopenTensorDescriptor_t inputDesc, +// const void* input, +// const miopenTensorDescriptor_t outputDesc, +// void* output, +// const int32_t* kernel_size, +// const int kernel_size_size, +// const int32_t* stride, +// const int stride_size, +// const int32_t* padding, +// const int padding_size, +// const int32_t* dilation, +// const int dilation_size); + + /** @} */ +// CLOSEOUT FOLD DOXYGEN GROUP +#endif + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9671eed03c..34153587d3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -117,6 +117,8 @@ set( MIOpen_Source expanduser.cpp find_controls.cpp find_db.cpp + fold_api.cpp + fold/problem_description.cpp fused_api.cpp fusion.cpp fusion/problem_description.cpp @@ -256,6 +258,7 @@ set( MIOpen_Source solver/conv_winoRxS.cpp solver/conv_winoRxS_fused.cpp solver/fft.cpp + solver/fold/unfold_forward.cpp solver/gemm.cpp solver/gemm_bwd.cpp solver/gemm_wrw.cpp @@ -421,6 +424,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/neuron.inc kernels/rocm_version.inc kernels/stride_array.hpp + kernels/tensor_view.hpp kernels/utilities.inc kernels/workaround_issue_1431.hpp kernels/xform_bidirect_winograd_code.inc @@ -499,6 +503,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/conv7x7c3h224w224k64u2v2p3q3f1.s kernels/xform_out.s kernels/gcnAsmBNBwdTrainSpatial.s + kernels/MIOpenUnfold.cpp kernels/MIOpenTensorKernels.cl kernels/MIOpenTensorKernelsHip.cpp kernels/MIOpenSubTensorOpWithScalarKernel.cl @@ -578,6 +583,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN activ.cpp argmax.cpp cat.cpp + fold.cpp groupnorm.cpp kernel_cache.cpp layer_norm.cpp diff --git a/src/fold.cpp b/src/fold.cpp new file mode 100644 index 0000000000..97dee5a3e2 --- /dev/null +++ b/src/fold.cpp @@ -0,0 +1,82 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/miopen.h" +#include "miopen/fold/problem_description.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t UnfoldForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + const auto problem = + fold::UnfoldFwdProblemDescription{inputDesc, outputDesc, kernel_size, kernel_size_size, stride, stride_size, padding, padding_size, dilation, dilation_size}; + + const auto invoke_params = [&]() { + auto tmp = fold::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.input = input; + tmp.output = output; + tmp.kernel_size = kernel_size; + tmp.stride = stride; + tmp.padding = padding; + tmp.dilation = dilation; + tmp.kernel_size_size = kernel_size_size; + tmp.stride_size = stride_size; + tmp.padding_size = padding_size; + tmp.dilation_size = dilation_size; + return tmp; + }(); + + const auto algo = AlgorithmName{"UnfoldFwd"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp new file mode 100644 index 0000000000..d65ebd020b --- /dev/null +++ b/src/fold/problem_description.cpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace fold { + +// NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const +// { +// auto input_dtype = inputDesc.GetType(); +// auto output_dtype = outputDesc.GetType(); +// auto size = inputDesc.GetElementSize(); + +// std::ostringstream ss; + +// ss << "fold_fwd"; +// ss << "i_dtype" << input_dtype; +// ss << "o_dtype" << output_dtype; +// ss << "size" << size; + +// return NetworkConfig{ss.str()}; +// } + +NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto output_dtype = outputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto in_dims = inputDesc.GetLengths(); + + std::ostringstream ss; + + ss << "Unfold_fwd"; + ss << "i_dtype" << input_dtype; + ss << "o_dtype" << output_dtype; + ss << "size" << size; + ss << "in_dims" ; + for (auto val : in_dims) { + ss << "_" << val; + } + ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; + ss << "stride_" << stride[0] << "_" << stride[1]; + ss << "padding_" << padding[0] << "_" << padding[1]; + ss << "dilation_" << dilation[0] << "_" << dilation[1]; + + return NetworkConfig{ss.str()}; +} + +} // namespace fold + +} // namespace miopen diff --git a/src/fold_api.cpp b/src/fold_api.cpp new file mode 100644 index 0000000000..6c02dea728 --- /dev/null +++ b/src/fold_api.cpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/miopen.h" +#include +#include +#include +#include +#include + +extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + return miopen::try_([&] { + miopen::UnfoldForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); + }); +} diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp new file mode 100644 index 0000000000..f536f22ce8 --- /dev/null +++ b/src/include/miopen/fold.hpp @@ -0,0 +1,50 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_INSTANCE_NORM_HPP_ +#define MIOPEN_INSTANCE_NORM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t UnfoldForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); +} // namespace miopen +#endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp new file mode 100644 index 0000000000..5bcaf6faf0 --- /dev/null +++ b/src/include/miopen/fold/invoke_params.hpp @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "miopen/miopen.h" +#include +#include + +#include + +namespace miopen { + +namespace fold { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + + const int32_t* kernel_size = nullptr; + const int32_t* stride = nullptr; + const int32_t* padding = nullptr; + const int32_t* dilation = nullptr; + int kernel_size_size = 0; + int stride_size = 0; + int padding_size = 0; + int dilation_size = 0; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +} // namespace fold + +} // namespace miopen diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp new file mode 100644 index 0000000000..5dccce8782 --- /dev/null +++ b/src/include/miopen/fold/problem_description.hpp @@ -0,0 +1,163 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "miopen/errors.hpp" +#include "miopen/miopen.h" +#include +#include +#include + +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace fold { + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); + +// struct FoldFwdProblemDescription : ProblemDescriptionBase +// { +// FoldFwdProblemDescription(const TensorDescriptor& inputDesc_, +// const TensorDescriptor& outputDesc_, +// const int32_t* kernel_size_, +// const int kernel_size_size_, +// const int32_t* stride_, +// const int stride_size_, +// const int32_t* padding_, +// const int padding_size_, +// const int32_t* dilation_, +// const int dilation_size_) +// : inputDesc(inputDesc_), +// outputDesc(outputDesc_), +// kernel_size(kernel_size_), +// kernel_size_size(kernel_size_size_), +// stride(stride_), +// stride_size(stride_size_), +// padding(padding_), +// padding_size(padding_size_), +// dilation(dilation_), +// dilation_size(dilation_size_) +// { +// // IsValidSize(); +// } + +// // bool IsValidSize() const +// // { +// // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) +// // { +// // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG +// // MIOPEN_THROW(miopenStatusBadParm, +// // "Instance Norm: The input tensor dimension should be in range [2, 5]."); +// // #else +// // return false; +// // #endif +// // } +// // return true; +// // } + +// const TensorDescriptor& GetInputDesc() const { return inputDesc; } +// const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + +// NetworkConfig MakeNetworkConfig() const override; + +// public: +// TensorDescriptor inputDesc; +// TensorDescriptor outputDesc; +// const int32_t* kernel_size; +// const int kernel_size_size; +// const int32_t* stride; +// const int stride_size; +// const int32_t* padding; +// const int padding_size; +// const int32_t* dilation; +// const int dilation_size; +// }; + +struct UnfoldFwdProblemDescription : ProblemDescriptionBase +{ + UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) + : inputDesc(inputDesc_), + outputDesc(outputDesc_), + kernel_size(kernel_size_), + kernel_size_size(kernel_size_size_), + stride(stride_), + stride_size(stride_size_), + padding(padding_), + padding_size(padding_size_), + dilation(dilation_), + dilation_size(dilation_size_) + { + // IsValidSize(); + } + +// bool IsValidSize() const +// { +// if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) +// { +// #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG +// MIOPEN_THROW(miopenStatusBadParm, +// "Instance Norm: The input tensor dimension should be in range [2, 5]."); +// #else +// return false; +// #endif +// } +// return true; +// } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + + NetworkConfig MakeNetworkConfig() const override; + +public: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; + const int32_t* kernel_size; + const int kernel_size_size; + const int32_t* stride; + const int stride_size; + const int32_t* padding; + const int padding_size; + const int32_t* dilation; + const int dilation_size; +}; + +} // namespace fold + +} // namespace miopen diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp new file mode 100644 index 0000000000..743a3b6194 --- /dev/null +++ b/src/include/miopen/fold/solvers.hpp @@ -0,0 +1,75 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +#include + +namespace miopen { + +namespace solver { + +namespace fold { + +// using FoldFwdSolverBase = +// NonTunableSolverBase; + +// struct FoldFwd final : FoldFwdSolverBase +// { +// const std::string& SolverDbId() const override { return GetSolverDbId(); } + +// bool IsApplicable( +// const ExecutionContext& context, +// const miopen::fold::FoldFwdProblemDescription& problem) const override; + +// ConvSolution GetSolution( +// const ExecutionContext& context, +// const miopen::fold::FoldFwdProblemDescription& problem) const override; +// }; + +using UnfoldFwdSolverBase = + NonTunableSolverBase; + +struct UnfoldFwd final : UnfoldFwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable( + const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const override; + + ConvSolution GetSolution( + const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const override; +}; + +} // namespace fold + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index c52dc020ac..ca3b700772 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -56,7 +56,9 @@ enum class Primitive Reduce, Cat, Mha, - Softmax + Softmax, + Fold, + Unfold, }; struct MIOPEN_EXPORT Id diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp new file mode 100644 index 0000000000..4a7c0b51ad --- /dev/null +++ b/src/include/miopen/tensor_view_utils.hpp @@ -0,0 +1,80 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_ +#define MIOPEN_TENSOR_VIEW_UTIL_HPP_ + +#include "../../kernels/tensor_view.hpp" +#include "miopen/tensor.hpp" + +namespace miopen { + +template +inline tensor_view_t get_inner_expanded_tv(const TensorDescriptor Desc) +{ + auto dims = Desc.GetLengths(); + auto strides = Desc.GetStrides(); + + tensor_view_t tensor_view; + for(size_t i = 0; i < N; ++i) + { + if(i < dims.size()) + { + tensor_view.stride[i] = strides[i]; + tensor_view.size[i] = dims[i]; + } + else + { + tensor_view.stride[i] = (i == 0 ? 1 : strides[i - 1]); + tensor_view.size[i] = 1; + } + } + return tensor_view; +} + +template +inline void slice_tv(tensor_view_t& tensor_view, int32_t sliceCount, const int32_t* slices) +{ + for(int32_t i = 0; i < sliceCount; i++) + { + int32_t dim = slices[4 * i + 0]; + int32_t start = slices[4 * i + 1]; + int32_t end = slices[4 * i + 2]; + int32_t step = slices[4 * i + 3]; + + if(end > static_cast(tensor_view.size[dim])) + end = tensor_view.size[dim]; + + auto len = end - start; + + tensor_view.size[dim] = (len + step - 1) / step; + tensor_view.stride[dim] *= step; + } +} + +} // namespace miopen + +#endif // MIOPEN_TENSOR_REORDER_UTIL_HPP_ diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp new file mode 100644 index 0000000000..1135797401 --- /dev/null +++ b/src/kernels/MIOpenUnfold.cpp @@ -0,0 +1,227 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +template +__device__ void unfoldForward4D(const TIO* input, + TIO* output, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<4> input_tv, + tensor_view_t<3> output_tv) +{ + /* + * input = {N, C, H, W}, output = {N, C * P, L} + * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for + * formula) + * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE} + */ + + const int gid = threadIdx.x + blockIdx.x * blockDim.x; + int ncp = gid / L, l = gid % L; + int nc = ncp / P, p = ncp % P; + int n = nc / C, c = nc % C; + if (n >= N) return; + + + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int h = lh * stride_h - padding_h + ph * dilation_h; + int w = lw * stride_w - padding_w + pw * dilation_w; + + TIO x = 0; + if (0 <= h && h < H && 0 <= w && w < W) { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; + x = input[input_idx]; + } + + long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + output[output_idx] = x; +} + +extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input, + IN_OUT_TYPE* output, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<4> input_tv, + tensor_view_t<3> output_tv) +{ + unfoldForward4D( input, + output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); +} + +template +__device__ void unfoldBackward4D(const TIO* output_grad, + TIO* input_grad, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<3> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + /* + * output_grad = {N, C * P, L}, input_grad = {N, C, H, W} + * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for + * formula) + * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE} + */ + + const int gid = threadIdx.x + blockIdx.x * blockDim.x; + int nch = gid / W, w = gid % W; + int nc = nch / H, h = nch % H; + int n = nc / C, c = nc % C; + if (n >= N) return; + + FLOAT_ACCUM sum = 0.0f; + for (int ph = 0; ph < kernel_size_h; ++ph) + { + for (int pw = 0; pw < kernel_size_w; ++pw) + { + int lhsh = h - ph * dilation_h + padding_h; + int lwsw = w - pw * dilation_w + padding_w; + if (lhsh % stride_h != 0) continue; + if (lwsw % stride_w != 0) continue; + int lh = lhsh / stride_h; + int lw = lwsw / stride_w; + if (lh < 0 || LH <= lh) continue; + if (lw < 0 || LW <= lw) continue; + long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + output_grad_tv.stride[0] * n; + sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]); + } + } + + long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum); +} + +extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad, + IN_OUT_TYPE* input_grad, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<3> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + unfoldBackward4D(output_grad, + input_grad, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + output_grad_tv, + input_grad_tv); +} diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp new file mode 100644 index 0000000000..d35bfd93fc --- /dev/null +++ b/src/kernels/tensor_view.hpp @@ -0,0 +1,78 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_TENSOR_VIEW_HPP +#define GUARD_TENSOR_VIEW_HPP + +template +struct tensor_layout_t; + +template +struct tensor_view_t +{ + // Get index in tensor view at tensor layout + constexpr uint64_t get_tensor_view_idx(const tensor_layout_t& tensor_layout) + { + static_assert(N > 0); + uint64_t idx = 0; + for(auto i = 0; i < N; ++i) + { + idx += stride[i] * tensor_layout.layout[i]; + } + return idx; + } + uint64_t stride[N]; + uint64_t size[N]; +}; + +template +struct tensor_layout_t +{ + // Make tensor layout at index using tensor view + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t idx) + { + static_assert(N > 0); + uint64_t temp = idx; + if constexpr(N == 1) + { + layout[0] = idx; + } + else + { + for(auto i = N - 1; i > 1; --i) + { + layout[i] = temp % tensor_view.size[i]; + temp = temp / tensor_view.size[i]; + } + layout[1] = temp % tensor_view.size[1]; + layout[0] = temp / tensor_view.size[1]; + } + } + + uint64_t layout[N]; +}; + +#endif // GUARD_TENSOR_VIEW_HPP diff --git a/src/solver.cpp b/src/solver.cpp index f45f3058a6..97fa4637f3 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -648,6 +649,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Mha, mha::Mha{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); + // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp new file mode 100644 index 0000000000..d3e44c0d33 --- /dev/null +++ b/src/solver/fold/fold_forward.cpp @@ -0,0 +1,168 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/fold/problem_description.hpp" +#include "miopen/miopen.h" +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace fold { + +bool FoldFwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::FoldFwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution FoldFwd::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + const miopen::fold::FoldFwdProblemDescription& problem) const +{ + std::ignore = context; + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto input_dims = problem.GetInputDesc().GetLengths(); + + auto output_dims = problem.GetOutputDesc().GetLengths(); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); + int32_t H = static_cast(output_dims[2]); + int32_t W = static_cast(output_dims[3]); + + { + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldBackward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * H * W, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + auto input_dims = deref(params.inputDesc).GetLengths(); + auto output_dims = deref(params.outputDesc).GetLengths(); + + int spatial_dim_size = output_dims.size() - 2; + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= params.kernel_size[i]; + int32_t l = (output_dims[i + 2] + 2 * params.padding[i] - + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.stride[i] + + 0; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = params.kernel_size[0]; + int32_t kernel_size_w = params.kernel_size[1]; + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(output_dims[2]); + int32_t W = static_cast(output_dims[3]); + + kernel(params.input, + params.output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace fold + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp new file mode 100644 index 0000000000..68f8072e74 --- /dev/null +++ b/src/solver/fold/unfold_forward.cpp @@ -0,0 +1,178 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/fold/problem_description.hpp" +#include "miopen/miopen.h" +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace fold { + +bool UnfoldFwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::UnfoldFwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution UnfoldFwd::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const +{ + std::ignore = context; + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto input_dims = problem.GetInputDesc().GetLengths(); + + auto output_dims = problem.GetOutputDesc().GetLengths(); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + int spatial_dim_size = input_dims.size() - 2; + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= problem.kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * problem.padding[i] - + problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / + problem.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + { + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldForward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); + auto input_dims = deref(params.inputDesc).GetLengths(); + auto output_dims = deref(params.outputDesc).GetLengths(); + + int spatial_dim_size = input_dims.size() - 2; + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= params.kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * params.padding[i] - + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = params.kernel_size[0]; + int32_t kernel_size_w = params.kernel_size[1]; + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + + kernel(params.input, + params.output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace fold + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp new file mode 100644 index 0000000000..0e9588e000 --- /dev/null +++ b/test/cpu_fold.hpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_UNFOLD_HPP +#define GUARD_CPU_UNFOLD_HPP + +#pragma once + +#include "miopen/tensor.hpp" +#include "tensor_holder.hpp" +#include "tensor_view.hpp" +#include "miopen/tensor_view_utils.hpp" + +template +void cpu_unfold_fwd_4d(tensor input_tensor, + tensor& ref_output_tensor, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) +{ + auto input_tv = miopen::get_inner_expanded_tv<4>(input_tensor.desc); + auto output_tv = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc); + auto input_size = input_tensor.desc.GetSize(); + auto input_dims = input_tensor.desc.GetLengths(); + + auto input = input_tensor.data.data(); + auto output = ref_output_tensor.data.data(); + + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + par_ford(work_size)([&](int gid) { + int ncp = gid / L, l = gid % L; + int nc = ncp / P, p = ncp % P; + int n = nc / C, c = nc % C; + if (n >= N) return; + + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int h = lh * stride_h - padding_h + ph * dilation_h; + int w = lw * stride_w - padding_w + pw * dilation_w; + + T x = static_cast(0.0f); + if (0 <= h && h < H && 0 <= w && w < W) { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; + x = input[input_idx]; + } + + long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + output[output_idx] = x; + }); +} +#endif diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp new file mode 100644 index 0000000000..d1843ae3c8 --- /dev/null +++ b/test/gtest/fold.cpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "fold.hpp" +#include "miopen/bfloat16.hpp" +#include "tensor_holder.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace fold { +struct UnfoldForwardTestFloat32 : UnfoldFwdTest +{ +}; + +struct UnfoldForwardTestFloat16 : UnfoldFwdTest +{ +}; + +struct UnfoldForwardTestBFloat16 : UnfoldFwdTest +{ +}; +}; // namespace fold +using namespace fold; +TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestFloat32, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestFloat16, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestBFloat16, + testing::ValuesIn(UnfoldTestConfigs())); diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp new file mode 100644 index 0000000000..eee1e79fef --- /dev/null +++ b/test/gtest/fold.hpp @@ -0,0 +1,218 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "../driver/tensor_driver.hpp" +#include "cpu_fold.hpp" +#include "get_handle.hpp" +#include "miopen/allocator.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include +#include + +struct UnfoldTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + std::vector kernelSize; + std::vector stride; + std::vector padding; + std::vector dilation; + bool isContiguous = true; + friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc) + { + os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H + << " W:" << tc.W << " kernel_size:"; + for (const auto& ks : tc.kernelSize) os << ks << " "; + os << "stride:"; + for (const auto& s : tc.stride) os << s << " "; + os << "padding:"; + for (const auto& p : tc.padding) os << p << " "; + os << "dilation:"; + for (const auto& d : tc.dilation) os << d << " "; + os << "isContiguous:" << std::boolalpha << tc.isContiguous; + return os; + } + + std::vector GetInput() + { + if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, D, H, W}); + } + else if((N != 0) && (C != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, H, W}); + } + else if((N != 0) && (C != 0) && (W != 0)) + { + return std::vector({N, C, W}); + } + else if((N != 0) && (W != 0)) + { + return std::vector({N, W}); + } + else if((N != 0)) + { + return std::vector({N}); + } + else + { + std::cout << "Error Input Tensor Lengths\n" << std::endl; + return std::vector({0}); + } + } + + std::vector ComputeStrides(std::vector inputDim) const + { + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +std::vector UnfoldTestConfigs() +{ // n c d h w padding + return { + {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true}, + {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true}, + {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true}, + }; +} + +template +struct UnfoldFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + std::vector in_dims = config.GetInput(); + std::vector in_strides = config.ComputeStrides(in_dims); + + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); + + int spatial_dim_size = in_dims.size() - 2; + const int32_t N = static_cast(in_dims[0]); + const int32_t C = static_cast(in_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= config.kernelSize[i]; + int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - + config.dilation[i] * (config.kernelSize[i] - 1) - 1) / + config.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + std::vector out_dims{static_cast(N), + static_cast(C * P), + static_cast(L)}; + + output = tensor{out_dims}.generate(gen_zero); + outputHost = tensor{out_dims}.generate(gen_zero); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::UnfoldForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); + + cpu_unfold_fwd_4d(input, + outputHost, + config.kernelSize, + config.stride, + config.padding, + config.dilation); + + EXPECT_EQ(status, miopenStatusSuccess); + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + auto error_output = miopen::rms_range(outputHost, output); + EXPECT_TRUE(error_output < tolerance) + << "Error forward output beyond tolerance Error: {" << error_output + << "}, Tolerance: " << tolerance; + } + UnfoldTestCase config; + + tensor input; + tensor output; + + tensor outputHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; +}; From 6902fdfda3e8c31c1ea7524ff6dc744a400ab359 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 4 Jul 2024 03:23:41 +0000 Subject: [PATCH 02/46] githook format --- driver/mloUnfoldHost.hpp | 103 +++---- driver/unfold_driver.hpp | 101 +++---- include/miopen/miopen.h | 32 ++- src/fold.cpp | 58 ++-- src/fold/problem_description.cpp | 21 +- src/fold_api.cpp | 48 ++-- src/include/miopen/fold.hpp | 24 +- src/include/miopen/fold/invoke_params.hpp | 22 +- .../miopen/fold/problem_description.hpp | 46 +-- src/include/miopen/fold/solvers.hpp | 11 +- src/kernels/MIOpenUnfold.cpp | 261 +++++++++--------- src/solver/fold/fold_forward.cpp | 99 ++++--- src/solver/fold/unfold_forward.cpp | 102 +++---- test/cpu_fold.hpp | 83 +++--- test/gtest/fold.hpp | 79 +++--- 15 files changed, 554 insertions(+), 536 deletions(-) diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index 465bfa7b4f..7941eb63c4 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -35,80 +35,85 @@ template int32_t mloUnFoldFwd4DRunHost(Tgpu* input, - const miopenTensorDescriptor_t inputDesc, - Tcheck* ref_output, - const miopenTensorDescriptor_t ref_outputDesc, - const std::vector kernel_size, - const std::vector stride, - const std::vector padding, - const std::vector dilation) + const miopenTensorDescriptor_t inputDesc, + Tcheck* ref_output, + const miopenTensorDescriptor_t ref_outputDesc, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) { - auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); - auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc)); - auto input_dims = miopen::deref(inputDesc).GetLengths(); - auto input_size = miopen::deref(inputDesc).GetSize(); + auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc)); + auto input_dims = miopen::deref(inputDesc).GetLengths(); + auto input_size = miopen::deref(inputDesc).GetSize(); const int LOCAL_SIZE = 256; int spatial_dim_size = input_size - 2; - const int32_t N = static_cast(input_dims[0]); - const int32_t C = static_cast(input_dims[1]); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; ls.push_back(l); } [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; - int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; - [[maybe_unused]] int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + [[maybe_unused]] int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { - int ncp = gid / L, l = gid % L; - int nc = ncp / P, p = ncp % P; - int n = nc / C, c = nc % C; - if (n >= N) return; + int ncp = gid / L, l = gid % L; + int nc = ncp / P, p = ncp % P; + int n = nc / C, c = nc % C; + if(n >= N) + return; - int lh = l / LW, lw = l % LW; // sliding window position - int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel - int h = lh * stride_h - padding_h + ph * dilation_h; - int w = lw * stride_w - padding_w + pw * dilation_w; + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int h = lh * stride_h - padding_h + ph * dilation_h; + int w = lw * stride_w - padding_w + pw * dilation_w; - Tgpu x = static_cast(0.0f); - if (0 <= h && h < H && 0 <= w && w < W) { - long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; - x = input[input_idx]; - } + Tgpu x = static_cast(0.0f); + if(0 <= h && h < H && 0 <= w && w < W) + { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + + input_tv.stride[1] * c + input_tv.stride[0] * n; + x = input[input_idx]; + } - long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; - ref_output[output_idx] = static_cast(x); + long output_idx = + output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + ref_output[output_idx] = static_cast(x); }); - + return miopenStatusSuccess; } template int32_t mloUnFoldBwd4DRunHost(Tgpu* input, - const miopenTensorDescriptor_t inputDesc, - Tcheck* ref_output, - const miopenTensorDescriptor_t ref_outputDesc, - const std::vector kernel_size, - const std::vector stride, - const std::vector padding, - const std::vector dilation) + const miopenTensorDescriptor_t inputDesc, + Tcheck* ref_output, + const miopenTensorDescriptor_t ref_outputDesc, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) { return miopenStatusSuccess; } diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index da835d4f3a..cfa25d3a85 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -135,37 +135,27 @@ int UnfoldDriver::GetandSetData() std::vector input_length = GetTensorLengthsFromCmdLine(); kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); - stride = GetVectorInt32tFromCmdLine("stride"); - padding = GetVectorInt32tFromCmdLine("padding"); - dilation = GetVectorInt32tFromCmdLine("dilation"); - std::cout << "asdasdkernel_size " << kernel_size.size() << std::endl; - std::cout << "stride " << stride.size() << std::endl; - std::cout << "padding " << padding.size() << std::endl; - std::cout << "dilation " << dilation.size() << std::endl; - + stride = GetVectorInt32tFromCmdLine("stride"); + padding = GetVectorInt32tFromCmdLine("padding"); + dilation = GetVectorInt32tFromCmdLine("dilation"); int spatial_dim_size = input_length.size() - 2; - std::cout << "spatial_dim_size " << spatial_dim_size << std::endl; - const int N = input_length[0]; const int C = input_length[1]; int P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; - int l = (input_length[i + 2] + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / - stride[i] + 1; + int l = (input_length[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; L *= l; ls.push_back(l); } - std::cout << "input-asdasd " << input_length.size() << std::endl; - - std::vector output_length = {N, (C*P), L}; - std::cout << "output_length " << output_length.size() << std::endl; + std::vector output_length = {N, (C * P), L}; SetTensorNd(inputDesc, input_length, data_type); SetTensorNd(outputDesc, output_length, data_type); - SetTensorNd(doutputDesc, output_length, data_type); SetTensorNd(dinputDesc, input_length, data_type); @@ -175,12 +165,10 @@ int UnfoldDriver::GetandSetData() template int UnfoldDriver::AddCmdLineArgs() { - inflags.AddInputFlag("forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int"); - inflags.AddInputFlag("DimLengths", - 'D', - "2,5,3,4", - "The dimensional lengths of the input tensor", - "string"); + inflags.AddInputFlag( + "forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int"); + inflags.AddInputFlag( + "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string"); inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str"); inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str"); @@ -257,27 +245,27 @@ std::vector UnfoldDriver::GetVectorInt32tFromCmdLine(std::s template int UnfoldDriver::AllocateBuffersAndCopy() { - size_t input_sz = GetTensorSize(inputDesc); - size_t output_sz = GetTensorSize(outputDesc); + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); size_t doutput_sz = GetTensorSize(doutputDesc); size_t dinput_sz = GetTensorSize(dinputDesc); uint32_t ctx = 0; - input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); - output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); doutput_dev = std::unique_ptr(new GPUMem(ctx, doutput_sz, sizeof(Tgpu))); dinput_dev = std::unique_ptr(new GPUMem(ctx, dinput_sz, sizeof(Tgpu))); - input = std::vector(input_sz, static_cast(0.0f)); - output = std::vector(output_sz, static_cast(0.0f)); + input = std::vector(input_sz, static_cast(0.0f)); + output = std::vector(output_sz, static_cast(0.0f)); doutput = std::vector(doutput_sz, static_cast(1.0f)); dinput = std::vector(dinput_sz, static_cast(0.0f)); - output_host = std::vector(output_sz, static_cast(0.0f)); + output_host = std::vector(output_sz, static_cast(0.0f)); doutput_host = std::vector(doutput_sz, static_cast(0.0f)); @@ -312,18 +300,18 @@ int UnfoldDriver::RunForwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenUnfoldForward(GetHandle(), - inputDesc, - input_dev->GetMem(), - outputDesc, - output_dev->GetMem(), - kernel_size.data(), - kernel_size.size(), - stride.data(), - stride.size(), - padding.data(), - padding.size(), - dilation.data(), - dilation.size()); + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -342,8 +330,8 @@ int UnfoldDriver::RunForwardGPU() float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; - std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time - << " ms" << std::endl; + std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time << " ms" + << std::endl; } if(output_dev->FromGPU(GetStream(), output.data()) != 0) @@ -357,14 +345,13 @@ template int UnfoldDriver::RunForwardCPU() { mloUnFoldFwd4DRunHost(input.data(), - inputDesc, - output_host.data(), - outputDesc, - kernel_size, - stride, - padding, - dilation); - + inputDesc, + output_host.data(), + outputDesc, + kernel_size, + stride, + padding, + dilation); return miopenStatusSuccess; } @@ -402,13 +389,13 @@ int UnfoldDriver::VerifyForward() if(!std::isfinite(error_output) || error_output > tolerance) { - std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance - << std::endl; + std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance << std::endl; return EC_VerifyFwd; } else { - std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < " << tolerance << ')' << std::endl; + std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < " + << tolerance << ')' << std::endl; } return miopenStatusSuccess; } diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index ac4e08b63e..45e8df42db 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6601,23 +6601,24 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d * @param stride_size Size of the stride array (input) * @param padding Padding array to be added on input (input) * @param padding_size Size of the padding array (input) - * @param dilation Dilation array control the stride of the elements within the neighborhood (input) + * @param dilation Dilation array control the stride of the elements within the + * neighborhood (input) * @param dilation_size Size of the dilation array (input) * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); // /*! @brief Execute an unfold forward layer // * @@ -6632,7 +6633,8 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, // * @param stride_size Size of the stride array (input) // * @param padding Padding array to be added on input (input) // * @param padding_size Size of the padding array (input) -// * @param dilation Dilation array control the stride of the elements within the neighborhood (input) +// * @param dilation Dilation array control the stride of the elements within the +// neighborhood (input) // * @param dilation_size Size of the dilation array (input) // * @return miopenStatus_t // */ @@ -6650,7 +6652,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, // const int32_t* dilation, // const int dilation_size); - /** @} */ +/** @} */ // CLOSEOUT FOLD DOXYGEN GROUP #endif diff --git a/src/fold.cpp b/src/fold.cpp index 97dee5a3e2..d545c01964 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -38,37 +38,45 @@ namespace miopen { miopenStatus_t UnfoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { - const auto problem = - fold::UnfoldFwdProblemDescription{inputDesc, outputDesc, kernel_size, kernel_size_size, stride, stride_size, padding, padding_size, dilation, dilation_size}; + const auto problem = fold::UnfoldFwdProblemDescription{inputDesc, + outputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; const auto invoke_params = [&]() { - auto tmp = fold::InvokeParams{}; - tmp.type = InvokeType::Run; - tmp.inputDesc = &inputDesc; + auto tmp = fold::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; tmp.outputDesc = &outputDesc; - tmp.input = input; - tmp.output = output; - tmp.kernel_size = kernel_size; + tmp.input = input; + tmp.output = output; + tmp.kernel_size = kernel_size; tmp.stride = stride; - tmp.padding = padding; - tmp.dilation = dilation; + tmp.padding = padding; + tmp.dilation = dilation; tmp.kernel_size_size = kernel_size_size; - tmp.stride_size = stride_size; - tmp.padding_size = padding_size; - tmp.dilation_size = dilation_size; + tmp.stride_size = stride_size; + tmp.padding_size = padding_size; + tmp.dilation_size = dilation_size; return tmp; }(); diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index d65ebd020b..9a0c6ec921 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -51,10 +51,10 @@ namespace fold { NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const { - auto input_dtype = inputDesc.GetType(); - auto output_dtype = outputDesc.GetType(); - auto size = inputDesc.GetElementSize(); - auto in_dims = inputDesc.GetLengths(); + auto input_dtype = inputDesc.GetType(); + auto output_dtype = outputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto in_dims = inputDesc.GetLengths(); std::ostringstream ss; @@ -62,14 +62,15 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const ss << "i_dtype" << input_dtype; ss << "o_dtype" << output_dtype; ss << "size" << size; - ss << "in_dims" ; - for (auto val : in_dims) { + ss << "in_dims"; + for(auto val : in_dims) + { ss << "_" << val; } - ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; - ss << "stride_" << stride[0] << "_" << stride[1]; - ss << "padding_" << padding[0] << "_" << padding[1]; - ss << "dilation_" << dilation[0] << "_" << dilation[1]; + ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; + ss << "stride_" << stride[0] << "_" << stride[1]; + ss << "padding_" << padding[0] << "_" << padding[1]; + ss << "dilation_" << dilation[0] << "_" << dilation[1]; return NetworkConfig{ss.str()}; } diff --git a/src/fold_api.cpp b/src/fold_api.cpp index 6c02dea728..1e6c97ef83 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -32,32 +32,32 @@ #include extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { return miopen::try_([&] { miopen::UnfoldForward(miopen::deref(handle), - miopen::deref(inputDesc), - DataCast(input), - miopen::deref(outputDesc), - DataCast(output), - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size); + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); }); } diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index f536f22ce8..33e879eb0a 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -34,17 +34,17 @@ struct Handle; struct TensorDescriptor; miopenStatus_t UnfoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index 5bcaf6faf0..318e312206 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -39,20 +39,20 @@ struct InvokeParams : public miopen::InvokeParams { InvokeParams() = default; - const TensorDescriptor* inputDesc = nullptr; - const TensorDescriptor* outputDesc = nullptr; + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; - ConstData_t input = nullptr; - Data_t output = nullptr; + ConstData_t input = nullptr; + Data_t output = nullptr; const int32_t* kernel_size = nullptr; - const int32_t* stride = nullptr; - const int32_t* padding = nullptr; - const int32_t* dilation = nullptr; - int kernel_size_size = 0; - int stride_size = 0; - int padding_size = 0; - int dilation_size = 0; + const int32_t* stride = nullptr; + const int32_t* padding = nullptr; + const int32_t* dilation = nullptr; + int kernel_size_size = 0; + int stride_size = 0; + int padding_size = 0; + int dilation_size = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index 5dccce8782..938abe6dae 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -74,7 +74,8 @@ bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); // // { // // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG // // MIOPEN_THROW(miopenStatusBadParm, -// // "Instance Norm: The input tensor dimension should be in range [2, 5]."); +// // "Instance Norm: The input tensor dimension should be in range [2, +// 5]."); // // #else // // return false; // // #endif @@ -114,31 +115,32 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase const int dilation_size_) : inputDesc(inputDesc_), outputDesc(outputDesc_), - kernel_size(kernel_size_), - kernel_size_size(kernel_size_size_), - stride(stride_), - stride_size(stride_size_), - padding(padding_), - padding_size(padding_size_), - dilation(dilation_), - dilation_size(dilation_size_) + kernel_size(kernel_size_), + kernel_size_size(kernel_size_size_), + stride(stride_), + stride_size(stride_size_), + padding(padding_), + padding_size(padding_size_), + dilation(dilation_), + dilation_size(dilation_size_) { // IsValidSize(); } -// bool IsValidSize() const -// { -// if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) -// { -// #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG -// MIOPEN_THROW(miopenStatusBadParm, -// "Instance Norm: The input tensor dimension should be in range [2, 5]."); -// #else -// return false; -// #endif -// } -// return true; -// } + // bool IsValidSize() const + // { + // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) + // { + // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + // MIOPEN_THROW(miopenStatusBadParm, + // "Instance Norm: The input tensor dimension should be in range [2, + // 5]."); + // #else + // return false; + // #endif + // } + // return true; + // } const TensorDescriptor& GetInputDesc() const { return inputDesc; } const TensorDescriptor& GetOutputDesc() const { return outputDesc; } diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp index 743a3b6194..0d2cbe282f 100644 --- a/src/include/miopen/fold/solvers.hpp +++ b/src/include/miopen/fold/solvers.hpp @@ -59,13 +59,12 @@ struct UnfoldFwd final : UnfoldFwdSolverBase { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable( - const ExecutionContext& context, - const miopen::fold::UnfoldFwdProblemDescription& problem) const override; + bool IsApplicable(const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const override; - ConvSolution GetSolution( - const ExecutionContext& context, - const miopen::fold::UnfoldFwdProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const override; }; } // namespace fold diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index 1135797401..5c39a82e2c 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -54,174 +54,185 @@ __device__ void unfoldForward4D(const TIO* input, tensor_view_t<4> input_tv, tensor_view_t<3> output_tv) { - /* - * input = {N, C, H, W}, output = {N, C * P, L} - * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for - * formula) - * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE} - */ + /* + * input = {N, C, H, W}, output = {N, C * P, L} + * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for + * formula) + * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE} + */ const int gid = threadIdx.x + blockIdx.x * blockDim.x; int ncp = gid / L, l = gid % L; int nc = ncp / P, p = ncp % P; int n = nc / C, c = nc % C; - if (n >= N) return; + if(n >= N) + return; - - int lh = l / LW, lw = l % LW; // sliding window position - int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel int h = lh * stride_h - padding_h + ph * dilation_h; int w = lw * stride_w - padding_w + pw * dilation_w; TIO x = 0; - if (0 <= h && h < H && 0 <= w && w < W) { - long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; + if(0 <= h && h < H && 0 <= w && w < W) + { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + + input_tv.stride[0] * n; x = input[input_idx]; } - long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; - output[output_idx] = x; + long output_idx = + output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + output[output_idx] = x; } extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input, - IN_OUT_TYPE* output, - int N, - int C, - int H, - int W, - int P, - int L, - int LH, - int LW, - int kernel_size_h, - int kernel_size_w, - int stride_h, - int stride_w, - int padding_h, - int padding_w, - int dilation_h, - int dilation_w, - tensor_view_t<4> input_tv, - tensor_view_t<3> output_tv) + IN_OUT_TYPE* output, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<4> input_tv, + tensor_view_t<3> output_tv) { - unfoldForward4D( input, - output, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - input_tv, - output_tv); + unfoldForward4D(input, + output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); } template __device__ void unfoldBackward4D(const TIO* output_grad, - TIO* input_grad, - int N, - int C, - int H, - int W, - int P, - int L, - int LH, - int LW, - int kernel_size_h, - int kernel_size_w, - int stride_h, - int stride_w, - int padding_h, - int padding_w, - int dilation_h, - int dilation_w, - tensor_view_t<3> output_grad_tv, - tensor_view_t<4> input_grad_tv) + TIO* input_grad, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<3> output_grad_tv, + tensor_view_t<4> input_grad_tv) { - /* - * output_grad = {N, C * P, L}, input_grad = {N, C, H, W} - * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for - * formula) - * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE} - */ + /* + * output_grad = {N, C * P, L}, input_grad = {N, C, H, W} + * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for + * formula) + * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE} + */ const int gid = threadIdx.x + blockIdx.x * blockDim.x; int nch = gid / W, w = gid % W; int nc = nch / H, h = nch % H; int n = nc / C, c = nc % C; - if (n >= N) return; + if(n >= N) + return; FLOAT_ACCUM sum = 0.0f; - for (int ph = 0; ph < kernel_size_h; ++ph) + for(int ph = 0; ph < kernel_size_h; ++ph) { - for (int pw = 0; pw < kernel_size_w; ++pw) + for(int pw = 0; pw < kernel_size_w; ++pw) { int lhsh = h - ph * dilation_h + padding_h; int lwsw = w - pw * dilation_w + padding_w; - if (lhsh % stride_h != 0) continue; - if (lwsw % stride_w != 0) continue; + if(lhsh % stride_h != 0) + continue; + if(lwsw % stride_w != 0) + continue; int lh = lhsh / stride_h; int lw = lwsw / stride_w; - if (lh < 0 || LH <= lh) continue; - if (lw < 0 || LW <= lw) continue; - long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + output_grad_tv.stride[0] * n; + if(lh < 0 || LH <= lh) + continue; + if(lw < 0 || LW <= lw) + continue; + long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + + output_grad_tv.stride[0] * n; sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]); } } - long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum); } extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad, - IN_OUT_TYPE* input_grad, - int N, - int C, - int H, - int W, - int P, - int L, - int LH, - int LW, - int kernel_size_h, - int kernel_size_w, - int stride_h, - int stride_w, - int padding_h, - int padding_w, - int dilation_h, - int dilation_w, - tensor_view_t<3> output_grad_tv, - tensor_view_t<4> input_grad_tv) + IN_OUT_TYPE* input_grad, + int N, + int C, + int H, + int W, + int P, + int L, + int LH, + int LW, + int kernel_size_h, + int kernel_size_w, + int stride_h, + int stride_w, + int padding_h, + int padding_w, + int dilation_h, + int dilation_w, + tensor_view_t<3> output_grad_tv, + tensor_view_t<4> input_grad_tv) { unfoldBackward4D(output_grad, - input_grad, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - output_grad_tv, - input_grad_tv); + input_grad, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + output_grad_tv, + input_grad_tv); } diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index d3e44c0d33..67528b00b7 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -42,29 +42,27 @@ namespace solver { namespace fold { -bool FoldFwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, - const miopen::fold::FoldFwdProblemDescription& problem) const +bool FoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::FoldFwdProblemDescription& problem) const { return true; } -ConvSolution FoldFwd::GetSolution( - [[maybe_unused]] const ExecutionContext& context, - const miopen::fold::FoldFwdProblemDescription& problem) const +ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::fold::FoldFwdProblemDescription& problem) const { std::ignore = context; auto result = ConvSolution{miopenStatusSuccess}; - auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); - auto dtype = problem.GetOutputDesc().GetType(); - auto input_dims = problem.GetInputDesc().GetLengths(); + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto input_dims = problem.GetInputDesc().GetLengths(); auto output_dims = problem.GetOutputDesc().GetLengths(); - const int32_t N = static_cast(output_dims[0]); - const int32_t C = static_cast(output_dims[1]); - int32_t H = static_cast(output_dims[2]); - int32_t W = static_cast(output_dims[3]); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); + int32_t H = static_cast(output_dims[2]); + int32_t W = static_cast(output_dims[3]); { auto kernel = KernelInfo{}; @@ -102,20 +100,21 @@ ConvSolution FoldFwd::GetSolution( decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); - auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); - auto input_dims = deref(params.inputDesc).GetLengths(); - auto output_dims = deref(params.outputDesc).GetLengths(); + auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + auto input_dims = deref(params.inputDesc).GetLengths(); + auto output_dims = deref(params.outputDesc).GetLengths(); int spatial_dim_size = output_dims.size() - 2; - const int32_t N = static_cast(output_dims[0]); - const int32_t C = static_cast(output_dims[1]); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= params.kernel_size[i]; int32_t l = (output_dims[i + 2] + 2 * params.padding[i] - - params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / params.stride[i] + 0; L *= l; @@ -124,37 +123,37 @@ ConvSolution FoldFwd::GetSolution( int32_t kernel_size_h = params.kernel_size[0]; int32_t kernel_size_w = params.kernel_size[1]; - int32_t stride_h = params.stride[0]; - int32_t stride_w = params.stride[1]; - int32_t padding_h = params.padding[0]; - int32_t padding_w = params.padding[1]; - int32_t dilation_h = params.dilation[0]; - int32_t dilation_w = params.dilation[1]; - int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(output_dims[2]); - int32_t W = static_cast(output_dims[3]); + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(output_dims[2]); + int32_t W = static_cast(output_dims[3]); kernel(params.input, - params.output, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - input_tv, - output_tv); + params.output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); }; }; diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index 68f8072e74..b866b5d167 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -42,34 +42,33 @@ namespace solver { namespace fold { -bool UnfoldFwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, - const miopen::fold::UnfoldFwdProblemDescription& problem) const +bool UnfoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::UnfoldFwdProblemDescription& problem) const { return true; } -ConvSolution UnfoldFwd::GetSolution( - [[maybe_unused]] const ExecutionContext& context, - const miopen::fold::UnfoldFwdProblemDescription& problem) const +ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::fold::UnfoldFwdProblemDescription& problem) const { std::ignore = context; auto result = ConvSolution{miopenStatusSuccess}; - auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); - auto dtype = problem.GetOutputDesc().GetType(); - auto input_dims = problem.GetInputDesc().GetLengths(); + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto input_dims = problem.GetInputDesc().GetLengths(); - auto output_dims = problem.GetOutputDesc().GetLengths(); - const int32_t N = static_cast(input_dims[0]); - const int32_t C = static_cast(input_dims[1]); + auto output_dims = problem.GetOutputDesc().GetLengths(); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); int spatial_dim_size = input_dims.size() - 2; int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= problem.kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * problem.padding[i] - - problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / + problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / problem.stride[i] + 1; L *= l; @@ -112,20 +111,21 @@ ConvSolution UnfoldFwd::GetSolution( decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); - auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); - auto input_dims = deref(params.inputDesc).GetLengths(); - auto output_dims = deref(params.outputDesc).GetLengths(); + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); + auto input_dims = deref(params.inputDesc).GetLengths(); + auto output_dims = deref(params.outputDesc).GetLengths(); int spatial_dim_size = input_dims.size() - 2; - const int32_t N = static_cast(input_dims[0]); - const int32_t C = static_cast(input_dims[1]); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= params.kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * params.padding[i] - - params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / params.stride[i] + 1; L *= l; @@ -134,37 +134,37 @@ ConvSolution UnfoldFwd::GetSolution( int32_t kernel_size_h = params.kernel_size[0]; int32_t kernel_size_w = params.kernel_size[1]; - int32_t stride_h = params.stride[0]; - int32_t stride_w = params.stride[1]; - int32_t padding_h = params.padding[0]; - int32_t padding_w = params.padding[1]; - int32_t dilation_h = params.dilation[0]; - int32_t dilation_w = params.dilation[1]; - int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); kernel(params.input, - params.output, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - input_tv, - output_tv); + params.output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); }; }; diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp index 0e9588e000..030d5722d2 100644 --- a/test/cpu_fold.hpp +++ b/test/cpu_fold.hpp @@ -35,19 +35,19 @@ template void cpu_unfold_fwd_4d(tensor input_tensor, - tensor& ref_output_tensor, - const std::vector kernel_size, - const std::vector stride, - const std::vector padding, - const std::vector dilation) + tensor& ref_output_tensor, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) { - auto input_tv = miopen::get_inner_expanded_tv<4>(input_tensor.desc); - auto output_tv = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc); - auto input_size = input_tensor.desc.GetSize(); - auto input_dims = input_tensor.desc.GetLengths(); + auto input_tv = miopen::get_inner_expanded_tv<4>(input_tensor.desc); + auto output_tv = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc); + auto input_size = input_tensor.desc.GetSize(); + auto input_dims = input_tensor.desc.GetLengths(); - auto input = input_tensor.data.data(); - auto output = ref_output_tensor.data.data(); + auto input = input_tensor.data.data(); + auto output = ref_output_tensor.data.data(); const int LOCAL_SIZE = 256; int spatial_dim_size = input_size - 2; @@ -57,10 +57,11 @@ void cpu_unfold_fwd_4d(tensor input_tensor, int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; @@ -69,36 +70,40 @@ void cpu_unfold_fwd_4d(tensor input_tensor, int32_t kernel_size_h = kernel_size[0]; int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; - int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); - int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { - int ncp = gid / L, l = gid % L; - int nc = ncp / P, p = ncp % P; - int n = nc / C, c = nc % C; - if (n >= N) return; + int ncp = gid / L, l = gid % L; + int nc = ncp / P, p = ncp % P; + int n = nc / C, c = nc % C; + if(n >= N) + return; - int lh = l / LW, lw = l % LW; // sliding window position - int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel - int h = lh * stride_h - padding_h + ph * dilation_h; - int w = lw * stride_w - padding_w + pw * dilation_w; + int lh = l / LW, lw = l % LW; // sliding window position + int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel + int h = lh * stride_h - padding_h + ph * dilation_h; + int w = lw * stride_w - padding_w + pw * dilation_w; - T x = static_cast(0.0f); - if (0 <= h && h < H && 0 <= w && w < W) { - long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n; - x = input[input_idx]; - } + T x = static_cast(0.0f); + if(0 <= h && h < H && 0 <= w && w < W) + { + long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + + input_tv.stride[1] * c + input_tv.stride[0] * n; + x = input[input_idx]; + } - long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; - output[output_idx] = x; + long output_idx = + output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; + output[output_idx] = x; }); } #endif diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index eee1e79fef..8900ea4827 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -51,15 +51,19 @@ struct UnfoldTestCase bool isContiguous = true; friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc) { - os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H - << " W:" << tc.W << " kernel_size:"; - for (const auto& ks : tc.kernelSize) os << ks << " "; + os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W + << " kernel_size:"; + for(const auto& ks : tc.kernelSize) + os << ks << " "; os << "stride:"; - for (const auto& s : tc.stride) os << s << " "; + for(const auto& s : tc.stride) + os << s << " "; os << "padding:"; - for (const auto& p : tc.padding) os << p << " "; + for(const auto& p : tc.padding) + os << p << " "; os << "dilation:"; - for (const auto& d : tc.dilation) os << d << " "; + for(const auto& d : tc.dilation) + os << d << " "; os << "isContiguous:" << std::boolalpha << tc.isContiguous; return os; } @@ -128,8 +132,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam auto&& handle = get_handle(); config = GetParam(); - std::vector in_dims = config.GetInput(); - std::vector in_strides = config.ComputeStrides(in_dims); + std::vector in_dims = config.GetInput(); + std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; auto gen_one = [&](auto...) { return 1; }; @@ -137,29 +141,29 @@ struct UnfoldFwdTest : public ::testing::TestWithParam input = tensor{in_dims, in_strides}.generate(gen_value); int spatial_dim_size = in_dims.size() - 2; - const int32_t N = static_cast(in_dims[0]); - const int32_t C = static_cast(in_dims[1]); + const int32_t N = static_cast(in_dims[0]); + const int32_t C = static_cast(in_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= config.kernelSize[i]; int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - - config.dilation[i] * (config.kernelSize[i] - 1) - 1) / + config.dilation[i] * (config.kernelSize[i] - 1) - 1) / config.stride[i] + 1; L *= l; ls.push_back(l); } - std::vector out_dims{static_cast(N), - static_cast(C * P), - static_cast(L)}; + std::vector out_dims{ + static_cast(N), static_cast(C * P), static_cast(L)}; output = tensor{out_dims}.generate(gen_zero); outputHost = tensor{out_dims}.generate(gen_zero); - input_dev = handle.Write(input.data); - output_dev = handle.Write(output.data); + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); } void RunTest() @@ -168,25 +172,21 @@ struct UnfoldFwdTest : public ::testing::TestWithParam miopenStatus_t status; status = miopen::UnfoldForward(handle, - input.desc, - input_dev.get(), - output.desc, - output_dev.get(), - config.kernelSize.data(), - static_cast(config.kernelSize.size()), - config.stride.data(), - static_cast(config.stride.size()), - config.padding.data(), - static_cast(config.padding.size()), - config.dilation.data(), - static_cast(config.dilation.size())); - - cpu_unfold_fwd_4d(input, - outputHost, - config.kernelSize, - config.stride, - config.padding, - config.dilation); + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); + + cpu_unfold_fwd_4d( + input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation); EXPECT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -201,10 +201,9 @@ struct UnfoldFwdTest : public ::testing::TestWithParam // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; - auto error_output = miopen::rms_range(outputHost, output); - EXPECT_TRUE(error_output < tolerance) - << "Error forward output beyond tolerance Error: {" << error_output - << "}, Tolerance: " << tolerance; + auto error_output = miopen::rms_range(outputHost, output); + EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" + << error_output << "}, Tolerance: " << tolerance; } UnfoldTestCase config; From 0f15ed504f7c49570ec0b408909088389a3e380b Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 4 Jul 2024 03:24:18 +0000 Subject: [PATCH 03/46] githook format --- driver/unfold_driver.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index cfa25d3a85..1c7fb75bbf 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -134,13 +134,13 @@ int UnfoldDriver::GetandSetData() { std::vector input_length = GetTensorLengthsFromCmdLine(); - kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); - stride = GetVectorInt32tFromCmdLine("stride"); - padding = GetVectorInt32tFromCmdLine("padding"); - dilation = GetVectorInt32tFromCmdLine("dilation"); + kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); + stride = GetVectorInt32tFromCmdLine("stride"); + padding = GetVectorInt32tFromCmdLine("padding"); + dilation = GetVectorInt32tFromCmdLine("dilation"); int spatial_dim_size = input_length.size() - 2; - const int N = input_length[0]; - const int C = input_length[1]; + const int N = input_length[0]; + const int C = input_length[1]; int P = 1, L = 1; std::vector ls; From 9b49b9dc66d2f32380af7db770ebe8dd7030ba73 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 4 Jul 2024 08:36:03 +0000 Subject: [PATCH 04/46] unfold backward driver and gtest --- driver/mloUnfoldHost.hpp | 79 ++++++++- driver/unfold_driver.hpp | 75 +++++++- include/miopen/miopen.h | 62 +++---- src/CMakeLists.txt | 1 + src/fold.cpp | 50 ++++++ src/fold/problem_description.cpp | 26 +++ src/fold_api.cpp | 31 ++++ src/include/miopen/fold.hpp | 14 ++ src/include/miopen/fold/invoke_params.hpp | 6 +- .../miopen/fold/problem_description.hpp | 118 ++++++------- src/include/miopen/fold/solvers.hpp | 15 ++ src/solver.cpp | 1 + src/solver/fold/unfold_backward.cpp | 167 ++++++++++++++++++ test/cpu_fold.hpp | 86 +++++++++ test/gtest/fold.cpp | 63 +++++++ test/gtest/fold.hpp | 94 ++++++++++ 16 files changed, 790 insertions(+), 98 deletions(-) create mode 100644 src/solver/fold/unfold_backward.cpp diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index 7941eb63c4..466217feba 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -106,14 +106,85 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input, } template -int32_t mloUnFoldBwd4DRunHost(Tgpu* input, - const miopenTensorDescriptor_t inputDesc, - Tcheck* ref_output, - const miopenTensorDescriptor_t ref_outputDesc, +int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, + const miopenTensorDescriptor_t dinputDesc, + Tgpu* doutput, + const miopenTensorDescriptor_t doutputDesc, const std::vector kernel_size, const std::vector stride, const std::vector padding, const std::vector dilation) { + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(dinputDesc)); + auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(doutputDesc)); + auto input_grad_dims = miopen::deref(dinputDesc).GetLengths(); + auto input_size = miopen::deref(dinputDesc).GetSize(); + + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); + [[maybe_unused]] int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= kernel_size[i]; + int32_t l = (static_cast(input_grad_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); + int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + par_ford(work_size)([&](int gid) { + int nch = gid / W, w = gid % W; + int nc = nch / H, h = nch % H; + int n = nc / C, c = nc % C; + if(n >= N) + return; + + float sum = 0.0f; + + for(int ph = 0; ph < kernel_size_h; ++ph) + { + for(int pw = 0; pw < kernel_size_w; ++pw) + { + int lhsh = h - ph * dilation_h + padding_h; + int lwsw = w - pw * dilation_w + padding_w; + if(lhsh % stride_h != 0) + continue; + if(lwsw % stride_w != 0) + continue; + int lh = lhsh / stride_h; + int lw = lwsw / stride_w; + if(lh < 0 || LH <= lh) + continue; + if(lw < 0 || LW <= lw) + continue; + long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + + output_grad_tv.stride[0] * n; + sum += static_cast(doutput[output_grad_idx]); + } + } + + long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + ref_dinput[input_grad_idx] = static_cast(sum); + }); + return miopenStatusSuccess; } diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index 1c7fb75bbf..57f92b3423 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -109,7 +109,7 @@ class UnfoldDriver : public Driver std::vector output_host; - std::vector doutput_host; + std::vector dinput_host; std::vector kernel_size; std::vector stride; @@ -156,8 +156,8 @@ int UnfoldDriver::GetandSetData() std::vector output_length = {N, (C * P), L}; SetTensorNd(inputDesc, input_length, data_type); SetTensorNd(outputDesc, output_length, data_type); - SetTensorNd(doutputDesc, output_length, data_type); SetTensorNd(dinputDesc, input_length, data_type); + SetTensorNd(doutputDesc, output_length, data_type); return miopenStatusSuccess; } @@ -267,7 +267,7 @@ int UnfoldDriver::AllocateBuffersAndCopy() output_host = std::vector(output_sz, static_cast(0.0f)); - doutput_host = std::vector(doutput_sz, static_cast(0.0f)); + dinput_host = std::vector(dinput_sz, static_cast(0.0f)); int status; @@ -358,12 +358,67 @@ int UnfoldDriver::RunForwardCPU() template int UnfoldDriver::RunBackwardGPU() { + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenUnfoldBackward(GetHandle(), + dinputDesc, + dinput_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Unfold Backward Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Unfold Backward Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0) + std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize() + << std::endl; + return miopenStatusSuccess; } template int UnfoldDriver::RunBackwardCPU() { + mloUnFoldBwd4DRunHost(dinput_host.data(), + inputDesc, + doutput.data(), + doutputDesc, + kernel_size, + stride, + padding, + dilation); return miopenStatusSuccess; } @@ -403,6 +458,20 @@ int UnfoldDriver::VerifyForward() template int UnfoldDriver::VerifyBackward() { + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_dinput = miopen::rms_range(dinput_host, dinput); + + if(!std::isfinite(error_dinput) || error_dinput > tolerance) + { + std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Backward Unfold Verifies OK on CPU reference ({" << error_dinput << "} < " + << tolerance << ')' << std::endl; + } return miopenStatusSuccess; } diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 45e8df42db..9fae26ed6e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6620,37 +6620,37 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, const int32_t* dilation, const int dilation_size); -// /*! @brief Execute an unfold forward layer -// * -// * @param handle MIOpen handle (input) -// * @param inputDesc Tensor descriptor for data input tensor input (input) -// * @param input Data tensor input (input) -// * @param outputDesc Tensor descriptor for data output tensor output (output) -// * @param output Data tensor output (output) -// * @param kernel_size Size of the sliding box array (input) -// * @param kernel_size_size Size of the kernel_size array (input) -// * @param stride Stride array of the sliding box (input) -// * @param stride_size Size of the stride array (input) -// * @param padding Padding array to be added on input (input) -// * @param padding_size Size of the padding array (input) -// * @param dilation Dilation array control the stride of the elements within the -// neighborhood (input) -// * @param dilation_size Size of the dilation array (input) -// * @return miopenStatus_t -// */ -// MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, -// const miopenTensorDescriptor_t inputDesc, -// const void* input, -// const miopenTensorDescriptor_t outputDesc, -// void* output, -// const int32_t* kernel_size, -// const int kernel_size_size, -// const int32_t* stride, -// const int stride_size, -// const int32_t* padding, -// const int padding_size, -// const int32_t* dilation, -// const int dilation_size); +/*! @brief Execute an unfold backward layer + * + * @param handle MIOpen handle (input) + * @param dinputDesc Tensor descriptor for data input grad tensor (output) + * @param dinput Data tensor input grad (output) + * @param doutputDesc Tensor descriptor for data output grad tensor (input) + * @param doutput Data tensor output grad (input) + * @param kernel_size Size of the sliding box array (input) + * @param kernel_size_size Size of the kernel_size array (input) + * @param stride Stride array of the sliding box (input) + * @param stride_size Size of the stride array (input) + * @param padding Padding array to be added on input (input) + * @param padding_size Size of the padding array (input) + * @param dilation Dilation array control the stride of the elements within the + neighborhood (input) + * @param dilation_size Size of the dilation array (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); /** @} */ // CLOSEOUT FOLD DOXYGEN GROUP diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 34153587d3..ae2965b07c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -259,6 +259,7 @@ set( MIOpen_Source solver/conv_winoRxS_fused.cpp solver/fft.cpp solver/fold/unfold_forward.cpp + solver/fold/unfold_backward.cpp solver/gemm.cpp solver/gemm_bwd.cpp solver/gemm_wrw.cpp diff --git a/src/fold.cpp b/src/fold.cpp index d545c01964..1117cdc642 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -87,4 +87,54 @@ miopenStatus_t UnfoldForward(Handle& handle, return miopenStatusSuccess; } +miopenStatus_t UnfoldBackward(Handle& handle, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc, + doutputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; + + const auto invoke_params = [&]() { + auto tmp = fold::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.dinputDesc = &dinputDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinput = dinput; + tmp.doutput = doutput; + tmp.kernel_size = kernel_size; + tmp.stride = stride; + tmp.padding = padding; + tmp.dilation = dilation; + tmp.kernel_size_size = kernel_size_size; + tmp.stride_size = stride_size; + tmp.padding_size = padding_size; + tmp.dilation_size = dilation_size; + return tmp; + }(); + + const auto algo = AlgorithmName{"UnfoldBwd"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + } // namespace miopen diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index 9a0c6ec921..d0ecf629e9 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -75,6 +75,32 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const return NetworkConfig{ss.str()}; } +NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = dinputDesc.GetType(); + auto output_dtype = doutputDesc.GetType(); + auto size = dinputDesc.GetElementSize(); + auto in_dims = dinputDesc.GetLengths(); + + std::ostringstream ss; + + ss << "Unfold_bwd"; + ss << "i_dtype" << input_dtype; + ss << "o_dtype" << output_dtype; + ss << "size" << size; + ss << "in_grad_dims"; + for(auto val : in_dims) + { + ss << "_" << val; + } + ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; + ss << "stride_" << stride[0] << "_" << stride[1]; + ss << "padding_" << padding[0] << "_" << padding[1]; + ss << "dilation_" << dilation[0] << "_" << dilation[1]; + + return NetworkConfig{ss.str()}; +} + } // namespace fold } // namespace miopen diff --git a/src/fold_api.cpp b/src/fold_api.cpp index 1e6c97ef83..cb50b194ea 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -61,3 +61,34 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, dilation_size); }); } + +extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + return miopen::try_([&] { + miopen::UnfoldBackward(miopen::deref(handle), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(doutputDesc), + DataCast(doutput), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); + }); +} diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 33e879eb0a..7bb0cc946a 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -46,5 +46,19 @@ miopenStatus_t UnfoldForward(Handle& handle, const int padding_size, const int32_t* dilation, const int dilation_size); + +miopenStatus_t UnfoldBackward(Handle& handle, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index 318e312206..b256680d8e 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -41,10 +41,14 @@ struct InvokeParams : public miopen::InvokeParams const TensorDescriptor* inputDesc = nullptr; const TensorDescriptor* outputDesc = nullptr; - ConstData_t input = nullptr; Data_t output = nullptr; + const TensorDescriptor* dinputDesc = nullptr; + const TensorDescriptor* doutputDesc = nullptr; + Data_t dinput = nullptr; + ConstData_t doutput = nullptr; + const int32_t* kernel_size = nullptr; const int32_t* stride = nullptr; const int32_t* padding = nullptr; diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index 938abe6dae..9e4e5b427f 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -42,65 +42,6 @@ namespace fold { bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); -// struct FoldFwdProblemDescription : ProblemDescriptionBase -// { -// FoldFwdProblemDescription(const TensorDescriptor& inputDesc_, -// const TensorDescriptor& outputDesc_, -// const int32_t* kernel_size_, -// const int kernel_size_size_, -// const int32_t* stride_, -// const int stride_size_, -// const int32_t* padding_, -// const int padding_size_, -// const int32_t* dilation_, -// const int dilation_size_) -// : inputDesc(inputDesc_), -// outputDesc(outputDesc_), -// kernel_size(kernel_size_), -// kernel_size_size(kernel_size_size_), -// stride(stride_), -// stride_size(stride_size_), -// padding(padding_), -// padding_size(padding_size_), -// dilation(dilation_), -// dilation_size(dilation_size_) -// { -// // IsValidSize(); -// } - -// // bool IsValidSize() const -// // { -// // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) -// // { -// // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG -// // MIOPEN_THROW(miopenStatusBadParm, -// // "Instance Norm: The input tensor dimension should be in range [2, -// 5]."); -// // #else -// // return false; -// // #endif -// // } -// // return true; -// // } - -// const TensorDescriptor& GetInputDesc() const { return inputDesc; } -// const TensorDescriptor& GetOutputDesc() const { return outputDesc; } - -// NetworkConfig MakeNetworkConfig() const override; - -// public: -// TensorDescriptor inputDesc; -// TensorDescriptor outputDesc; -// const int32_t* kernel_size; -// const int kernel_size_size; -// const int32_t* stride; -// const int stride_size; -// const int32_t* padding; -// const int padding_size; -// const int32_t* dilation; -// const int dilation_size; -// }; - struct UnfoldFwdProblemDescription : ProblemDescriptionBase { UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_, @@ -160,6 +101,65 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase const int dilation_size; }; +struct UnfoldBwdProblemDescription : ProblemDescriptionBase +{ + UnfoldBwdProblemDescription(const TensorDescriptor& dinputDesc_, + const TensorDescriptor& doutputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) + : dinputDesc(dinputDesc_), + doutputDesc(doutputDesc_), + kernel_size(kernel_size_), + kernel_size_size(kernel_size_size_), + stride(stride_), + stride_size(stride_size_), + padding(padding_), + padding_size(padding_size_), + dilation(dilation_), + dilation_size(dilation_size_) + { + // IsValidSize(); + } + + // bool IsValidSize() const + // { + // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) + // { + // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + // MIOPEN_THROW(miopenStatusBadParm, + // "Instance Norm: The input tensor dimension should be in range [2, + // 5]."); + // #else + // return false; + // #endif + // } + // return true; + // } + + const TensorDescriptor& GetDinputDesc() const { return dinputDesc; } + const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; } + + NetworkConfig MakeNetworkConfig() const override; + +public: + TensorDescriptor dinputDesc; + TensorDescriptor doutputDesc; + const int32_t* kernel_size; + const int kernel_size_size; + const int32_t* stride; + const int stride_size; + const int32_t* padding; + const int padding_size; + const int32_t* dilation; + const int dilation_size; +}; + } // namespace fold } // namespace miopen diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp index 0d2cbe282f..d463bb0251 100644 --- a/src/include/miopen/fold/solvers.hpp +++ b/src/include/miopen/fold/solvers.hpp @@ -67,6 +67,21 @@ struct UnfoldFwd final : UnfoldFwdSolverBase const miopen::fold::UnfoldFwdProblemDescription& problem) const override; }; +using UnfoldBwdSolverBase = + NonTunableSolverBase; + +struct UnfoldBwd final : UnfoldBwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::fold::UnfoldBwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::fold::UnfoldBwdProblemDescription& problem) const override; +}; + } // namespace fold } // namespace solver diff --git a/src/solver.cpp b/src/solver.cpp index 97fa4637f3..8e3d5afcb3 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -651,6 +651,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp new file mode 100644 index 0000000000..c8613d9cca --- /dev/null +++ b/src/solver/fold/unfold_backward.cpp @@ -0,0 +1,167 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/fold/problem_description.hpp" +#include "miopen/miopen.h" +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace fold { + +bool UnfoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::UnfoldBwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::fold::UnfoldBwdProblemDescription& problem) const +{ + std::ignore = context; + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetDinputDesc().GetType()); + auto dtype = problem.GetDoutputDesc().GetType(); + auto input_grad_dims = problem.GetDinputDesc().GetLengths(); + auto output_grad_dims = problem.GetDoutputDesc().GetLengths(); + + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); + + { + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldBackward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * H * W, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.dinputDesc)); + auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.doutputDesc)); + auto input_grad_dims = deref(params.dinputDesc).GetLengths(); + auto output_grad_dims = deref(params.doutputDesc).GetLengths(); + + int spatial_dim_size = input_grad_dims.size() - 2; + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= params.kernel_size[i]; + int32_t l = (static_cast(input_grad_dims[i + 2]) + 2 * params.padding[i] - + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = params.kernel_size[0]; + int32_t kernel_size_w = params.kernel_size[1]; + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); + + kernel(params.doutput, + params.dinput, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace fold + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp index 030d5722d2..4c9427f2d4 100644 --- a/test/cpu_fold.hpp +++ b/test/cpu_fold.hpp @@ -106,4 +106,90 @@ void cpu_unfold_fwd_4d(tensor input_tensor, output[output_idx] = x; }); } + +template +void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, + tensor doutput_tensor, + const std::vector kernel_size, + const std::vector stride, + const std::vector padding, + const std::vector dilation) +{ + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc); + auto output_grad_tv = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc); + auto input_size = ref_dinput_tensor.desc.GetSize(); + auto input_grad_dims = ref_dinput_tensor.desc.GetLengths(); + + auto input_grad = ref_dinput_tensor.data.data(); + auto output_grad = doutput_tensor.data.data(); + + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); + + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= kernel_size[i]; + int32_t l = (static_cast(input_grad_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); + int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + par_ford(work_size)([&](int gid) { + int nch = gid / W, w = gid % W; + int nc = nch / H, h = nch % H; + int n = nc / C, c = nc % C; + if(n >= N) + return; + + float sum = 0.0f; + + for(int ph = 0; ph < kernel_size_h; ++ph) + { + for(int pw = 0; pw < kernel_size_w; ++pw) + { + int lhsh = h - ph * dilation_h + padding_h; + int lwsw = w - pw * dilation_w + padding_w; + if(lhsh % stride_h != 0) + continue; + if(lwsw % stride_w != 0) + continue; + int lh = lhsh / stride_h; + int lw = lwsw / stride_w; + if(lh < 0 || LH <= lh) + continue; + if(lw < 0 || LW <= lw) + continue; + long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + + output_grad_tv.stride[0] * n; + sum += static_cast(output_grad[output_grad_idx]); + } + } + + long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + input_grad[input_grad_idx] = static_cast(sum); + }); +} #endif diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index d1843ae3c8..0e9fe9ddd8 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -43,6 +43,18 @@ struct UnfoldForwardTestFloat16 : UnfoldFwdTest struct UnfoldForwardTestBFloat16 : UnfoldFwdTest { }; + +struct UnfoldBackwardTestFloat32 : UnfoldBwdTest +{ +}; + +struct UnfoldBackwardTestFloat16 : UnfoldBwdTest +{ +}; + +struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest +{ +}; }; // namespace fold using namespace fold; TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) @@ -95,3 +107,54 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, UnfoldForwardTestBFloat16, testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestFloat32, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestFloat16, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestBFloat16, + testing::ValuesIn(UnfoldTestConfigs())); diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 8900ea4827..150edf0a47 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -215,3 +215,97 @@ struct UnfoldFwdTest : public ::testing::TestWithParam miopen::Allocator::ManageDataPtr input_dev; miopen::Allocator::ManageDataPtr output_dev; }; + + +template +struct UnfoldBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + std::vector in_dims = config.GetInput(); + std::vector in_strides = config.ComputeStrides(in_dims); + + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + + int spatial_dim_size = in_dims.size() - 2; + const int32_t N = static_cast(in_dims[0]); + const int32_t C = static_cast(in_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= config.kernelSize[i]; + int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - + config.dilation[i] * (config.kernelSize[i] - 1) - 1) / + config.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + std::vector out_dims{ + static_cast(N), static_cast(C * P), static_cast(L)}; + + doutput = tensor{out_dims}.generate(gen_value); + + dinput_dev = handle.Write(dinput.data); + doutput_dev = handle.Write(doutput.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::UnfoldBackward(handle, + dinput.desc, + dinput_dev.get(), + doutput.desc, + doutput_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); + + cpu_unfold_bwd_4d( + dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation); + + EXPECT_EQ(status, miopenStatusSuccess); + dinput.data = handle.Read(dinput_dev, dinput.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + auto error_dinput = miopen::rms_range(dinputHost, dinput); + EXPECT_TRUE(error_dinput < tolerance) << "Error backward input_grad beyond tolerance Error: {" + << error_dinput << "}, Tolerance: " << tolerance; + } + UnfoldTestCase config; + + tensor dinput; + tensor doutput; + + tensor dinputHost; + + miopen::Allocator::ManageDataPtr dinput_dev; + miopen::Allocator::ManageDataPtr doutput_dev; +}; From 83fba5690843a85a8533d1647736bf32e10b17fa Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 4 Jul 2024 08:36:22 +0000 Subject: [PATCH 05/46] githook format --- driver/mloUnfoldHost.hpp | 45 ++++++++++----------- driver/unfold_driver.hpp | 41 +++++++++---------- include/miopen/miopen.h | 24 ++++++------ src/fold.cpp | 32 +++++++-------- src/fold_api.cpp | 48 +++++++++++------------ src/include/miopen/fold.hpp | 24 ++++++------ src/include/miopen/fold/invoke_params.hpp | 8 ++-- src/solver/fold/unfold_backward.cpp | 14 +++---- test/cpu_fold.hpp | 11 +++--- test/gtest/fold.hpp | 36 ++++++++--------- 10 files changed, 143 insertions(+), 140 deletions(-) diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index 466217feba..fcfd5f4a6b 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -118,12 +118,12 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(dinputDesc)); auto output_grad_tv = miopen::get_inner_expanded_tv<3>(miopen::deref(doutputDesc)); auto input_grad_dims = miopen::deref(dinputDesc).GetLengths(); - auto input_size = miopen::deref(dinputDesc).GetSize(); + auto input_size = miopen::deref(dinputDesc).GetSize(); - const int LOCAL_SIZE = 256; - int spatial_dim_size = input_size - 2; - const int32_t N = static_cast(input_grad_dims[0]); - const int32_t C = static_cast(input_grad_dims[1]); + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); [[maybe_unused]] int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) @@ -136,19 +136,19 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, L *= l; ls.push_back(l); } - int32_t kernel_size_h = kernel_size[0]; - int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; - int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_grad_dims[2]); - int32_t W = static_cast(input_grad_dims[3]); - int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); + int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { int nch = gid / W, w = gid % W; int nc = nch / H, h = nch % H; @@ -174,15 +174,16 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, continue; if(lw < 0 || LW <= lw) continue; - long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + - output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + - output_grad_tv.stride[0] * n; + long output_grad_idx = + output_grad_tv.stride[2] * (lh * LW + lw) + + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + + output_grad_tv.stride[0] * n; sum += static_cast(doutput[output_grad_idx]); } } long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + - input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; ref_dinput[input_grad_idx] = static_cast(sum); }); diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index 57f92b3423..d565d192f5 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -367,18 +367,18 @@ int UnfoldDriver::RunBackwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenUnfoldBackward(GetHandle(), - dinputDesc, - dinput_dev->GetMem(), - doutputDesc, - doutput_dev->GetMem(), - kernel_size.data(), - kernel_size.size(), - stride.data(), - stride.size(), - padding.data(), - padding.size(), - dilation.data(), - dilation.size()); + dinputDesc, + dinput_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -412,13 +412,13 @@ template int UnfoldDriver::RunBackwardCPU() { mloUnFoldBwd4DRunHost(dinput_host.data(), - inputDesc, - doutput.data(), - doutputDesc, - kernel_size, - stride, - padding, - dilation); + inputDesc, + doutput.data(), + doutputDesc, + kernel_size, + stride, + padding, + dilation); return miopenStatusSuccess; } @@ -464,7 +464,8 @@ int UnfoldDriver::VerifyBackward() if(!std::isfinite(error_dinput) || error_dinput > tolerance) { - std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance << std::endl; + std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance + << std::endl; return EC_VerifyFwd; } else diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 9fae26ed6e..56633f57c2 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6639,18 +6639,18 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t dinputDesc, - void* dinput, - const miopenTensorDescriptor_t doutputDesc, - const void* doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); /** @} */ // CLOSEOUT FOLD DOXYGEN GROUP diff --git a/src/fold.cpp b/src/fold.cpp index 1117cdc642..8a028d379e 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -88,18 +88,18 @@ miopenStatus_t UnfoldForward(Handle& handle, } miopenStatus_t UnfoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc, doutputDesc, @@ -115,10 +115,10 @@ miopenStatus_t UnfoldBackward(Handle& handle, const auto invoke_params = [&]() { auto tmp = fold::InvokeParams{}; tmp.type = InvokeType::Run; - tmp.dinputDesc = &dinputDesc; - tmp.doutputDesc = &doutputDesc; - tmp.dinput = dinput; - tmp.doutput = doutput; + tmp.dinputDesc = &dinputDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinput = dinput; + tmp.doutput = doutput; tmp.kernel_size = kernel_size; tmp.stride = stride; tmp.padding = padding; diff --git a/src/fold_api.cpp b/src/fold_api.cpp index cb50b194ea..ba9f2fd805 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -63,32 +63,32 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, } extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t dinputDesc, - void* dinput, - const miopenTensorDescriptor_t doutputDesc, - const void* doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { return miopen::try_([&] { miopen::UnfoldBackward(miopen::deref(handle), - miopen::deref(dinputDesc), - DataCast(dinput), - miopen::deref(doutputDesc), - DataCast(doutput), - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size); + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(doutputDesc), + DataCast(doutput), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); }); } diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 7bb0cc946a..040bb681ea 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -48,17 +48,17 @@ miopenStatus_t UnfoldForward(Handle& handle, const int dilation_size); miopenStatus_t UnfoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index b256680d8e..da89023f17 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -41,13 +41,13 @@ struct InvokeParams : public miopen::InvokeParams const TensorDescriptor* inputDesc = nullptr; const TensorDescriptor* outputDesc = nullptr; - ConstData_t input = nullptr; - Data_t output = nullptr; + ConstData_t input = nullptr; + Data_t output = nullptr; const TensorDescriptor* dinputDesc = nullptr; const TensorDescriptor* doutputDesc = nullptr; - Data_t dinput = nullptr; - ConstData_t doutput = nullptr; + Data_t dinput = nullptr; + ConstData_t doutput = nullptr; const int32_t* kernel_size = nullptr; const int32_t* stride = nullptr; diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index c8613d9cca..249f08592c 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -54,15 +54,15 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con std::ignore = context; auto result = ConvSolution{miopenStatusSuccess}; - auto in_dtype = miopen::GetDataType(problem.GetDinputDesc().GetType()); - auto dtype = problem.GetDoutputDesc().GetType(); - auto input_grad_dims = problem.GetDinputDesc().GetLengths(); + auto in_dtype = miopen::GetDataType(problem.GetDinputDesc().GetType()); + auto dtype = problem.GetDoutputDesc().GetType(); + auto input_grad_dims = problem.GetDinputDesc().GetLengths(); auto output_grad_dims = problem.GetDoutputDesc().GetLengths(); - const int32_t N = static_cast(input_grad_dims[0]); - const int32_t C = static_cast(input_grad_dims[1]); - int32_t H = static_cast(input_grad_dims[2]); - int32_t W = static_cast(input_grad_dims[3]); + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); + int32_t H = static_cast(input_grad_dims[2]); + int32_t W = static_cast(input_grad_dims[3]); { auto kernel = KernelInfo{}; diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp index 4c9427f2d4..46f7552083 100644 --- a/test/cpu_fold.hpp +++ b/test/cpu_fold.hpp @@ -117,7 +117,7 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, { auto input_grad_tv = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc); auto output_grad_tv = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc); - auto input_size = ref_dinput_tensor.desc.GetSize(); + auto input_size = ref_dinput_tensor.desc.GetSize(); auto input_grad_dims = ref_dinput_tensor.desc.GetLengths(); auto input_grad = ref_dinput_tensor.data.data(); @@ -180,15 +180,16 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, continue; if(lw < 0 || LW <= lw) continue; - long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + - output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + - output_grad_tv.stride[0] * n; + long output_grad_idx = + output_grad_tv.stride[2] * (lh * LW + lw) + + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + + output_grad_tv.stride[0] * n; sum += static_cast(output_grad[output_grad_idx]); } } long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + - input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; input_grad[input_grad_idx] = static_cast(sum); }); } diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 150edf0a47..f15c5b6a5f 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -216,7 +216,6 @@ struct UnfoldFwdTest : public ::testing::TestWithParam miopen::Allocator::ManageDataPtr output_dev; }; - template struct UnfoldBwdTest : public ::testing::TestWithParam { @@ -232,8 +231,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; - dinput = tensor{in_dims, in_strides}.generate(gen_zero); - dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); @@ -254,7 +253,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam std::vector out_dims{ static_cast(N), static_cast(C * P), static_cast(L)}; - doutput = tensor{out_dims}.generate(gen_value); + doutput = tensor{out_dims}.generate(gen_value); dinput_dev = handle.Write(dinput.data); doutput_dev = handle.Write(doutput.data); @@ -266,18 +265,18 @@ struct UnfoldBwdTest : public ::testing::TestWithParam miopenStatus_t status; status = miopen::UnfoldBackward(handle, - dinput.desc, - dinput_dev.get(), - doutput.desc, - doutput_dev.get(), - config.kernelSize.data(), - static_cast(config.kernelSize.size()), - config.stride.data(), - static_cast(config.stride.size()), - config.padding.data(), - static_cast(config.padding.size()), - config.dilation.data(), - static_cast(config.dilation.size())); + dinput.desc, + dinput_dev.get(), + doutput.desc, + doutput_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); cpu_unfold_bwd_4d( dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation); @@ -296,8 +295,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam if(std::is_same::value) tolerance *= 8.0; auto error_dinput = miopen::rms_range(dinputHost, dinput); - EXPECT_TRUE(error_dinput < tolerance) << "Error backward input_grad beyond tolerance Error: {" - << error_dinput << "}, Tolerance: " << tolerance; + EXPECT_TRUE(error_dinput < tolerance) + << "Error backward input_grad beyond tolerance Error: {" << error_dinput + << "}, Tolerance: " << tolerance; } UnfoldTestCase config; From 101794f07ed0d67386e391cc5b5a073dd2ddc330 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 10 Jul 2024 07:43:30 +0000 Subject: [PATCH 06/46] Add foldfwd, foldbwd, problem_description verification, gtest and driver --- driver/CMakeLists.txt | 1 + driver/dm_fold.cpp | 39 ++ driver/driver.hpp | 4 +- driver/fold_driver.hpp | 474 ++++++++++++++++++ include/miopen/miopen.h | 64 +++ src/CMakeLists.txt | 2 + src/fold.cpp | 100 ++++ src/fold/problem_description.cpp | 80 ++- src/fold_api.cpp | 62 +++ src/include/miopen/fold.hpp | 28 ++ .../miopen/fold/problem_description.hpp | 395 +++++++++++++-- src/include/miopen/fold/solvers.hpp | 46 +- src/solver.cpp | 3 +- src/solver/fold/fold_backward.cpp | 178 +++++++ src/solver/fold/fold_forward.cpp | 4 +- test/cpu_fold.hpp | 6 +- test/gtest/fold.cpp | 60 +-- test/gtest/fold.hpp | 90 ++-- test/gtest/unfold.cpp | 160 ++++++ test/gtest/unfold.hpp | 311 ++++++++++++ 20 files changed, 1957 insertions(+), 150 deletions(-) create mode 100644 src/solver/fold/fold_backward.cpp create mode 100644 test/gtest/unfold.cpp create mode 100644 test/gtest/unfold.hpp diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 8ca4ccd5c1..c115cf435f 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -52,6 +52,7 @@ add_executable(MIOpenDriver dm_sum.cpp dm_tensorop.cpp dm_unfold.cpp + dm_fold.cpp main.cpp registry_driver_maker.cpp rocrand_wrapper.cpp) diff --git a/driver/dm_fold.cpp b/driver/dm_fold.cpp index e69de29bb2..d7a8e2cb9a 100644 --- a/driver/dm_fold.cpp +++ b/driver/dm_fold.cpp @@ -0,0 +1,39 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "fold_driver.hpp" +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "fold") + return new FoldDriver(); + if(base_arg == "foldfp16") + return new FoldDriver(); + if(base_arg == "foldbfp16") + return new FoldDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index a7396d272f..f26d7053f3 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16]\n"); + "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -177,7 +177,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" && - arg != "unfoldbfp16" && arg != "--version") + arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp index e69de29bb2..117538452e 100644 --- a/driver/fold_driver.hpp +++ b/driver/fold_driver.hpp @@ -0,0 +1,474 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_FOLD_DRIVER_HPP +#define GUARD_MIOPEN_FOLD_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloUnfoldHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "util_driver.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +template +class FoldDriver : public Driver +{ +public: + FoldDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&dinputDesc); + miopenCreateTensorDescriptor(&doutputDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetTensorLengthsFromCmdLine(); + std::vector GetVectorInt32tFromCmdLine(std::string long_name); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~FoldDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(dinputDesc); + miopenDestroyTensorDescriptor(doutputDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + + miopenTensorDescriptor_t doutputDesc; + miopenTensorDescriptor_t dinputDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + + std::unique_ptr doutput_dev; + std::unique_ptr dinput_dev; + + std::vector input; + std::vector output; + + std::vector doutput; + std::vector dinput; + + std::vector output_host; + + std::vector dinput_host; + + std::vector output_size; + std::vector kernel_size; + std::vector stride; + std::vector padding; + std::vector dilation; +}; + +template +int FoldDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int FoldDriver::GetandSetData() +{ + std::vector input_length = GetTensorLengthsFromCmdLine(); + + output_size = GetVectorInt32tFromCmdLine("outputSize"); + kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); + stride = GetVectorInt32tFromCmdLine("stride"); + padding = GetVectorInt32tFromCmdLine("padding"); + dilation = GetVectorInt32tFromCmdLine("dilation"); + const int N = input_length[0]; + int C = input_length[1]; + for (int32_t i : kernel_size) + { + C = C / i; + } + + std::vector output_length = {N, C, output_size[0], output_size[1]}; + SetTensorNd(inputDesc, input_length, data_type); + SetTensorNd(outputDesc, output_length, data_type); + SetTensorNd(dinputDesc, input_length, data_type); + SetTensorNd(doutputDesc, output_length, data_type); + + return miopenStatusSuccess; +} + +template +int FoldDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag( + "forw", 'F', "1", "Run Fold Forward (Default=1) or both Forward and Backward (0)", "int"); + inflags.AddInputFlag( + "DimLengths", 'D', "3,12,12", "The dimensional lengths of the input tensor", "string"); + inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str"); + inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str"); + inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str"); + inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +std::vector FoldDriver::GetTensorLengthsFromCmdLine() +{ + std::string lengthsStr = inflags.GetValueStr("DimLengths"); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + return (lengths); +} + +template +std::vector FoldDriver::GetVectorInt32tFromCmdLine(std::string long_name) +{ + std::string lengthsStr = inflags.GetValueStr(long_name); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(static_cast(len)); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(static_cast(len)); + + return (lengths); +} + +template +int FoldDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); + + size_t doutput_sz = GetTensorSize(doutputDesc); + size_t dinput_sz = GetTensorSize(dinputDesc); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + + doutput_dev = std::unique_ptr(new GPUMem(ctx, doutput_sz, sizeof(Tgpu))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, dinput_sz, sizeof(Tgpu))); + + input = std::vector(input_sz, static_cast(0.0f)); + output = std::vector(output_sz, static_cast(0.0f)); + + doutput = std::vector(doutput_sz, static_cast(1.0f)); + dinput = std::vector(dinput_sz, static_cast(0.0f)); + + output_host = std::vector(output_sz, static_cast(0.0f)); + + dinput_host = std::vector(dinput_sz, static_cast(0.0f)); + + int status; + + for(int i = 0; i < input_sz; i++) + input[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + status = input_dev->ToGPU(GetStream(), input.data()); + + for(int i = 0; i < doutput_sz; i++) + { + doutput[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); + } + status |= doutput_dev->ToGPU(GetStream(), doutput.data()); + status |= dinput_dev->ToGPU(GetStream(), dinput.data()); + + if(status != 0) + std::cout << "Fold Driver Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int FoldDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenFoldForward(GetHandle(), + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Fold Forward Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(output_dev->FromGPU(GetStream(), output.data()) != 0) + std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int FoldDriver::RunForwardCPU() +{ + mloUnFoldBwd4DRunHost(output_host.data(), + outputDesc, + input.data(), + inputDesc, + kernel_size, + stride, + padding, + dilation); + return miopenStatusSuccess; +} + +template +int FoldDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenFoldBackward(GetHandle(), + dinputDesc, + dinput_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Fold Backward Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0) + std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int FoldDriver::RunBackwardCPU() +{ + mloUnFoldFwd4DRunHost(doutput.data(), + doutputDesc, + dinput_host.data(), + dinputDesc, + kernel_size, + stride, + padding, + dilation); + return miopenStatusSuccess; +} + +template +Tref FoldDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int FoldDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_output = miopen::rms_range(output_host, output); + + if(!std::isfinite(error_output) || error_output > tolerance) + { + std::cout << "Forward Fold FAILED: {" << error_output << "} > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward Fold Verifies OK on CPU reference ({" << error_output << "} < " + << tolerance << ')' << std::endl; + } + return miopenStatusSuccess; +} + +template +int FoldDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_dinput = miopen::rms_range(dinput_host, dinput); + + if(!std::isfinite(error_dinput) || error_dinput > tolerance) + { + std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance + << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Backward Fold Verifies OK on CPU reference ({" << error_dinput << "} < " + << tolerance << ')' << std::endl; + } + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_FOLD_DRIVER_HPP diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 56633f57c2..a45ece12fe 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6588,6 +6588,70 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d * * @{ */ +/*! @brief Execute an unfold forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for data input tensor input (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for data output tensor output (output) + * @param output Data tensor output (output) + * @param kernel_size Size of the sliding box array (input) + * @param kernel_size_size Size of the kernel_size array (input) + * @param stride Stride array of the sliding box (input) + * @param stride_size Size of the stride array (input) + * @param padding Padding array to be added on input (input) + * @param padding_size Size of the padding array (input) + * @param dilation Dilation array control the stride of the elements within the + * neighborhood (input) + * @param dilation_size Size of the dilation array (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); + + /*! @brief Execute an unfold backward layer + * + * @param handle MIOpen handle (input) + * @param dinputDesc Tensor descriptor for data input grad tensor (output) + * @param dinput Data tensor input grad (output) + * @param doutputDesc Tensor descriptor for data output grad tensor (input) + * @param doutput Data tensor output grad (input) + * @param kernel_size Size of the sliding box array (input) + * @param kernel_size_size Size of the kernel_size array (input) + * @param stride Stride array of the sliding box (input) + * @param stride_size Size of the stride array (input) + * @param padding Padding array to be added on input (input) + * @param padding_size Size of the padding array (input) + * @param dilation Dilation array control the stride of the elements within the + neighborhood (input) + * @param dilation_size Size of the dilation array (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); + /*! @brief Execute an unfold forward layer * * @param handle MIOpen handle (input) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ae2965b07c..bd057795a3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -260,6 +260,8 @@ set( MIOpen_Source solver/fft.cpp solver/fold/unfold_forward.cpp solver/fold/unfold_backward.cpp + solver/fold/fold_forward.cpp + solver/fold/fold_backward.cpp solver/gemm.cpp solver/gemm_bwd.cpp solver/gemm_wrw.cpp diff --git a/src/fold.cpp b/src/fold.cpp index 8a028d379e..d2ff285af1 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -137,4 +137,104 @@ miopenStatus_t UnfoldBackward(Handle& handle, return miopenStatusSuccess; } +miopenStatus_t FoldForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + const auto problem = fold::FoldFwdProblemDescription{inputDesc, + outputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; + + const auto invoke_params = [&]() { + auto tmp = fold::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.input = input; + tmp.output = output; + tmp.kernel_size = kernel_size; + tmp.stride = stride; + tmp.padding = padding; + tmp.dilation = dilation; + tmp.kernel_size_size = kernel_size_size; + tmp.stride_size = stride_size; + tmp.padding_size = padding_size; + tmp.dilation_size = dilation_size; + return tmp; + }(); + + const auto algo = AlgorithmName{"FoldFwd"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t FoldBackward(Handle& handle, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + const auto problem = fold::FoldBwdProblemDescription{dinputDesc, + doutputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; + + const auto invoke_params = [&]() { + auto tmp = fold::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.dinputDesc = &dinputDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinput = dinput; + tmp.doutput = doutput; + tmp.kernel_size = kernel_size; + tmp.stride = stride; + tmp.padding = padding; + tmp.dilation = dilation; + tmp.kernel_size_size = kernel_size_size; + tmp.stride_size = stride_size; + tmp.padding_size = padding_size; + tmp.dilation_size = dilation_size; + return tmp; + }(); + + const auto algo = AlgorithmName{"FoldBwd"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + } // namespace miopen diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index d0ecf629e9..39202fd372 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -33,22 +33,6 @@ namespace miopen { namespace fold { -// NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const -// { -// auto input_dtype = inputDesc.GetType(); -// auto output_dtype = outputDesc.GetType(); -// auto size = inputDesc.GetElementSize(); - -// std::ostringstream ss; - -// ss << "fold_fwd"; -// ss << "i_dtype" << input_dtype; -// ss << "o_dtype" << output_dtype; -// ss << "size" << size; - -// return NetworkConfig{ss.str()}; -// } - NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const { auto input_dtype = inputDesc.GetType(); @@ -101,6 +85,70 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const return NetworkConfig{ss.str()}; } +NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto output_dtype = outputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto in_dims = inputDesc.GetLengths(); + auto out_dims = outputDesc.GetLengths(); + + std::ostringstream ss; + + ss << "Fold_fwd"; + ss << "i_dtype" << input_dtype; + ss << "o_dtype" << output_dtype; + ss << "size" << size; + ss << "in_dims"; + for(auto val : in_dims) + { + ss << "_" << val; + } + ss << "out_dims"; + for (auto val: out_dims) + { + ss << "_" << val; + } + ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; + ss << "stride_" << stride[0] << "_" << stride[1]; + ss << "padding_" << padding[0] << "_" << padding[1]; + ss << "dilation_" << dilation[0] << "_" << dilation[1]; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = dinputDesc.GetType(); + auto output_dtype = doutputDesc.GetType(); + auto size = dinputDesc.GetElementSize(); + auto in_dims = dinputDesc.GetLengths(); + auto out_dims = doutputDesc.GetLengths(); + + std::ostringstream ss; + + ss << "Fold_bwd"; + ss << "i_dtype" << input_dtype; + ss << "o_dtype" << output_dtype; + ss << "size" << size; + ss << "in_grad_dims"; + for(auto val : in_dims) + { + ss << "_" << val; + } + ss << "out_grad_dims"; + for (auto val: out_dims) + { + ss << "_" << val; + } + ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; + ss << "stride_" << stride[0] << "_" << stride[1]; + ss << "padding_" << padding[0] << "_" << padding[1]; + ss << "dilation_" << dilation[0] << "_" << dilation[1]; + + return NetworkConfig{ss.str()}; +} + } // namespace fold } // namespace miopen diff --git a/src/fold_api.cpp b/src/fold_api.cpp index ba9f2fd805..fb22fa90b4 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -92,3 +92,65 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, dilation_size); }); } + +extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + return miopen::try_([&] { + miopen::FoldForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); + }); +} + +extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) +{ + return miopen::try_([&] { + miopen::FoldBackward(miopen::deref(handle), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(doutputDesc), + DataCast(doutput), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); + }); +} diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 040bb681ea..d94a42ee5a 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -60,5 +60,33 @@ miopenStatus_t UnfoldBackward(Handle& handle, const int padding_size, const int32_t* dilation, const int dilation_size); + +miopenStatus_t FoldForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); + +miopenStatus_t FoldBackward(Handle& handle, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index 9e4e5b427f..f89a90eac2 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -65,23 +65,77 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase dilation(dilation_), dilation_size(dilation_size_) { - // IsValidSize(); + IsValidSize(); + IsValidType(); } - // bool IsValidSize() const - // { - // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) - // { - // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - // MIOPEN_THROW(miopenStatusBadParm, - // "Instance Norm: The input tensor dimension should be in range [2, - // 5]."); - // #else - // return false; - // #endif - // } - // return true; - // } + bool IsValidSize() const + { + if(inputDesc.GetSize() != 4) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: The input tensor should be 4D."); +#else + return false; +#endif + } + int spatial_dim_size = inputDesc.GetSize() - 2; + if (kernel_size_size != spatial_dim_size || + stride_size != spatial_dim_size || + padding_size != spatial_dim_size || + dilation_size != spatial_dim_size) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: Argument length should be 2D"); +#else + return false; +#endif + } + auto input_dims = inputDesc.GetLengths(); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + std::vector output_dims_desired{static_cast(N), + static_cast(C * P), + static_cast(L)}; + auto output_dims = outputDesc.GetLengths(); + if (output_dims != output_dims_desired) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: Invalid output dimension"); +#else + return false; +#endif + } + return true; + } + + bool IsValidType() const + { + if (inputDesc.GetType() != outputDesc.GetType()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: The input tensor and output tensor has mismatch type."); +#else + return false; +#endif + } + return true; + } const TensorDescriptor& GetInputDesc() const { return inputDesc; } const TensorDescriptor& GetOutputDesc() const { return outputDesc; } @@ -124,23 +178,304 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase dilation(dilation_), dilation_size(dilation_size_) { - // IsValidSize(); + IsValidSize(); + IsValidType(); + } + + bool IsValidSize() const + { + if(dinputDesc.GetSize() != 4) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: The input gradient tensor should be 4D."); +#else + return false; +#endif + } + int spatial_dim_size = dinputDesc.GetSize() - 2; + if (kernel_size_size != spatial_dim_size || + stride_size != spatial_dim_size || + padding_size != spatial_dim_size || + dilation_size != spatial_dim_size) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: Argument length should be 2D"); +#else + return false; +#endif + } + auto input_dims = dinputDesc.GetLengths(); + const int32_t N = static_cast(input_dims[0]); + const int32_t C = static_cast(input_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + std::vector output_dims_desired{static_cast(N), + static_cast(C * P), + static_cast(L)}; + auto output_dims = doutputDesc.GetLengths(); + if (output_dims != output_dims_desired) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: Invalid output gradient dimension"); +#else + return false; +#endif + } + return true; + } + + bool IsValidType() const + { + if (dinputDesc.GetType() != doutputDesc.GetType()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Unfold: The input gradient tensor and output gradient tensor has mismatch type."); +#else + return false; +#endif + } + return true; + } + + const TensorDescriptor& GetDinputDesc() const { return dinputDesc; } + const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; } + + NetworkConfig MakeNetworkConfig() const override; + +public: + TensorDescriptor dinputDesc; + TensorDescriptor doutputDesc; + const int32_t* kernel_size; + const int kernel_size_size; + const int32_t* stride; + const int stride_size; + const int32_t* padding; + const int padding_size; + const int32_t* dilation; + const int dilation_size; +}; + +struct FoldFwdProblemDescription : ProblemDescriptionBase +{ + FoldFwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) + : inputDesc(inputDesc_), + outputDesc(outputDesc_), + kernel_size(kernel_size_), + kernel_size_size(kernel_size_size_), + stride(stride_), + stride_size(stride_size_), + padding(padding_), + padding_size(padding_size_), + dilation(dilation_), + dilation_size(dilation_size_) + { + IsValidSize(); + IsValidType(); + } + + bool IsValidSize() const + { + if(outputDesc.GetSize() != 4) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: The output tensor should be 4D."); +#else + return false; +#endif + } + int spatial_dim_size = outputDesc.GetSize() - 2; + if (kernel_size_size != spatial_dim_size || + stride_size != spatial_dim_size || + padding_size != spatial_dim_size || + dilation_size != spatial_dim_size) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: Argument length should be 2D"); +#else + return false; +#endif + } + auto input_dims = inputDesc.GetLengths(); + auto output_dims = outputDesc.GetLengths(); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(output_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + std::vector input_dims_desired{static_cast(N), + static_cast(C * P), + static_cast(L)}; + if (input_dims != input_dims_desired) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: Invalid input dimension"); +#else + return false; +#endif + } + return true; + } + + bool IsValidType() const + { + if (inputDesc.GetType() != outputDesc.GetType()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: The input tensor and output tensor has mismatch type."); +#else + return false; +#endif + } + return true; + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + + NetworkConfig MakeNetworkConfig() const override; + +public: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; + const int32_t* kernel_size; + const int kernel_size_size; + const int32_t* stride; + const int stride_size; + const int32_t* padding; + const int padding_size; + const int32_t* dilation; + const int dilation_size; +}; + +struct FoldBwdProblemDescription : ProblemDescriptionBase +{ + FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_, + const TensorDescriptor& doutputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) + : dinputDesc(dinputDesc_), + doutputDesc(doutputDesc_), + kernel_size(kernel_size_), + kernel_size_size(kernel_size_size_), + stride(stride_), + stride_size(stride_size_), + padding(padding_), + padding_size(padding_size_), + dilation(dilation_), + dilation_size(dilation_size_) + { + IsValidSize(); + IsValidType(); + } + + bool IsValidSize() const + { + if(doutputDesc.GetSize() != 4) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: The output gradient tensor should be 4D."); +#else + return false; +#endif + } + int spatial_dim_size = doutputDesc.GetSize() - 2; + if (kernel_size_size != spatial_dim_size || + stride_size != spatial_dim_size || + padding_size != spatial_dim_size || + dilation_size != spatial_dim_size) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: Argument length should be 2D"); +#else + return false; +#endif + } + auto input_dims = dinputDesc.GetLengths(); + auto output_dims = doutputDesc.GetLengths(); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for (int i = 0; i < spatial_dim_size; ++i) { + P *= kernel_size[i]; + int32_t l = (static_cast(output_dims[i + 2]) + 2 * padding[i] - + dilation[i] * (kernel_size[i] - 1) - 1) / + stride[i] + + 1; + L *= l; + ls.push_back(l); + } + std::vector input_dims_desired{static_cast(N), + static_cast(C * P), + static_cast(L)}; + if (input_dims != input_dims_desired) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: Invalid input gradient dimension"); +#else + return false; +#endif + } + return true; + } + + bool IsValidType() const + { + if (dinputDesc.GetType() != doutputDesc.GetType()) + { +#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG + MIOPEN_THROW(miopenStatusBadParm, + "Fold: The input gradient tensor and output gradient tensor has mismatch type."); +#else + return false; +#endif + } + return true; } - // bool IsValidSize() const - // { - // if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5) - // { - // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - // MIOPEN_THROW(miopenStatusBadParm, - // "Instance Norm: The input tensor dimension should be in range [2, - // 5]."); - // #else - // return false; - // #endif - // } - // return true; - // } const TensorDescriptor& GetDinputDesc() const { return dinputDesc; } const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; } diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp index d463bb0251..e92213f434 100644 --- a/src/include/miopen/fold/solvers.hpp +++ b/src/include/miopen/fold/solvers.hpp @@ -36,22 +36,6 @@ namespace solver { namespace fold { -// using FoldFwdSolverBase = -// NonTunableSolverBase; - -// struct FoldFwd final : FoldFwdSolverBase -// { -// const std::string& SolverDbId() const override { return GetSolverDbId(); } - -// bool IsApplicable( -// const ExecutionContext& context, -// const miopen::fold::FoldFwdProblemDescription& problem) const override; - -// ConvSolution GetSolution( -// const ExecutionContext& context, -// const miopen::fold::FoldFwdProblemDescription& problem) const override; -// }; - using UnfoldFwdSolverBase = NonTunableSolverBase; @@ -82,6 +66,36 @@ struct UnfoldBwd final : UnfoldBwdSolverBase const miopen::fold::UnfoldBwdProblemDescription& problem) const override; }; +using FoldFwdSolverBase = + NonTunableSolverBase; + +struct FoldFwd final : FoldFwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::fold::FoldFwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::fold::FoldFwdProblemDescription& problem) const override; +}; + +using FoldBwdSolverBase = + NonTunableSolverBase; + +struct FoldBwd final : FoldBwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::fold::FoldBwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::fold::FoldBwdProblemDescription& problem) const override; +}; + } // namespace fold } // namespace solver diff --git a/src/solver.cpp b/src/solver.cpp index 8e3d5afcb3..f47b766272 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -649,9 +649,10 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Mha, mha::Mha{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); - // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Fold, fold::UnfoldFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Fold, fold::UnfoldBwd{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp new file mode 100644 index 0000000000..e7696b10a3 --- /dev/null +++ b/src/solver/fold/fold_backward.cpp @@ -0,0 +1,178 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/fold/problem_description.hpp" +#include "miopen/miopen.h" +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace fold { + +bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, + const miopen::fold::FoldBwdProblemDescription& problem) const +{ + return true; +} + +ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::fold::FoldBwdProblemDescription& problem) const +{ + std::ignore = context; + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetDinputDesc().GetType()); + auto dtype = problem.GetDoutputDesc().GetType(); + auto input_grad_dims = problem.GetDinputDesc().GetLengths(); + auto output_grad_dims = problem.GetDoutputDesc().GetLengths(); + + const int32_t N = static_cast(output_grad_dims[0]); + const int32_t C = static_cast(output_grad_dims[1]); + int spatial_dim_size = output_grad_dims.size() - 2; + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= problem.kernel_size[i]; + int32_t l = (static_cast(output_grad_dims[i + 2]) + 2 * problem.padding[i] - + problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / + problem.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + { + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldForward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<3>(deref(params.dinputDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.doutputDesc)); + auto input_grad_dims = deref(params.dinputDesc).GetLengths(); + auto output_grad_dims = deref(params.doutputDesc).GetLengths(); + + int spatial_dim_size = output_grad_dims.size() - 2; + const int32_t N = static_cast(output_grad_dims[0]); + const int32_t C = static_cast(output_grad_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= params.kernel_size[i]; + int32_t l = (static_cast(output_grad_dims[i + 2]) + 2 * params.padding[i] - + params.dilation[i] * (params.kernel_size[i] - 1) - 1) / + params.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + int32_t kernel_size_h = params.kernel_size[0]; + int32_t kernel_size_w = params.kernel_size[1]; + int32_t stride_h = params.stride[0]; + int32_t stride_w = params.stride[1]; + int32_t padding_h = params.padding[0]; + int32_t padding_w = params.padding[1]; + int32_t dilation_h = params.dilation[0]; + int32_t dilation_w = params.dilation[1]; + int32_t LH = ls[0]; + int32_t LW = ls[1]; + int32_t H = static_cast(output_grad_dims[2]); + int32_t W = static_cast(output_grad_dims[3]); + + kernel(params.doutput, + params.dinput, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace fold + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 67528b00b7..5b8f638cb1 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -113,10 +113,10 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte for(int i = 0; i < spatial_dim_size; ++i) { P *= params.kernel_size[i]; - int32_t l = (output_dims[i + 2] + 2 * params.padding[i] - + int32_t l = (static_cast(output_dims[i + 2]) + 2 * params.padding[i] - params.dilation[i] * (params.kernel_size[i] - 1) - 1) / params.stride[i] + - 0; + 1; L *= l; ls.push_back(l); } diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp index 46f7552083..de34115177 100644 --- a/test/cpu_fold.hpp +++ b/test/cpu_fold.hpp @@ -68,7 +68,7 @@ void cpu_unfold_fwd_4d(tensor input_tensor, ls.push_back(l); } - int32_t kernel_size_h = kernel_size[0]; + [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; int32_t kernel_size_w = kernel_size[1]; int32_t stride_h = stride[0]; int32_t stride_w = stride[1]; @@ -76,7 +76,7 @@ void cpu_unfold_fwd_4d(tensor input_tensor, int32_t padding_w = padding[1]; int32_t dilation_h = dilation[0]; int32_t dilation_w = dilation[1]; - int32_t LH = ls[0]; + [[maybe_unused]] int32_t LH = ls[0]; int32_t LW = ls[1]; int32_t H = static_cast(input_dims[2]); int32_t W = static_cast(input_dims[3]); @@ -129,7 +129,7 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, const int32_t N = static_cast(input_grad_dims[0]); const int32_t C = static_cast(input_grad_dims[1]); - int32_t P = 1, L = 1; + [[maybe_unused]] int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) { diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 0e9fe9ddd8..6bd24d931f 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -32,32 +32,32 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) namespace fold { -struct UnfoldForwardTestFloat32 : UnfoldFwdTest +struct FoldForwardTestFloat32 : FoldFwdTest { }; -struct UnfoldForwardTestFloat16 : UnfoldFwdTest +struct FoldForwardTestFloat16 : FoldFwdTest { }; -struct UnfoldForwardTestBFloat16 : UnfoldFwdTest +struct FoldForwardTestBFloat16 : FoldFwdTest { }; -struct UnfoldBackwardTestFloat32 : UnfoldBwdTest +struct FoldBackwardTestFloat32 : FoldBwdTest { }; -struct UnfoldBackwardTestFloat16 : UnfoldBwdTest +struct FoldBackwardTestFloat16 : FoldBwdTest { }; -struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest +struct FoldBackwardTestBFloat16 : FoldBwdTest { }; }; // namespace fold using namespace fold; -TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) +TEST_P(FoldForwardTestFloat32, FoldForwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -70,11 +70,11 @@ TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestFloat32, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, + FoldForwardTestFloat32, + testing::ValuesIn(FoldTestConfigs())); -TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) +TEST_P(FoldForwardTestFloat16, FoldForwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -87,11 +87,11 @@ TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestFloat16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, + FoldForwardTestFloat16, + testing::ValuesIn(FoldTestConfigs())); -TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) +TEST_P(FoldForwardTestBFloat16, FoldForwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -104,11 +104,11 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestBFloat16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, + FoldForwardTestBFloat16, + testing::ValuesIn(FoldTestConfigs())); -TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) +TEST_P(FoldBackwardTestFloat32, FoldBackwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -121,11 +121,11 @@ TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestFloat32, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, + FoldBackwardTestFloat32, + testing::ValuesIn(FoldTestConfigs())); -TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) +TEST_P(FoldBackwardTestFloat16, FoldBackwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -138,11 +138,11 @@ TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestFloat16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, + FoldBackwardTestFloat16, + testing::ValuesIn(FoldTestConfigs())); -TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) +TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest) { if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) { @@ -155,6 +155,6 @@ TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestBFloat16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, + FoldBackwardTestBFloat16, + testing::ValuesIn(FoldTestConfigs())); diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index f15c5b6a5f..9b7f883528 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -37,22 +37,26 @@ #include #include -struct UnfoldTestCase +struct FoldTestCase { size_t N; size_t C; size_t D; size_t H; size_t W; + std::vector outputSize; std::vector kernelSize; std::vector stride; std::vector padding; std::vector dilation; bool isContiguous = true; - friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc) + friend std::ostream& operator<<(std::ostream& os, const FoldTestCase& tc) { - os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W - << " kernel_size:"; + os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W; + os << " output_size:"; + for(const auto& outs : tc.outputSize) + os << outs << " "; + os << " kernel_size:"; for(const auto& ks : tc.kernelSize) os << ks << " "; os << "stride:"; @@ -111,20 +115,21 @@ struct UnfoldTestCase } }; -std::vector UnfoldTestConfigs() +std::vector FoldTestConfigs() { // n c d h w padding return { - {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true}, - {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true}, - {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true}, - {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true}, - {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true}, - {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true}, + {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true}, }; } template -struct UnfoldFwdTest : public ::testing::TestWithParam +struct FoldFwdTest : public ::testing::TestWithParam { protected: void SetUp() override @@ -136,28 +141,18 @@ struct UnfoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_one = [&](auto...) { return 1; }; + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; input = tensor{in_dims, in_strides}.generate(gen_value); - - int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); - const int32_t C = static_cast(in_dims[1]); - int32_t P = 1, L = 1; - std::vector ls; - for(int i = 0; i < spatial_dim_size; ++i) + int32_t C = static_cast(in_dims[1]); + for (int32_t i : config.kernelSize) { - P *= config.kernelSize[i]; - int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - - config.dilation[i] * (config.kernelSize[i] - 1) - 1) / - config.stride[i] + - 1; - L *= l; - ls.push_back(l); + C = C / i; } std::vector out_dims{ - static_cast(N), static_cast(C * P), static_cast(L)}; + static_cast(N), static_cast(C), static_cast(config.outputSize[0]), static_cast(config.outputSize[1])}; output = tensor{out_dims}.generate(gen_zero); outputHost = tensor{out_dims}.generate(gen_zero); @@ -171,7 +166,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam auto&& handle = get_handle(); miopenStatus_t status; - status = miopen::UnfoldForward(handle, + status = miopen::FoldForward(handle, input.desc, input_dev.get(), output.desc, @@ -185,8 +180,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam config.dilation.data(), static_cast(config.dilation.size())); - cpu_unfold_fwd_4d( - input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation); + cpu_unfold_bwd_4d( + outputHost, input, config.kernelSize, config.stride, config.padding, config.dilation); EXPECT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -201,11 +196,15 @@ struct UnfoldFwdTest : public ::testing::TestWithParam // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; + for (int i = 0; i < 10; ++i) + { + std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] << std::endl; + } auto error_output = miopen::rms_range(outputHost, output); EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" << error_output << "}, Tolerance: " << tolerance; } - UnfoldTestCase config; + FoldTestCase config; tensor input; tensor output; @@ -217,7 +216,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam }; template -struct UnfoldBwdTest : public ::testing::TestWithParam +struct FoldBwdTest : public ::testing::TestWithParam { protected: void SetUp() override @@ -229,29 +228,20 @@ struct UnfoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_one = [&](auto...) { return 1; }; + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; dinput = tensor{in_dims, in_strides}.generate(gen_zero); dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); - int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); - const int32_t C = static_cast(in_dims[1]); - int32_t P = 1, L = 1; - std::vector ls; - for(int i = 0; i < spatial_dim_size; ++i) + int32_t C = static_cast(in_dims[1]); + for (int32_t i : config.kernelSize) { - P *= config.kernelSize[i]; - int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - - config.dilation[i] * (config.kernelSize[i] - 1) - 1) / - config.stride[i] + - 1; - L *= l; - ls.push_back(l); + C = C / i; } std::vector out_dims{ - static_cast(N), static_cast(C * P), static_cast(L)}; + static_cast(N), static_cast(C), static_cast(config.outputSize[0]), static_cast(config.outputSize[1])}; doutput = tensor{out_dims}.generate(gen_value); @@ -264,7 +254,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam auto&& handle = get_handle(); miopenStatus_t status; - status = miopen::UnfoldBackward(handle, + status = miopen::FoldBackward(handle, dinput.desc, dinput_dev.get(), doutput.desc, @@ -278,8 +268,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam config.dilation.data(), static_cast(config.dilation.size())); - cpu_unfold_bwd_4d( - dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation); + cpu_unfold_fwd_4d( + doutput, dinputHost, config.kernelSize, config.stride, config.padding, config.dilation); EXPECT_EQ(status, miopenStatusSuccess); dinput.data = handle.Read(dinput_dev, dinput.data.size()); @@ -299,7 +289,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam << "Error backward input_grad beyond tolerance Error: {" << error_dinput << "}, Tolerance: " << tolerance; } - UnfoldTestCase config; + FoldTestCase config; tensor dinput; tensor doutput; diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp new file mode 100644 index 0000000000..b97c96d567 --- /dev/null +++ b/test/gtest/unfold.cpp @@ -0,0 +1,160 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "unfold.hpp" +#include "miopen/bfloat16.hpp" +#include "tensor_holder.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace unfold { +struct UnfoldForwardTestFloat32 : UnfoldFwdTest +{ +}; + +struct UnfoldForwardTestFloat16 : UnfoldFwdTest +{ +}; + +struct UnfoldForwardTestBFloat16 : UnfoldFwdTest +{ +}; + +struct UnfoldBackwardTestFloat32 : UnfoldBwdTest +{ +}; + +struct UnfoldBackwardTestFloat16 : UnfoldBwdTest +{ +}; + +struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest +{ +}; +}; // namespace unfold +using namespace unfold; +TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestFloat32, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestFloat16, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, + UnfoldForwardTestBFloat16, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestFloat32, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestFloat16, + testing::ValuesIn(UnfoldTestConfigs())); + +TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) +{ + if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, + UnfoldBackwardTestBFloat16, + testing::ValuesIn(UnfoldTestConfigs())); diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp new file mode 100644 index 0000000000..686a1e8f02 --- /dev/null +++ b/test/gtest/unfold.hpp @@ -0,0 +1,311 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "../driver/tensor_driver.hpp" +#include "cpu_fold.hpp" +#include "get_handle.hpp" +#include "miopen/allocator.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include +#include + +struct UnfoldTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + std::vector kernelSize; + std::vector stride; + std::vector padding; + std::vector dilation; + bool isContiguous = true; + friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc) + { + os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W + << " kernel_size:"; + for(const auto& ks : tc.kernelSize) + os << ks << " "; + os << "stride:"; + for(const auto& s : tc.stride) + os << s << " "; + os << "padding:"; + for(const auto& p : tc.padding) + os << p << " "; + os << "dilation:"; + for(const auto& d : tc.dilation) + os << d << " "; + os << "isContiguous:" << std::boolalpha << tc.isContiguous; + return os; + } + + std::vector GetInput() + { + if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, D, H, W}); + } + else if((N != 0) && (C != 0) && (H != 0) && (W != 0)) + { + return std::vector({N, C, H, W}); + } + else if((N != 0) && (C != 0) && (W != 0)) + { + return std::vector({N, C, W}); + } + else if((N != 0) && (W != 0)) + { + return std::vector({N, W}); + } + else if((N != 0)) + { + return std::vector({N}); + } + else + { + std::cout << "Error Input Tensor Lengths\n" << std::endl; + return std::vector({0}); + } + } + + std::vector ComputeStrides(std::vector inputDim) const + { + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +std::vector UnfoldTestConfigs() +{ // n c d h w padding + return { + {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true}, + {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true}, + {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true}, + }; +} + +template +struct UnfoldFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + std::vector in_dims = config.GetInput(); + std::vector in_strides = config.ComputeStrides(in_dims); + + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); + + int spatial_dim_size = in_dims.size() - 2; + const int32_t N = static_cast(in_dims[0]); + const int32_t C = static_cast(in_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= config.kernelSize[i]; + int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - + config.dilation[i] * (config.kernelSize[i] - 1) - 1) / + config.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + std::vector out_dims{ + static_cast(N), static_cast(C * P), static_cast(L)}; + + output = tensor{out_dims}.generate(gen_zero); + outputHost = tensor{out_dims}.generate(gen_zero); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::UnfoldForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); + + cpu_unfold_fwd_4d( + input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation); + + EXPECT_EQ(status, miopenStatusSuccess); + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + auto error_output = miopen::rms_range(outputHost, output); + EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" + << error_output << "}, Tolerance: " << tolerance; + } + UnfoldTestCase config; + + tensor input; + tensor output; + + tensor outputHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; +}; + +template +struct UnfoldBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + std::vector in_dims = config.GetInput(); + std::vector in_strides = config.ComputeStrides(in_dims); + + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + + int spatial_dim_size = in_dims.size() - 2; + const int32_t N = static_cast(in_dims[0]); + const int32_t C = static_cast(in_dims[1]); + int32_t P = 1, L = 1; + std::vector ls; + for(int i = 0; i < spatial_dim_size; ++i) + { + P *= config.kernelSize[i]; + int32_t l = (static_cast(in_dims[i + 2]) + 2 * config.padding[i] - + config.dilation[i] * (config.kernelSize[i] - 1) - 1) / + config.stride[i] + + 1; + L *= l; + ls.push_back(l); + } + + std::vector out_dims{ + static_cast(N), static_cast(C * P), static_cast(L)}; + + doutput = tensor{out_dims}.generate(gen_value); + + dinput_dev = handle.Write(dinput.data); + doutput_dev = handle.Write(doutput.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::UnfoldBackward(handle, + dinput.desc, + dinput_dev.get(), + doutput.desc, + doutput_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); + + cpu_unfold_bwd_4d( + dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation); + + EXPECT_EQ(status, miopenStatusSuccess); + dinput.data = handle.Read(dinput_dev, dinput.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + auto error_dinput = miopen::rms_range(dinputHost, dinput); + EXPECT_TRUE(error_dinput < tolerance) + << "Error backward input_grad beyond tolerance Error: {" << error_dinput + << "}, Tolerance: " << tolerance; + } + UnfoldTestCase config; + + tensor dinput; + tensor doutput; + + tensor dinputHost; + + miopen::Allocator::ManageDataPtr dinput_dev; + miopen::Allocator::ManageDataPtr doutput_dev; +}; From 9286ce7336264a1be722bf5bad4a2515868132dc Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 10 Jul 2024 07:44:38 +0000 Subject: [PATCH 07/46] githook format --- driver/driver.hpp | 6 +- driver/fold_driver.hpp | 75 ++++---- include/miopen/miopen.h | 84 ++++----- src/fold.cpp | 84 ++++----- src/fold/problem_description.cpp | 8 +- src/fold_api.cpp | 96 +++++----- src/include/miopen/fold.hpp | 48 ++--- .../miopen/fold/problem_description.hpp | 175 ++++++++---------- src/include/miopen/fold/solvers.hpp | 10 +- src/solver/fold/fold_backward.cpp | 10 +- test/cpu_fold.hpp | 22 +-- test/gtest/fold.hpp | 93 +++++----- test/gtest/unfold.hpp | 14 +- 13 files changed, 355 insertions(+), 370 deletions(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index f26d7053f3..68a0421e41 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -151,7 +151,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n"); + "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], " + "fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -177,7 +178,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" && - arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && arg != "--version") + arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && + arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp index 117538452e..4278624928 100644 --- a/driver/fold_driver.hpp +++ b/driver/fold_driver.hpp @@ -135,14 +135,14 @@ int FoldDriver::GetandSetData() { std::vector input_length = GetTensorLengthsFromCmdLine(); - output_size = GetVectorInt32tFromCmdLine("outputSize"); - kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); - stride = GetVectorInt32tFromCmdLine("stride"); - padding = GetVectorInt32tFromCmdLine("padding"); - dilation = GetVectorInt32tFromCmdLine("dilation"); - const int N = input_length[0]; - int C = input_length[1]; - for (int32_t i : kernel_size) + output_size = GetVectorInt32tFromCmdLine("outputSize"); + kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); + stride = GetVectorInt32tFromCmdLine("stride"); + padding = GetVectorInt32tFromCmdLine("padding"); + dilation = GetVectorInt32tFromCmdLine("dilation"); + const int N = input_length[0]; + int C = input_length[1]; + for(int32_t i : kernel_size) { C = C / i; } @@ -295,18 +295,18 @@ int FoldDriver::RunForwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenFoldForward(GetHandle(), - inputDesc, - input_dev->GetMem(), - outputDesc, - output_dev->GetMem(), - kernel_size.data(), - kernel_size.size(), - stride.data(), - stride.size(), - padding.data(), - padding.size(), - dilation.data(), - dilation.size()); + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -320,8 +320,8 @@ int FoldDriver::RunForwardGPU() STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) - std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter - << " ms" << std::endl; + std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter << " ms" + << std::endl; float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; @@ -362,18 +362,18 @@ int FoldDriver::RunBackwardGPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenFoldBackward(GetHandle(), - dinputDesc, - dinput_dev->GetMem(), - doutputDesc, - doutput_dev->GetMem(), - kernel_size.data(), - kernel_size.size(), - stride.data(), - stride.size(), - padding.data(), - padding.size(), - dilation.data(), - dilation.size()); + dinputDesc, + dinput_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + kernel_size.data(), + kernel_size.size(), + stride.data(), + stride.size(), + padding.data(), + padding.size(), + dilation.data(), + dilation.size()); float time = 0.0; miopenGetKernelTime(GetHandle(), &time); @@ -387,8 +387,8 @@ int FoldDriver::RunBackwardGPU() STOP_TIME int iter = inflags.GetValueInt("iter"); if(WALL_CLOCK) - std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter - << " ms" << std::endl; + std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter << " ms" + << std::endl; float kernel_average_time = iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; @@ -459,8 +459,7 @@ int FoldDriver::VerifyBackward() if(!std::isfinite(error_dinput) || error_dinput > tolerance) { - std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance - << std::endl; + std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance << std::endl; return EC_VerifyFwd; } else diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a45ece12fe..51485db6e7 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6607,50 +6607,50 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); - /*! @brief Execute an unfold backward layer - * - * @param handle MIOpen handle (input) - * @param dinputDesc Tensor descriptor for data input grad tensor (output) - * @param dinput Data tensor input grad (output) - * @param doutputDesc Tensor descriptor for data output grad tensor (input) - * @param doutput Data tensor output grad (input) - * @param kernel_size Size of the sliding box array (input) - * @param kernel_size_size Size of the kernel_size array (input) - * @param stride Stride array of the sliding box (input) - * @param stride_size Size of the stride array (input) - * @param padding Padding array to be added on input (input) - * @param padding_size Size of the padding array (input) - * @param dilation Dilation array control the stride of the elements within the - neighborhood (input) - * @param dilation_size Size of the dilation array (input) - * @return miopenStatus_t - */ +/*! @brief Execute an unfold backward layer +* +* @param handle MIOpen handle (input) +* @param dinputDesc Tensor descriptor for data input grad tensor (output) +* @param dinput Data tensor input grad (output) +* @param doutputDesc Tensor descriptor for data output grad tensor (input) +* @param doutput Data tensor output grad (input) +* @param kernel_size Size of the sliding box array (input) +* @param kernel_size_size Size of the kernel_size array (input) +* @param stride Stride array of the sliding box (input) +* @param stride_size Size of the stride array (input) +* @param padding Padding array to be added on input (input) +* @param padding_size Size of the padding array (input) +* @param dilation Dilation array control the stride of the elements within the +neighborhood (input) +* @param dilation_size Size of the dilation array (input) +* @return miopenStatus_t +*/ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t dinputDesc, - void* dinput, - const miopenTensorDescriptor_t doutputDesc, - const void* doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); /*! @brief Execute an unfold forward layer * diff --git a/src/fold.cpp b/src/fold.cpp index d2ff285af1..470d8eb6de 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -138,29 +138,29 @@ miopenStatus_t UnfoldBackward(Handle& handle, } miopenStatus_t FoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { const auto problem = fold::FoldFwdProblemDescription{inputDesc, - outputDesc, - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size}; + outputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; const auto invoke_params = [&]() { auto tmp = fold::InvokeParams{}; @@ -188,29 +188,29 @@ miopenStatus_t FoldForward(Handle& handle, } miopenStatus_t FoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { const auto problem = fold::FoldBwdProblemDescription{dinputDesc, - doutputDesc, - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size}; + doutputDesc, + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size}; const auto invoke_params = [&]() { auto tmp = fold::InvokeParams{}; diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index 39202fd372..ce34de1a16 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -91,7 +91,7 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const auto output_dtype = outputDesc.GetType(); auto size = inputDesc.GetElementSize(); auto in_dims = inputDesc.GetLengths(); - auto out_dims = outputDesc.GetLengths(); + auto out_dims = outputDesc.GetLengths(); std::ostringstream ss; @@ -105,7 +105,7 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const ss << "_" << val; } ss << "out_dims"; - for (auto val: out_dims) + for(auto val : out_dims) { ss << "_" << val; } @@ -123,7 +123,7 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const auto output_dtype = doutputDesc.GetType(); auto size = dinputDesc.GetElementSize(); auto in_dims = dinputDesc.GetLengths(); - auto out_dims = doutputDesc.GetLengths(); + auto out_dims = doutputDesc.GetLengths(); std::ostringstream ss; @@ -137,7 +137,7 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const ss << "_" << val; } ss << "out_grad_dims"; - for (auto val: out_dims) + for(auto val : out_dims) { ss << "_" << val; } diff --git a/src/fold_api.cpp b/src/fold_api.cpp index fb22fa90b4..f59c209785 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -94,63 +94,63 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, } extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { return miopen::try_([&] { miopen::FoldForward(miopen::deref(handle), - miopen::deref(inputDesc), - DataCast(input), - miopen::deref(outputDesc), - DataCast(output), - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size); + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); }); } extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle, - const miopenTensorDescriptor_t dinputDesc, - void* dinput, - const miopenTensorDescriptor_t doutputDesc, - const void* doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size) + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size) { return miopen::try_([&] { miopen::FoldBackward(miopen::deref(handle), - miopen::deref(dinputDesc), - DataCast(dinput), - miopen::deref(doutputDesc), - DataCast(doutput), - kernel_size, - kernel_size_size, - stride, - stride_size, - padding, - padding_size, - dilation, - dilation_size); + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(doutputDesc), + DataCast(doutput), + kernel_size, + kernel_size_size, + stride, + stride_size, + padding, + padding_size, + dilation, + dilation_size); }); } diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index d94a42ee5a..3ac7e878f7 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -62,31 +62,31 @@ miopenStatus_t UnfoldBackward(Handle& handle, const int dilation_size); miopenStatus_t FoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); miopenStatus_t FoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - const int kernel_size_size, - const int32_t* stride, - const int stride_size, - const int32_t* padding, - const int padding_size, - const int32_t* dilation, - const int dilation_size); + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + const int kernel_size_size, + const int32_t* stride, + const int stride_size, + const int32_t* padding, + const int padding_size, + const int32_t* dilation, + const int dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index f89a90eac2..ebaadb5386 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -74,21 +74,17 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase if(inputDesc.GetSize() != 4) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: The input tensor should be 4D."); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input tensor should be 4D."); #else return false; #endif } int spatial_dim_size = inputDesc.GetSize() - 2; - if (kernel_size_size != spatial_dim_size || - stride_size != spatial_dim_size || - padding_size != spatial_dim_size || - dilation_size != spatial_dim_size) + if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || + padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: Argument length should be 2D"); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D"); #else return false; #endif @@ -98,24 +94,23 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase const int32_t C = static_cast(input_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; ls.push_back(l); } - std::vector output_dims_desired{static_cast(N), - static_cast(C * P), - static_cast(L)}; + std::vector output_dims_desired{ + static_cast(N), static_cast(C * P), static_cast(L)}; auto output_dims = outputDesc.GetLengths(); - if (output_dims != output_dims_desired) + if(output_dims != output_dims_desired) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: Invalid output dimension"); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output dimension"); #else return false; #endif @@ -125,7 +120,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase bool IsValidType() const { - if (inputDesc.GetType() != outputDesc.GetType()) + if(inputDesc.GetType() != outputDesc.GetType()) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, @@ -187,21 +182,17 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase if(dinputDesc.GetSize() != 4) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: The input gradient tensor should be 4D."); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input gradient tensor should be 4D."); #else return false; #endif } int spatial_dim_size = dinputDesc.GetSize() - 2; - if (kernel_size_size != spatial_dim_size || - stride_size != spatial_dim_size || - padding_size != spatial_dim_size || - dilation_size != spatial_dim_size) + if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || + padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: Argument length should be 2D"); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D"); #else return false; #endif @@ -211,24 +202,23 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase const int32_t C = static_cast(input_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(input_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; ls.push_back(l); } - std::vector output_dims_desired{static_cast(N), - static_cast(C * P), - static_cast(L)}; + std::vector output_dims_desired{ + static_cast(N), static_cast(C * P), static_cast(L)}; auto output_dims = doutputDesc.GetLengths(); - if (output_dims != output_dims_desired) + if(output_dims != output_dims_desired) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: Invalid output gradient dimension"); + MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output gradient dimension"); #else return false; #endif @@ -238,11 +228,12 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase bool IsValidType() const { - if (dinputDesc.GetType() != doutputDesc.GetType()) + if(dinputDesc.GetType() != doutputDesc.GetType()) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Unfold: The input gradient tensor and output gradient tensor has mismatch type."); + MIOPEN_THROW( + miopenStatusBadParm, + "Unfold: The input gradient tensor and output gradient tensor has mismatch type."); #else return false; #endif @@ -271,15 +262,15 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase struct FoldFwdProblemDescription : ProblemDescriptionBase { FoldFwdProblemDescription(const TensorDescriptor& inputDesc_, - const TensorDescriptor& outputDesc_, - const int32_t* kernel_size_, - const int kernel_size_size_, - const int32_t* stride_, - const int stride_size_, - const int32_t* padding_, - const int padding_size_, - const int32_t* dilation_, - const int dilation_size_) + const TensorDescriptor& outputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) : inputDesc(inputDesc_), outputDesc(outputDesc_), kernel_size(kernel_size_), @@ -300,48 +291,43 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase if(outputDesc.GetSize() != 4) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: The output tensor should be 4D."); + MIOPEN_THROW(miopenStatusBadParm, "Fold: The output tensor should be 4D."); #else return false; #endif } int spatial_dim_size = outputDesc.GetSize() - 2; - if (kernel_size_size != spatial_dim_size || - stride_size != spatial_dim_size || - padding_size != spatial_dim_size || - dilation_size != spatial_dim_size) + if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || + padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: Argument length should be 2D"); + MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D"); #else return false; #endif } - auto input_dims = inputDesc.GetLengths(); + auto input_dims = inputDesc.GetLengths(); auto output_dims = outputDesc.GetLengths(); - const int32_t N = static_cast(output_dims[0]); - const int32_t C = static_cast(output_dims[1]); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(output_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; ls.push_back(l); } - std::vector input_dims_desired{static_cast(N), - static_cast(C * P), - static_cast(L)}; - if (input_dims != input_dims_desired) + std::vector input_dims_desired{ + static_cast(N), static_cast(C * P), static_cast(L)}; + if(input_dims != input_dims_desired) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: Invalid input dimension"); + MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input dimension"); #else return false; #endif @@ -351,7 +337,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase bool IsValidType() const { - if (inputDesc.GetType() != outputDesc.GetType()) + if(inputDesc.GetType() != outputDesc.GetType()) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, @@ -384,15 +370,15 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase struct FoldBwdProblemDescription : ProblemDescriptionBase { FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_, - const TensorDescriptor& doutputDesc_, - const int32_t* kernel_size_, - const int kernel_size_size_, - const int32_t* stride_, - const int stride_size_, - const int32_t* padding_, - const int padding_size_, - const int32_t* dilation_, - const int dilation_size_) + const TensorDescriptor& doutputDesc_, + const int32_t* kernel_size_, + const int kernel_size_size_, + const int32_t* stride_, + const int stride_size_, + const int32_t* padding_, + const int padding_size_, + const int32_t* dilation_, + const int dilation_size_) : dinputDesc(dinputDesc_), doutputDesc(doutputDesc_), kernel_size(kernel_size_), @@ -413,48 +399,43 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase if(doutputDesc.GetSize() != 4) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: The output gradient tensor should be 4D."); + MIOPEN_THROW(miopenStatusBadParm, "Fold: The output gradient tensor should be 4D."); #else return false; #endif } int spatial_dim_size = doutputDesc.GetSize() - 2; - if (kernel_size_size != spatial_dim_size || - stride_size != spatial_dim_size || - padding_size != spatial_dim_size || - dilation_size != spatial_dim_size) + if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || + padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: Argument length should be 2D"); + MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D"); #else return false; #endif } - auto input_dims = dinputDesc.GetLengths(); + auto input_dims = dinputDesc.GetLengths(); auto output_dims = doutputDesc.GetLengths(); - const int32_t N = static_cast(output_dims[0]); - const int32_t C = static_cast(output_dims[1]); + const int32_t N = static_cast(output_dims[0]); + const int32_t C = static_cast(output_dims[1]); int32_t P = 1, L = 1; std::vector ls; - for (int i = 0; i < spatial_dim_size; ++i) { + for(int i = 0; i < spatial_dim_size; ++i) + { P *= kernel_size[i]; int32_t l = (static_cast(output_dims[i + 2]) + 2 * padding[i] - - dilation[i] * (kernel_size[i] - 1) - 1) / + dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; L *= l; ls.push_back(l); } - std::vector input_dims_desired{static_cast(N), - static_cast(C * P), - static_cast(L)}; - if (input_dims != input_dims_desired) + std::vector input_dims_desired{ + static_cast(N), static_cast(C * P), static_cast(L)}; + if(input_dims != input_dims_desired) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: Invalid input gradient dimension"); + MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input gradient dimension"); #else return false; #endif @@ -464,11 +445,12 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase bool IsValidType() const { - if (dinputDesc.GetType() != doutputDesc.GetType()) + if(dinputDesc.GetType() != doutputDesc.GetType()) { #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG - MIOPEN_THROW(miopenStatusBadParm, - "Fold: The input gradient tensor and output gradient tensor has mismatch type."); + MIOPEN_THROW( + miopenStatusBadParm, + "Fold: The input gradient tensor and output gradient tensor has mismatch type."); #else return false; #endif @@ -476,7 +458,6 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase return true; } - const TensorDescriptor& GetDinputDesc() const { return dinputDesc; } const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; } diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp index e92213f434..1ff3ef7566 100644 --- a/src/include/miopen/fold/solvers.hpp +++ b/src/include/miopen/fold/solvers.hpp @@ -76,9 +76,8 @@ struct FoldFwd final : FoldFwdSolverBase bool IsApplicable(const ExecutionContext& context, const miopen::fold::FoldFwdProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::fold::FoldFwdProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::fold::FoldFwdProblemDescription& problem) const override; }; using FoldBwdSolverBase = @@ -91,9 +90,8 @@ struct FoldBwd final : FoldBwdSolverBase bool IsApplicable(const ExecutionContext& context, const miopen::fold::FoldBwdProblemDescription& problem) const override; - ConvSolution - GetSolution(const ExecutionContext& context, - const miopen::fold::FoldBwdProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::fold::FoldBwdProblemDescription& problem) const override; }; } // namespace fold diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index e7696b10a3..a1327be94e 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -43,13 +43,13 @@ namespace solver { namespace fold { bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, - const miopen::fold::FoldBwdProblemDescription& problem) const + const miopen::fold::FoldBwdProblemDescription& problem) const { return true; } ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context, - const miopen::fold::FoldBwdProblemDescription& problem) const + const miopen::fold::FoldBwdProblemDescription& problem) const { std::ignore = context; auto result = ConvSolution{miopenStatusSuccess}; @@ -59,8 +59,8 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte auto input_grad_dims = problem.GetDinputDesc().GetLengths(); auto output_grad_dims = problem.GetDoutputDesc().GetLengths(); - const int32_t N = static_cast(output_grad_dims[0]); - const int32_t C = static_cast(output_grad_dims[1]); + const int32_t N = static_cast(output_grad_dims[0]); + const int32_t C = static_cast(output_grad_dims[1]); int spatial_dim_size = output_grad_dims.size() - 2; int32_t P = 1, L = 1; std::vector ls; @@ -68,7 +68,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte { P *= problem.kernel_size[i]; int32_t l = (static_cast(output_grad_dims[i + 2]) + 2 * problem.padding[i] - - problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / + problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) / problem.stride[i] + 1; L *= l; diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp index de34115177..373cc30917 100644 --- a/test/cpu_fold.hpp +++ b/test/cpu_fold.hpp @@ -69,18 +69,18 @@ void cpu_unfold_fwd_4d(tensor input_tensor, } [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; - int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; [[maybe_unused]] int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); - int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { int ncp = gid / L, l = gid % L; int nc = ncp / P, p = ncp % P; diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 9b7f883528..7e71c5ce2f 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -141,18 +141,20 @@ struct FoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; - auto gen_zero = [&](auto...) { return 0; }; - input = tensor{in_dims, in_strides}.generate(gen_value); - const int32_t N = static_cast(in_dims[0]); - int32_t C = static_cast(in_dims[1]); - for (int32_t i : config.kernelSize) + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); + const int32_t N = static_cast(in_dims[0]); + int32_t C = static_cast(in_dims[1]); + for(int32_t i : config.kernelSize) { C = C / i; } - std::vector out_dims{ - static_cast(N), static_cast(C), static_cast(config.outputSize[0]), static_cast(config.outputSize[1])}; + std::vector out_dims{static_cast(N), + static_cast(C), + static_cast(config.outputSize[0]), + static_cast(config.outputSize[1])}; output = tensor{out_dims}.generate(gen_zero); outputHost = tensor{out_dims}.generate(gen_zero); @@ -167,18 +169,18 @@ struct FoldFwdTest : public ::testing::TestWithParam miopenStatus_t status; status = miopen::FoldForward(handle, - input.desc, - input_dev.get(), - output.desc, - output_dev.get(), - config.kernelSize.data(), - static_cast(config.kernelSize.size()), - config.stride.data(), - static_cast(config.stride.size()), - config.padding.data(), - static_cast(config.padding.size()), - config.dilation.data(), - static_cast(config.dilation.size())); + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); cpu_unfold_bwd_4d( outputHost, input, config.kernelSize, config.stride, config.padding, config.dilation); @@ -196,9 +198,10 @@ struct FoldFwdTest : public ::testing::TestWithParam // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; - for (int i = 0; i < 10; ++i) + for(int i = 0; i < 10; ++i) { - std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] << std::endl; + std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] + << std::endl; } auto error_output = miopen::rms_range(outputHost, output); EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" @@ -228,20 +231,22 @@ struct FoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; - auto gen_zero = [&](auto...) { return 0; }; - dinput = tensor{in_dims, in_strides}.generate(gen_zero); - dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); - - const int32_t N = static_cast(in_dims[0]); - int32_t C = static_cast(in_dims[1]); - for (int32_t i : config.kernelSize) + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + + const int32_t N = static_cast(in_dims[0]); + int32_t C = static_cast(in_dims[1]); + for(int32_t i : config.kernelSize) { C = C / i; } - std::vector out_dims{ - static_cast(N), static_cast(C), static_cast(config.outputSize[0]), static_cast(config.outputSize[1])}; + std::vector out_dims{static_cast(N), + static_cast(C), + static_cast(config.outputSize[0]), + static_cast(config.outputSize[1])}; doutput = tensor{out_dims}.generate(gen_value); @@ -255,18 +260,18 @@ struct FoldBwdTest : public ::testing::TestWithParam miopenStatus_t status; status = miopen::FoldBackward(handle, - dinput.desc, - dinput_dev.get(), - doutput.desc, - doutput_dev.get(), - config.kernelSize.data(), - static_cast(config.kernelSize.size()), - config.stride.data(), - static_cast(config.stride.size()), - config.padding.data(), - static_cast(config.padding.size()), - config.dilation.data(), - static_cast(config.dilation.size())); + dinput.desc, + dinput_dev.get(), + doutput.desc, + doutput_dev.get(), + config.kernelSize.data(), + static_cast(config.kernelSize.size()), + config.stride.data(), + static_cast(config.stride.size()), + config.padding.data(), + static_cast(config.padding.size()), + config.dilation.data(), + static_cast(config.dilation.size())); cpu_unfold_fwd_4d( doutput, dinputHost, config.kernelSize, config.stride, config.padding, config.dilation); diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 686a1e8f02..3a839024b2 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -136,9 +136,9 @@ struct UnfoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; - auto gen_zero = [&](auto...) { return 0; }; - input = tensor{in_dims, in_strides}.generate(gen_value); + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); @@ -229,10 +229,10 @@ struct UnfoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; - auto gen_zero = [&](auto...) { return 0; }; - dinput = tensor{in_dims, in_strides}.generate(gen_zero); - dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); From e59ce36d740b91608b03d46a5b6b4da8ea05815b Mon Sep 17 00:00:00 2001 From: Duong Le Date: Fri, 12 Jul 2024 08:44:18 +0000 Subject: [PATCH 08/46] update doc and miopen.h description --- docs/reference/index.rst | 1 + include/miopen/miopen.h | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 02bcb88622..c6ab9521b7 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -32,3 +32,4 @@ The MIOpen API library is structured as follows: * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental) * :doc:`Cat <../doxygen/html/group__cat>` (experimental) * :doc:`Argmax<./argmax>` (experimental) + * :doc:`Fold <./fold>` (experimental) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 51485db6e7..a019b4f7c6 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6588,12 +6588,12 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d * * @{ */ -/*! @brief Execute an unfold forward layer +/*! @brief Execute an fold forward layer * * @param handle MIOpen handle (input) * @param inputDesc Tensor descriptor for data input tensor input (input) * @param input Data tensor input (input) - * @param outputDesc Tensor descriptor for data output tensor output (output) + * @param outputDesc Tensor descriptor for data output tensor output (input) * @param output Data tensor output (output) * @param kernel_size Size of the sliding box array (input) * @param kernel_size_size Size of the kernel_size array (input) @@ -6620,10 +6620,10 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, const int32_t* dilation, const int dilation_size); -/*! @brief Execute an unfold backward layer +/*! @brief Execute an fold backward layer * * @param handle MIOpen handle (input) -* @param dinputDesc Tensor descriptor for data input grad tensor (output) +* @param dinputDesc Tensor descriptor for data input grad tensor (input) * @param dinput Data tensor input grad (output) * @param doutputDesc Tensor descriptor for data output grad tensor (input) * @param doutput Data tensor output grad (input) @@ -6657,7 +6657,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, * @param handle MIOpen handle (input) * @param inputDesc Tensor descriptor for data input tensor input (input) * @param input Data tensor input (input) - * @param outputDesc Tensor descriptor for data output tensor output (output) + * @param outputDesc Tensor descriptor for data output tensor output (input) * @param output Data tensor output (output) * @param kernel_size Size of the sliding box array (input) * @param kernel_size_size Size of the kernel_size array (input) @@ -6687,7 +6687,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, /*! @brief Execute an unfold backward layer * * @param handle MIOpen handle (input) - * @param dinputDesc Tensor descriptor for data input grad tensor (output) + * @param dinputDesc Tensor descriptor for data input grad tensor (input) * @param dinput Data tensor input grad (output) * @param doutputDesc Tensor descriptor for data output grad tensor (input) * @param doutput Data tensor output grad (input) From c11493403883df17a9cd5f5dfd109a024ccdab54 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 15 Jul 2024 05:43:46 +0000 Subject: [PATCH 09/46] Update driver help text --- driver/fold_driver.hpp | 4 ++-- driver/unfold_driver.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp index 4278624928..1468a77e0d 100644 --- a/driver/fold_driver.hpp +++ b/driver/fold_driver.hpp @@ -166,8 +166,8 @@ int FoldDriver::AddCmdLineArgs() inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str"); inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str"); inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); - inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str"); - inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str"); + inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index d565d192f5..e9a3665eaa 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -171,8 +171,8 @@ int UnfoldDriver::AddCmdLineArgs() "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string"); inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str"); inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); - inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str"); - inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str"); + inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str"); + inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); From bd5db593af74bd68f03e44465a4a1beec55b8056 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 15 Jul 2024 06:14:14 +0000 Subject: [PATCH 10/46] Change IN_OUT_TYPE to FLOAT --- src/kernels/MIOpenUnfold.cpp | 12 ++++++------ src/solver/fold/fold_backward.cpp | 1 - src/solver/fold/fold_forward.cpp | 1 - src/solver/fold/unfold_backward.cpp | 1 - src/solver/fold/unfold_forward.cpp | 1 - 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index 5c39a82e2c..9b36edd28d 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -86,8 +86,8 @@ __device__ void unfoldForward4D(const TIO* input, output[output_idx] = x; } -extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input, - IN_OUT_TYPE* output, +extern "C" __global__ void UnfoldForward4D(const FLOAT* input, + FLOAT* output, int N, int C, int H, @@ -107,7 +107,7 @@ extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input, tensor_view_t<4> input_tv, tensor_view_t<3> output_tv) { - unfoldForward4D(input, + unfoldForward4D(input, output, N, C, @@ -194,8 +194,8 @@ __device__ void unfoldBackward4D(const TIO* output_grad, input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum); } -extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad, - IN_OUT_TYPE* input_grad, +extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad, + FLOAT* input_grad, int N, int C, int H, @@ -215,7 +215,7 @@ extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad, tensor_view_t<3> output_grad_tv, tensor_view_t<4> input_grad_tv) { - unfoldBackward4D(output_grad, + unfoldBackward4D(output_grad, input_grad, N, C, diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index a1327be94e..2b09ac3529 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -85,7 +85,6 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, }; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 5b8f638cb1..0150c8b9fb 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -74,7 +74,6 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, }; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index 249f08592c..38d0812307 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -74,7 +74,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, }; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index b866b5d167..1aa10d9eea 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -85,7 +85,6 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, }; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); From 4bb5855eb4623006da4f4f62f29343cbfdfba5b9 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 15 Jul 2024 06:16:32 +0000 Subject: [PATCH 11/46] add __restrict__ to tensor pointer --- src/kernels/MIOpenUnfold.cpp | 98 ++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index 9b36edd28d..84a272ef80 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -32,9 +32,9 @@ #include "float_types.h" #include "tensor_view.hpp" -template -__device__ void unfoldForward4D(const TIO* input, - TIO* output, +template +__device__ void unfoldForward4D(const DTYPE* __restrict__ input, + DTYPE* __restrict__ output, int N, int C, int H, @@ -73,7 +73,7 @@ __device__ void unfoldForward4D(const TIO* input, int h = lh * stride_h - padding_h + ph * dilation_h; int w = lw * stride_w - padding_w + pw * dilation_w; - TIO x = 0; + DTYPE x = 0; if(0 <= h && h < H && 0 <= w && w < W) { long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + @@ -86,8 +86,8 @@ __device__ void unfoldForward4D(const TIO* input, output[output_idx] = x; } -extern "C" __global__ void UnfoldForward4D(const FLOAT* input, - FLOAT* output, +extern "C" __global__ void UnfoldForward4D(const FLOAT* __restrict__ input, + FLOAT* __restrict__ output, int N, int C, int H, @@ -108,30 +108,30 @@ extern "C" __global__ void UnfoldForward4D(const FLOAT* input, tensor_view_t<3> output_tv) { unfoldForward4D(input, - output, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - input_tv, - output_tv); + output, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + input_tv, + output_tv); } -template -__device__ void unfoldBackward4D(const TIO* output_grad, - TIO* input_grad, +template +__device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad, + DTYPE* __restrict__ input_grad, int N, int C, int H, @@ -194,8 +194,8 @@ __device__ void unfoldBackward4D(const TIO* output_grad, input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum); } -extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad, - FLOAT* input_grad, +extern "C" __global__ void UnfoldBackward4D(const FLOAT* __restrict__ output_grad, + FLOAT* __restrict__ input_grad, int N, int C, int H, @@ -216,23 +216,23 @@ extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad, tensor_view_t<4> input_grad_tv) { unfoldBackward4D(output_grad, - input_grad, - N, - C, - H, - W, - P, - L, - LH, - LW, - kernel_size_h, - kernel_size_w, - stride_h, - stride_w, - padding_h, - padding_w, - dilation_h, - dilation_w, - output_grad_tv, - input_grad_tv); + input_grad, + N, + C, + H, + W, + P, + L, + LH, + LW, + kernel_size_h, + kernel_size_w, + stride_h, + stride_w, + padding_h, + padding_w, + dilation_h, + dilation_w, + output_grad_tv, + input_grad_tv); } From 918a26751c88bdfaa06732ba8fa76a551e5de4f8 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 17 Jul 2024 06:09:46 +0000 Subject: [PATCH 12/46] replace include "" with <> --- src/fold.cpp | 4 ++-- src/fold_api.cpp | 2 +- src/include/miopen/fold/invoke_params.hpp | 2 +- src/include/miopen/fold/problem_description.hpp | 2 +- src/solver/fold/fold_backward.cpp | 4 ++-- src/solver/fold/fold_forward.cpp | 4 ++-- src/solver/fold/unfold_backward.cpp | 4 ++-- src/solver/fold/unfold_forward.cpp | 4 ++-- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/fold.cpp b/src/fold.cpp index 470d8eb6de..a105be60b3 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include "miopen/miopen.h" -#include "miopen/fold/problem_description.hpp" +#include +#include #include #include #include diff --git a/src/fold_api.cpp b/src/fold_api.cpp index f59c209785..d1bdefdfb0 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -24,7 +24,7 @@ * *******************************************************************************/ -#include "miopen/miopen.h" +#include #include #include #include diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index da89023f17..c19e83eeee 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -25,7 +25,7 @@ *******************************************************************************/ #pragma once -#include "miopen/miopen.h" +#include #include #include diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index ebaadb5386..30689e8bd9 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -26,7 +26,7 @@ #pragma once #include "miopen/errors.hpp" -#include "miopen/miopen.h" +#include #include #include #include diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index 2b09ac3529..ee9316a3bf 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include "miopen/fold/problem_description.hpp" -#include "miopen/miopen.h" +#include +#include #include #include #include diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 0150c8b9fb..9ecf89c6e6 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include "miopen/fold/problem_description.hpp" -#include "miopen/miopen.h" +#include +#include #include #include #include diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index 38d0812307..7b75679263 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include "miopen/fold/problem_description.hpp" -#include "miopen/miopen.h" +#include +#include #include #include #include diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index 1aa10d9eea..35e29df8bb 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#include "miopen/fold/problem_description.hpp" -#include "miopen/miopen.h" +#include +#include #include #include #include From 4f51b6e8b3f9adf009bf9d6f5a7999ee506ec908 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 17 Jul 2024 07:57:56 +0000 Subject: [PATCH 13/46] change all int -> int32_t, remove duplicate lines in solver --- include/miopen/miopen.h | 32 ++++----- src/fold.cpp | 32 ++++----- src/fold_api.cpp | 32 ++++----- src/include/miopen/fold.hpp | 32 ++++----- src/include/miopen/fold/invoke_params.hpp | 8 +-- .../miopen/fold/problem_description.hpp | 72 +++++++++---------- src/solver/fold/fold_backward.cpp | 20 +----- src/solver/fold/fold_forward.cpp | 7 +- src/solver/fold/unfold_backward.cpp | 6 +- src/solver/fold/unfold_forward.cpp | 18 +---- 10 files changed, 111 insertions(+), 148 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a019b4f7c6..3a0d7fd5dd 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -6612,13 +6612,13 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t outputDesc, void* output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); /*! @brief Execute an fold backward layer * @@ -6644,13 +6644,13 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t doutputDesc, const void* doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); /*! @brief Execute an unfold forward layer * @@ -6676,13 +6676,13 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t outputDesc, void* output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); /*! @brief Execute an unfold backward layer * @@ -6708,13 +6708,13 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t doutputDesc, const void* doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); /** @} */ // CLOSEOUT FOLD DOXYGEN GROUP diff --git a/src/fold.cpp b/src/fold.cpp index a105be60b3..0c30529c99 100644 --- a/src/fold.cpp +++ b/src/fold.cpp @@ -43,13 +43,13 @@ miopenStatus_t UnfoldForward(Handle& handle, const TensorDescriptor& outputDesc, Data_t output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { const auto problem = fold::UnfoldFwdProblemDescription{inputDesc, outputDesc, @@ -93,13 +93,13 @@ miopenStatus_t UnfoldBackward(Handle& handle, const TensorDescriptor& doutputDesc, ConstData_t doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc, doutputDesc, @@ -143,13 +143,13 @@ miopenStatus_t FoldForward(Handle& handle, const TensorDescriptor& outputDesc, Data_t output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { const auto problem = fold::FoldFwdProblemDescription{inputDesc, outputDesc, @@ -193,13 +193,13 @@ miopenStatus_t FoldBackward(Handle& handle, const TensorDescriptor& doutputDesc, ConstData_t doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { const auto problem = fold::FoldBwdProblemDescription{dinputDesc, doutputDesc, diff --git a/src/fold_api.cpp b/src/fold_api.cpp index d1bdefdfb0..6c15380db6 100644 --- a/src/fold_api.cpp +++ b/src/fold_api.cpp @@ -37,13 +37,13 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t outputDesc, void* output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { return miopen::try_([&] { miopen::UnfoldForward(miopen::deref(handle), @@ -68,13 +68,13 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t doutputDesc, const void* doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { return miopen::try_([&] { miopen::UnfoldBackward(miopen::deref(handle), @@ -99,13 +99,13 @@ extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t outputDesc, void* output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { return miopen::try_([&] { miopen::FoldForward(miopen::deref(handle), @@ -130,13 +130,13 @@ extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t doutputDesc, const void* doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size) + const int32_t dilation_size) { return miopen::try_([&] { miopen::FoldBackward(miopen::deref(handle), diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 3ac7e878f7..a88e0b9b9e 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -39,13 +39,13 @@ miopenStatus_t UnfoldForward(Handle& handle, const TensorDescriptor& outputDesc, Data_t output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); miopenStatus_t UnfoldBackward(Handle& handle, const TensorDescriptor& dinputDesc, @@ -53,13 +53,13 @@ miopenStatus_t UnfoldBackward(Handle& handle, const TensorDescriptor& doutputDesc, ConstData_t doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); miopenStatus_t FoldForward(Handle& handle, const TensorDescriptor& inputDesc, @@ -67,13 +67,13 @@ miopenStatus_t FoldForward(Handle& handle, const TensorDescriptor& outputDesc, Data_t output, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); miopenStatus_t FoldBackward(Handle& handle, const TensorDescriptor& dinputDesc, @@ -81,12 +81,12 @@ miopenStatus_t FoldBackward(Handle& handle, const TensorDescriptor& doutputDesc, ConstData_t doutput, const int32_t* kernel_size, - const int kernel_size_size, + const int32_t kernel_size_size, const int32_t* stride, - const int stride_size, + const int32_t stride_size, const int32_t* padding, - const int padding_size, + const int32_t padding_size, const int32_t* dilation, - const int dilation_size); + const int32_t dilation_size); } // namespace miopen #endif // MIOPEN_INSTANCE_NORM_HPP_ diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index c19e83eeee..20e7859b35 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -53,10 +53,10 @@ struct InvokeParams : public miopen::InvokeParams const int32_t* stride = nullptr; const int32_t* padding = nullptr; const int32_t* dilation = nullptr; - int kernel_size_size = 0; - int stride_size = 0; - int padding_size = 0; - int dilation_size = 0; + int32_t kernel_size_size = 0; + int32_t stride_size = 0; + int32_t padding_size = 0; + int32_t dilation_size = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index 30689e8bd9..3bc7ae91ca 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -47,13 +47,13 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_, const int32_t* kernel_size_, - const int kernel_size_size_, + const int32_t kernel_size_size_, const int32_t* stride_, - const int stride_size_, + const int32_t stride_size_, const int32_t* padding_, - const int padding_size_, + const int32_t padding_size_, const int32_t* dilation_, - const int dilation_size_) + const int32_t dilation_size_) : inputDesc(inputDesc_), outputDesc(outputDesc_), kernel_size(kernel_size_), @@ -79,7 +79,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase return false; #endif } - int spatial_dim_size = inputDesc.GetSize() - 2; + int32_t spatial_dim_size = inputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { @@ -141,13 +141,13 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase TensorDescriptor inputDesc; TensorDescriptor outputDesc; const int32_t* kernel_size; - const int kernel_size_size; + const int32_t kernel_size_size; const int32_t* stride; - const int stride_size; + const int32_t stride_size; const int32_t* padding; - const int padding_size; + const int32_t padding_size; const int32_t* dilation; - const int dilation_size; + const int32_t dilation_size; }; struct UnfoldBwdProblemDescription : ProblemDescriptionBase @@ -155,13 +155,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase UnfoldBwdProblemDescription(const TensorDescriptor& dinputDesc_, const TensorDescriptor& doutputDesc_, const int32_t* kernel_size_, - const int kernel_size_size_, + const int32_t kernel_size_size_, const int32_t* stride_, - const int stride_size_, + const int32_t stride_size_, const int32_t* padding_, - const int padding_size_, + const int32_t padding_size_, const int32_t* dilation_, - const int dilation_size_) + const int32_t dilation_size_) : dinputDesc(dinputDesc_), doutputDesc(doutputDesc_), kernel_size(kernel_size_), @@ -187,7 +187,7 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase return false; #endif } - int spatial_dim_size = dinputDesc.GetSize() - 2; + int32_t spatial_dim_size = dinputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { @@ -250,13 +250,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase TensorDescriptor dinputDesc; TensorDescriptor doutputDesc; const int32_t* kernel_size; - const int kernel_size_size; + const int32_t kernel_size_size; const int32_t* stride; - const int stride_size; + const int32_t stride_size; const int32_t* padding; - const int padding_size; + const int32_t padding_size; const int32_t* dilation; - const int dilation_size; + const int32_t dilation_size; }; struct FoldFwdProblemDescription : ProblemDescriptionBase @@ -264,13 +264,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase FoldFwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_, const int32_t* kernel_size_, - const int kernel_size_size_, + const int32_t kernel_size_size_, const int32_t* stride_, - const int stride_size_, + const int32_t stride_size_, const int32_t* padding_, - const int padding_size_, + const int32_t padding_size_, const int32_t* dilation_, - const int dilation_size_) + const int32_t dilation_size_) : inputDesc(inputDesc_), outputDesc(outputDesc_), kernel_size(kernel_size_), @@ -296,7 +296,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase return false; #endif } - int spatial_dim_size = outputDesc.GetSize() - 2; + int32_t spatial_dim_size = outputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { @@ -358,13 +358,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase TensorDescriptor inputDesc; TensorDescriptor outputDesc; const int32_t* kernel_size; - const int kernel_size_size; + const int32_t kernel_size_size; const int32_t* stride; - const int stride_size; + const int32_t stride_size; const int32_t* padding; - const int padding_size; + const int32_t padding_size; const int32_t* dilation; - const int dilation_size; + const int32_t dilation_size; }; struct FoldBwdProblemDescription : ProblemDescriptionBase @@ -372,13 +372,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_, const TensorDescriptor& doutputDesc_, const int32_t* kernel_size_, - const int kernel_size_size_, + const int32_t kernel_size_size_, const int32_t* stride_, - const int stride_size_, + const int32_t stride_size_, const int32_t* padding_, - const int padding_size_, + const int32_t padding_size_, const int32_t* dilation_, - const int dilation_size_) + const int32_t dilation_size_) : dinputDesc(dinputDesc_), doutputDesc(doutputDesc_), kernel_size(kernel_size_), @@ -404,7 +404,7 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase return false; #endif } - int spatial_dim_size = doutputDesc.GetSize() - 2; + int32_t spatial_dim_size = doutputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { @@ -467,13 +467,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase TensorDescriptor dinputDesc; TensorDescriptor doutputDesc; const int32_t* kernel_size; - const int kernel_size_size; + const int32_t kernel_size_size; const int32_t* stride; - const int stride_size; + const int32_t stride_size; const int32_t* padding; - const int padding_size; + const int32_t padding_size; const int32_t* dilation; - const int dilation_size; + const int32_t dilation_size; }; } // namespace fold diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index ee9316a3bf..0e5c4b0824 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -61,7 +61,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte const int32_t N = static_cast(output_grad_dims[0]); const int32_t C = static_cast(output_grad_dims[1]); - int spatial_dim_size = output_grad_dims.size() - 2; + int32_t spatial_dim_size = output_grad_dims.size() - 2; int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) @@ -105,7 +105,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte result.construction_params.push_back(kernel); } - result.invoker_factory = [](const std::vector& kernels) { + result.invoker_factory = [N, C, P, L, ls](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); @@ -115,22 +115,6 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte auto input_grad_dims = deref(params.dinputDesc).GetLengths(); auto output_grad_dims = deref(params.doutputDesc).GetLengths(); - int spatial_dim_size = output_grad_dims.size() - 2; - const int32_t N = static_cast(output_grad_dims[0]); - const int32_t C = static_cast(output_grad_dims[1]); - int32_t P = 1, L = 1; - std::vector ls; - for(int i = 0; i < spatial_dim_size; ++i) - { - P *= params.kernel_size[i]; - int32_t l = (static_cast(output_grad_dims[i + 2]) + 2 * params.padding[i] - - params.dilation[i] * (params.kernel_size[i] - 1) - 1) / - params.stride[i] + - 1; - L *= l; - ls.push_back(l); - } - int32_t kernel_size_h = params.kernel_size[0]; int32_t kernel_size_w = params.kernel_size[1]; int32_t stride_h = params.stride[0]; diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 9ecf89c6e6..1ec6e9f4ab 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include #include #include #include @@ -94,7 +95,7 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte result.construction_params.push_back(kernel); } - result.invoker_factory = [](const std::vector& kernels) { + result.invoker_factory = [N, C](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); @@ -104,9 +105,7 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte auto input_dims = deref(params.inputDesc).GetLengths(); auto output_dims = deref(params.outputDesc).GetLengths(); - int spatial_dim_size = output_dims.size() - 2; - const int32_t N = static_cast(output_dims[0]); - const int32_t C = static_cast(output_dims[1]); + int32_t spatial_dim_size = output_dims.size() - 2; int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index 7b75679263..c673c4497a 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -94,7 +94,7 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con result.construction_params.push_back(kernel); } - result.invoker_factory = [](const std::vector& kernels) { + result.invoker_factory = [N, C, H, W](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); @@ -105,8 +105,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con auto output_grad_dims = deref(params.doutputDesc).GetLengths(); int spatial_dim_size = input_grad_dims.size() - 2; - const int32_t N = static_cast(input_grad_dims[0]); - const int32_t C = static_cast(input_grad_dims[1]); int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) @@ -130,8 +128,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con int32_t dilation_w = params.dilation[1]; int32_t LH = ls[0]; int32_t LW = ls[1]; - int32_t H = static_cast(input_grad_dims[2]); - int32_t W = static_cast(input_grad_dims[3]); kernel(params.doutput, params.dinput, diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index 35e29df8bb..68402e6a2c 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -105,7 +105,7 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con result.construction_params.push_back(kernel); } - result.invoker_factory = [](const std::vector& kernels) { + result.invoker_factory = [N, C, P, L, ls](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); @@ -115,22 +115,6 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con auto input_dims = deref(params.inputDesc).GetLengths(); auto output_dims = deref(params.outputDesc).GetLengths(); - int spatial_dim_size = input_dims.size() - 2; - const int32_t N = static_cast(input_dims[0]); - const int32_t C = static_cast(input_dims[1]); - int32_t P = 1, L = 1; - std::vector ls; - for(int i = 0; i < spatial_dim_size; ++i) - { - P *= params.kernel_size[i]; - int32_t l = (static_cast(input_dims[i + 2]) + 2 * params.padding[i] - - params.dilation[i] * (params.kernel_size[i] - 1) - 1) / - params.stride[i] + - 1; - L *= l; - ls.push_back(l); - } - int32_t kernel_size_h = params.kernel_size[0]; int32_t kernel_size_w = params.kernel_size[1]; int32_t stride_h = params.stride[0]; From e726fc1d1f9ddbabb2a4e0f3d9504277ccc9c34b Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 17 Jul 2024 07:59:56 +0000 Subject: [PATCH 14/46] githook format --- src/include/miopen/fold/invoke_params.hpp | 8 ++++---- src/solver/fold/fold_backward.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp index 20e7859b35..246ccfb401 100644 --- a/src/include/miopen/fold/invoke_params.hpp +++ b/src/include/miopen/fold/invoke_params.hpp @@ -53,10 +53,10 @@ struct InvokeParams : public miopen::InvokeParams const int32_t* stride = nullptr; const int32_t* padding = nullptr; const int32_t* dilation = nullptr; - int32_t kernel_size_size = 0; - int32_t stride_size = 0; - int32_t padding_size = 0; - int32_t dilation_size = 0; + int32_t kernel_size_size = 0; + int32_t stride_size = 0; + int32_t padding_size = 0; + int32_t dilation_size = 0; std::size_t GetWorkspaceSize() const { return 0; } Data_t GetWorkspace() const { return nullptr; } diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index 0e5c4b0824..1fe957d408 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -59,8 +59,8 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte auto input_grad_dims = problem.GetDinputDesc().GetLengths(); auto output_grad_dims = problem.GetDoutputDesc().GetLengths(); - const int32_t N = static_cast(output_grad_dims[0]); - const int32_t C = static_cast(output_grad_dims[1]); + const int32_t N = static_cast(output_grad_dims[0]); + const int32_t C = static_cast(output_grad_dims[1]); int32_t spatial_dim_size = output_grad_dims.size() - 2; int32_t P = 1, L = 1; std::vector ls; From 8ee286160f08a51be318531311032182c8cb3a12 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 18 Jul 2024 04:54:47 +0000 Subject: [PATCH 15/46] remove useless if else in problem description --- .../miopen/fold/problem_description.hpp | 60 ------------------- 1 file changed, 60 deletions(-) diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp index 3bc7ae91ca..0e3ef29d4b 100644 --- a/src/include/miopen/fold/problem_description.hpp +++ b/src/include/miopen/fold/problem_description.hpp @@ -83,11 +83,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D"); -#else - return false; -#endif } auto input_dims = inputDesc.GetLengths(); const int32_t N = static_cast(input_dims[0]); @@ -109,11 +105,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase auto output_dims = outputDesc.GetLengths(); if(output_dims != output_dims_desired) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output dimension"); -#else - return false; -#endif } return true; } @@ -122,12 +114,8 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase { if(inputDesc.GetType() != outputDesc.GetType()) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input tensor and output tensor has mismatch type."); -#else - return false; -#endif } return true; } @@ -181,21 +169,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase { if(dinputDesc.GetSize() != 4) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input gradient tensor should be 4D."); -#else - return false; -#endif } int32_t spatial_dim_size = dinputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D"); -#else - return false; -#endif } auto input_dims = dinputDesc.GetLengths(); const int32_t N = static_cast(input_dims[0]); @@ -217,11 +197,7 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase auto output_dims = doutputDesc.GetLengths(); if(output_dims != output_dims_desired) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output gradient dimension"); -#else - return false; -#endif } return true; } @@ -230,13 +206,9 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase { if(dinputDesc.GetType() != doutputDesc.GetType()) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW( miopenStatusBadParm, "Unfold: The input gradient tensor and output gradient tensor has mismatch type."); -#else - return false; -#endif } return true; } @@ -290,21 +262,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase { if(outputDesc.GetSize() != 4) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: The output tensor should be 4D."); -#else - return false; -#endif } int32_t spatial_dim_size = outputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D"); -#else - return false; -#endif } auto input_dims = inputDesc.GetLengths(); auto output_dims = outputDesc.GetLengths(); @@ -326,11 +290,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase static_cast(N), static_cast(C * P), static_cast(L)}; if(input_dims != input_dims_desired) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input dimension"); -#else - return false; -#endif } return true; } @@ -339,12 +299,8 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase { if(inputDesc.GetType() != outputDesc.GetType()) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: The input tensor and output tensor has mismatch type."); -#else - return false; -#endif } return true; } @@ -398,21 +354,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase { if(doutputDesc.GetSize() != 4) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: The output gradient tensor should be 4D."); -#else - return false; -#endif } int32_t spatial_dim_size = doutputDesc.GetSize() - 2; if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size || padding_size != spatial_dim_size || dilation_size != spatial_dim_size) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D"); -#else - return false; -#endif } auto input_dims = dinputDesc.GetLengths(); auto output_dims = doutputDesc.GetLengths(); @@ -434,11 +382,7 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase static_cast(N), static_cast(C * P), static_cast(L)}; if(input_dims != input_dims_desired) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input gradient dimension"); -#else - return false; -#endif } return true; } @@ -447,13 +391,9 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase { if(dinputDesc.GetType() != doutputDesc.GetType()) { -#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG MIOPEN_THROW( miopenStatusBadParm, "Fold: The input gradient tensor and output gradient tensor has mismatch type."); -#else - return false; -#endif } return true; } From f3dea16ca398ffa36db95f75d5ef000ad2b74721 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 18 Jul 2024 08:36:42 +0000 Subject: [PATCH 16/46] add more tensor_layout_t constructor and update kernel to use get_tensor_view_idx --- src/kernels/MIOpenUnfold.cpp | 23 ++++++++--------------- src/kernels/tensor_view.hpp | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index 84a272ef80..100d8d7c42 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -76,14 +76,11 @@ __device__ void unfoldForward4D(const DTYPE* __restrict__ input, DTYPE x = 0; if(0 <= h && h < H && 0 <= w && w < W) { - long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + - input_tv.stride[0] * n; - x = input[input_idx]; + tensor_layout_t<4> input_layout(input_tv, n, c, h, w); + x = input[input_tv.get_tensor_view_idx(input_layout)]; } - - long output_idx = - output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n; - output[output_idx] = x; + tensor_layout_t<3> output_layout(output_tv, n, c * P + p, l); + output[output_tv.get_tensor_view_idx(output_layout)] = x; } extern "C" __global__ void UnfoldForward4D(const FLOAT* __restrict__ input, @@ -182,16 +179,12 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad, continue; if(lw < 0 || LW <= lw) continue; - long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + - output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + - output_grad_tv.stride[0] * n; - sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]); + tensor_layout_t<3> output_grad_layout(output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw); + sum += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); } } - - long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + - input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n; - input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum); + tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h , w); + input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum); } extern "C" __global__ void UnfoldBackward4D(const FLOAT* __restrict__ output_grad, diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d35bfd93fc..b62bb5ef33 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -72,6 +72,40 @@ struct tensor_layout_t } } + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) + { + static_assert(N == 5); + layout[0] = n; + layout[1] = c; + layout[2] = d; + layout[3] = h; + layout[4] = w; + } + + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w) + { + static_assert(N == 4); + layout[0] = n; + layout[1] = c; + layout[2] = h; + layout[3] = w; + } + + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t h, uint64_t w) + { + static_assert(N == 3); + layout[0] = n; + layout[1] = h; + layout[2] = w; + } + + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t w) + { + static_assert(N == 2); + layout[0] = n; + layout[1] = w; + } + uint64_t layout[N]; }; From 4a832966d63c9699f2751354aae0e1e16b4c5147 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 18 Jul 2024 08:37:34 +0000 Subject: [PATCH 17/46] githook format --- src/kernels/MIOpenUnfold.cpp | 8 +++++--- src/kernels/tensor_view.hpp | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index 100d8d7c42..24eebcc80f 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -179,11 +179,13 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad, continue; if(lw < 0 || LW <= lw) continue; - tensor_layout_t<3> output_grad_layout(output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw); - sum += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); + tensor_layout_t<3> output_grad_layout( + output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw); + sum += CVT_FLOAT2ACCUM( + output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); } } - tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h , w); + tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h, w); input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum); } diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index b62bb5ef33..abaa052142 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -72,7 +72,12 @@ struct tensor_layout_t } } - constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) + constexpr tensor_layout_t(const tensor_view_t& tensor_view, + uint64_t n, + uint64_t c, + uint64_t d, + uint64_t h, + uint64_t w) { static_assert(N == 5); layout[0] = n; @@ -82,7 +87,8 @@ struct tensor_layout_t layout[4] = w; } - constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w) + constexpr tensor_layout_t( + const tensor_view_t& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w) { static_assert(N == 4); layout[0] = n; @@ -91,14 +97,17 @@ struct tensor_layout_t layout[3] = w; } - constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t h, uint64_t w) + constexpr tensor_layout_t(const tensor_view_t& tensor_view, + uint64_t n, + uint64_t h, + uint64_t w) { static_assert(N == 3); layout[0] = n; layout[1] = h; layout[2] = w; } - + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t n, uint64_t w) { static_assert(N == 2); From 299117b4e20374a40ffc03bd0aba9a0c4e764903 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Thu, 18 Jul 2024 08:39:49 +0000 Subject: [PATCH 18/46] remove {} --- src/solver/fold/unfold_forward.cpp | 56 ++++++++++++++---------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index 68402e6a2c..be8692f111 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -75,35 +75,33 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con ls.push_back(l); } - { - auto kernel = KernelInfo{}; - kernel.kernel_file = "MIOpenUnfold.cpp"; - kernel.kernel_name = "UnfoldForward4D"; - - const auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, - {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, - {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, - {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - }; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - - size_t xlocalsize = LOCAL_SIZE; - size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); - size_t ylocalsize = 1; - size_t ygridsize = 1; - size_t zlocalsize = 1; - size_t zgridsize = 1; - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - result.construction_params.push_back(kernel); - } + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldForward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); result.invoker_factory = [N, C, P, L, ls](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { From e76095e8fc1771a82978af13018b538b396877d1 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 22 Jul 2024 08:43:14 +0000 Subject: [PATCH 19/46] update code as comments --- docs/reference/index.rst | 2 +- driver/CMakeLists.txt | 2 +- driver/fold_driver.hpp | 6 ++-- driver/unfold_driver.hpp | 6 ++-- src/CMakeLists.txt | 4 +-- src/fold/problem_description.cpp | 8 ----- src/solver/fold/fold_backward.cpp | 56 +++++++++++++++---------------- test/gtest/fold.cpp | 32 ++++++++++++++---- test/gtest/fold.hpp | 3 -- test/gtest/unfold.cpp | 30 +++++++++++++---- 10 files changed, 87 insertions(+), 62 deletions(-) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c6ab9521b7..cd1cfee6d2 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -32,4 +32,4 @@ The MIOpen API library is structured as follows: * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental) * :doc:`Cat <../doxygen/html/group__cat>` (experimental) * :doc:`Argmax<./argmax>` (experimental) - * :doc:`Fold <./fold>` (experimental) + * :doc:`Fold <./group__fold>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index c115cf435f..a51a8ec859 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -51,8 +51,8 @@ add_executable(MIOpenDriver dm_softmax.cpp dm_sum.cpp dm_tensorop.cpp - dm_unfold.cpp dm_fold.cpp + dm_unfold.cpp main.cpp registry_driver_maker.cpp rocrand_wrapper.cpp) diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp index 1468a77e0d..c034beeaee 100644 --- a/driver/fold_driver.hpp +++ b/driver/fold_driver.hpp @@ -133,7 +133,7 @@ int FoldDriver::ParseCmdLineArgs(int argc, char* argv[]) template int FoldDriver::GetandSetData() { - std::vector input_length = GetTensorLengthsFromCmdLine(); + std::vector input_length = inflags.GetValueTensor("DimLengths").lengths; output_size = GetVectorInt32tFromCmdLine("outputSize"); kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); @@ -161,8 +161,8 @@ int FoldDriver::AddCmdLineArgs() { inflags.AddInputFlag( "forw", 'F', "1", "Run Fold Forward (Default=1) or both Forward and Backward (0)", "int"); - inflags.AddInputFlag( - "DimLengths", 'D', "3,12,12", "The dimensional lengths of the input tensor", "string"); + inflags.AddTensorFlag( + "DimLengths", 'D', "3x12x12", "The dimensional lengths of the input tensor"); inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str"); inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str"); inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp index e9a3665eaa..8e4be26f3f 100644 --- a/driver/unfold_driver.hpp +++ b/driver/unfold_driver.hpp @@ -132,7 +132,7 @@ int UnfoldDriver::ParseCmdLineArgs(int argc, char* argv[]) template int UnfoldDriver::GetandSetData() { - std::vector input_length = GetTensorLengthsFromCmdLine(); + std::vector input_length = inflags.GetValueTensor("DimLengths").lengths; kernel_size = GetVectorInt32tFromCmdLine("kernelSize"); stride = GetVectorInt32tFromCmdLine("stride"); @@ -167,8 +167,8 @@ int UnfoldDriver::AddCmdLineArgs() { inflags.AddInputFlag( "forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int"); - inflags.AddInputFlag( - "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string"); + inflags.AddTensorFlag( + "DimLengths", 'D', "2x5x3x4", "The dimensional lengths of the input tensor"); inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str"); inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str"); inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str"); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bd057795a3..6ed6638122 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -258,10 +258,10 @@ set( MIOpen_Source solver/conv_winoRxS.cpp solver/conv_winoRxS_fused.cpp solver/fft.cpp - solver/fold/unfold_forward.cpp - solver/fold/unfold_backward.cpp solver/fold/fold_forward.cpp solver/fold/fold_backward.cpp + solver/fold/unfold_forward.cpp + solver/fold/unfold_backward.cpp solver/gemm.cpp solver/gemm_bwd.cpp solver/gemm_wrw.cpp diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index ce34de1a16..7cf628c170 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -36,7 +36,6 @@ namespace fold { NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const { auto input_dtype = inputDesc.GetType(); - auto output_dtype = outputDesc.GetType(); auto size = inputDesc.GetElementSize(); auto in_dims = inputDesc.GetLengths(); @@ -44,7 +43,6 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const ss << "Unfold_fwd"; ss << "i_dtype" << input_dtype; - ss << "o_dtype" << output_dtype; ss << "size" << size; ss << "in_dims"; for(auto val : in_dims) @@ -62,7 +60,6 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const { auto input_dtype = dinputDesc.GetType(); - auto output_dtype = doutputDesc.GetType(); auto size = dinputDesc.GetElementSize(); auto in_dims = dinputDesc.GetLengths(); @@ -70,7 +67,6 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const ss << "Unfold_bwd"; ss << "i_dtype" << input_dtype; - ss << "o_dtype" << output_dtype; ss << "size" << size; ss << "in_grad_dims"; for(auto val : in_dims) @@ -88,7 +84,6 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const { auto input_dtype = inputDesc.GetType(); - auto output_dtype = outputDesc.GetType(); auto size = inputDesc.GetElementSize(); auto in_dims = inputDesc.GetLengths(); auto out_dims = outputDesc.GetLengths(); @@ -97,7 +92,6 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const ss << "Fold_fwd"; ss << "i_dtype" << input_dtype; - ss << "o_dtype" << output_dtype; ss << "size" << size; ss << "in_dims"; for(auto val : in_dims) @@ -120,7 +114,6 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const { auto input_dtype = dinputDesc.GetType(); - auto output_dtype = doutputDesc.GetType(); auto size = dinputDesc.GetElementSize(); auto in_dims = dinputDesc.GetLengths(); auto out_dims = doutputDesc.GetLengths(); @@ -129,7 +122,6 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const ss << "Fold_bwd"; ss << "i_dtype" << input_dtype; - ss << "o_dtype" << output_dtype; ss << "size" << size; ss << "in_grad_dims"; for(auto val : in_dims) diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index 1fe957d408..d07362ace3 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -75,35 +75,33 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte ls.push_back(l); } - { - auto kernel = KernelInfo{}; - kernel.kernel_file = "MIOpenUnfold.cpp"; - kernel.kernel_name = "UnfoldForward4D"; - - const auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, - {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, - {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, - {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, - }; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - - size_t xlocalsize = LOCAL_SIZE; - size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); - size_t ylocalsize = 1; - size_t ygridsize = 1; - size_t zlocalsize = 1; - size_t zgridsize = 1; - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - result.construction_params.push_back(kernel); - } + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenUnfold.cpp"; + kernel.kernel_name = "UnfoldForward4D"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(N * C * P * L, LOCAL_SIZE); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); result.invoker_factory = [N, C, P, L, ls](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 6bd24d931f..5d96199515 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -29,9 +29,21 @@ #include "tensor_holder.hpp" #include +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) namespace fold { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + struct FoldForwardTestFloat32 : FoldFwdTest { }; @@ -56,10 +68,13 @@ struct FoldBackwardTestBFloat16 : FoldBwdTest { }; }; // namespace fold + using namespace fold; + TEST_P(FoldForwardTestFloat32, FoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -76,7 +91,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldForwardTestFloat16, FoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -93,7 +109,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldForwardTestBFloat16, FoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -110,7 +127,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldBackwardTestFloat32, FoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -127,7 +145,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, TEST_P(FoldBackwardTestFloat16, FoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -144,7 +163,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 7e71c5ce2f..0f9f9e040b 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -30,9 +30,6 @@ #include "random.hpp" #include "tensor_holder.hpp" #include "verify.hpp" -#include -#include -#include #include #include #include diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp index b97c96d567..22a67c4657 100644 --- a/test/gtest/unfold.cpp +++ b/test/gtest/unfold.cpp @@ -29,9 +29,21 @@ #include "tensor_holder.hpp" #include +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) namespace unfold { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + struct UnfoldForwardTestFloat32 : UnfoldFwdTest { }; @@ -59,7 +71,8 @@ struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest using namespace unfold; TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -76,7 +89,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -93,7 +107,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -110,7 +125,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -127,7 +143,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); @@ -144,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) { - if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL))) + if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || + (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) { RunTest(); Verify(); From 526f7728430b21ba072e37c218d67b2ebef49126 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 22 Jul 2024 08:44:04 +0000 Subject: [PATCH 20/46] githook format --- src/fold/problem_description.cpp | 28 ++++++++++++++-------------- test/gtest/fold.cpp | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp index 7cf628c170..a59b460ed7 100644 --- a/src/fold/problem_description.cpp +++ b/src/fold/problem_description.cpp @@ -35,9 +35,9 @@ namespace fold { NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const { - auto input_dtype = inputDesc.GetType(); - auto size = inputDesc.GetElementSize(); - auto in_dims = inputDesc.GetLengths(); + auto input_dtype = inputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto in_dims = inputDesc.GetLengths(); std::ostringstream ss; @@ -59,9 +59,9 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const { - auto input_dtype = dinputDesc.GetType(); - auto size = dinputDesc.GetElementSize(); - auto in_dims = dinputDesc.GetLengths(); + auto input_dtype = dinputDesc.GetType(); + auto size = dinputDesc.GetElementSize(); + auto in_dims = dinputDesc.GetLengths(); std::ostringstream ss; @@ -83,10 +83,10 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const { - auto input_dtype = inputDesc.GetType(); - auto size = inputDesc.GetElementSize(); - auto in_dims = inputDesc.GetLengths(); - auto out_dims = outputDesc.GetLengths(); + auto input_dtype = inputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto in_dims = inputDesc.GetLengths(); + auto out_dims = outputDesc.GetLengths(); std::ostringstream ss; @@ -113,10 +113,10 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const { - auto input_dtype = dinputDesc.GetType(); - auto size = dinputDesc.GetElementSize(); - auto in_dims = dinputDesc.GetLengths(); - auto out_dims = doutputDesc.GetLengths(); + auto input_dtype = dinputDesc.GetType(); + auto size = dinputDesc.GetElementSize(); + auto in_dims = dinputDesc.GetLengths(); + auto out_dims = doutputDesc.GetLengths(); std::ostringstream ss; diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 5d96199515..7368c6920e 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -33,7 +33,7 @@ MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) namespace fold { - + std::string GetFloatArg() { const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); From 2a3d2b0565d0aec02064ef1c9c94373ad66def20 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 22 Jul 2024 08:46:15 +0000 Subject: [PATCH 21/46] cpu_fold -> cpu_unfold --- test/{cpu_fold.hpp => cpu_unfold.hpp} | 0 test/gtest/fold.hpp | 2 +- test/gtest/unfold.hpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename test/{cpu_fold.hpp => cpu_unfold.hpp} (100%) diff --git a/test/cpu_fold.hpp b/test/cpu_unfold.hpp similarity index 100% rename from test/cpu_fold.hpp rename to test/cpu_unfold.hpp diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 0f9f9e040b..a92b09d6a7 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -24,7 +24,7 @@ * *******************************************************************************/ #include "../driver/tensor_driver.hpp" -#include "cpu_fold.hpp" +#include "cpu_unfold.hpp" #include "get_handle.hpp" #include "miopen/allocator.hpp" #include "random.hpp" diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 3a839024b2..51d0c9cf30 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -24,7 +24,7 @@ * *******************************************************************************/ #include "../driver/tensor_driver.hpp" -#include "cpu_fold.hpp" +#include "cpu_unfold.hpp" #include "get_handle.hpp" #include "miopen/allocator.hpp" #include "random.hpp" From 27e26c35b5d2fedee5bee62202e300f4d2665d40 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 22 Jul 2024 09:08:46 +0000 Subject: [PATCH 22/46] update code as comments --- driver/mloUnfoldHost.hpp | 6 +++--- test/cpu_unfold.hpp | 6 +++--- test/gtest/fold.hpp | 4 +++- test/gtest/unfold.hpp | 4 +++- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index fcfd5f4a6b..6178946bd1 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -64,7 +64,7 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input, L *= l; ls.push_back(l); } - [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_h = kernel_size[0]; int32_t kernel_size_w = kernel_size[1]; int32_t stride_h = stride[0]; int32_t stride_w = stride[1]; @@ -72,7 +72,7 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input, int32_t padding_w = padding[1]; int32_t dilation_h = dilation[0]; int32_t dilation_w = dilation[1]; - [[maybe_unused]] int32_t LH = ls[0]; + int32_t LH = ls[0]; int32_t LW = ls[1]; int32_t H = static_cast(input_dims[2]); int32_t W = static_cast(input_dims[3]); @@ -124,7 +124,7 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, int spatial_dim_size = input_size - 2; const int32_t N = static_cast(input_grad_dims[0]); const int32_t C = static_cast(input_grad_dims[1]); - [[maybe_unused]] int32_t P = 1, L = 1; + int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) { diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp index 373cc30917..6a3b5f3d2e 100644 --- a/test/cpu_unfold.hpp +++ b/test/cpu_unfold.hpp @@ -68,7 +68,7 @@ void cpu_unfold_fwd_4d(tensor input_tensor, ls.push_back(l); } - [[maybe_unused]] int32_t kernel_size_h = kernel_size[0]; + int32_t kernel_size_h = kernel_size[0]; int32_t kernel_size_w = kernel_size[1]; int32_t stride_h = stride[0]; int32_t stride_w = stride[1]; @@ -76,7 +76,7 @@ void cpu_unfold_fwd_4d(tensor input_tensor, int32_t padding_w = padding[1]; int32_t dilation_h = dilation[0]; int32_t dilation_w = dilation[1]; - [[maybe_unused]] int32_t LH = ls[0]; + int32_t LH = ls[0]; int32_t LW = ls[1]; int32_t H = static_cast(input_dims[2]); int32_t W = static_cast(input_dims[3]); @@ -129,7 +129,7 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, const int32_t N = static_cast(input_grad_dims[0]); const int32_t C = static_cast(input_grad_dims[1]); - [[maybe_unused]] int32_t P = 1, L = 1; + int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) { diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index a92b09d6a7..02d9e42e17 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -113,7 +113,8 @@ struct FoldTestCase }; std::vector FoldTestConfigs() -{ // n c d h w padding +{ + // clang-format: off return { {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, true}, {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true}, @@ -123,6 +124,7 @@ std::vector FoldTestConfigs() {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true}, {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true}, }; + // clang-format: on } template diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 51d0c9cf30..4bb790b1f5 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -112,7 +112,8 @@ struct UnfoldTestCase }; std::vector UnfoldTestConfigs() -{ // n c d h w padding +{ + // clang-format: off return { {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true}, {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true}, @@ -121,6 +122,7 @@ std::vector UnfoldTestConfigs() {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true}, {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true}, }; + // clang-format: on } template From 366e350ebfef0c3722f81a8e8a400648454628ab Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 22 Jul 2024 09:09:33 +0000 Subject: [PATCH 23/46] githook format --- driver/mloUnfoldHost.hpp | 30 +++++++++++++++--------------- test/cpu_unfold.hpp | 22 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index 6178946bd1..3334cd55fe 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -65,18 +65,18 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input, ls.push_back(l); } int32_t kernel_size_h = kernel_size[0]; - int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); - int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { int ncp = gid / L, l = gid % L; int nc = ncp / P, p = ncp % P; @@ -120,10 +120,10 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, auto input_grad_dims = miopen::deref(dinputDesc).GetLengths(); auto input_size = miopen::deref(dinputDesc).GetSize(); - const int LOCAL_SIZE = 256; - int spatial_dim_size = input_size - 2; - const int32_t N = static_cast(input_grad_dims[0]); - const int32_t C = static_cast(input_grad_dims[1]); + const int LOCAL_SIZE = 256; + int spatial_dim_size = input_size - 2; + const int32_t N = static_cast(input_grad_dims[0]); + const int32_t C = static_cast(input_grad_dims[1]); int32_t P = 1, L = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp index 6a3b5f3d2e..46f7552083 100644 --- a/test/cpu_unfold.hpp +++ b/test/cpu_unfold.hpp @@ -69,18 +69,18 @@ void cpu_unfold_fwd_4d(tensor input_tensor, } int32_t kernel_size_h = kernel_size[0]; - int32_t kernel_size_w = kernel_size[1]; - int32_t stride_h = stride[0]; - int32_t stride_w = stride[1]; - int32_t padding_h = padding[0]; - int32_t padding_w = padding[1]; - int32_t dilation_h = dilation[0]; - int32_t dilation_w = dilation[1]; + int32_t kernel_size_w = kernel_size[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + int32_t padding_h = padding[0]; + int32_t padding_w = padding[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; int32_t LH = ls[0]; - int32_t LW = ls[1]; - int32_t H = static_cast(input_dims[2]); - int32_t W = static_cast(input_dims[3]); - int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; + int32_t LW = ls[1]; + int32_t H = static_cast(input_dims[2]); + int32_t W = static_cast(input_dims[3]); + int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE; par_ford(work_size)([&](int gid) { int ncp = gid / L, l = gid % L; int nc = ncp / P, p = ncp % P; From 2b3bd1f1d8602274aefdfbcb501b9751d4ccf806 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 24 Jul 2024 09:26:07 +0000 Subject: [PATCH 24/46] githook format --- driver/driver.hpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index 4b1c831ad9..a22e10e572 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -169,14 +169,15 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); - printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " - "pool[fp16], lrn[fp16], " - "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " - "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " - "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " - "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], " - "fold[bfp16|fp16]\n"); + printf( + "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " + "pool[fp16], lrn[fp16], " + "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " + "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " + "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " + "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " + "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], " + "fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -206,9 +207,9 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && - arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" && arg != "unfoldfp16" && - arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && - arg != "--version") + arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" && + arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && + arg != "foldbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); From 5c506b4269a16b480645c5c917d3df70d9dc6de7 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 29 Jul 2024 08:26:20 +0000 Subject: [PATCH 25/46] Update gtest code --- test/cpu_unfold.hpp | 9 +++------ test/gtest/fold.cpp | 26 +++++++++++++------------- test/gtest/unfold.cpp | 26 +++++++++++++------------- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp index 46f7552083..201917d4e0 100644 --- a/test/cpu_unfold.hpp +++ b/test/cpu_unfold.hpp @@ -43,7 +43,7 @@ void cpu_unfold_fwd_4d(tensor input_tensor, { auto input_tv = miopen::get_inner_expanded_tv<4>(input_tensor.desc); auto output_tv = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc); - auto input_size = input_tensor.desc.GetSize(); + auto input_size = input_tensor.desc.GetNumDims(); auto input_dims = input_tensor.desc.GetLengths(); auto input = input_tensor.data.data(); @@ -68,7 +68,6 @@ void cpu_unfold_fwd_4d(tensor input_tensor, ls.push_back(l); } - int32_t kernel_size_h = kernel_size[0]; int32_t kernel_size_w = kernel_size[1]; int32_t stride_h = stride[0]; int32_t stride_w = stride[1]; @@ -76,7 +75,6 @@ void cpu_unfold_fwd_4d(tensor input_tensor, int32_t padding_w = padding[1]; int32_t dilation_h = dilation[0]; int32_t dilation_w = dilation[1]; - int32_t LH = ls[0]; int32_t LW = ls[1]; int32_t H = static_cast(input_dims[2]); int32_t W = static_cast(input_dims[3]); @@ -117,7 +115,7 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, { auto input_grad_tv = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc); auto output_grad_tv = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc); - auto input_size = ref_dinput_tensor.desc.GetSize(); + auto input_size = ref_dinput_tensor.desc.GetNumDims(); auto input_grad_dims = ref_dinput_tensor.desc.GetLengths(); auto input_grad = ref_dinput_tensor.data.data(); @@ -129,7 +127,7 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, const int32_t N = static_cast(input_grad_dims[0]); const int32_t C = static_cast(input_grad_dims[1]); - int32_t P = 1, L = 1; + int32_t P = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) { @@ -138,7 +136,6 @@ void cpu_unfold_bwd_4d(tensor& ref_dinput_tensor, dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1; - L *= l; ls.push_back(l); } diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 7368c6920e..15ae2572c4 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -36,7 +36,7 @@ namespace fold { std::string GetFloatArg() { - const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); if(tmp.empty()) { return ""; @@ -73,8 +73,8 @@ using namespace fold; TEST_P(FoldForwardTestFloat32, FoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -91,8 +91,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldForwardTestFloat16, FoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -109,8 +109,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldForwardTestBFloat16, FoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); @@ -127,8 +127,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, TEST_P(FoldBackwardTestFloat32, FoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -145,8 +145,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, TEST_P(FoldBackwardTestFloat16, FoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp index 22a67c4657..dc7cbdce38 100644 --- a/test/gtest/unfold.cpp +++ b/test/gtest/unfold.cpp @@ -36,7 +36,7 @@ namespace unfold { std::string GetFloatArg() { - const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); if(tmp.empty()) { return ""; @@ -71,8 +71,8 @@ struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest using namespace unfold; TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -89,8 +89,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -107,8 +107,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); @@ -125,8 +125,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -143,8 +143,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -161,8 +161,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) { - if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) || - (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); From e2631d8bc488bcfee0d3af508075fcea6afb097c Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 29 Jul 2024 09:17:33 +0000 Subject: [PATCH 26/46] githook format --- driver/driver.hpp | 19 ++++++++++--------- driver/mloUnfoldHost.hpp | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index 0bace22ab1..902389b977 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -169,13 +169,14 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); - printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " - "pool[fp16], lrn[fp16], " - "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " - "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " - "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " - "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], " + printf( + "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " + "pool[fp16], lrn[fp16], " + "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " + "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " + "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " + "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " + "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], " "fold[bfp16|fp16], getitem[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -207,8 +208,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" && arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && - arg != "foldbfp16" && arg != "getitem" && - arg != "getitemfp16" && arg != "getitembfp16" && arg != "--version") + arg != "foldbfp16" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && + arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp index f2e1d3f6e6..6204ea2bd9 100644 --- a/driver/mloUnfoldHost.hpp +++ b/driver/mloUnfoldHost.hpp @@ -121,7 +121,7 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput, int spatial_dim_size = input_size - 2; const int32_t N = static_cast(input_grad_dims[0]); const int32_t C = static_cast(input_grad_dims[1]); - int32_t P = 1; + int32_t P = 1; std::vector ls; for(int i = 0; i < spatial_dim_size; ++i) { From 7bb583b51b954f1b65f1177edea21e9178ece556 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 31 Jul 2024 08:28:38 +0000 Subject: [PATCH 27/46] git hook format --- src/kernels/tensor_view.hpp | 10 ++-------- src/solver/fold/fold_backward.cpp | 5 +++-- src/solver/fold/fold_forward.cpp | 5 +++-- src/solver/fold/unfold_backward.cpp | 5 +++-- src/solver/fold/unfold_forward.cpp | 5 +++-- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index 1e6491fadf..1b29099c2b 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -72,11 +72,7 @@ struct tensor_layout_t } } - constexpr tensor_layout_t(uint64_t n, - uint64_t c, - uint64_t d, - uint64_t h, - uint64_t w) + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) { static_assert(N == 5); layout[0] = n; @@ -95,9 +91,7 @@ struct tensor_layout_t layout[3] = w; } - constexpr tensor_layout_t(uint64_t n, - uint64_t h, - uint64_t w) + constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) { static_assert(N == 3); layout[0] = n; diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index 6d0e0ab021..b952e5375c 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -42,8 +42,9 @@ namespace solver { namespace fold { -bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, - [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const +bool FoldBwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const { return true; } diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 63f6130761..17fb11180c 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -43,8 +43,9 @@ namespace solver { namespace fold { -bool FoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, - [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const +bool FoldFwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const { return true; } diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index 02b44a5339..da11969c64 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -42,8 +42,9 @@ namespace solver { namespace fold { -bool UnfoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, - [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const +bool UnfoldBwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const { return true; } diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index d0e3f53e8c..54e39809d6 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -42,8 +42,9 @@ namespace solver { namespace fold { -bool UnfoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/, - [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const +bool UnfoldFwd::IsApplicable( + [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const { return true; } From a6256e752b3cdf4372c5a6473cb17c897e57bf70 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 31 Jul 2024 10:42:15 +0000 Subject: [PATCH 28/46] add MIOPEN_INTERNALS_EXPORT --- src/include/miopen/fold.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 25f4aefa8d..454e067c5e 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -23,8 +23,8 @@ * SOFTWARE. * *******************************************************************************/ -#ifndef MIOPEN_INSTANCE_NORM_HPP_ -#define MIOPEN_INSTANCE_NORM_HPP_ +#ifndef MIOPEN_FOLD_HPP_ +#define MIOPEN_FOLD_HPP_ #include @@ -33,7 +33,7 @@ namespace miopen { struct Handle; struct TensorDescriptor; -miopenStatus_t UnfoldForward(Handle& handle, +MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldForward(Handle& handle, const TensorDescriptor& inputDesc, ConstData_t input, const TensorDescriptor& outputDesc, @@ -47,7 +47,7 @@ miopenStatus_t UnfoldForward(Handle& handle, const int32_t* dilation, int32_t dilation_size); -miopenStatus_t UnfoldBackward(Handle& handle, +MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldBackward(Handle& handle, const TensorDescriptor& dinputDesc, Data_t dinput, const TensorDescriptor& doutputDesc, @@ -61,7 +61,7 @@ miopenStatus_t UnfoldBackward(Handle& handle, const int32_t* dilation, int32_t dilation_size); -miopenStatus_t FoldForward(Handle& handle, +MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldForward(Handle& handle, const TensorDescriptor& inputDesc, ConstData_t input, const TensorDescriptor& outputDesc, @@ -75,7 +75,7 @@ miopenStatus_t FoldForward(Handle& handle, const int32_t* dilation, int32_t dilation_size); -miopenStatus_t FoldBackward(Handle& handle, +MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldBackward(Handle& handle, const TensorDescriptor& dinputDesc, Data_t dinput, const TensorDescriptor& doutputDesc, @@ -89,4 +89,4 @@ miopenStatus_t FoldBackward(Handle& handle, const int32_t* dilation, int32_t dilation_size); } // namespace miopen -#endif // MIOPEN_INSTANCE_NORM_HPP_ +#endif // MIOPEN_FOLD_HPP_ From d359c1f64946135113709e8d0c6ba8665e50cb71 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Wed, 31 Jul 2024 10:42:37 +0000 Subject: [PATCH 29/46] githook format --- src/include/miopen/fold.hpp | 96 ++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp index 454e067c5e..9a8b46bb56 100644 --- a/src/include/miopen/fold.hpp +++ b/src/include/miopen/fold.hpp @@ -34,59 +34,59 @@ struct Handle; struct TensorDescriptor; MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - int32_t kernel_size_size, - const int32_t* stride, - int32_t stride_size, - const int32_t* padding, - int32_t padding_size, - const int32_t* dilation, - int32_t dilation_size); + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + int32_t kernel_size_size, + const int32_t* stride, + int32_t stride_size, + const int32_t* padding, + int32_t padding_size, + const int32_t* dilation, + int32_t dilation_size); MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - int32_t kernel_size_size, - const int32_t* stride, - int32_t stride_size, - const int32_t* padding, - int32_t padding_size, - const int32_t* dilation, - int32_t dilation_size); + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + int32_t kernel_size_size, + const int32_t* stride, + int32_t stride_size, + const int32_t* padding, + int32_t padding_size, + const int32_t* dilation, + int32_t dilation_size); MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldForward(Handle& handle, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& outputDesc, - Data_t output, - const int32_t* kernel_size, - int32_t kernel_size_size, - const int32_t* stride, - int32_t stride_size, - const int32_t* padding, - int32_t padding_size, - const int32_t* dilation, - int32_t dilation_size); + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t* kernel_size, + int32_t kernel_size_size, + const int32_t* stride, + int32_t stride_size, + const int32_t* padding, + int32_t padding_size, + const int32_t* dilation, + int32_t dilation_size); MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldBackward(Handle& handle, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const int32_t* kernel_size, - int32_t kernel_size_size, - const int32_t* stride, - int32_t stride_size, - const int32_t* padding, - int32_t padding_size, - const int32_t* dilation, - int32_t dilation_size); + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const int32_t* kernel_size, + int32_t kernel_size_size, + const int32_t* stride, + int32_t stride_size, + const int32_t* padding, + int32_t padding_size, + const int32_t* dilation, + int32_t dilation_size); } // namespace miopen #endif // MIOPEN_FOLD_HPP_ From 1652dc4def449cba1bc4b3c28adb6574b4175a84 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Fri, 2 Aug 2024 02:58:25 +0000 Subject: [PATCH 30/46] resolve conflict --- driver/driver.hpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index 4b328682e4..8307602c79 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -169,25 +169,13 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); -<<<<<<< HEAD - printf( - "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " - "pool[fp16], lrn[fp16], " - "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " - "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " - "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " - "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], " - "fold[bfp16|fp16], getitem[bfp16|fp16]\n"); -======= printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n"); ->>>>>>> origin + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } From 1f6e4a2762ddceb0c5a57d512b7ad2aaaff6aa45 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Fri, 2 Aug 2024 02:59:15 +0000 Subject: [PATCH 31/46] githook format --- driver/driver.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index 8307602c79..ddb6667323 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], " + "fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } From 2f9bce7af944b3ca5b03f565aeb69dbf1caae5fe Mon Sep 17 00:00:00 2001 From: Duong Le Date: Fri, 2 Aug 2024 04:56:36 +0000 Subject: [PATCH 32/46] fix git merge dup --- src/solver.cpp | 30 ++++++------------------------ test/gtest/fold.cpp | 2 -- test/gtest/unfold.cpp | 2 -- 3 files changed, 6 insertions(+), 28 deletions(-) diff --git a/src/solver.cpp b/src/solver.cpp index 6839a5f122..47686cd789 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -654,11 +654,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); - Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); - Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); - Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); - Register(registry, ++id, Primitive::Fold, fold::FoldBwd{}.SolverDbId()); - + Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId()); @@ -676,25 +672,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Primitive::Fusion, fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); - - Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId()); - Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId()); - Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId()); - - Register(registry, ++id, Primitive::Mha, mha::MhaForward{}.SolverDbId()); - Register(registry, ++id, Primitive::Mha, mha::MhaBackward{}.SolverDbId()); - - Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId()); - Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId()); - Register(registry, ++id, Primitive::Item, getitem::GetitemBackward{}.SolverDbId()); - - Register(registry, ++id, Primitive::Adam, adam::TransformersAdamW{}.SolverDbId()); - - Register(registry, - ++id, - Primitive::Fusion, - fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), - miopenConvolutionAlgoWinograd); + + Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Fold, fold::FoldBwd{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 15ae2572c4..a07d0ea8d8 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -25,8 +25,6 @@ *******************************************************************************/ #include "fold.hpp" -#include "miopen/bfloat16.hpp" -#include "tensor_holder.hpp" #include MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp index dc7cbdce38..0c523b1b7f 100644 --- a/test/gtest/unfold.cpp +++ b/test/gtest/unfold.cpp @@ -25,8 +25,6 @@ *******************************************************************************/ #include "unfold.hpp" -#include "miopen/bfloat16.hpp" -#include "tensor_holder.hpp" #include MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) From fa6f15a010a1168f31cf6b1f12c291dce2ae8160 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Fri, 2 Aug 2024 04:57:15 +0000 Subject: [PATCH 33/46] githook format --- src/solver.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/solver.cpp b/src/solver.cpp index 47686cd789..282a0930b3 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -654,7 +654,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); - + Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId()); Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId()); @@ -672,7 +672,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Primitive::Fusion, fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); - + Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); From 45ed5c12ab5ceb7c1ccfe4565d953ed4176a8603 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 5 Aug 2024 03:47:35 +0000 Subject: [PATCH 34/46] update tensor_view and kernel code --- src/include/miopen/tensor_view_utils.hpp | 2 +- src/kernels/MIOpenUnfold.cpp | 8 ++--- src/kernels/tensor_view.hpp | 38 +++++------------------- 3 files changed, 13 insertions(+), 35 deletions(-) diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index 4a7c0b51ad..4ac9196a8d 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -38,7 +38,7 @@ inline tensor_view_t get_inner_expanded_tv(const TensorDescriptor Desc) auto dims = Desc.GetLengths(); auto strides = Desc.GetStrides(); - tensor_view_t tensor_view; + tensor_view_t tensor_view{}; for(size_t i = 0; i < N; ++i) { if(i < dims.size()) diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp index a1c8cfd9f4..0e3f33e2f6 100644 --- a/src/kernels/MIOpenUnfold.cpp +++ b/src/kernels/MIOpenUnfold.cpp @@ -76,10 +76,10 @@ __device__ void unfoldForward4D(const DTYPE* __restrict__ input, DTYPE x = 0; if(0 <= h && h < H && 0 <= w && w < W) { - tensor_layout_t<4> input_layout(n, c, h, w); + tensor_layout_t<4> input_layout({n, c, h, w}); x = input[input_tv.get_tensor_view_idx(input_layout)]; } - tensor_layout_t<3> output_layout(n, c * P + p, l); + tensor_layout_t<3> output_layout({n, c * P + p, l}); output[output_tv.get_tensor_view_idx(output_layout)] = x; } @@ -180,12 +180,12 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad, if(lw < 0 || LW <= lw) continue; tensor_layout_t<3> output_grad_layout( - n, c * P + (ph * kernel_size_w + pw), lh * LW + lw); + {n, c * P + (ph * kernel_size_w + pw), lh * LW + lw}); sum += CVT_FLOAT2ACCUM( output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); } } - tensor_layout_t<4> input_grad_layout(n, c, h, w); + tensor_layout_t<4> input_grad_layout({n, c, h, w}); input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum); } diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index 1b29099c2b..ecc075ac12 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -27,6 +27,8 @@ #ifndef GUARD_TENSOR_VIEW_HPP #define GUARD_TENSOR_VIEW_HPP +#include + template struct tensor_layout_t; @@ -72,38 +74,14 @@ struct tensor_layout_t } } - constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) - { - static_assert(N == 5); - layout[0] = n; - layout[1] = c; - layout[2] = d; - layout[3] = h; - layout[4] = w; - } - constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w) + constexpr tensor_layout_t(std::initializer_list layout_) { - static_assert(N == 4); - layout[0] = n; - layout[1] = c; - layout[2] = h; - layout[3] = w; - } - - constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) - { - static_assert(N == 3); - layout[0] = n; - layout[1] = h; - layout[2] = w; - } - - constexpr tensor_layout_t(uint64_t n, uint64_t w) - { - static_assert(N == 2); - layout[0] = n; - layout[1] = w; + static_assert(N > 0); + for(auto i = 0; i < N; ++i) + { + layout[i] = layout_.begin()[i]; + } } uint64_t layout[N]; From ba2020be6cb6b7cb0868e77adab34d72844ec29a Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 5 Aug 2024 03:48:02 +0000 Subject: [PATCH 35/46] githook format --- src/kernels/tensor_view.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index ecc075ac12..c9357dd729 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -74,7 +74,6 @@ struct tensor_layout_t } } - constexpr tensor_layout_t(std::initializer_list layout_) { static_assert(N > 0); From 87edcbd014915137ed737f784819e4a80b782e3b Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 5 Aug 2024 08:48:51 +0000 Subject: [PATCH 36/46] remove duplicate miopen ops and update doc --- docs/reference/index.rst | 2 +- include/miopen/miopen.h | 701 --------------------------------------- 2 files changed, 1 insertion(+), 702 deletions(-) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index be56db13f9..a4cc9470a1 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -33,6 +33,6 @@ The MIOpen API library is structured as follows: * :doc:`Cat <../doxygen/html/group__cat>` (experimental) * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental) * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) - * :doc:`Fold <./group__fold>` (experimental) + * :doc:`Fold <./group___f_o_l_d>` (experimental) * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index ee3585cbf7..7eab6a77c5 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7621,707 +7621,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, // CLOSEOUT GETITEM DOXYGEN GROUP #endif // MIOPEN_BETA_API -#ifdef MIOPEN_BETA_API -// GetItem APIs -/** @addtogroup getitem - * - * @{ - */ -/*! @brief Helper function to query the minimum workspace size required by the getitem call - * - * @param [in] handle MIOpen Handle - * @param [in] indexCount Number of input tensor indexs - * @param [in] indexDescs Tensor descriptor of input tensor indexs - * @param [out] sizeInBytes Pointer to data to return the minimum workspace size - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t -miopenGetGetitemWorkspaceSize(miopenHandle_t handle, - uint32_t indexCount, - const miopenTensorDescriptor_t* indexDescs, - size_t* sizeInBytes); - -/*! @brief Execute a getitem backward layer - * - * Backward of getitem for tensor indexing, slicing, masking. - * - * @param [in] handle MIOpen handle - * @param [in] workspace Address of the allocated workspace data - * @param [in] workspaceSizeInBytes Size in bytes of the allocated workspace data - * @param [in] dyDesc Tensor descriptor of input tensor dy - * @param [in] dy Source data tensor dy - * @param [in] indexCount Number of input tensor indexs - * @param [in] indexDescs Tensor descriptor of input tensor indexs(All indexs same - * size) - * @param [in] indexs Source data tensor indexs - * @param [in] dxDesc Tensor descriptor of output tensor dx - * @param [out] dx Data tensor dx(It must be initialized to 0) - * @param [in] errorDesc Tensor descriptor of output tensor error - * @param [out] error Data tensor error(It must be initialized to 0) - * @param [in] dimCount Number of dimensions - * @param [in] dims Dimensions - * @param [in] sliceCount Number of slices - * @param [in] slices Slices - * @param [in] offset Offset of output tensor dx - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, - void* workspace, - size_t workspaceSizeInBytes, - const miopenTensorDescriptor_t dyDesc, - const void* dy, - uint32_t indexCount, - const miopenTensorDescriptor_t* indexDescs, - const void* const* indexs, - const miopenTensorDescriptor_t dxDesc, - void* dx, - const miopenTensorDescriptor_t errorDesc, - void* error, - uint32_t dimCount, - const int32_t* dims, - uint32_t sliceCount, - const int32_t* slices, - uint32_t offset); - -/** @} */ -// CLOSEOUT GETITEM DOXYGEN GROUP -#endif // MIOPEN_BETA_API - -#ifdef MIOPEN_BETA_API -// FusedAdam APIs -/** @addtogroup SGD - * - * @{ - */ -/*! @brief Perform Fused Adam optimization for a single tensor (Adaptive Moment Estimation). - * - * This function implements the Fused Adam optimization algorithm. Adam, short for Adaptive Moment - * Estimation, extends the RMSProp optimizer. It combines the advantages of AdaGrad and RMSProp by - * adaptively adjusting learning rates for each parameter using the first and second moments of - * gradients. Fused Adam optimization efficiently combines multiple operations into a single kernel, - * reducing memory access overhead and improving performance. - * - * Additionally, Fused Adam can be utilized in both adam w and Automatic Mixed Precision (AMP), - * enabling accelerated model training and reduced memory consumption. AMP supports FP16 - * computation, optimizing model calculations using a mixture of FP32 and FP16 precision to enhance - * training speed. When utilizing AMP, FoundInf, ScaleGrad, and step tensors should be employed. In - * AMP mode, the execution of Adam is determined based on the FoundInf value. State Step accepts - * both int values and int tensors. If a Step tensor is employed, the step received as an int is - * disregarded, and if Adam is executed, the step tensor is incremented by 1. - * - * @code - * // Execute Adam - * miopenFusedAdam(handle, - * paramDesc, - * param, - * gradDesc, - * grad, - * expAvgDesc, - * expAvg, - * expAvgSqDesc, - * expAvgSq, - * NULL, // Unused maxExpAvgSqDesc because amsgrad is false - * NULL, - * NULL, // Unused stateStep Tensor because use step integer argument - * NULL, - * step, - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * false, // amsgrad - * false, // maximize - * false, // adamw - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * - * // Execute AdamW - * miopenFusedAdam(handle, - * paramDesc, - * param, - * gradDesc, - * grad, - * expAvgDesc, - * expAvg, - * expAvgSqDesc, - * expAvgSq, - * NULL, // Unused maxExpAvgSqDesc because amsgrad is false - * NULL, - * NULL, // Unused stateStep Tensor because use step integer argument - * NULL, - * step, - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * false, // amsgrad - * false, // maximize - * true, // adamw - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * - * // Execute AMP Adam - * miopenFusedAdam(handle, - * paramDesc, - * param, - * gradDesc, - * grad, - * expAvgDesc, - * expAvg, - * expAvgSqDesc, - * expAvgSq, - * NULL, // Unused maxExpAvgSqDesc because amsgrad is false - * NULL, - * stateStepDesc, - * stateStep, - * -1, // Ignore step value because stateStep Tensor is used - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * false, // amsgrad - * false, // maximize - * false, // adamw - * gradScaleDesc, - * gradScale, - * foundInfDesc, - * foundInf); - * @endcode - * - * @param handle MIOpen handle (input) - * @param paramDesc Tensor descriptor for the input parameter tensor (input) - * @param param Input parameter tensor (input) - * @param gradDesc Tensor descriptor for the input gradient tensor (input) - * @param grad Input gradient tensor (input) - * @param expAvgDesc Tensor descriptor for the input exponential moving average tensor - * (input) - * @param expAvg Input exponential moving average tensor (input) - * @param expAvgSqDesc Tensor descriptor for the input exponential moving average squared - * tensor (input) - * @param expAvgSq Input exponential moving average squared tensor (input) - * @param maxExpAvgSqDesc Tensor descriptor for the input maximum exponential moving average - * squared tensor. Used when amsgrad is true (input, optional) - * @param maxExpAvgSq Input maximum exponential moving average squared tensor. Used when - * amsgrad is true (input, optional) - * @param stateStepDesc Tensor descriptor for the input state step tensor (input) - * @param stateStep Input state step tensor (input) - * @param state_step Input state step. used when the step tensor is null (input) - * @param lr Learning rate (input) - * @param beta1 Coefficient used for computing the first moment running average of - * gradient (input) - * @param beta2 Coefficient used for computing the second moment running average of - * gradient (input) - * @param weight_decay Weight decay (input) - * @param eps Term added to the denominator to improve numerical stability (input) - * @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input) - * @param maximize Flag indicating whether to maximize the objective with respect to the - * parameters (input) - * @param adamw If true, the operation becomes AdamW (input) - * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) - * @param gradScale Input grad scale tensor (input, optional) - * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) - * @param foundInf Tensor indicating the presence of inf or NaN in gradients. If true, - * skips operation and step update (input, optional) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenFusedAdam(miopenHandle_t handle, - const miopenTensorDescriptor_t paramDesc, - void* param, - const miopenTensorDescriptor_t gradDesc, - const void* grad, - const miopenTensorDescriptor_t expAvgDesc, - void* expAvg, - const miopenTensorDescriptor_t expAvgSqDesc, - void* expAvgSq, - const miopenTensorDescriptor_t maxExpAvgSqDesc, - void* maxExpAvgSq, - const miopenTensorDescriptor_t stateStepDesc, - void* stateStep, - const unsigned int state_step, - const float lr, - const float beta1, - const float beta2, - const float weight_decay, - const float eps, - const bool amsgrad, - const bool maximize, - const bool adamw, - const miopenTensorDescriptor_t gradScaleDesc, - const void* gradScale, - const miopenTensorDescriptor_t foundInfDesc, - const void* foundInf); - -/*! @brief Execute single tensor Adam optimization and receive the result in a separate output - * tensor. - * - * This function is equivalent to miopenFusedAdam but receives the result in a separate output - * tensor. - * @see miopenFusedAdam - * - * @code - * // Execute Adam - * miopenFusedAdamWithOutput(handle, - * paramInDesc, - * paramIn, - * paramOutDesc, - * paramOut, - * NULL, // Unused paramOutFloat16 tensor because is not amp - * NULL, - * gradInDesc, - * gradIn, - * expAvgInDesc, - * expAvgIn, - * expAvgOutDesc, - * expAvgOut, - * expAvgInSqDesc, - * expAvgSqIn, - * expAvgSqOutDesc, - * expAvgSqOut, - * NULL, // Unused maxExpAvgSqIn tensor because amsgrad is false - * NULL, - * NULL, // Unused maxExpAvgSqOut tensor because amsgrad is false - * NULL, - * NULL, // Unused stateStepIn tensor because use step integer argument - * NULL, - * NULL, // Unused stateStepOut tensor because use step integer argument - * NULL, - * step, - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * false, // amsgrad - * false, // maximize - * false, // adamw - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * - * // Execute Amp Adam - * miopenFusedAdamWithOutput(handle, - * paramInDesc, - * paramIn, - * paramOutDesc, - * paramOut, - * paramOutFloat16Desc, // paramOutFloat16 tensor is optional in amp - * paramOutFloat16, - * gradInDesc, - * gradIn, - * expAvgInDesc, - * expAvgIn, - * expAvgOutDesc, - * expAvgOut, - * expAvgInSqDesc, - * expAvgSqIn, - * expAvgSqIn, - * expAvgSqOutDesc, - * expAvgSqOut, - * NULL, // Unused maxExpAvgSqIn tensor because amsgrad is false - * NULL, - * NULL, // Unused maxExpAvgSqOut tensor because amsgrad is false - * NULL, - * stateStepInDesc, - * stateStepIn, - * stateStepOutDesc, - * stateStepOut - * -1, // Ignore step value because stateStep Tensor is used - * lr, beta1, beta2, weight_decay, eps, - * false, // amsgrad - * false, // maximize - * false, // adamw - * gradScaleDesc, - * gradScale, - * foundInfDesc, - * foundInf); - * @endcode - * - * @param handle MIOpen handle (input) - * @param paramInDesc Tensor descriptor for the input parameter tensor (input) - * @param paramIn Input parameter tensor (input) - * @param paramOutDesc Tensor descriptor for the output parameter tensor (input) - * @param paramOut Output parameter tensor (output) - * @param paramOutFloat16Desc Tensor descriptor for the output parameter tensor float16 (input, - * optional) - * @param paramOutFloat16 Output parameter tensor (output, optional) - * @param gradInDesc Tensor descriptor for the input gradient tensor (input) - * @param gradIn Input gradient tensor (input) - * @param expAvgInDesc Tensor descriptor for the input exponential moving average tensor - * (input) - * @param expAvgIn Input exponential moving average tensor (input) - * @param expAvgOutDesc Tensor descriptor for the output exponential moving average tensor - * (input) - * @param expAvgOut Output exponential moving average tensor (output) - * @param expAvgSqInDesc Tensor descriptor for the input exponential moving average squared - * tensor (input) - * @param expAvgSqIn Input exponential moving average squared tensor (input) - * @param expAvgSqOutDesc Tensor descriptor for the output exponential moving average squared - * tensor (input) - * @param expAvgSqOut Output exponential moving average squared tensor (output) - * @param maxExpAvgSqInDesc Tensor descriptor for the input maximum exponential moving average - * squared tensor. Used when amsgrad is true (input, optional) - * @param maxExpAvgSqIn Input maximum exponential moving average squared tensor. Used when - * amsgrad is true (input, optional) - * @param maxExpAvgSqOutDesc Tensor descriptor for the output maximum exponential moving average - * squared tensor. Used when amsgrad is true (input, optional) - * @param maxExpAvgSqOut Output maximum exponential moving average squared tensor. Used when - * amsgrad is true (output, optional) - * @param stateStepInDesc Tensor descriptor for the input state step tensor (input, optional) - * @param stateStepIn Input state step tensor (input, optional) - * @param stateStepOutDesc Tensor descriptor for the output state step tensor (input, optional) - * @param stateStepOut Output state step tensor that stores the updated step value. (output, - * optional) - * @param state_step Input state step, It is used when the step tensor is null. (input) - * @param lr Learning rate (input) - * @param beta1 Coefficient used for computing the first moment running average of - * gradient (input) - * @param beta2 Coefficient used for computing the second moment running average of - * gradient (input) - * @param weight_decay Weight decay (input) - * @param eps Term added to the denominator to improve numerical stability (input) - * @param amsgrad Flag indicating whether to use the AMSGrad variant of Adam (input) - * @param maximize Flag indicating whether to maximize the objective with respect to the - * parameters (input) - * @param adamw If it is true, the operation becomes AdamW (input) - * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) - * @param gradScale Input grad scale tensor (input, optional) - * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) - * @param foundInf Tensor indicating presence of inf or nan in gradients. If true, skips - * operation and step update. (input, optional) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t -miopenFusedAdamWithOutput(miopenHandle_t handle, - const miopenTensorDescriptor_t paramInDesc, - void* paramIn, - const miopenTensorDescriptor_t paramOutDesc, - void* paramOut, - const miopenTensorDescriptor_t paramOutFloat16Desc, - void* paramOutFloat16, - const miopenTensorDescriptor_t gradInDesc, - const void* gradIn, - const miopenTensorDescriptor_t expAvgInDesc, - void* expAvgIn, - const miopenTensorDescriptor_t expAvgOutDesc, - void* expAvgOut, - const miopenTensorDescriptor_t expAvgSqInDesc, - void* expAvgSqIn, - const miopenTensorDescriptor_t expAvgSqOutDesc, - void* expAvgSqOut, - const miopenTensorDescriptor_t maxExpAvgSqInDesc, - void* maxExpAvgSqIn, - const miopenTensorDescriptor_t maxExpAvgSqOutDesc, - void* maxExpAvgSqOut, - const miopenTensorDescriptor_t stateStepInDesc, - void* stateStepIn, - const miopenTensorDescriptor_t stateStepOutDesc, - void* stateStepOut, - const unsigned int state_step, - const float lr, - const float beta1, - const float beta2, - const float weight_decay, - const float eps, - const bool amsgrad, - const bool maximize, - const bool adamw, - const miopenTensorDescriptor_t gradScaleDesc, - const void* gradScale, - const miopenTensorDescriptor_t foundInfDesc, - const void* foundInf); - -/** @} */ -// CLOSEOUT SGD DOXYGEN GROUP -#endif // MIOPEN_BETA_API - -#ifdef MIOPEN_BETA_API -// TransformersAdamW APIs -/** @addtogroup SGD - * - * @{ - */ -/*! @brief Implements Adam algorithm with weight decay fix as introduced in - * Decoupled Weight Decay Regularization. - * This is the fused kernel version of AdamW included in the Hugging Face Transformers module. - * - * @see miopenFusedAdam - * - * @code - * // Execute Adam - * miopenTransformersAdamW(handle, - * paramDesc, - * param, - * gradDesc, - * grad, - * expAvgDesc, - * expAvg, - * expAvgSqDesc, - * expAvgSq, - * NULL, // Unused stateStep Tensor because use step integer argument - * NULL, - * step, - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * true, // correct_bias - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * - * // Execute AMP Adam - * miopenTransformersAdamW(handle, - * paramDesc, - * param, - * gradDesc, - * grad, - * expAvgDesc, - * expAvg, - * expAvgSqDesc, - * expAvgSq, - * stateStepDesc, - * stateStep, - * -1, // Ignore step value because stateStep Tensor is used - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * true, // correct_bias - * gradScaleDesc, - * gradScale, - * foundInfDesc, - * foundInf); - * @endcode - * - * @param handle MIOpen handle (input) - * @param paramDesc Tensor descriptor for the input parameter tensor (input) - * @param param Input parameter tensor (input) - * @param gradDesc Tensor descriptor for the input gradient tensor (input) - * @param grad Input gradient tensor (input) - * @param expAvgDesc Tensor descriptor for the input exponential moving average tensor - * (input) - * @param expAvg Input exponential moving average tensor (input) - * @param expAvgSqDesc Tensor descriptor for the input exponential moving average squared - * tensor (input) - * @param expAvgSq Input exponential moving average squared tensor (input) - * @param stateStepDesc Tensor descriptor for the input state step tensor (input) - * @param stateStep Input state step tensor (input) - * @param state_step Input state step. used when the step tensor is null (input) - * @param lr Learning rate (input) - * @param beta1 Coefficient used for computing the first moment running average of - * gradient (input) - * @param beta2 Coefficient used for computing the second moment running average of - * gradient (input) - * @param weight_decay Weight decay (input) - * @param eps Term added to the denominator to improve numerical stability (input) - * @param correct_bias Whether or not to correct bias in Adam (for instance, in Bert TF - * repository they use False). - * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) - * @param gradScale Input grad scale tensor (input, optional) - * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) - * @param foundInf Tensor indicating the presence of inf or NaN in gradients. If true, - * skips operation and step update (input, optional) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenTransformersAdamW(miopenHandle_t handle, - const miopenTensorDescriptor_t paramDesc, - void* param, - const miopenTensorDescriptor_t gradDesc, - const void* grad, - const miopenTensorDescriptor_t expAvgDesc, - void* expAvg, - const miopenTensorDescriptor_t expAvgSqDesc, - void* expAvgSq, - const miopenTensorDescriptor_t stateStepDesc, - void* stateStep, - const unsigned int state_step, - const float lr, - const float beta1, - const float beta2, - const float weight_decay, - const float eps, - const bool correct_bias, - const miopenTensorDescriptor_t gradScaleDesc, - const void* gradScale, - const miopenTensorDescriptor_t foundInfDesc, - const void* foundInf); - -/*! @brief Execute single tensor Adam optimization and receive the result in a separate output - * tensor. - * - * This function is equivalent to miopenTransformersAdam but receives the result in a separate - * output tensor. - * @see miopenTransformersAdamW - * @see miopenFusedAdamWithOutput - * - * @code - * // Execute Adam - * miopenTransformersAdamWWithOutput(handle, - * paramInDesc, - * paramIn, - * paramOutDesc, - * paramOut, - * NULL, // Unused paramOutFloat16 tensor because is not amp - * NULL, - * gradInDesc, - * gradIn, - * expAvgInDesc, - * expAvgIn, - * expAvgOutDesc, - * expAvgOut, - * expAvgInSqDesc, - * expAvgSqIn, - * expAvgSqOutDesc, - * expAvgSqOut, - * NULL, // Unused stateStepIn tensor because use step int - * NULL, - * NULL, // Unused stateStepOut tensor because use step int - * NULL, - * step, - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * -1, // step_size - * true, // correct_bias - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * - * // Execute Amp Adam - * miopenTransformersAdamWWithOutput(handle, - * paramInDesc, - * paramIn, - * paramOutDesc, - * paramOut, - * paramOutFloat16Desc, // optional in amp - * paramOutFloat16, - * gradInDesc, - * gradIn, - * expAvgInDesc, - * expAvgIn, - * expAvgOutDesc, - * expAvgOut, - * expAvgInSqDesc, - * expAvgSqIn, - * expAvgSqIn, - * expAvgSqOutDesc, - * expAvgSqOut, - * stateStepInDesc, - * stateStepIn, - * stateStepOutDesc, - * stateStepOut - * -1, // Ignore step value because stateStep Tensor is used - * lr, - * beta1, - * beta2, - * weight_decay, - * eps, - * -1, // step_size - * true, // correct_bias - * NULL, // Unused gradScale Tensor because not amp - * NULL, - * NULL, // Unused foundInf Tensor because not amp - * NULL); - * @endcode - * - * @param handle MIOpen handle (input) - * @param paramInDesc Tensor descriptor for the input parameter tensor (input) - * @param paramIn Input parameter tensor (input) - * @param paramOutDesc Tensor descriptor for the output parameter tensor (input) - * @param paramOut Output parameter tensor (output) - * @param paramOutFloat16Desc Tensor descriptor for the output parameter tensor float16 (input, - * optional) - * @param paramOutFloat16 Output parameter tensor (output, optional) - * @param gradInDesc Tensor descriptor for the input gradient tensor (input) - * @param gradIn Input gradient tensor (input) - * @param expAvgInDesc Tensor descriptor for the input exponential moving average tensor - * (input) - * @param expAvgIn Input exponential moving average tensor (input) - * @param expAvgOutDesc Tensor descriptor for the output exponential moving average tensor - * (input) - * @param expAvgOut Output exponential moving average tensor (output) - * @param expAvgSqInDesc Tensor descriptor for the input exponential moving average squared - * tensor (input) - * @param expAvgSqIn Input exponential moving average squared tensor (input) - * @param expAvgSqOutDesc Tensor descriptor for the output exponential moving average squared - * tensor (input) - * @param expAvgSqOut Output exponential moving average squared tensor (output) - * @param stateStepInDesc Tensor descriptor for the input state step tensor (input, optional) - * @param stateStepIn Input state step tensor (input, optional) - * @param stateStepOutDesc Tensor descriptor for the output state step tensor (input, optional) - * @param stateStepOut Output state step tensor that stores the updated step value. (output, - * optional) - * @param state_step Input state step, It is used when the step tensor is null. (input) - * @param lr Learning rate (input) - * @param beta1 Coefficient used for computing the first moment running average of - * gradient (input) - * @param beta2 Coefficient used for computing the second moment running average of - * gradient (input) - * @param weight_decay Weight decay (input) - * @param eps Term added to the denominator to improve numerical stability (input) - * @param step_size Pre-calculated step_size, used for performance enhancement (input) - * @param correct_bias Whether or not to correct bias in Adam (for instance, in Bert TF - * repository they use False) (input) - * @param gradScaleDesc Tensor descriptor for the input grad scale tensor (input, optional) - * @param gradScale Input grad scale tensor (input, optional) - * @param foundInfDesc Tensor descriptor for the input found inf tensor (input, optional) - * @param foundInf Tensor indicating presence of inf or nan in gradients. If true, skips - * operation and step update. (input, optional) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t -miopenTransformersAdamWWithOutput(miopenHandle_t handle, - const miopenTensorDescriptor_t paramInDesc, - void* paramIn, - const miopenTensorDescriptor_t paramOutDesc, - void* paramOut, - const miopenTensorDescriptor_t paramOutFloat16Desc, - void* paramOutFloat16, - const miopenTensorDescriptor_t gradInDesc, - const void* gradIn, - const miopenTensorDescriptor_t expAvgInDesc, - void* expAvgIn, - const miopenTensorDescriptor_t expAvgOutDesc, - void* expAvgOut, - const miopenTensorDescriptor_t expAvgSqInDesc, - void* expAvgSqIn, - const miopenTensorDescriptor_t expAvgSqOutDesc, - void* expAvgSqOut, - const miopenTensorDescriptor_t stateStepInDesc, - void* stateStepIn, - const miopenTensorDescriptor_t stateStepOutDesc, - void* stateStepOut, - const unsigned int state_step, - const float lr, - const float beta1, - const float beta2, - const float weight_decay, - const float eps, - const float step_size, - const bool correct_bias, - const miopenTensorDescriptor_t gradScaleDesc, - const void* gradScale, - const miopenTensorDescriptor_t foundInfDesc, - const void* foundInf); - -/** @} */ -// CLOSEOUT SGD DOXYGEN GROUP -#endif // MIOPEN_BETA_API - #ifdef MIOPEN_BETA_API // Fold APIs /** @addtogroup FOLD From db7b9a8ff3ca7ef99076129f2c8fc5fbb4e04e7a Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 5 Aug 2024 09:33:50 +0000 Subject: [PATCH 37/46] update spacing --- include/miopen/miopen.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 7eab6a77c5..fb0f7006db 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -7643,7 +7643,7 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, * @param dilation Dilation array control the stride of the elements within the * neighborhood (input) * @param dilation_size Size of the dilation array (input) - * @return miopenStatus_t + * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t inputDesc, @@ -7675,7 +7675,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle, * @param dilation Dilation array control the stride of the elements within the neighborhood (input) * @param dilation_size Size of the dilation array (input) -* @return miopenStatus_t +* @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t dinputDesc, @@ -7707,7 +7707,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle, * @param dilation Dilation array control the stride of the elements within the * neighborhood (input) * @param dilation_size Size of the dilation array (input) - * @return miopenStatus_t + * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, const miopenTensorDescriptor_t inputDesc, @@ -7739,7 +7739,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle, * @param dilation Dilation array control the stride of the elements within the neighborhood (input) * @param dilation_size Size of the dilation array (input) - * @return miopenStatus_t + * @return miopenStatus_t */ MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle, const miopenTensorDescriptor_t dinputDesc, From 857db5c5bfc3775a8574e0f7cfbf429137a7f98a Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 6 Aug 2024 03:54:39 +0000 Subject: [PATCH 38/46] empty commit From 0da1cc6a4fc4c38b027ac64bc14ca118de2c44b5 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 6 Aug 2024 21:39:56 +0000 Subject: [PATCH 39/46] update gtest syntax --- test/gtest/fold.cpp | 48 ++++++++++++++++++++--------------------- test/gtest/unfold.cpp | 50 ++++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index a07d0ea8d8..4de7f26e6a 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -42,34 +42,34 @@ std::string GetFloatArg() return tmp; } -struct FoldForwardTestFloat32 : FoldFwdTest +struct GPU_Fold_fwd_FP32 : FoldFwdTest { }; -struct FoldForwardTestFloat16 : FoldFwdTest +struct GPU_Fold_fwd_FP16 : FoldFwdTest { }; -struct FoldForwardTestBFloat16 : FoldFwdTest +struct GPU_Fold_fwd_BFP16 : FoldFwdTest { }; -struct FoldBackwardTestFloat32 : FoldBwdTest +struct GPU_Fold_bwd_FP32 : FoldBwdTest { }; -struct FoldBackwardTestFloat16 : FoldBwdTest +struct GPU_Fold_bwd_FP16 : FoldBwdTest { }; -struct FoldBackwardTestBFloat16 : FoldBwdTest +struct GPU_Fold_bwd_BFP16 : FoldBwdTest { }; }; // namespace fold using namespace fold; -TEST_P(FoldForwardTestFloat32, FoldForwardTest) +TEST_P(GPU_Fold_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -83,11 +83,11 @@ TEST_P(FoldForwardTestFloat32, FoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, - FoldForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_fwd_FP32, testing::ValuesIn(FoldTestConfigs())); -TEST_P(FoldForwardTestFloat16, FoldForwardTest) +TEST_P(GPU_Fold_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -101,11 +101,11 @@ TEST_P(FoldForwardTestFloat16, FoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, - FoldForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_fwd_FP16, testing::ValuesIn(FoldTestConfigs())); -TEST_P(FoldForwardTestBFloat16, FoldForwardTest) +TEST_P(GPU_Fold_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -119,11 +119,11 @@ TEST_P(FoldForwardTestBFloat16, FoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet, - FoldForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_fwd_BFP16, testing::ValuesIn(FoldTestConfigs())); -TEST_P(FoldBackwardTestFloat32, FoldBackwardTest) +TEST_P(GPU_Fold_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -137,11 +137,11 @@ TEST_P(FoldBackwardTestFloat32, FoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, - FoldBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_bwd_FP32, testing::ValuesIn(FoldTestConfigs())); -TEST_P(FoldBackwardTestFloat16, FoldBackwardTest) +TEST_P(GPU_Fold_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -155,11 +155,11 @@ TEST_P(FoldBackwardTestFloat16, FoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, - FoldBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_bwd_FP16, testing::ValuesIn(FoldTestConfigs())); -TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest) +TEST_P(GPU_Fold_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -173,6 +173,6 @@ TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet, - FoldBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Fold_bwd_BFP16, testing::ValuesIn(FoldTestConfigs())); diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp index 0c523b1b7f..8573e1573b 100644 --- a/test/gtest/unfold.cpp +++ b/test/gtest/unfold.cpp @@ -42,32 +42,34 @@ std::string GetFloatArg() return tmp; } -struct UnfoldForwardTestFloat32 : UnfoldFwdTest +struct GPU_Unfold_fwd_FP32 : UnfoldFwdTest { }; -struct UnfoldForwardTestFloat16 : UnfoldFwdTest +struct GPU_Unfold_fwd_FP16 : UnfoldFwdTest { }; -struct UnfoldForwardTestBFloat16 : UnfoldFwdTest +struct GPU_Unfold_fwd_BFP16 : UnfoldFwdTest { }; -struct UnfoldBackwardTestFloat32 : UnfoldBwdTest +struct GPU_Unfold_bwd_FP32 : UnfoldBwdTest { }; -struct UnfoldBackwardTestFloat16 : UnfoldBwdTest +struct GPU_Unfold_bwd_FP16 : UnfoldBwdTest { }; -struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest +struct GPU_Unfold_bwd_BFP16 : UnfoldBwdTest { }; }; // namespace unfold + using namespace unfold; -TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) + +TEST_P(GPU_Unfold_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -81,11 +83,11 @@ TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_fwd_FP32, testing::ValuesIn(UnfoldTestConfigs())); -TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) +TEST_P(GPU_Unfold_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -99,11 +101,11 @@ TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_fwd_FP16, testing::ValuesIn(UnfoldTestConfigs())); -TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) +TEST_P(GPU_Unfold_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -117,11 +119,11 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet, - UnfoldForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_fwd_BFP16, testing::ValuesIn(UnfoldTestConfigs())); -TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) +TEST_P(GPU_Unfold_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -135,11 +137,11 @@ TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_bwd_FP32, testing::ValuesIn(UnfoldTestConfigs())); -TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) +TEST_P(GPU_Unfold_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -153,11 +155,11 @@ TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_bwd_FP16, testing::ValuesIn(UnfoldTestConfigs())); -TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) +TEST_P(GPU_Unfold_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -171,6 +173,6 @@ TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet, - UnfoldBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_Unfold_bwd_BFP16, testing::ValuesIn(UnfoldTestConfigs())); From 879c5c77b675c2bd03944f9912104a677ffbd2b9 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 6 Aug 2024 21:41:48 +0000 Subject: [PATCH 40/46] githook format --- test/gtest/fold.cpp | 24 ++++++------------------ test/gtest/unfold.cpp | 24 ++++++------------------ 2 files changed, 12 insertions(+), 36 deletions(-) diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp index 4de7f26e6a..b3868bb0ec 100644 --- a/test/gtest/fold.cpp +++ b/test/gtest/fold.cpp @@ -83,9 +83,7 @@ TEST_P(GPU_Fold_fwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_fwd_FP32, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP32, testing::ValuesIn(FoldTestConfigs())); TEST_P(GPU_Fold_fwd_FP16, Test) { @@ -101,9 +99,7 @@ TEST_P(GPU_Fold_fwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_fwd_FP16, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP16, testing::ValuesIn(FoldTestConfigs())); TEST_P(GPU_Fold_fwd_BFP16, Test) { @@ -119,9 +115,7 @@ TEST_P(GPU_Fold_fwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_fwd_BFP16, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_BFP16, testing::ValuesIn(FoldTestConfigs())); TEST_P(GPU_Fold_bwd_FP32, Test) { @@ -137,9 +131,7 @@ TEST_P(GPU_Fold_bwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_bwd_FP32, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP32, testing::ValuesIn(FoldTestConfigs())); TEST_P(GPU_Fold_bwd_FP16, Test) { @@ -155,9 +147,7 @@ TEST_P(GPU_Fold_bwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_bwd_FP16, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP16, testing::ValuesIn(FoldTestConfigs())); TEST_P(GPU_Fold_bwd_BFP16, Test) { @@ -173,6 +163,4 @@ TEST_P(GPU_Fold_bwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Fold_bwd_BFP16, - testing::ValuesIn(FoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_BFP16, testing::ValuesIn(FoldTestConfigs())); diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp index 8573e1573b..a5aead6d10 100644 --- a/test/gtest/unfold.cpp +++ b/test/gtest/unfold.cpp @@ -83,9 +83,7 @@ TEST_P(GPU_Unfold_fwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_fwd_FP32, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP32, testing::ValuesIn(UnfoldTestConfigs())); TEST_P(GPU_Unfold_fwd_FP16, Test) { @@ -101,9 +99,7 @@ TEST_P(GPU_Unfold_fwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_fwd_FP16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP16, testing::ValuesIn(UnfoldTestConfigs())); TEST_P(GPU_Unfold_fwd_BFP16, Test) { @@ -119,9 +115,7 @@ TEST_P(GPU_Unfold_fwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_fwd_BFP16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_BFP16, testing::ValuesIn(UnfoldTestConfigs())); TEST_P(GPU_Unfold_bwd_FP32, Test) { @@ -137,9 +131,7 @@ TEST_P(GPU_Unfold_bwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_bwd_FP32, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP32, testing::ValuesIn(UnfoldTestConfigs())); TEST_P(GPU_Unfold_bwd_FP16, Test) { @@ -155,9 +147,7 @@ TEST_P(GPU_Unfold_bwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_bwd_FP16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP16, testing::ValuesIn(UnfoldTestConfigs())); TEST_P(GPU_Unfold_bwd_BFP16, Test) { @@ -173,6 +163,4 @@ TEST_P(GPU_Unfold_bwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_Unfold_bwd_BFP16, - testing::ValuesIn(UnfoldTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_BFP16, testing::ValuesIn(UnfoldTestConfigs())); From 7d07012d12a93ea0c6b0482efe661f10e62a6367 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Mon, 12 Aug 2024 03:07:48 +0000 Subject: [PATCH 41/46] githook format --- driver/driver.hpp | 23 ++++++++++++----------- src/solver.cpp | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/driver/driver.hpp b/driver/driver.hpp index c37f3be25a..844101230a 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -169,14 +169,15 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); - printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], " - "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " - "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " - "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " - "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], unfold[bfp16|fp16], " - "fold[bfp16|fp16]\n"); + printf( + "Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], " + "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " + "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " + "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " + "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " + "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], unfold[bfp16|fp16], " + "fold[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -208,9 +209,9 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" && - arg != "ropefp16" && arg != "ropebfp16" && arg != "unfold" && - arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && - arg != "foldbfp16" && arg != "--version") + arg != "ropefp16" && arg != "ropebfp16" && arg != "unfold" && arg != "unfoldfp16" && + arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && + arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/src/solver.cpp b/src/solver.cpp index 7ee13a51f9..ae443f29d5 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -676,7 +676,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); - + Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId()); Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId()); From 85c1ee04b6737e6ac02251ae836b2a4c4f3cb101 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 13 Aug 2024 16:40:04 +0000 Subject: [PATCH 42/46] add not contiguous test cases for fold and unfold --- test/gtest/fold.hpp | 22 ++++++++++++++-------- test/gtest/unfold.hpp | 15 ++++++++++++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 02d9e42e17..9c7a9874eb 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -23,7 +23,6 @@ * SOFTWARE. * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "cpu_unfold.hpp" #include "get_handle.hpp" #include "miopen/allocator.hpp" @@ -123,6 +122,13 @@ std::vector FoldTestConfigs() {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, true}, {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true}, {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, false}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false}, + {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, false}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false}, + {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, false}, }; // clang-format: on } @@ -197,13 +203,10 @@ struct FoldFwdTest : public ::testing::TestWithParam // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; - for(int i = 0; i < 10; ++i) - { - std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] - << std::endl; - } auto error_output = miopen::rms_range(outputHost, output); - EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" + ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output)); + + EXPECT_LT(error_output, tolerance) << "Error forward output beyond tolerance Error: {" << error_output << "}, Tolerance: " << tolerance; } FoldTestCase config; @@ -289,10 +292,13 @@ struct FoldBwdTest : public ::testing::TestWithParam if(std::is_same::value) tolerance *= 8.0; auto error_dinput = miopen::rms_range(dinputHost, dinput); - EXPECT_TRUE(error_dinput < tolerance) + ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput)); + + EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {" << error_dinput << "}, Tolerance: " << tolerance; } + FoldTestCase config; tensor dinput; diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 4bb790b1f5..2abb1daca4 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -23,7 +23,6 @@ * SOFTWARE. * *******************************************************************************/ -#include "../driver/tensor_driver.hpp" #include "cpu_unfold.hpp" #include "get_handle.hpp" #include "miopen/allocator.hpp" @@ -32,7 +31,6 @@ #include "verify.hpp" #include #include -#include #include #include #include @@ -121,6 +119,12 @@ std::vector UnfoldTestConfigs() {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true}, {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true}, {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true}, + {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, false}, + {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, false}, + {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, false}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, false}, + {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, false}, + {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, false}, }; // clang-format: on } @@ -204,9 +208,12 @@ struct UnfoldFwdTest : public ::testing::TestWithParam if(std::is_same::value) tolerance *= 8.0; auto error_output = miopen::rms_range(outputHost, output); + ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output)); + EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" << error_output << "}, Tolerance: " << tolerance; } + UnfoldTestCase config; tensor input; @@ -297,7 +304,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam if(std::is_same::value) tolerance *= 8.0; auto error_dinput = miopen::rms_range(dinputHost, dinput); - EXPECT_TRUE(error_dinput < tolerance) + ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput)); + + EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {" << error_dinput << "}, Tolerance: " << tolerance; } From 66e5dcb4b6c10fdb5dcb7647b94b52c95b1240f1 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 13 Aug 2024 16:40:48 +0000 Subject: [PATCH 43/46] githook format --- test/gtest/fold.hpp | 9 ++++----- test/gtest/unfold.hpp | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 9c7a9874eb..3c63862fca 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -207,7 +207,7 @@ struct FoldFwdTest : public ::testing::TestWithParam ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output)); EXPECT_LT(error_output, tolerance) << "Error forward output beyond tolerance Error: {" - << error_output << "}, Tolerance: " << tolerance; + << error_output << "}, Tolerance: " << tolerance; } FoldTestCase config; @@ -294,11 +294,10 @@ struct FoldBwdTest : public ::testing::TestWithParam auto error_dinput = miopen::rms_range(dinputHost, dinput); ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput)); - EXPECT_LT(error_dinput, tolerance) - << "Error backward input_grad beyond tolerance Error: {" << error_dinput - << "}, Tolerance: " << tolerance; + EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {" + << error_dinput << "}, Tolerance: " << tolerance; } - + FoldTestCase config; tensor dinput; diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 2abb1daca4..63d859f7f0 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -209,7 +209,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam tolerance *= 8.0; auto error_output = miopen::rms_range(outputHost, output); ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output)); - + EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {" << error_output << "}, Tolerance: " << tolerance; } @@ -306,9 +306,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam auto error_dinput = miopen::rms_range(dinputHost, dinput); ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput)); - EXPECT_LT(error_dinput, tolerance) - << "Error backward input_grad beyond tolerance Error: {" << error_dinput - << "}, Tolerance: " << tolerance; + EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {" + << error_dinput << "}, Tolerance: " << tolerance; } UnfoldTestCase config; From 918091d242ad9cad4eedd5696c425b6009d7b38a Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 13 Aug 2024 17:01:39 +0000 Subject: [PATCH 44/46] remove /*context*/ for solver --- src/solver/fold/fold_backward.cpp | 2 +- src/solver/fold/fold_forward.cpp | 2 +- src/solver/fold/unfold_backward.cpp | 2 +- src/solver/fold/unfold_forward.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp index b952e5375c..edfa5649a0 100644 --- a/src/solver/fold/fold_backward.cpp +++ b/src/solver/fold/fold_backward.cpp @@ -43,7 +43,7 @@ namespace solver { namespace fold { bool FoldBwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const ExecutionContext&, [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const { return true; diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp index 17fb11180c..585a21a0e2 100644 --- a/src/solver/fold/fold_forward.cpp +++ b/src/solver/fold/fold_forward.cpp @@ -44,7 +44,7 @@ namespace solver { namespace fold { bool FoldFwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const ExecutionContext&, [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const { return true; diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp index da11969c64..b9b49c2799 100644 --- a/src/solver/fold/unfold_backward.cpp +++ b/src/solver/fold/unfold_backward.cpp @@ -43,7 +43,7 @@ namespace solver { namespace fold { bool UnfoldBwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const ExecutionContext&, [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const { return true; diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp index 54e39809d6..154c51c2fc 100644 --- a/src/solver/fold/unfold_forward.cpp +++ b/src/solver/fold/unfold_forward.cpp @@ -43,7 +43,7 @@ namespace solver { namespace fold { bool UnfoldFwd::IsApplicable( - [[maybe_unused]] const ExecutionContext& /*context*/, + [[maybe_unused]] const ExecutionContext&, [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const { return true; From 901d7b32448be557eeca4d3001ea648a71b477f0 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 13 Aug 2024 17:02:32 +0000 Subject: [PATCH 45/46] remove gen_one --- test/gtest/fold.hpp | 2 -- test/gtest/unfold.hpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index 3c63862fca..e97758f720 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -146,7 +146,6 @@ struct FoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; input = tensor{in_dims, in_strides}.generate(gen_value); const int32_t N = static_cast(in_dims[0]); @@ -233,7 +232,6 @@ struct FoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; dinput = tensor{in_dims, in_strides}.generate(gen_zero); dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 63d859f7f0..78941443e9 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -142,7 +142,6 @@ struct UnfoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; input = tensor{in_dims, in_strides}.generate(gen_value); @@ -238,7 +237,6 @@ struct UnfoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - [[maybe_unused]] auto gen_one = [&](auto...) { return 1; }; auto gen_zero = [&](auto...) { return 0; }; dinput = tensor{in_dims, in_strides}.generate(gen_zero); dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); From 89562483bfa20bc5e769b4633b1ccb26e1b8c2b4 Mon Sep 17 00:00:00 2001 From: Duong Le Date: Tue, 13 Aug 2024 17:02:52 +0000 Subject: [PATCH 46/46] githook format --- test/gtest/fold.hpp | 16 ++++++++-------- test/gtest/unfold.hpp | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp index e97758f720..f43f30f346 100644 --- a/test/gtest/fold.hpp +++ b/test/gtest/fold.hpp @@ -145,11 +145,11 @@ struct FoldFwdTest : public ::testing::TestWithParam std::vector in_dims = config.GetInput(); std::vector in_strides = config.ComputeStrides(in_dims); - auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_zero = [&](auto...) { return 0; }; - input = tensor{in_dims, in_strides}.generate(gen_value); - const int32_t N = static_cast(in_dims[0]); - int32_t C = static_cast(in_dims[1]); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); + const int32_t N = static_cast(in_dims[0]); + int32_t C = static_cast(in_dims[1]); for(int32_t i : config.kernelSize) { C = C / i; @@ -232,9 +232,9 @@ struct FoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_zero = [&](auto...) { return 0; }; - dinput = tensor{in_dims, in_strides}.generate(gen_zero); - dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); const int32_t N = static_cast(in_dims[0]); int32_t C = static_cast(in_dims[1]); diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp index 78941443e9..2631722400 100644 --- a/test/gtest/unfold.hpp +++ b/test/gtest/unfold.hpp @@ -142,8 +142,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_zero = [&](auto...) { return 0; }; - input = tensor{in_dims, in_strides}.generate(gen_value); + auto gen_zero = [&](auto...) { return 0; }; + input = tensor{in_dims, in_strides}.generate(gen_value); int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]); @@ -237,9 +237,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam std::vector in_strides = config.ComputeStrides(in_dims); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - auto gen_zero = [&](auto...) { return 0; }; - dinput = tensor{in_dims, in_strides}.generate(gen_zero); - dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); + auto gen_zero = [&](auto...) { return 0; }; + dinput = tensor{in_dims, in_strides}.generate(gen_zero); + dinputHost = tensor{in_dims, in_strides}.generate(gen_zero); int spatial_dim_size = in_dims.size() - 2; const int32_t N = static_cast(in_dims[0]);