From d4f50a2324e9294e5cff5291a66412ba9fad7e8d Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Mon, 5 Aug 2024 15:06:17 +0700 Subject: [PATCH 01/28] resolve conflict --- .githooks/post-checkout | 3 + .githooks/post-commit | 3 + .githooks/post-merge | 3 + .githooks/pre-push | 3 + docs/reference/index.rst | 1 + driver/CMakeLists.txt | 1 + driver/dm_sigmoid_focal_loss.cpp | 41 + driver/driver.hpp | 10 +- driver/sigmoid_focal_loss_driver.hpp | 777 ++++++++++++++++++ include/miopen/miopen.h | 92 +++ rocfft_r2c_ex.cpp | 317 +++++++ src/CMakeLists.txt | 10 + src/include/miopen/sigmoid_focal_loss.hpp | 71 ++ .../miopen/sigmoidfocalloss/invoke_params.hpp | 79 ++ .../sigmoidfocalloss/problem_description.hpp | 118 +++ .../miopen/sigmoidfocalloss/solvers.hpp | 121 +++ src/include/miopen/sigmoidfocalloss/utils.hpp | 49 ++ src/include/miopen/solver_id.hpp | 3 +- src/include/miopen/tensor_view_utils.hpp | 11 +- src/kernels/MIOpenLossSum.cpp | 56 ++ src/kernels/MIOpenSigmoidFocalLoss.cpp | 329 ++++++++ src/kernels/warp_shuffle.hpp | 72 ++ src/sigmoid_focal_loss.cpp | 170 ++++ src/sigmoid_focal_loss_api.cpp | 192 +++++ src/sigmoidfocalloss/problem_description.cpp | 88 ++ src/solver.cpp | 12 + .../backward_reduce_sigmoid_focal_loss.cpp | 119 +++ .../backward_unreduce_sigmoid_focal_loss.cpp | 113 +++ .../forward_reduce_sigmoid_focal_loss.cpp | 186 +++++ .../forward_unreduce_sigmoid_focal_loss.cpp | 107 +++ test/cpu_sigmoid_focal_loss.hpp | 238 ++++++ test/gtest/sigmoid_focal_loss.cpp | 325 ++++++++ test/gtest/sigmoid_focal_loss.hpp | 489 +++++++++++ 33 files changed, 4202 insertions(+), 7 deletions(-) create mode 100755 .githooks/post-checkout create mode 100755 .githooks/post-commit create mode 100755 .githooks/post-merge create mode 100755 .githooks/pre-push create mode 100644 driver/dm_sigmoid_focal_loss.cpp create mode 100644 driver/sigmoid_focal_loss_driver.hpp create mode 100644 rocfft_r2c_ex.cpp create mode 100644 src/include/miopen/sigmoid_focal_loss.hpp create mode 100644 src/include/miopen/sigmoidfocalloss/invoke_params.hpp create mode 100644 src/include/miopen/sigmoidfocalloss/problem_description.hpp create mode 100644 src/include/miopen/sigmoidfocalloss/solvers.hpp create mode 100644 src/include/miopen/sigmoidfocalloss/utils.hpp create mode 100644 src/kernels/MIOpenLossSum.cpp create mode 100644 src/kernels/MIOpenSigmoidFocalLoss.cpp create mode 100644 src/kernels/warp_shuffle.hpp create mode 100644 src/sigmoid_focal_loss.cpp create mode 100644 src/sigmoid_focal_loss_api.cpp create mode 100644 src/sigmoidfocalloss/problem_description.cpp create mode 100644 src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp create mode 100644 src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp create mode 100644 src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp create mode 100644 src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp create mode 100644 test/cpu_sigmoid_focal_loss.hpp create mode 100644 test/gtest/sigmoid_focal_loss.cpp create mode 100644 test/gtest/sigmoid_focal_loss.hpp diff --git a/.githooks/post-checkout b/.githooks/post-checkout new file mode 100755 index 0000000000..ca7fcb4008 --- /dev/null +++ b/.githooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit new file mode 100755 index 0000000000..52b339cb3f --- /dev/null +++ b/.githooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge new file mode 100755 index 0000000000..a912e667aa --- /dev/null +++ b/.githooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000000..0f0089bc25 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs pre-push "$@" diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 90e29ffaa9..2387ef1be8 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -35,3 +35,4 @@ The MIOpen API library is structured as follows: * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) + * :doc:`SigmoidFocalLoss <../doxygen/html/group__loss_function>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index cd663eb8b4..c8763f0c7b 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -52,6 +52,7 @@ add_executable(MIOpenDriver dm_reduceextreme.cpp dm_reducecalculation.cpp dm_rnn.cpp + dm_sigmoid_focal_loss.cpp dm_softmax.cpp dm_t5layernorm.cpp dm_tensorop.cpp diff --git a/driver/dm_sigmoid_focal_loss.cpp b/driver/dm_sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..001f2964b5 --- /dev/null +++ b/driver/dm_sigmoid_focal_loss.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "registry_driver_maker.hpp" +#include "sigmoid_focal_loss_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "sigmoidfocalloss") + return new SigmoidFocalLossDriver(); + else if(base_arg == "sigmoidfocallossfp16") + return new SigmoidFocalLossDriver(); + else if(base_arg == "sigmoidfocallossbfp16") + return new SigmoidFocalLossDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index b23df690d1..749ee16a17 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], " + "sigmoidfocalloss[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -205,8 +206,11 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && - arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && - arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version") + arg != "getitemfp16" && arg != "getitembfp16" && arg != "transformersadamwfp16" && + arg != "transformersampadamw" && arg != "reducecalculation" && + arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && + arg != "sigmoidfocalloss" && arg != "sigmoidfocallossfp16" && + arg != "sigmoidfocallossbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp new file mode 100644 index 0000000000..6c739a3911 --- /dev/null +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -0,0 +1,777 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once +#include "InputFlags.hpp" +#include "driver.hpp" +#include +#include +#include +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "random.hpp" +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> +#include +#include +#include + +template +void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input, + miopenTensorDescriptor_t inputDesc, + TIO* target, + miopenTensorDescriptor_t targetDesc, + TIO* outputHost, + miopenTensorDescriptor_t outputDesc, + float alpha = 0.25, + float gamma = 2) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + + float sig = 1 / (1 + exp(-i)); + float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); + float sigT = sig * t + (1 - sig) * (1 - t); + float loss = ceLoss * pow(1 - sigT, gamma); + + if(alpha >= 0) + { + float alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; + } + + outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); + } +} + +template +void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input, + miopenTensorDescriptor_t inputDesc, + TIO* target, + miopenTensorDescriptor_t targetDesc, + TIO* doutput, + miopenTensorDescriptor_t doutputDesc, + TIO* dinput, + miopenTensorDescriptor_t dinputDesc, + TIO* dtarget, + miopenTensorDescriptor_t dtargetDesc, + float alpha = 0.25, + float gamma = 2) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); + auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc)); + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); + + float p = 1 / (1 + exp(-i)); + float ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + float pT = p * t + (1 - p) * (1 - t); + float powPt = pow(1 - pT, gamma); + float alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput) + { + float dpdi = exp(-i) / pow(1 + exp(-i), 2); + float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + float dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + float grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + } + + if(dtarget) + { + float dcelossdt = -log(p) + log(1 - p); + float dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + float gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + } + } +} + +template +void mloSigmoidFocalLossFwdRunHost(TIO* input, + miopenTensorDescriptor_t inputDesc, + TIO* target, + miopenTensorDescriptor_t targetDesc, + TIO* workspace, + TIO* ref_output, + float alpha = 0.25, + float gamma = 2, + float divisor = 1) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + + float sig = 1 / (1 + exp(-i)); + float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); + float sigT = sig * t + (1 - sig) * (1 - t); + float loss = ceLoss * pow(1 - sigT, gamma); + + if(alpha >= 0) + { + float alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; + } + + workspace[id] = static_cast(loss / divisor); + } + + // Reduce loss + const int local_size = 256; + int offset_a = 0; + int offset_b = inputSize; + size_t _size = inputSize; + do + { + for(int i = 0; i < _size; i += local_size) + { + TIO shared[local_size]; + for(int j = 0; j < local_size; ++j) + shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f; + for(int offset = local_size / 2; offset > 0; offset >>= 1) + for(int j = 0; j < offset; ++j) + shared[j] += shared[j + offset]; + if(_size <= local_size) + ref_output[0] = shared[0]; + else + workspace[offset_b + i / local_size] = shared[0]; + } + std::swap(offset_a, offset_b); + _size = (_size + local_size - 1) / local_size; + } while(_size > 1); +} + +template +void mloSigmoidFocalLossBwdRunHost(TIO* input, + miopenTensorDescriptor_t inputDesc, + TIO* target, + miopenTensorDescriptor_t targetDesc, + TIO* doutput, + miopenTensorDescriptor_t doutputDesc, + TIO* dinput, + miopenTensorDescriptor_t dinputDesc, + TIO* dtarget, + miopenTensorDescriptor_t dtargetDesc, + float alpha = 0.25, + float gamma = 2, + float divisor = 1) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); + auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc)); + + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + tensor_layout_t<5> doIdx(input_tv, 0); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + + float p = 1 / (1 + exp(-i)); + float ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + float pT = p * t + (1 - p) * (1 - t); + float powPt = pow(1 - pT, gamma); + float alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput) + { + float dpdi = exp(-i) / pow(1 + exp(-i), 2); + float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + float dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + float grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + grad /= divisor; + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + } + + if(dtarget) + { + float dcelossdt = -log(p) + log(1 - p); + float dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + float gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + gradTarget /= divisor; + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + } + } +} + +template +class SigmoidFocalLossDriver : public Driver +{ +public: + SigmoidFocalLossDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&targetDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&doutputDesc); + miopenCreateTensorDescriptor(&dinputDesc); + miopenCreateTensorDescriptor(&dtargetDesc); + + data_type = miopen_type{}; + } + + std::vector ComputeStrides(std::vector input); + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + int VerifyBackward() override; + int VerifyForward() override; + ~SigmoidFocalLossDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(targetDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(doutputDesc); + miopenDestroyTensorDescriptor(dinputDesc); + miopenDestroyTensorDescriptor(dtargetDesc); + } + +private: + InputFlags inflags; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t targetDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t doutputDesc; + miopenTensorDescriptor_t dinputDesc; + miopenTensorDescriptor_t dtargetDesc; + + std::unique_ptr input_dev; + std::unique_ptr target_dev; + std::unique_ptr output_dev; + std::unique_ptr doutput_dev; + std::unique_ptr dinput_dev; + std::unique_ptr dtarget_dev; + std::unique_ptr workspace_dev; + + std::vector input; + std::vector target; + std::vector output; + std::vector outputHost; + std::vector doutput; + std::vector dinput; + std::vector dinputHost; + std::vector dtarget; + std::vector dtargetHost; + std::vector workspace; + + float alpha; + float gamma; + float divisor; + bool isContiguous; + bool isTargetGradientComputed; + miopenLossReductionMode_t reduction; + + size_t workSpaceSizeInBytes; +}; + +template +int SigmoidFocalLossDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::GetandSetData() +{ + auto inDims = inflags.GetValueTensor("dim-lengths").lengths; + alpha = inflags.GetValueDouble("alpha"); + gamma = inflags.GetValueDouble("gamma"); + isContiguous = inflags.GetValueInt("is-contiguous") == 1 ? true : false; + isTargetGradientComputed = inflags.GetValueInt("target-gradient") == 1 ? true : false; + reduction = static_cast(inflags.GetValueInt("reduction")); + + std::vector inStride = ComputeStrides(inDims); + + SetTensorNd(inputDesc, inDims, inStride, data_type); + SetTensorNd(targetDesc, inDims, inStride, data_type); + SetTensorNd(doutputDesc, inDims, data_type); + SetTensorNd(dinputDesc, inDims, data_type); + + if(isTargetGradientComputed) + { + SetTensorNd(dtargetDesc, inDims, data_type); + } + else + { + std::vector dtargetDim(1); + dtargetDim[0] = 1; + SetTensorNd(dtargetDesc, dtargetDim, data_type); + } + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + SetTensorNd(outputDesc, inDims, data_type); + } + else + { + std::vector outDim(1); + outDim[0] = 1; + SetTensorNd(outputDesc, outDim, data_type); + divisor = 1; + if(reduction == MIOPEN_LOSS_REDUCTION_MEAN) + { + divisor = miopen::deref(inputDesc).GetElementSize(); + } + } + + return 0; +} + +// Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False +template +std::vector SigmoidFocalLossDriver::ComputeStrides(std::vector inputDim) +{ + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +int SigmoidFocalLossDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward (Default=1)", "int"); + inflags.AddTensorFlag( + "dim-lengths", 'D', "256x4x2", "The dimensional lengths of the input tensor"); + inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int"); + inflags.AddInputFlag( + "reduction", 'R', "0", "reduction mode: 0(default) - unreduced, 1 - sum, 2 -mean", "int"); + inflags.AddInputFlag("alpha", 'A', "0.25", "Alpha (Default=0.25)", "float"); + inflags.AddInputFlag("gamma", 'G', "2", "Gamma (Default=2)", "float"); + inflags.AddInputFlag( + "target-gradient", 'T', "0", "Is target gradient computed (Default=0)", "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = miopen::deref(inputDesc).GetElementSize(); + size_t target_sz = miopen::deref(targetDesc).GetElementSize(); + size_t out_sz = miopen::deref(outputDesc).GetElementSize(); + size_t dO_sz = miopen::deref(doutputDesc).GetElementSize(); + size_t dI_sz = miopen::deref(dinputDesc).GetElementSize(); + size_t dT_sz = miopen::deref(dtargetDesc).GetElementSize(); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(TIO))); + target_dev = std::unique_ptr(new GPUMem(ctx, target_sz, sizeof(TIO))); + output_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(TIO))); + doutput_dev = std::unique_ptr(new GPUMem(ctx, dO_sz, sizeof(TIO))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, dI_sz, sizeof(TIO))); + dtarget_dev = std::unique_ptr(new GPUMem(ctx, dT_sz, sizeof(TIO))); + + miopenGetSigmoidFocalLossForwardWorkspaceSize( + handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes); + workspace_dev = + std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(TIO), sizeof(TIO))); + + input = std::vector(in_sz, static_cast(0)); + target = std::vector(target_sz, static_cast(0)); + output = std::vector(out_sz, static_cast(0)); + outputHost = std::vector(out_sz, static_cast(0)); + doutput = std::vector(dO_sz, static_cast(0)); + dinput = std::vector(dI_sz, static_cast(0)); + dinputHost = std::vector(dI_sz, static_cast(0)); + dtarget = std::vector(dT_sz, static_cast(0)); + dtargetHost = std::vector(dT_sz, static_cast(0)); + workspace = std::vector(workSpaceSizeInBytes / sizeof(TIO), static_cast(0)); + + for(int i = 0; i < in_sz; i++) + { + input[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + target[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + } + for(int i = 0; i < dO_sz; ++i) + { + doutput[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + } + + fill(output.begin(), output.end(), static_cast(0)); + fill(dinput.begin(), dinput.end(), static_cast(0)); + fill(dtarget.begin(), dtarget.end(), static_cast(0)); + + if(input_dev->ToGPU(GetStream(), input.data()) != 0) + std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl; + + if(target_dev->ToGPU(GetStream(), target.data()) != 0) + std::cerr << "Error copying (in) to GPU, size: " << target_dev->GetSize() << std::endl; + + if(output_dev->ToGPU(GetStream(), output.data()) != 0) + std::cerr << "Error copying (out) to GPU, size: " << output_dev->GetSize() << std::endl; + + if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0) + std::cerr << "Error copying (dO) to GPU, size: " << doutput_dev->GetSize() << std::endl; + + if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0) + std::cerr << "Error copying (dI) to GPU, size: " << dinput_dev->GetSize() << std::endl; + + if(dtarget_dev->ToGPU(GetStream(), dtarget.data()) != 0) + std::cerr << "Error copying (dT) to GPU, size: " << dtarget_dev->GetSize() << std::endl; + + if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0) + std::cerr << "Error copying (dI) to GPU, size: " << workspace_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::RunForwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenSigmoidFocalLossForward(GetHandle(), + workspace_dev->GetMem(), + workSpaceSizeInBytes, + inputDesc, + input_dev->GetMem(), + targetDesc, + target_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + alpha, + gamma, + reduction); + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Sigmoid Focal Loss Fwd Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Sigmoid Focal Loss Fwd Elapsed: " << kernel_average_time + << " ms" << std::endl; + } + + if(output_dev->FromGPU(GetStream(), output.data()) != 0) + std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::RunForwardCPU() +{ + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + mloSigmoidFocalLossUnreducedFwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + outputHost.data(), + outputDesc, + alpha, + gamma); + } + else + { + mloSigmoidFocalLossFwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + workspace.data(), + outputHost.data(), + alpha, + gamma, + divisor); + } + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + void* p_dtarget = nullptr; + if(isTargetGradientComputed) + { + p_dtarget = dtarget_dev->GetMem(); + } + + miopenSigmoidFocalLossBackward(GetHandle(), + inputDesc, + input_dev->GetMem(), + targetDesc, + target_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + dinputDesc, + dinput_dev->GetMem(), + dtargetDesc, + p_dtarget, + alpha, + gamma, + reduction); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Sigmoid Focal Loss Bwd Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Sigmoid Focal Loss Bwd Elapsed: " << kernel_average_time + << " ms" << std::endl; + } + + if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0) + std::cerr << "Error copying (dI_dev) from GPU, size: " << dinput_dev->GetSize() + << std::endl; + if(isTargetGradientComputed && dtarget_dev->FromGPU(GetStream(), dtarget.data()) != 0) + std::cerr << "Error copying (dT_dev) from GPU, size: " << dtarget_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::RunBackwardCPU() +{ + TIO* p_dtarget = nullptr; + if(isTargetGradientComputed) + { + p_dtarget = dtargetHost.data(); + } + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + + mloSigmoidFocalLossUnreducedBwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + doutput.data(), + doutputDesc, + dinputHost.data(), + dinputDesc, + p_dtarget, + dtargetDesc, + alpha, + gamma); + } + else + { + mloSigmoidFocalLossBwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + doutput.data(), + doutputDesc, + dinputHost.data(), + dinputDesc, + p_dtarget, + dtargetDesc, + alpha, + gamma, + divisor); + } + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::VerifyForward() +{ + RunForwardCPU(); + + double tolerance = std::numeric_limits::epsilon() * 10; + auto error = miopen::rms_range(outputHost, output); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward " << reduction << " Sigmoid Focal Loss FAILED: " << error << " > " + << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Forward " << reduction << " Sigmoid Focal Loss Verifies OK on CPU reference (" + << error << "< " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} + +template +int SigmoidFocalLossDriver::VerifyBackward() +{ + RunBackwardCPU(); + + double tolerance = std::numeric_limits::epsilon() * 10; + auto dinputError = miopen::rms_range(dinputHost, dinput); + auto dtargetError = miopen::rms_range(dtargetHost, dtarget); + + if(!std::isfinite(dinputError) || dinputError > tolerance) + { + std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dinputError + << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else if(isTargetGradientComputed && (!std::isfinite(dtargetError) || dtargetError > tolerance)) + { + std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dtargetError + << " > " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + std::cout << "Backward " << reduction + << " Sigmoid Focal Loss Verifies OK on CPU reference (dinput: " << dinputError + << ", dtarget: " << dtargetError << "< " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 3b9bbeccc1..c983f92619 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -5004,6 +5004,98 @@ MIOPEN_EXPORT miopenStatus_t miopenCTCLoss(miopenHandle_t handle, void* workSpace, size_t workSpaceSize); +#ifdef MIOPEN_BETA_API + +typedef enum +{ + MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */ + MIOPEN_LOSS_REDUCTION_SUM = 1, /*!< output tensor elements are summed up */ + MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total + number of elements to get mean value */ +} miopenLossReductionMode_t; + +/*! @brief Helper function to query the minimum workspace size required by the sigmoid focal loss + * call + * + * @param handle MIOpen Handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param reduction Reduction (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t targetDesc, + miopenTensorDescriptor_t outputDesc, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes); + +/*! @brief Execute a SigmoidFocalLoss forward layer + * + * @param handle MIOpen handle (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param target Data tensor target (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param alpha Alpha (input) + * @param gamma Gamma (input) + * @param reduction Reduction (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t outputDesc, + void* output, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +/*! @brief Execute a SigmoidFocalLoss backward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param target Data tensor target (input) + * @param doutputDesc Tensor descriptor for output gradient (input) + * @param doutput Gradient of output (input) + * @param dinputDesc Tensor descriptor for input gradient (input) + * @param dinput Gradient of input (output) + * @param dtargetDesc Tensor descriptor for target gradient (input) + * @param dtarget Gradient of target (output) + * @param alpha Alpha (input) + * @param gamma Gamma (input) + * @param reduction Reduction (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dtargetDesc, + void* dtarget, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); +#endif + /** @} */ // CLOSEOUT LossFunction DOXYGEN GROUP diff --git a/rocfft_r2c_ex.cpp b/rocfft_r2c_ex.cpp new file mode 100644 index 0000000000..8c17fac21b --- /dev/null +++ b/rocfft_r2c_ex.cpp @@ -0,0 +1,317 @@ +// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../../../shared/CLI11.hpp" +#include "examplekernels.h" +#include "exampleutils.h" +#include + +int main(int argc, char* argv[]) +{ + std::cout << "rocfft double-precision real/complex transform\n" << std::endl; + + // Length of transform: + std::vector length = {8}; + + // Gpu device id: + size_t deviceId = 0; + + // Command-line options: + CLI::App app{"rocfft sample command line options"}; + app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); + CLI::Option* opt_outofplace = + app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); + CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); + app.add_option( + "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); + + try + { + app.parse(argc, argv); + } + catch(const CLI::ParseError& e) + { + return app.exit(e); + } + + // Placeness for the transform + if(rocfft_setup() != rocfft_status_success) + throw std::runtime_error("rocfft_setup failed."); + const rocfft_result_placement place = + *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; + const bool inplace = place == rocfft_placement_inplace; + + // Direction of transform + const rocfft_transform_type direction = + *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward; + const bool forward = direction == rocfft_transform_type_real_forward; + + // Set up the strides and buffer size for the real values: + std::vector rstride = {1}; + for(unsigned int i = 1; i < length.size(); ++i) + { + // In-place transforms need space for two extra real values in the contiguous + // direction. + auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1]; + rstride.push_back(val); + } + // NB: not tight, but hey + const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1]; + std::vector rdata(real_size); // host storage + + // The complex data length is half + 1 of the real data length in the contiguous + // dimensions. Since rocFFT is column-major, this is the first index. + std::vector clength = length; + clength[0] = clength[0] / 2 + 1; + std::vector cstride = {1}; + for(unsigned int i = 1; i < clength.size(); ++i) + { + cstride.push_back(clength[i - 1] * cstride[i - 1]); + } + const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; + std::vector cdata(complex_size); // host storage + + // Based on the direction, we set the input and output parameters appropriately. + const size_t isize = forward ? real_size : complex_size; + const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); + const std::vector ilength = forward ? length : clength; + const std::vector istride = forward ? rstride : cstride; + + const size_t osize = forward ? complex_size : real_size; + const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); + const std::vector olength = forward ? clength : length; + const std::vector ostride = forward ? cstride : rstride; + + // Print information about the transform: + std::cout << "direction: "; + if(forward) + std::cout << "forward\n"; + else + std::cout << "inverse\n"; + std::cout << "length:"; + for(const auto i : length) + std::cout << " " << i; + std::cout << "\n"; + if(inplace) + std::cout << "in-place transform\n"; + else + std::cout << "out-of-place transform\n"; + std::cout << "deviceID: " << deviceId << "\n"; + std::cout << "input length:"; + for(auto i : ilength) + std::cout << " " << i; + std::cout << "\n"; + std::cout << "input buffer stride:"; + for(auto i : istride) + std::cout << " " << i; + std::cout << "\n"; + std::cout << "input buffer size: " << ibytes << "\n"; + + std::cout << "output length:"; + for(auto i : olength) + std::cout << " " << i; + std::cout << "\n"; + std::cout << "output buffer stride:"; + for(auto i : ostride) + std::cout << " " << i; + std::cout << "\n"; + std::cout << "output buffer size: " << obytes << "\n"; + std::cout << std::endl; + + // Set the device: + if(hipSetDevice(deviceId) != hipSuccess) + throw std::runtime_error("hipSetDevice failed."); + + // Create HIP device object and initialize data + // Kernels are provided in examplekernels.h + void* gpu_in = nullptr; + hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); + if(hip_status != hipSuccess) + throw std::runtime_error("device error"); + + if(forward) + { + initreal_cm(length, istride, gpu_in); + } + else + { + init_hermitiancomplex_cm(length, ilength, istride, gpu_in); + } + + // Print the input: + std::cout << "input:\n"; + if(forward) + { + hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(rdata, ilength, istride, 1, isize); + } + else + { + hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(cdata, ilength, istride, 1, isize); + + // Check that the buffer is Hermitian symmetric: + check_symmetry_cm(cdata, length, istride, 1, isize); + } + + // rocfft_status can be used to capture API status info + rocfft_status rc = rocfft_status_success; + + // Create the a descrition struct to set data layout: + rocfft_plan_description gpu_description = nullptr; + rc = rocfft_plan_description_create(&gpu_description); + if(rc != rocfft_status_success) + throw std::runtime_error("failed to create plan description"); + + rc = rocfft_plan_description_set_data_layout( + gpu_description, + // input data format: + forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, + // output data format: + forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, + nullptr, + nullptr, + istride.size(), // input stride length + istride.data(), // input stride data + 0, // input batch distance + ostride.size(), // output stride length + ostride.data(), // output stride data + 0); // ouptut batch distance + if(rc != rocfft_status_success) + throw std::runtime_error("failed to set data layout"); + + // We can also pass "nullptr" instead of a description; rocFFT will use reasonable + // default parameters. If the data isn't contiguous, we need to set strides, etc, + // using the description. + + // Create the FFT plan: + rocfft_plan gpu_plan = nullptr; + rc = rocfft_plan_create(&gpu_plan, + place, + direction, + rocfft_precision_double, + length.size(), // Dimension + length.data(), // lengths + 1, // Number of transforms + gpu_description); // Description + if(rc != rocfft_status_success) + throw std::runtime_error("failed to create plan"); + + // Get the execution info for the fft plan (in particular, work memory requirements): + rocfft_execution_info planinfo = nullptr; + rc = rocfft_execution_info_create(&planinfo); + if(rc != rocfft_status_success) + throw std::runtime_error("failed to create execution info"); + + size_t workbuffersize = 0; + rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); + if(rc != rocfft_status_success) + throw std::runtime_error("failed to get work buffer size"); + + // If the transform requires work memory, allocate a work buffer: + void* wbuffer = nullptr; + if(workbuffersize > 0) + { + hip_status = hipMalloc(&wbuffer, workbuffersize); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMalloc failed"); + + rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); + if(rc != rocfft_status_success) + throw std::runtime_error("failed to set work buffer"); + } + + // If the transform is out-of-place, allocate the output buffer as well: + void* gpu_out = inplace ? gpu_in : nullptr; + if(!inplace) + { + hip_status = hipMalloc(&gpu_out, obytes); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMalloc failed"); + } + + // Execute the GPU transform: + rc = rocfft_execute(gpu_plan, // plan + (void**)&gpu_in, // in_buffer + (void**)&gpu_out, // out_buffer + planinfo); // execution info + if(rc != rocfft_status_success) + throw std::runtime_error("failed to execute"); + + // Get the output from the device and print to cout: + std::cout << "output:\n"; + if(forward) + { + hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(cdata, olength, ostride, 1, osize); + } + else + { + hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); + if(hip_status != hipSuccess) + throw std::runtime_error("hipMemcpy failed."); + printbuffer_cm(rdata, olength, ostride, 1, osize); + } + + // Clean up: free GPU memory: + if(hipFree(gpu_in) != hipSuccess) + throw std::runtime_error("hipFree failed."); + + if(!inplace) + { + if(hipFree(gpu_out) != hipSuccess) + throw std::runtime_error("hipFree failed."); + } + if(wbuffer != nullptr) + { + if(hipFree(wbuffer) != hipSuccess) + throw std::runtime_error("hipFree failed."); + } + + // Clean up: destroy plans: + if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) + throw std::runtime_error("rocfft_execution_info_destroy failed."); + planinfo = nullptr; + if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_description_destroy failed."); + gpu_description = nullptr; + if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) + throw std::runtime_error("rocfft_plan_destroy failed."); + gpu_plan = nullptr; + + rocfft_cleanup(); + return 0; +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77acf3f7d3..2f4d54976a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -180,6 +180,8 @@ set( MIOpen_Source rnn/Solutions/bwd_s_stream.cpp rnn/Solutions/bwd_multi_stream.cpp scalar.cpp + sigmoidfocalloss/problem_description.cpp + sigmoid_focal_loss_api.cpp softmax.cpp softmax_api.cpp softmax/problem_description.cpp @@ -305,6 +307,10 @@ set( MIOpen_Source solver/reduce/forward_min.cpp solver/reduce/forward_prod.cpp solver/reduce/forward_sum.cpp + solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp + solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp + solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp + solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp subbuffers.cpp @@ -461,6 +467,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/stride_array.hpp kernels/tensor_view.hpp kernels/utilities.inc + kernels/warp_shuffle.hpp kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc @@ -503,6 +510,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl kernels/MIOpenNeuron.cl + kernels/MIOpenLossSum.cpp kernels/MIOpenPooling.cl kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl @@ -548,6 +556,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/gcnAsmBNBwdTrainSpatial.s kernels/MIOpenTensorKernels.cl kernels/MIOpenTensorKernelsHip.cpp + kernels/MIOpenSigmoidFocalLoss.cpp kernels/MIOpenSubTensorOpWithScalarKernel.cl kernels/MIOpenSubTensorOpWithSubTensorKernel.cl kernels/MIOpenSubTensorOpWithCastTensorKernel.cl @@ -656,6 +665,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ocl/fusionopbiasbnactivocl.cpp reducecalculation.cpp reduceextreme.cpp + sigmoid_focal_loss.cpp transformers_adam_w.cpp ${PROJECT_BINARY_DIR}/db_path.cpp ) diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp new file mode 100644 index 0000000000..07d3e32d61 --- /dev/null +++ b/src/include/miopen/sigmoid_focal_loss.hpp @@ -0,0 +1,71 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ +#define MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& targetDesc, + const TensorDescriptor& outputDesc, + miopenLossReductionMode_t reduction); + +miopenStatus_t SigmoidFocalLossForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& outputDesc, + Data_t output, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +miopenStatus_t SigmoidFocalLossBackward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dtargetDesc, + Data_t dtarget, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +} // namespace miopen +#endif // MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ diff --git a/src/include/miopen/sigmoidfocalloss/invoke_params.hpp b/src/include/miopen/sigmoidfocalloss/invoke_params.hpp new file mode 100644 index 0000000000..e2801cead2 --- /dev/null +++ b/src/include/miopen/sigmoidfocalloss/invoke_params.hpp @@ -0,0 +1,79 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { + +namespace sigmoidfocalloss { + +struct SigmoidFocalLossInvokeParams : public miopen::InvokeParams +{ + SigmoidFocalLossInvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* targetDesc = nullptr; + + ConstData_t input = nullptr; + ConstData_t target = nullptr; + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + float alpha = 0.25; + float gamma = 2.0f; + miopenLossReductionMode_t reduction = MIOPEN_LOSS_REDUCTION_NONE; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +struct FwdInvokeParams : SigmoidFocalLossInvokeParams +{ + FwdInvokeParams() = default; + + const TensorDescriptor* outputDesc = nullptr; + Data_t output = nullptr; +}; + +struct BwdInvokeParams : SigmoidFocalLossInvokeParams +{ + BwdInvokeParams() = default; + + const TensorDescriptor* doutputDesc = nullptr; + const TensorDescriptor* dinputDesc = nullptr; + const TensorDescriptor* dtargetDesc = nullptr; + + ConstData_t doutput = nullptr; + ConstData_t dinput = nullptr; + ConstData_t dtarget = nullptr; +}; + +} // namespace sigmoidfocalloss + +} // namespace miopen diff --git a/src/include/miopen/sigmoidfocalloss/problem_description.hpp b/src/include/miopen/sigmoidfocalloss/problem_description.hpp new file mode 100644 index 0000000000..3590b5c3d4 --- /dev/null +++ b/src/include/miopen/sigmoidfocalloss/problem_description.hpp @@ -0,0 +1,118 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace sigmoidfocalloss { + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); + +struct SigmoidFocalLossProblemDescription : ProblemDescriptionBase +{ + SigmoidFocalLossProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const miopenLossReductionMode_t reduction_) + : inputDesc(inputDesc_), targetDesc(targetDesc_), reduction(reduction_) + { + if(!checkSameLength(inputDesc, targetDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "SigmoidFocalLoss: Input, target tensor sizes do not match."); + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetTargetDesc() const { return targetDesc; } + +public: + TensorDescriptor inputDesc; + TensorDescriptor targetDesc; + miopenLossReductionMode_t reduction; +}; + +struct SigmoidFocalLossFwdProblemDescription : SigmoidFocalLossProblemDescription +{ + SigmoidFocalLossFwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const TensorDescriptor& outputDesc_, + const miopenLossReductionMode_t reduction_) + : SigmoidFocalLossProblemDescription(inputDesc_, targetDesc_, reduction_), + outputDesc(outputDesc_) + { + miopenDataType_t dtype = inputDesc.GetType(); + if(dtype != targetDesc.GetType() || dtype != outputDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SigmoidFocalLoss: Input, target, output tensor type do not match."); + } + + NetworkConfig MakeNetworkConfig() const override; + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + +public: + TensorDescriptor outputDesc; +}; + +struct SigmoidFocalLossBwdProblemDescription : SigmoidFocalLossProblemDescription +{ + SigmoidFocalLossBwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& targetDesc_, + const TensorDescriptor& doutputDesc_, + const TensorDescriptor& dinputDesc_, + const TensorDescriptor& dtargetDesc_, + const miopenLossReductionMode_t reduction_) + : SigmoidFocalLossProblemDescription(inputDesc_, targetDesc_, reduction_), + doutputDesc(doutputDesc_), + dinputDesc(dinputDesc_), + dtargetDesc(dtargetDesc_) + { + miopenDataType_t dtype = inputDesc.GetType(); + if(dtype != targetDesc.GetType() || dtype != doutputDesc.GetType() || + dtype != dinputDesc.GetType() || dtype != dtargetDesc.GetType()) + MIOPEN_THROW(miopenStatusBadParm, + "SigmoidFocalLoss: Input, target, doutput, dinput, dtarget tensor type do " + "not match."); + } + + NetworkConfig MakeNetworkConfig() const override; + const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; } + const TensorDescriptor& GetDinputDesc() const { return dinputDesc; } + const TensorDescriptor& GetDtargetDesc() const { return dtargetDesc; } + +public: + TensorDescriptor doutputDesc; + TensorDescriptor dinputDesc; + TensorDescriptor dtargetDesc; +}; + +} // namespace sigmoidfocalloss + +} // namespace miopen diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp new file mode 100644 index 0000000000..992ad5a9d6 --- /dev/null +++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp @@ -0,0 +1,121 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace sigmoidfocalloss { + +using SigmoidFocalLossFwdSolverBase = + NonTunableSolverBase; + +struct SigmoidFocalLossFwd final : SigmoidFocalLossFwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& + problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& + problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) + const override; + + bool MayNeedWorkspace() const override { return true; } +}; + +using SigmoidFocalLossBwdSolverBase = + NonTunableSolverBase; + +struct SigmoidFocalLossBwd final : SigmoidFocalLossBwdSolverBase +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& + problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& + problem) const override; +}; + +using SigmoidFocalLossUnreducedFwdSolverBase = + NonTunableSolverBase; + +struct SigmoidFocalLossUnreducedFwd final : SigmoidFocalLossUnreducedFwdSolverBase +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& + problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& + problem) const override; +}; + +using SigmoidFocalLossUnreducedBwdSolverBase = + NonTunableSolverBase; + +struct SigmoidFocalLossUnreducedBwd final : SigmoidFocalLossUnreducedBwdSolverBase +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& + problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& + problem) const override; +}; + +} // namespace sigmoidfocalloss + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/sigmoidfocalloss/utils.hpp b/src/include/miopen/sigmoidfocalloss/utils.hpp new file mode 100644 index 0000000000..0dddceea7e --- /dev/null +++ b/src/include/miopen/sigmoidfocalloss/utils.hpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + miopen::KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return miopen::solver::KernelInfo{build_params.GenerateFor(miopen::kbp::HIP{}), + localsize, + gridsize, + kernel_file, + kernel_name}; +}; diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 81c15f6bea..9f79cefc6b 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -59,7 +59,8 @@ enum class Primitive Mha, Softmax, Adam, - Item + Item, + Loss }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index 9f7430ba8a..77a9b6ddae 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -27,8 +27,8 @@ #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_ #define MIOPEN_TENSOR_VIEW_UTIL_HPP_ -#include #include "../../kernels/tensor_view.hpp" +#include "miopen/tensor.hpp" namespace miopen { @@ -38,10 +38,15 @@ inline tensor_view_t get_inner_expanded_tv(const TensorDescriptor Desc) auto dims = Desc.GetLengths(); auto strides = Desc.GetStrides(); - tensor_view_t tensor_view; + tensor_view_t tensor_view{}; for(size_t i = 0; i < N; ++i) { - if(i < dims.size()) + if(dims.empty()) + { + tensor_view.stride[i] = 0; + tensor_view.size[i] = 0; + } + else if(i < dims.size()) { tensor_view.stride[i] = strides[i]; tensor_view.size[i] = dims[i]; diff --git a/src/kernels/MIOpenLossSum.cpp b/src/kernels/MIOpenLossSum.cpp new file mode 100644 index 0000000000..08d3a656f6 --- /dev/null +++ b/src/kernels/MIOpenLossSum.cpp @@ -0,0 +1,56 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "warp_shuffle.hpp" + +#ifndef IN_OUT_TYPE +#define IN_OUT_TYPE float +#endif + +template +__device__ void losssum(const TIO* input, TIO* output, size_t N) +{ + auto gid = blockIdx.x * blockDim.x + threadIdx.x; + + FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : static_cast(0.0f); + val = block_reduce_sum(val); + + if(threadIdx.x == 0) + output[blockIdx.x] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void +LossSum(const IN_OUT_TYPE* __restrict__ input, IN_OUT_TYPE* __restrict__ output, size_t N) +{ + // instantiate the kernel + losssum(input, output, N); +} diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp new file mode 100644 index 0000000000..75c25c0e42 --- /dev/null +++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp @@ -0,0 +1,329 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +#ifndef IN_OUT_TYPE +#define IN_OUT_TYPE float +#endif + +#ifndef CVT_ACCUM2FLOAT +#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x)) +#endif + +#ifndef CVT_FLOAT2ACCUM +#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x)) +#endif + +template +__device__ void sigmoidFocalLossFwd(const TIO* input, + TIO* target, + TIO* workspace, + float alpha, + float gamma, + float divisor, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> idx(input_tv, gid); + if(idx.layout[0] >= input_tv.size[0]) + return; + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + + FLOAT_ACCUM p = 1 / (1 + exp(-i)); + FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); + FLOAT_ACCUM loss = ceLoss * pow(1 - pT, gamma); + + if(alpha >= 0) + { + FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t); + loss = alpha_t * loss; + } + + workspace[gid] = CVT_ACCUM2FLOAT(loss / divisor); +} + +extern "C" __global__ void SigmoidFocalLossFwd(const IN_OUT_TYPE* input, + IN_OUT_TYPE* target, + IN_OUT_TYPE* workspace, + float alpha, + float gamma, + float divisor, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv) +{ + sigmoidFocalLossFwd( + input, target, workspace, alpha, gamma, divisor, input_tv, target_tv); +} + +template +__device__ void sigmoidFocalLossBwd(const TIO* input, + const TIO* target, + const TIO* doutput, + TIO* dinput, + TIO* dtarget, + float alpha, + float gamma, + float divisor, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> doutput_tv, + tensor_view_t<5> dinput_tv, + tensor_view_t<5> dtarget_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> idx(input_tv, gid); + tensor_layout_t<5> doIdx(doutput_tv, 0); + if(idx.layout[0] >= input_tv.size[0]) + return; + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + + FLOAT_ACCUM p = 1 / (1 + exp(-i)); + FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); + FLOAT_ACCUM powPt = pow(1 - pT, gamma); + FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput) + { + FLOAT_ACCUM dpdi = exp(-i) / pow(1 + exp(-i), 2); + // dceloss/di = dceloss/dp * dp/di + FLOAT_ACCUM dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + // dpowt/di = dpowt/dpT * dpT/dp * dp/di + FLOAT_ACCUM dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + FLOAT_ACCUM dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + FLOAT_ACCUM grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + grad /= divisor; + dinput[dinput_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(grad); + } + + if(dtarget) + { + FLOAT_ACCUM dcelossdt = -log(p) + log(1 - p); + FLOAT_ACCUM dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + FLOAT_ACCUM dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + FLOAT_ACCUM gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + gradTarget /= divisor; + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(gradTarget); + } +} + +extern "C" __global__ void SigmoidFocalLossBwd(const IN_OUT_TYPE* input, + IN_OUT_TYPE* target, + IN_OUT_TYPE* doutput, + IN_OUT_TYPE* dinput, + IN_OUT_TYPE* dtarget, + float alpha, + float gamma, + float divisor, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> doutput_tv, + tensor_view_t<5> dinput_tv, + tensor_view_t<5> dtarget_tv) +{ + sigmoidFocalLossBwd(input, + target, + doutput, + dinput, + dtarget, + alpha, + gamma, + divisor, + input_tv, + target_tv, + doutput_tv, + dinput_tv, + dtarget_tv); +} + +template +__device__ void sigmoidFocalLossUnreducedFwd(const TIO* input, + TIO* target, + TIO* output, + float alpha, + float gamma, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> output_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> idx(input_tv, gid); + if(idx.layout[0] >= input_tv.size[0]) + return; + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + + FLOAT_ACCUM p = 1 / (1 + exp(-i)); + FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); + FLOAT_ACCUM loss = ceLoss * pow(1 - pT, gamma); + + if(alpha >= 0) + { + FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t); + loss = alpha_t * loss; + } + + output[output_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss); +} + +extern "C" __global__ void SigmoidFocalLossUnreducedFwd(const IN_OUT_TYPE* input, + IN_OUT_TYPE* target, + IN_OUT_TYPE* output, + float alpha, + float gamma, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> output_tv) +{ + sigmoidFocalLossUnreducedFwd( + input, target, output, alpha, gamma, input_tv, target_tv, output_tv); +} + +template +__device__ void sigmoidFocalLossUnreducedBwd(const TIO* input, + const TIO* target, + const TIO* doutput, + TIO* dinput, + TIO* dtarget, + float alpha, + float gamma, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> doutput_tv, + tensor_view_t<5> dinput_tv, + tensor_view_t<5> dtarget_tv) +{ + size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> idx(input_tv, gid); + if(idx.layout[0] >= input_tv.size[0]) + return; + + FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(idx)]); + + FLOAT_ACCUM p = 1 / (1 + exp(-i)); + FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); + FLOAT_ACCUM powPt = pow(1 - pT, gamma); + FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput) + { + FLOAT_ACCUM dpdi = exp(-i) / pow(1 + exp(-i), 2); + // dceloss/di = dceloss/dp * dp/di + FLOAT_ACCUM dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + // dpowt/di = dpowt/dpT * dpT/dp * dp/di + FLOAT_ACCUM dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + FLOAT_ACCUM dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + FLOAT_ACCUM grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + dinput[dinput_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(grad); + } + + if(dtarget) + { + FLOAT_ACCUM dcelossdt = -log(p) + log(1 - p); + FLOAT_ACCUM dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + FLOAT_ACCUM dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + FLOAT_ACCUM gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(gradTarget); + } +} + +extern "C" __global__ void SigmoidFocalLossUnreducedBwd(const IN_OUT_TYPE* input, + IN_OUT_TYPE* target, + IN_OUT_TYPE* doutput, + IN_OUT_TYPE* dinput, + IN_OUT_TYPE* dtarget, + float alpha, + float gamma, + tensor_view_t<5> input_tv, + tensor_view_t<5> target_tv, + tensor_view_t<5> doutput_tv, + tensor_view_t<5> dinput_tv, + tensor_view_t<5> dtarget_tv) +{ + sigmoidFocalLossUnreducedBwd(input, + target, + doutput, + dinput, + dtarget, + alpha, + gamma, + input_tv, + target_tv, + doutput_tv, + dinput_tv, + dtarget_tv); +} diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp new file mode 100644 index 0000000000..ebd5861976 --- /dev/null +++ b/src/kernels/warp_shuffle.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" + +#ifndef REDUCE_SIZE +#define REDUCE_SIZE 256 +#endif + +__device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val) +{ + if(warpSize >= 64) + val += __shfl_down(val, 32); + if(warpSize >= 32) + val += __shfl_down(val, 16); + if(warpSize >= 16) + val += __shfl_down(val, 8); + if(warpSize >= 8) + val += __shfl_down(val, 4); + if(warpSize >= 4) + val += __shfl_down(val, 2); + if(warpSize >= 2) + val += __shfl_down(val, 1); + return val; +} + +__device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val) +{ + static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize]; + auto lane = threadIdx.x % warpSize; + auto wid = threadIdx.x / warpSize; + + val = warp_reduce_sum(val); + + if(lane == 0) + shared[wid] = val; + __syncthreads(); + + val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0; + if(wid == 0) + val = warp_reduce_sum(val); + + return val; +} diff --git a/src/sigmoid_focal_loss.cpp b/src/sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..e1123a799c --- /dev/null +++ b/src/sigmoid_focal_loss.cpp @@ -0,0 +1,170 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& targetDesc, + const TensorDescriptor& outputDesc, + miopenLossReductionMode_t reduction) +{ + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + return 0; + } + + auto ctx = ExecutionContext{&handle}; + const auto problem = sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription{ + inputDesc, targetDesc, outputDesc, reduction}; + + const auto algo = AlgorithmName{"SigmoidFocalLossFwd"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t SigmoidFocalLossForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& outputDesc, + Data_t output, + float alpha, + float gamma, + miopenLossReductionMode_t reduction) +{ + const auto problem = sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription{ + inputDesc, targetDesc, outputDesc, reduction}; + + const auto invoke_params = [&]() { + auto tmp = sigmoidfocalloss::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.targetDesc = &targetDesc; + tmp.outputDesc = &outputDesc; + tmp.input = input; + tmp.target = target; + tmp.output = output; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + tmp.alpha = alpha; + tmp.gamma = gamma; + tmp.reduction = reduction; + return tmp; + }(); + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + const auto algo = AlgorithmName{"SigmoidFocalLossUnreducedFwd"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + } + else + { + const auto algo = AlgorithmName{"SigmoidFocalLossFwd"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + } + + return miopenStatusSuccess; +} + +miopenStatus_t SigmoidFocalLossBackward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dtargetDesc, + Data_t dtarget, + float alpha, + float gamma, + const miopenLossReductionMode_t reduction) +{ + const auto problem = sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription{ + inputDesc, targetDesc, doutputDesc, dinputDesc, dtargetDesc, reduction}; + + const auto invoke_params = [&]() { + auto tmp = sigmoidfocalloss::BwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.targetDesc = &targetDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinputDesc = &dinputDesc; + tmp.dtargetDesc = &dtargetDesc; + tmp.input = input; + tmp.target = target; + tmp.doutput = doutput; + tmp.dinput = dinput; + tmp.dtarget = dtarget; + tmp.alpha = alpha; + tmp.gamma = gamma; + tmp.reduction = reduction; + return tmp; + }(); + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + const auto algo = AlgorithmName{"SigmoidFocalLossUnreducedBwd"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + } + else + { + const auto algo = AlgorithmName{"SigmoidFocalLossBwd"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + } + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/sigmoid_focal_loss_api.cpp b/src/sigmoid_focal_loss_api.cpp new file mode 100644 index 0000000000..2cc511bb28 --- /dev/null +++ b/src/sigmoid_focal_loss_api.cpp @@ -0,0 +1,192 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdSigmoidFocalLoss(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t targetDesc, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(inputDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "sigmoidFocalLossfp16"; + } + else if(dtype == miopenFloat) + { + ss << "sigmoidFocalLossfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "sigmoidFocalLossbfp16"; + } + + MIOPEN_LOG_FUNCTION(inputDesc, targetDesc); + ss << " -n " << miopen::deref(inputDesc).GetLengths()[0]; + ss << " -T " << miopen::deref(inputDesc).GetLengths(); + ss << " -Si " << miopen::deref(inputDesc).GetStrides(); + ss << " -St " << miopen::deref(targetDesc).GetStrides(); + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t +miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t targetDesc, + const miopenTensorDescriptor_t outputDesc, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION(handle, inputDesc, targetDesc, outputDesc, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = + miopen::GetSigmoidFocalLossForwardWorkspaceSize(miopen::deref(handle), + miopen::deref(inputDesc), + miopen::deref(targetDesc), + miopen::deref(outputDesc), + reduction); + }); +} + +extern "C" miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t targetDesc, + const void* target, + const miopenTensorDescriptor_t outputDesc, + void* output, + const float alpha, + const float gamma, + const miopenLossReductionMode_t reduction) +{ + MIOPEN_LOG_FUNCTION(handle, + workspace, + workspaceSizeInBytes, + inputDesc, + input, + targetDesc, + target, + outputDesc, + output, + alpha, + gamma, + reduction); + + LogCmdSigmoidFocalLoss(inputDesc, targetDesc, true); + + return miopen::try_([&] { + miopen::SigmoidFocalLossForward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(targetDesc), + DataCast(target), + miopen::deref(outputDesc), + DataCast(output), + alpha, + gamma, + reduction); + }); +} + +extern "C" miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dtargetDesc, + void* dtarget, + float alpha, + float gamma, + const miopenLossReductionMode_t reduction) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + targetDesc, + target, + doutputDesc, + doutput, + dinputDesc, + dinput, + dtargetDesc, + dtarget, + alpha, + gamma, + reduction); + + LogCmdSigmoidFocalLoss(inputDesc, targetDesc, false); + + return miopen::try_([&] { + miopen::SigmoidFocalLossBackward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(targetDesc), + DataCast(target), + miopen::deref(doutputDesc), + DataCast(doutput), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(dtargetDesc), + DataCast(dtarget), + alpha, + gamma, + reduction); + }); +} diff --git a/src/sigmoidfocalloss/problem_description.cpp b/src/sigmoidfocalloss/problem_description.cpp new file mode 100644 index 0000000000..825df9286e --- /dev/null +++ b/src/sigmoidfocalloss/problem_description.cpp @@ -0,0 +1,88 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace sigmoidfocalloss { + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetNumDims() != y.GetNumDims()) + return false; + for(int32_t i = 0; i < x.GetNumDims(); ++i) + { + if(x.GetLengths()[i] != y.GetLengths()[i]) + return false; + } + return true; +} + +NetworkConfig SigmoidFocalLossBwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto target_dtype = targetDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto dim_num = inputDesc.GetNumDims(); + + std::ostringstream ss; + + ss << "sfl_bwd"; + ss << "reduction" << reduction; + ss << "i_dtype" << input_dtype; + ss << "t_dtype" << target_dtype; + ss << "dim_num" << dim_num; + ss << "size" << size; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig SigmoidFocalLossFwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto target_dtype = targetDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto dim_num = inputDesc.GetNumDims(); + + std::ostringstream ss; + + ss << "sfl_fwd"; + ss << "reduction" << reduction; + ss << "i_dtype" << input_dtype; + ss << "t_dtype" << target_dtype; + ss << "dim_num" << dim_num; + ss << "size" << size; + + return NetworkConfig{ss.str()}; +} + +} // namespace sigmoidfocalloss + +} // namespace miopen diff --git a/src/solver.cpp b/src/solver.cpp index 6b451ca498..91def5d6eb 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -673,6 +674,17 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); + Register(registry, + ++id, + Primitive::Loss, + sigmoidfocalloss::SigmoidFocalLossUnreducedFwd{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Loss, + sigmoidfocalloss::SigmoidFocalLossUnreducedBwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossFwd{}.SolverDbId()); + Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossBwd{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..4e5046da49 --- /dev/null +++ b/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp @@ -0,0 +1,119 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace sigmoidfocalloss { + +bool SigmoidFocalLossBwd::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() > 5) + return false; + return true; +} + +ConvSolution SigmoidFocalLossBwd::GetSolution( + const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetDinputDesc().GetType(); + auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType()); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE}, + {problem.GetInputDesc().GetElementSize()}, + "MIOpenSigmoidFocalLoss.cpp", + "SigmoidFocalLossBwd", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto target_tv = get_inner_expanded_tv<5>(deref(params.targetDesc)); + auto doutput_tv = get_inner_expanded_tv<5>(deref(params.doutputDesc)); + auto dinput_tv = get_inner_expanded_tv<5>(deref(params.dinputDesc)); + auto dtarget_tv = get_inner_expanded_tv<5>(deref(params.dtargetDesc)); + float divisor = 1; + if(params.reduction == MIOPEN_LOSS_REDUCTION_MEAN) + { + divisor = deref(params.inputDesc).GetElementSize(); + } + + kernel(params.input, + params.target, + params.doutput, + params.dinput, + params.dtarget, + params.alpha, + params.gamma, + divisor, + input_tv, + target_tv, + doutput_tv, + dinput_tv, + dtarget_tv); + }; + }; + + return result; +} + +} // namespace sigmoidfocalloss + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..8d34198d73 --- /dev/null +++ b/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp @@ -0,0 +1,113 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace sigmoidfocalloss { + +bool SigmoidFocalLossUnreducedBwd::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() > 5) + return false; + return true; +} + +ConvSolution SigmoidFocalLossUnreducedBwd::GetSolution( + const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetDinputDesc().GetType(); + auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType()); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE}, + {problem.GetInputDesc().GetElementSize()}, + "MIOpenSigmoidFocalLoss.cpp", + "SigmoidFocalLossUnreducedBwd", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto target_tv = get_inner_expanded_tv<5>(deref(params.targetDesc)); + auto doutput_tv = get_inner_expanded_tv<5>(deref(params.doutputDesc)); + auto dinput_tv = get_inner_expanded_tv<5>(deref(params.dinputDesc)); + auto dtarget_tv = get_inner_expanded_tv<5>(deref(params.dtargetDesc)); + + kernel(params.input, + params.target, + params.doutput, + params.dinput, + params.dtarget, + params.alpha, + params.gamma, + input_tv, + target_tv, + doutput_tv, + dinput_tv, + dtarget_tv); + }; + }; + + return result; +} + +} // namespace sigmoidfocalloss + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..f7daa8b84c --- /dev/null +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -0,0 +1,186 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 +#define LOCAL_SIZE_REDUCE_FWD 256 + +namespace miopen { + +namespace solver { + +namespace sigmoidfocalloss { + +bool SigmoidFocalLossFwd::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() > 5) + return false; + return true; +} + +ConvSolution SigmoidFocalLossFwd::GetSolution( + const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + std::ignore = context; + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType()); + auto size = problem.GetInputDesc().GetElementSize(); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + + /* Prepare params for loss kernel */ + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params)); + + /* Prepare params for reduce kernels */ + auto _size = size; + do + { + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_REDUCE_FWD}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); + _size = AlignUp(_size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD; + } while(_size > 1); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + auto size = deref(params.inputDesc).GetElementSize(); + + auto elapsed = 0.f; + HipEventPtr start; + HipEventPtr stop; + + bool resetProfilingState = false; + if(handle_.IsProfilingEnabled()) + { + resetProfilingState = true; + handle_.EnableProfiling(false); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + /* Execute loss kernel */ + { + decltype(auto) kernel = handle_.Run(kernels.front()); + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto target_tv = get_inner_expanded_tv<5>(deref(params.targetDesc)); + float divisor = 1; + if(params.reduction == MIOPEN_LOSS_REDUCTION_MEAN) + { + divisor = size; + } + + kernel(params.input, + params.target, + params.workspace, + params.alpha, + params.gamma, + divisor, + input_tv, + target_tv); + } + + /* Execute reduce kernels */ + auto reduceIn = params.workspace; + auto reduceOut = + static_cast(static_cast(params.workspace) + + deref(params.inputDesc).GetElementSize() * + get_data_size(deref(params.outputDesc).GetType())); + for(int i = 1; i < kernels.size(); ++i) + { + decltype(auto) kernel = handle_.Run(kernels[i]); + if(i + 1 != kernels.size()) + { + kernel(reduceIn, reduceOut, size); + std::swap(reduceIn, reduceOut); + } + else + { + kernel(reduceIn, params.output, size); + } + size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD; + } + + if(resetProfilingState) + { + handle_.EnableProfiling(true); + } + + if(handle_.IsProfilingEnabled()) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + + return result; +} + +std::size_t SigmoidFocalLossFwd::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + size_t inputElements = problem.GetInputDesc().GetElementSize(); + size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE_FWD - 1) / LOCAL_SIZE_REDUCE_FWD; + size_t res = + (inputElements + reduceElements) * get_data_size(problem.GetOutputDesc().GetType()); + + return res; +} + +} // namespace sigmoidfocalloss + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..91e8b48e49 --- /dev/null +++ b/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp @@ -0,0 +1,107 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace sigmoidfocalloss { + +bool SigmoidFocalLossUnreducedFwd::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() > 5) + return false; + return true; +} + +ConvSolution SigmoidFocalLossUnreducedFwd::GetSolution( + const ExecutionContext& context, + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + + auto in_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType()); + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE}, + {problem.GetInputDesc().GetElementSize()}, + "MIOpenSigmoidFocalLoss.cpp", + "SigmoidFocalLossUnreducedFwd", + build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto target_tv = get_inner_expanded_tv<5>(deref(params.targetDesc)); + auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc)); + + kernel(params.input, + params.target, + params.output, + params.alpha, + params.gamma, + input_tv, + target_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace sigmoidfocalloss + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp new file mode 100644 index 0000000000..3b13b955e3 --- /dev/null +++ b/test/cpu_sigmoid_focal_loss.hpp @@ -0,0 +1,238 @@ +#pragma once + +#include "tensor_holder.hpp" +#include "tensor_view.hpp" +#include +#include + +template +void cpu_sigmoid_focal_loss_unreduced_forward(tensor input, + tensor target, + tensor& outputHost, + float alpha = 0.25, + float gamma = 2) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(outputHost.desc); + size_t inputSize = input.desc.GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + + float sig = 1 / (1 + std::exp(-i)); + float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig)); + float sigT = sig * t + (1 - sig) * (1 - t); + float loss = ceLoss * std::pow(1 - sigT, gamma); + + if(alpha >= 0) + { + float alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; + } + + outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); + } +} + +template +void cpu_sigmoid_focal_loss_unreduced_backward(tensor input, + tensor target, + tensor doutput, + tensor& dinput, + tensor& dtarget, + float alpha = 0.25, + float gamma = 2) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(doutput.desc); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(dinput.desc); + auto dtarget_tv = miopen::get_inner_expanded_tv<5>(dtarget.desc); + size_t inputSize = input.desc.GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); + + float p = 1 / (1 + std::exp(-i)); + float ceLoss = -(t * std::log(p) + (1 - t) * std::log(1 - p)); + float pT = p * t + (1 - p) * (1 - t); + float powPt = std::pow(1 - pT, gamma); + float alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput.data.size() > 0) + { + float dpdi = std::exp(-i) / std::pow(1 + std::exp(-i), 2); + float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + float dpowptdi = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + float grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + } + + if(dtarget.data.size() > 0) + { + float dcelossdt = -std::log(p) + std::log(1 - p); + float dpowptdt = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + float gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + } + } +} + +template +void cpu_sigmoid_focal_loss_forward(tensor input, + tensor target, + tensor& workspace, + tensor& outputHost, + float alpha = 0.25, + float gamma = 2, + float divisor = 1) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); + size_t inputSize = input.desc.GetElementSize(); + // float reduction_float; + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + + float sig = 1 / (1 + std::exp(-i)); + float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig)); + float sigT = sig * t + (1 - sig) * (1 - t); + float loss = ceLoss * std::pow(1 - sigT, gamma); + + if(alpha >= 0) + { + float alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; + } + // reduction_float += (loss / divisor); + + workspace[id] = static_cast(loss / divisor); + } + // std::cout << "Reduction result in float" << reduction_float << " " << divisor << std::endl; + + // Reduce loss + const int local_size = 256; + int offset_a = 0; + int offset_b = inputSize; + size_t _size = inputSize; + do + { + for(int i = 0; i < _size; i += local_size) + { + TIO shared[local_size]; + for(int j = 0; j < local_size; ++j) + shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f; + for(int offset = local_size / 2; offset > 0; offset >>= 1) + for(int j = 0; j < offset; ++j) + shared[j] += shared[j + offset]; + if(_size <= local_size) + outputHost[0] = shared[0]; + else + workspace[offset_b + i / local_size] = shared[0]; + } + std::swap(offset_a, offset_b); + _size = (_size + local_size - 1) / local_size; + } while(_size > 1); +} + +template +void cpu_sigmoid_focal_loss_backward(tensor input, + tensor target, + tensor doutput, + tensor& dinput, + tensor& dtarget, + float alpha = 0.25, + float gamma = 2, + float divisor = 1) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(doutput.desc); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(dinput.desc); + auto dtarget_tv = miopen::get_inner_expanded_tv<5>(dtarget.desc); + + size_t inputSize = input.desc.GetElementSize(); + + tensor_layout_t<5> doIdx(input_tv, 0); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + + float p = 1 / (1 + std::exp(-i)); + float ceLoss = -(t * std::log(p) + (1 - t) * std::log(1 - p)); + float pT = p * t + (1 - p) * (1 - t); + float powPt = std::pow(1 - pT, gamma); + float alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput.data.size() > 0) + { + float dpdi = std::exp(-i) / std::pow(1 + std::exp(-i), 2); + float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + float dpowptdi = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + float grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + grad /= divisor; + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + } + + if(dtarget.data.size() > 0) + { + float dcelossdt = -std::log(p) + std::log(1 - p); + float dpowptdt = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + float gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + gradTarget /= divisor; + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + } + } +} diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp new file mode 100644 index 0000000000..f2f6ec5d17 --- /dev/null +++ b/test/gtest/sigmoid_focal_loss.cpp @@ -0,0 +1,325 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "sigmoid_focal_loss.hpp" +#include "miopen/bfloat16.hpp" +#include "tensor_holder.hpp" +#include + +#define TEST_FWD_REDUCED +#define TEST_BWD_REDUCED +#define TEST_FWD_UNREDUCED +#define TEST_BWD_UNREDUCED + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace sigmoidfocalloss { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest +{ +}; + +struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest +{ +}; + +struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest +{ +}; + +struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest +{ +}; + +struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest +{ +}; + +struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest +{ +}; + +struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest +{ +}; + +struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest +{ +}; + +struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest +{ +}; + +struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest +{ +}; + +struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest +{ +}; + +struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest +{ +}; +}; // namespace sigmoidfocalloss + +using namespace sigmoidfocalloss; + +#ifdef TEST_FWD_REDUCED +TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestFloat32, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestBFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif + +#ifdef TEST_BWD_REDUCED +TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestFloat32, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestBFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif + +#ifdef TEST_FWD_UNREDUCED +TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestFloat32, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestBFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif + +#ifdef TEST_BWD_UNREDUCED +TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestFloat32, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); + +TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestBFloat16, + testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp new file mode 100644 index 0000000000..7443b7a94a --- /dev/null +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -0,0 +1,489 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "cpu_sigmoid_focal_loss.hpp" +#include "get_handle.hpp" +#include "miopen/allocator.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include + +struct SigmoidFocalLossTestCase +{ + std::vector dims; + bool isContiguous; + float alpha; + float gamma; + miopenLossReductionMode_t reduction; + friend std::ostream& operator<<(std::ostream& os, const SigmoidFocalLossTestCase& tc) + { + os << "dims: "; + for(auto dim : tc.dims) + { + os << dim << " "; + } + return os << "is_contiguous: " << tc.isContiguous << " alpha: " << tc.alpha + << " gamma: " << tc.gamma; + } + + std::vector GetDims() const { return dims; } + + SigmoidFocalLossTestCase() {} + + SigmoidFocalLossTestCase(std::vector dim_, + bool isContiguous_ = true, + miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE, + float alpha_ = 0.25, + float gamma_ = 2) + : dims(dim_), + isContiguous(isContiguous_), + alpha(alpha_), + gamma(gamma_), + reduction(reduction_) + { + } + + std::vector ComputeStrides(std::vector inputDim) const + { + if(!isContiguous) + std::swap(inputDim.front(), inputDim.back()); + std::vector strides(inputDim.size()); + strides.back() = 1; + for(int i = inputDim.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * inputDim[i + 1]; + if(!isContiguous) + std::swap(strides.front(), strides.back()); + return strides; + } +}; + +inline std::vector SigmoidFocalLossTestConfigs() +{ + return { + SigmoidFocalLossTestCase({4000}), // 1D cont + SigmoidFocalLossTestCase({100, 500}), // 2D cont + SigmoidFocalLossTestCase({100, 500}, false), // 2D non-cont + SigmoidFocalLossTestCase({10, 20, 200}), // 3D cont + SigmoidFocalLossTestCase({10, 20, 200}, false), // 3D non-cont + SigmoidFocalLossTestCase({8, 3, 20, 100}), // 4D cont + SigmoidFocalLossTestCase({8, 3, 20, 100}, false), // 4D non-cont + SigmoidFocalLossTestCase({2, 2, 3, 4, 100}), // 5D cont + SigmoidFocalLossTestCase({2, 2, 3, 4, 100}, false), // 5D non-cont + }; +} + +template +struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + auto in_dims = config.GetDims(); + auto in_strides = config.ComputeStrides(in_dims); + + auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + input = tensor{in_dims, in_strides}.generate(in_gen_value); + + auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + target = tensor{in_dims, in_strides}.generate(tar_gen_value); + + output = tensor{in_dims}; + std::fill(output.begin(), output.end(), 0); + + outputHost = tensor{in_dims}; + std::fill(outputHost.begin(), outputHost.end(), 0); + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::SigmoidFocalLossForward(handle, + nullptr, + 0, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + output.desc, + output_dev.get(), + config.alpha, + config.gamma, + config.reduction); + cpu_sigmoid_focal_loss_unreduced_forward(input, target, outputHost, config.alpha); + + EXPECT_EQ(status, miopenStatusSuccess); + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(outputHost, output); + + EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output)); + EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error + << ", Thresholdx10: " << threshold * 10; + } + SigmoidFocalLossTestCase config; + + tensor input; + tensor target; + tensor output; + + tensor outputHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr output_dev; +}; + +template +struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + auto in_dims = config.GetDims(); + auto in_strides = config.ComputeStrides(in_dims); + auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + input = tensor{in_dims, in_strides}.generate(in_gen_value); + + auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + target = tensor{in_dims, in_strides}.generate(tar_gen_value); + + auto dOut_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + dOutput = tensor{in_dims, in_strides}.generate(dOut_gen_value); + + dInput = tensor{in_dims}; + std::fill(dInput.begin(), dInput.end(), 0); + + dInputHost = tensor{in_dims}; + std::fill(dInputHost.begin(), dInputHost.end(), 0); + + dTarget = tensor{in_dims}; + std::fill(dTarget.begin(), dTarget.end(), 0); + + dTargetHost = tensor{in_dims}; + std::fill(dTargetHost.begin(), dTargetHost.end(), 0); + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + dOutput_dev = handle.Write(dOutput.data); + dInput_dev = handle.Write(dInput.data); + dTarget_dev = handle.Write(dTarget.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + status = miopen::SigmoidFocalLossBackward(handle, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + dOutput.desc, + dOutput_dev.get(), + dInput.desc, + dInput_dev.get(), + dTarget.desc, + dTarget_dev.get(), + config.alpha, + config.gamma, + config.reduction); + cpu_sigmoid_focal_loss_unreduced_backward( + input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma); + + EXPECT_EQ(status, miopenStatusSuccess); + + dInput.data = handle.Read(dInput_dev, dInput.data.size()); + dTarget.data = handle.Read(dTarget_dev, dTarget.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto dInputError = miopen::rms_range(dInputHost, dInput); + + EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput)); + EXPECT_TRUE(dInputError < threshold * 10) + << "dInput error output beyond tolerance Error: " << dInputError + << ", Thresholdx10: " << threshold * 10; + + auto dTargetError = miopen::rms_range(dTargetHost, dTarget); + + EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget)); + EXPECT_TRUE(dTargetError < threshold * 10) + << "dTarget error output beyond tolerance Error: " << dTargetError + << ", Thresholdx10: " << threshold * 10; + } + SigmoidFocalLossTestCase config; + + tensor input; + tensor target; + tensor dOutput; + tensor dInput; + tensor dTarget; + + tensor dInputHost; + tensor dTargetHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr dOutput_dev; + miopen::Allocator::ManageDataPtr dInput_dev; + miopen::Allocator::ManageDataPtr dTarget_dev; +}; + +template +struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + + config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1)); + + auto in_dims = config.GetDims(); + auto in_strides = config.ComputeStrides(in_dims); + + auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 20); }; + input = tensor{in_dims, in_strides}.generate(in_gen_value); + + auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 20); }; + target = tensor{in_dims, in_strides}.generate(tar_gen_value); + + size_t workspaceSizeBytes = miopen::GetSigmoidFocalLossForwardWorkspaceSize( + handle, input.desc, target.desc, output.desc, config.reduction); + size_t workspaceElements = workspaceSizeBytes / sizeof(TIO); + + workspace = tensor(workspaceElements); + std::fill(workspace.begin(), workspace.end(), 0); + + output = tensor(1); + std::fill(output.begin(), output.end(), 0); + + outputHost = tensor(1); + std::fill(outputHost.begin(), outputHost.end(), 0); + + divisor = 1; + if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN) + { + divisor *= input.desc.GetElementSize(); + } + + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + workspace_dev = handle.Write(workspace.data); + output_dev = handle.Write(output.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + status = miopen::SigmoidFocalLossForward(handle, + workspace_dev.get(), + workspace.GetDataByteSize(), + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + output.desc, + output_dev.get(), + config.alpha, + config.gamma, + config.reduction); + cpu_sigmoid_focal_loss_forward( + input, target, workspace, outputHost, config.alpha, config.gamma, divisor); + + EXPECT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(outputHost, output); + + EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output)); + EXPECT_TRUE(error < threshold * 10) + << "Error output beyond tolerance Error: " << error + << ", Thresholdx10: " << threshold * 10 << " Reduction: " << config.reduction; + } + SigmoidFocalLossTestCase config; + + tensor input; + tensor target; + tensor workspace; + tensor output; + + tensor outputHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + miopen::Allocator::ManageDataPtr output_dev; + + float divisor; +}; + +template +struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + config = GetParam(); + auto in_dims = config.GetDims(); + auto in_strides = config.ComputeStrides(in_dims); + + config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1)); + + auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + input = tensor{in_dims, in_strides}.generate(in_gen_value); + + auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(0.1, 50); }; + target = tensor{in_dims, in_strides}.generate(tar_gen_value); + + dOutput = tensor(1); + dOutput[0] = prng::gen_descreet_uniform_sign(0.1, 50); + + dInput = tensor{in_dims}; + std::fill(dInput.begin(), dInput.end(), 0); + + dInputHost = tensor{in_dims}; + std::fill(dInputHost.begin(), dInputHost.end(), 0); + + dTarget = tensor{in_dims}; + std::fill(dTarget.begin(), dTarget.end(), 0); + + dTargetHost = tensor{in_dims}; + std::fill(dTargetHost.begin(), dTargetHost.end(), 0); + + divisor = 1; + if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN) + { + divisor *= input.desc.GetElementSize(); + } + input_dev = handle.Write(input.data); + target_dev = handle.Write(target.data); + dOutput_dev = handle.Write(dOutput.data); + dInput_dev = handle.Write(dInput.data); + dTarget_dev = handle.Write(dTarget.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + status = miopen::SigmoidFocalLossBackward(handle, + input.desc, + input_dev.get(), + target.desc, + target_dev.get(), + dOutput.desc, + dOutput_dev.get(), + dInput.desc, + dInput_dev.get(), + dTarget.desc, + dTarget_dev.get(), + config.alpha, + config.gamma, + config.reduction); + cpu_sigmoid_focal_loss_backward( + input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma, divisor); + + EXPECT_EQ(status, miopenStatusSuccess); + + dInput.data = handle.Read(dInput_dev, dInput.data.size()); + dTarget.data = handle.Read(dTarget_dev, dTarget.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto dInputError = miopen::rms_range(dInputHost, dInput); + + EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput)); + EXPECT_TRUE(dInputError < threshold * 10) + << "dInput error output beyond tolerance Error: " << dInputError + << ", Thresholdx10: " << threshold * 10; + + auto dTargetError = miopen::rms_range(dTargetHost, dTarget); + + EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget)); + EXPECT_TRUE(dTargetError < threshold * 10) + << "dTarget error output beyond tolerance Error: " << dTargetError + << ", Thresholdx10: " << threshold * 10; + } + SigmoidFocalLossTestCase config; + + tensor input; + tensor target; + tensor dOutput; + tensor dInput; + tensor dTarget; + + tensor dInputHost; + tensor dTargetHost; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr target_dev; + miopen::Allocator::ManageDataPtr dOutput_dev; + miopen::Allocator::ManageDataPtr dInput_dev; + miopen::Allocator::ManageDataPtr dTarget_dev; + + float divisor; +}; From 7a6dfa4fd626f685dad0a3bba48b14cb56ae1f66 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 24 Jul 2024 15:31:37 +0700 Subject: [PATCH 02/28] remove githooks --- .githooks/install | 7 ------- .githooks/post-checkout | 3 --- .githooks/post-commit | 3 --- .githooks/post-merge | 3 --- .githooks/pre-commit | 43 ----------------------------------------- .githooks/pre-push | 3 --- 6 files changed, 62 deletions(-) delete mode 100755 .githooks/install delete mode 100755 .githooks/post-checkout delete mode 100755 .githooks/post-commit delete mode 100755 .githooks/post-merge delete mode 100755 .githooks/pre-commit delete mode 100755 .githooks/pre-push diff --git a/.githooks/install b/.githooks/install deleted file mode 100755 index 52fec83a2f..0000000000 --- a/.githooks/install +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -cd $(git rev-parse --git-dir) - -echo "Installing hooks..." -ln -s ../.githooks hooks -echo "Done!" diff --git a/.githooks/post-checkout b/.githooks/post-checkout deleted file mode 100755 index ca7fcb4008..0000000000 --- a/.githooks/post-checkout +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit deleted file mode 100755 index 52b339cb3f..0000000000 --- a/.githooks/post-commit +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge deleted file mode 100755 index a912e667aa..0000000000 --- a/.githooks/post-merge +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-merge "$@" diff --git a/.githooks/pre-commit b/.githooks/pre-commit deleted file mode 100755 index e166dadd03..0000000000 --- a/.githooks/pre-commit +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh -# -# This pre-commit hook checks if any versions of clang-format -# are installed, and if so, uses the installed version to format -# the staged changes. - -base=clang-format-12 -format="" - -# Redirect output to stderr. -exec 1>&2 - - # check if clang-format is installed -type "$base" >/dev/null 2>&1 && format="$base" - -# no versions of clang-format are installed -if [ -z "$format" ] -then - echo "$base is not installed. Pre-commit hook will not be executed." - exit 0 -fi - -# Do everything from top - level -cd $(git rev-parse --show-toplevel) - -if git rev-parse --verify HEAD >/dev/null 2>&1 -then - against=HEAD -else - # Initial commit: diff against an empty tree object - against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 -fi - -# do the formatting -for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') -do - if [ -e "$file" ] - then - echo "$format $file" - "$format" -i -style=file "$file" - fi -done - diff --git a/.githooks/pre-push b/.githooks/pre-push deleted file mode 100755 index 0f0089bc25..0000000000 --- a/.githooks/pre-push +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs pre-push "$@" From 42aafe37e6769e1979ec5074f9ccdbedfa76a08b Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 24 Jul 2024 15:33:42 +0700 Subject: [PATCH 03/28] add .githooks --- .githooks/install | 7 +++++++ .githooks/post-checkout | 3 +++ .githooks/post-commit | 3 +++ .githooks/post-merge | 3 +++ .githooks/pre-commit | 43 +++++++++++++++++++++++++++++++++++++++++ .githooks/pre-push | 3 +++ 6 files changed, 62 insertions(+) create mode 100755 .githooks/install create mode 100755 .githooks/post-checkout create mode 100755 .githooks/post-commit create mode 100755 .githooks/post-merge create mode 100755 .githooks/pre-commit create mode 100755 .githooks/pre-push diff --git a/.githooks/install b/.githooks/install new file mode 100755 index 0000000000..52fec83a2f --- /dev/null +++ b/.githooks/install @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +cd $(git rev-parse --git-dir) + +echo "Installing hooks..." +ln -s ../.githooks hooks +echo "Done!" diff --git a/.githooks/post-checkout b/.githooks/post-checkout new file mode 100755 index 0000000000..ca7fcb4008 --- /dev/null +++ b/.githooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit new file mode 100755 index 0000000000..52b339cb3f --- /dev/null +++ b/.githooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge new file mode 100755 index 0000000000..a912e667aa --- /dev/null +++ b/.githooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs post-merge "$@" diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000000..e166dadd03 --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,43 @@ +#!/bin/sh +# +# This pre-commit hook checks if any versions of clang-format +# are installed, and if so, uses the installed version to format +# the staged changes. + +base=clang-format-12 +format="" + +# Redirect output to stderr. +exec 1>&2 + + # check if clang-format is installed +type "$base" >/dev/null 2>&1 && format="$base" + +# no versions of clang-format are installed +if [ -z "$format" ] +then + echo "$base is not installed. Pre-commit hook will not be executed." + exit 0 +fi + +# Do everything from top - level +cd $(git rev-parse --show-toplevel) + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 +fi + +# do the formatting +for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') +do + if [ -e "$file" ] + then + echo "$format $file" + "$format" -i -style=file "$file" + fi +done + diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 0000000000..0f0089bc25 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } +git lfs pre-push "$@" From 605542b9ebfdb8b7378b8d2a860cbe7af8eb117f Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 24 Jul 2024 15:36:52 +0700 Subject: [PATCH 04/28] add githooks --- .githooks/post-checkout | 3 --- .githooks/post-commit | 3 --- .githooks/post-merge | 3 --- .githooks/pre-push | 3 --- 4 files changed, 12 deletions(-) delete mode 100755 .githooks/post-checkout delete mode 100755 .githooks/post-commit delete mode 100755 .githooks/post-merge delete mode 100755 .githooks/pre-push diff --git a/.githooks/post-checkout b/.githooks/post-checkout deleted file mode 100755 index ca7fcb4008..0000000000 --- a/.githooks/post-checkout +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit deleted file mode 100755 index 52b339cb3f..0000000000 --- a/.githooks/post-commit +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge deleted file mode 100755 index a912e667aa..0000000000 --- a/.githooks/post-merge +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push deleted file mode 100755 index 0f0089bc25..0000000000 --- a/.githooks/pre-push +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; } -git lfs pre-push "$@" From 0a6dfa2c151864cc5cbe5682c82eb1f857569d73 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 24 Jul 2024 17:43:32 +0700 Subject: [PATCH 05/28] add Tcheck type in driver --- driver/dm_sigmoid_focal_loss.cpp | 6 +- driver/sigmoid_focal_loss_driver.hpp | 375 ++++++++++++++------------- 2 files changed, 198 insertions(+), 183 deletions(-) diff --git a/driver/dm_sigmoid_focal_loss.cpp b/driver/dm_sigmoid_focal_loss.cpp index 001f2964b5..3ec7e9ac31 100644 --- a/driver/dm_sigmoid_focal_loss.cpp +++ b/driver/dm_sigmoid_focal_loss.cpp @@ -30,11 +30,11 @@ static Driver* makeDriver(const std::string& base_arg) { if(base_arg == "sigmoidfocalloss") - return new SigmoidFocalLossDriver(); + return new SigmoidFocalLossDriver(); else if(base_arg == "sigmoidfocallossfp16") - return new SigmoidFocalLossDriver(); + return new SigmoidFocalLossDriver(); else if(base_arg == "sigmoidfocallossbfp16") - return new SigmoidFocalLossDriver(); + return new SigmoidFocalLossDriver(); return nullptr; } diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 6c739a3911..8bdc350b2f 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -32,19 +32,17 @@ #include #include "tensor_driver.hpp" #include "timer.hpp" -#include "random.hpp" #include <../test/tensor_holder.hpp> #include <../test/verify.hpp> #include -#include #include -template -void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input, +template +void mloSigmoidFocalLossUnreducedFwdRunHost(Tgpu* input, miopenTensorDescriptor_t inputDesc, - TIO* target, + Tgpu* target, miopenTensorDescriptor_t targetDesc, - TIO* outputHost, + Tcheck* outputHost, miopenTensorDescriptor_t outputDesc, float alpha = 0.25, float gamma = 2) @@ -58,34 +56,34 @@ void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input, { tensor_layout_t<5> idx(input_tv, id); - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float sig = 1 / (1 + exp(-i)); - float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); - float sigT = sig * t + (1 - sig) * (1 - t); - float loss = ceLoss * pow(1 - sigT, gamma); + Tcheck sig = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); + Tcheck sigT = sig * t + (1 - sig) * (1 - t); + Tcheck loss = ceLoss * pow(1 - sigT, gamma); if(alpha >= 0) { - float alphaT = alpha * t + (1 - alpha) * (1 - t); - loss = alphaT * loss; + Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; } - outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); + outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); } } -template -void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input, +template +void mloSigmoidFocalLossUnreducedBwdRunHost(Tgpu* input, miopenTensorDescriptor_t inputDesc, - TIO* target, + Tgpu* target, miopenTensorDescriptor_t targetDesc, - TIO* doutput, + Tgpu* doutput, miopenTensorDescriptor_t doutputDesc, - TIO* dinput, + Tcheck* dinput, miopenTensorDescriptor_t dinputDesc, - TIO* dtarget, + Tcheck* dtarget, miopenTensorDescriptor_t dtargetDesc, float alpha = 0.25, float gamma = 2) @@ -101,58 +99,58 @@ void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input, { tensor_layout_t<5> idx(input_tv, id); - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + Tcheck dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); - float p = 1 / (1 + exp(-i)); - float ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); - float pT = p * t + (1 - p) * (1 - t); - float powPt = pow(1 - pT, gamma); - float alpha_t = alpha * t + (1 - alpha) * (1 - t); + Tcheck p = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + Tcheck pT = p * t + (1 - p) * (1 - t); + Tcheck powPt = pow(1 - pT, gamma); + Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t); if(dinput) { - float dpdi = exp(-i) / pow(1 + exp(-i), 2); - float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; - float dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + Tcheck dpdi = exp(-i) / pow(1 + exp(-i), 2); + Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + Tcheck dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di - float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; - float grad = dO * dLdi; + Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + Tcheck grad = dO * dLdi; if(alpha >= 0) { grad *= alpha_t; } - dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); } if(dtarget) { - float dcelossdt = -log(p) + log(1 - p); - float dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + Tcheck dcelossdt = -log(p) + log(1 - p); + Tcheck dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt - float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; - float gradTarget = dO * dLdt; + Tcheck dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + Tcheck gradTarget = dO * dLdt; if(alpha >= 0) { // alpha_t * dL/dt + dalpha_t/dt * dL gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; } - dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); } } } -template -void mloSigmoidFocalLossFwdRunHost(TIO* input, +template +void mloSigmoidFocalLossFwdRunHost(Tgpu* input, miopenTensorDescriptor_t inputDesc, - TIO* target, + Tgpu* target, miopenTensorDescriptor_t targetDesc, - TIO* workspace, - TIO* ref_output, + Tcheck* workspaceHost, + Tcheck* outputHost, float alpha = 0.25, float gamma = 2, float divisor = 1) @@ -165,21 +163,21 @@ void mloSigmoidFocalLossFwdRunHost(TIO* input, { tensor_layout_t<5> idx(input_tv, id); - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float sig = 1 / (1 + exp(-i)); - float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); - float sigT = sig * t + (1 - sig) * (1 - t); - float loss = ceLoss * pow(1 - sigT, gamma); + Tcheck sig = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); + Tcheck sigT = sig * t + (1 - sig) * (1 - t); + Tcheck loss = ceLoss * pow(1 - sigT, gamma); if(alpha >= 0) { - float alphaT = alpha * t + (1 - alpha) * (1 - t); - loss = alphaT * loss; + Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; } - workspace[id] = static_cast(loss / divisor); + workspaceHost[id] = static_cast(loss / divisor); } // Reduce loss @@ -191,32 +189,32 @@ void mloSigmoidFocalLossFwdRunHost(TIO* input, { for(int i = 0; i < _size; i += local_size) { - TIO shared[local_size]; + Tcheck shared[local_size]; for(int j = 0; j < local_size; ++j) - shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f; + shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f; for(int offset = local_size / 2; offset > 0; offset >>= 1) for(int j = 0; j < offset; ++j) shared[j] += shared[j + offset]; if(_size <= local_size) - ref_output[0] = shared[0]; + outputHost[0] = shared[0]; else - workspace[offset_b + i / local_size] = shared[0]; + workspaceHost[offset_b + i / local_size] = shared[0]; } std::swap(offset_a, offset_b); _size = (_size + local_size - 1) / local_size; } while(_size > 1); } -template -void mloSigmoidFocalLossBwdRunHost(TIO* input, +template +void mloSigmoidFocalLossBwdRunHost(Tgpu* input, miopenTensorDescriptor_t inputDesc, - TIO* target, + Tgpu* target, miopenTensorDescriptor_t targetDesc, - TIO* doutput, + Tgpu* doutput, miopenTensorDescriptor_t doutputDesc, - TIO* dinput, + Tcheck* dinput, miopenTensorDescriptor_t dinputDesc, - TIO* dtarget, + Tcheck* dtarget, miopenTensorDescriptor_t dtargetDesc, float alpha = 0.25, float gamma = 2, @@ -236,41 +234,41 @@ void mloSigmoidFocalLossBwdRunHost(TIO* input, { tensor_layout_t<5> idx(input_tv, id); - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + Tcheck dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); - float p = 1 / (1 + exp(-i)); - float ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); - float pT = p * t + (1 - p) * (1 - t); - float powPt = pow(1 - pT, gamma); - float alpha_t = alpha * t + (1 - alpha) * (1 - t); + Tcheck p = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + Tcheck pT = p * t + (1 - p) * (1 - t); + Tcheck powPt = pow(1 - pT, gamma); + Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t); if(dinput) { - float dpdi = exp(-i) / pow(1 + exp(-i), 2); - float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; - float dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + Tcheck dpdi = exp(-i) / pow(1 + exp(-i), 2); + Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + Tcheck dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di - float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; - float grad = dO * dLdi; + Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + Tcheck grad = dO * dLdi; if(alpha >= 0) { grad *= alpha_t; } grad /= divisor; - dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); } if(dtarget) { - float dcelossdt = -log(p) + log(1 - p); - float dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + Tcheck dcelossdt = -log(p) + log(1 - p); + Tcheck dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt - float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; - float gradTarget = dO * dLdt; + Tcheck dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + Tcheck gradTarget = dO * dLdt; if(alpha >= 0) { @@ -278,12 +276,12 @@ void mloSigmoidFocalLossBwdRunHost(TIO* input, gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; } gradTarget /= divisor; - dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); } } } -template +template class SigmoidFocalLossDriver : public Driver { public: @@ -296,7 +294,7 @@ class SigmoidFocalLossDriver : public Driver miopenCreateTensorDescriptor(&dinputDesc); miopenCreateTensorDescriptor(&dtargetDesc); - data_type = miopen_type{}; + data_type = miopen_type{}; } std::vector ComputeStrides(std::vector input); @@ -314,6 +312,7 @@ class SigmoidFocalLossDriver : public Driver int RunBackwardGPU() override; int RunBackwardCPU(); + Tcheck GetTolerance(); int VerifyBackward() override; int VerifyForward() override; ~SigmoidFocalLossDriver() override @@ -344,16 +343,17 @@ class SigmoidFocalLossDriver : public Driver std::unique_ptr dtarget_dev; std::unique_ptr workspace_dev; - std::vector input; - std::vector target; - std::vector output; - std::vector outputHost; - std::vector doutput; - std::vector dinput; - std::vector dinputHost; - std::vector dtarget; - std::vector dtargetHost; - std::vector workspace; + std::vector input; + std::vector target; + std::vector output; + std::vector outputHost; + std::vector doutput; + std::vector dinput; + std::vector dinputHost; + std::vector dtarget; + std::vector dtargetHost; + std::vector workspace; + std::vector workspaceHost; float alpha; float gamma; @@ -365,8 +365,8 @@ class SigmoidFocalLossDriver : public Driver size_t workSpaceSizeInBytes; }; -template -int SigmoidFocalLossDriver::ParseCmdLineArgs(int argc, char* argv[]) +template +int SigmoidFocalLossDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); @@ -377,8 +377,8 @@ int SigmoidFocalLossDriver::ParseCmdLineArgs(int argc, char* argv[]) return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::GetandSetData() +template +int SigmoidFocalLossDriver::GetandSetData() { auto inDims = inflags.GetValueTensor("dim-lengths").lengths; alpha = inflags.GetValueDouble("alpha"); @@ -425,8 +425,8 @@ int SigmoidFocalLossDriver::GetandSetData() } // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False -template -std::vector SigmoidFocalLossDriver::ComputeStrides(std::vector inputDim) +template +std::vector SigmoidFocalLossDriver::ComputeStrides(std::vector inputDim) { if(!isContiguous) std::swap(inputDim.front(), inputDim.back()); @@ -439,8 +439,8 @@ std::vector SigmoidFocalLossDriver::ComputeStrides(std::vector in return strides; } -template -int SigmoidFocalLossDriver::AddCmdLineArgs() +template +int SigmoidFocalLossDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward (Default=1)", "int"); inflags.AddTensorFlag( @@ -461,8 +461,8 @@ int SigmoidFocalLossDriver::AddCmdLineArgs() return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::AllocateBuffersAndCopy() +template +int SigmoidFocalLossDriver::AllocateBuffersAndCopy() { size_t in_sz = miopen::deref(inputDesc).GetElementSize(); size_t target_sz = miopen::deref(targetDesc).GetElementSize(); @@ -473,42 +473,44 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() uint32_t ctx = 0; - input_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(TIO))); - target_dev = std::unique_ptr(new GPUMem(ctx, target_sz, sizeof(TIO))); - output_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(TIO))); - doutput_dev = std::unique_ptr(new GPUMem(ctx, dO_sz, sizeof(TIO))); - dinput_dev = std::unique_ptr(new GPUMem(ctx, dI_sz, sizeof(TIO))); - dtarget_dev = std::unique_ptr(new GPUMem(ctx, dT_sz, sizeof(TIO))); + input_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + target_dev = std::unique_ptr(new GPUMem(ctx, target_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + doutput_dev = std::unique_ptr(new GPUMem(ctx, dO_sz, sizeof(Tgpu))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, dI_sz, sizeof(Tgpu))); + dtarget_dev = std::unique_ptr(new GPUMem(ctx, dT_sz, sizeof(Tgpu))); miopenGetSigmoidFocalLossForwardWorkspaceSize( handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes); workspace_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(TIO), sizeof(TIO))); - - input = std::vector(in_sz, static_cast(0)); - target = std::vector(target_sz, static_cast(0)); - output = std::vector(out_sz, static_cast(0)); - outputHost = std::vector(out_sz, static_cast(0)); - doutput = std::vector(dO_sz, static_cast(0)); - dinput = std::vector(dI_sz, static_cast(0)); - dinputHost = std::vector(dI_sz, static_cast(0)); - dtarget = std::vector(dT_sz, static_cast(0)); - dtargetHost = std::vector(dT_sz, static_cast(0)); - workspace = std::vector(workSpaceSizeInBytes / sizeof(TIO), static_cast(0)); + std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu))); + + input = std::vector(in_sz, static_cast(0)); + target = std::vector(target_sz, static_cast(0)); + output = std::vector(out_sz, static_cast(0)); + outputHost = std::vector(out_sz, static_cast(0)); + doutput = std::vector(dO_sz, static_cast(0)); + dinput = std::vector(dI_sz, static_cast(0)); + dinputHost = std::vector(dI_sz, static_cast(0)); + dtarget = std::vector(dT_sz, static_cast(0)); + dtargetHost = std::vector(dT_sz, static_cast(0)); + size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu); + workspace = std::vector(workSpaceElems, static_cast(0)); + workspaceHost = std::vector(workSpaceElems, static_cast(0)); for(int i = 0; i < in_sz; i++) { - input[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); - target[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + input[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + target[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); } for(int i = 0; i < dO_sz; ++i) { - doutput[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + doutput[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); } - fill(output.begin(), output.end(), static_cast(0)); - fill(dinput.begin(), dinput.end(), static_cast(0)); - fill(dtarget.begin(), dtarget.end(), static_cast(0)); + fill(output.begin(), output.end(), static_cast(0)); + fill(dinput.begin(), dinput.end(), static_cast(0)); + fill(dtarget.begin(), dtarget.end(), static_cast(0)); if(input_dev->ToGPU(GetStream(), input.data()) != 0) std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl; @@ -534,8 +536,8 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::RunForwardGPU() +template +int SigmoidFocalLossDriver::RunForwardGPU() { float kernel_total_time = 0; float kernel_first_time = 0; @@ -585,38 +587,38 @@ int SigmoidFocalLossDriver::RunForwardGPU() return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::RunForwardCPU() +template +int SigmoidFocalLossDriver::RunForwardCPU() { if(reduction == MIOPEN_LOSS_REDUCTION_NONE) { - mloSigmoidFocalLossUnreducedFwdRunHost(input.data(), + mloSigmoidFocalLossUnreducedFwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + outputHost.data(), + outputDesc, + alpha, + gamma); + } + else + { + mloSigmoidFocalLossFwdRunHost(input.data(), inputDesc, target.data(), targetDesc, + workspaceHost.data(), outputHost.data(), - outputDesc, alpha, - gamma); - } - else - { - mloSigmoidFocalLossFwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - workspace.data(), - outputHost.data(), - alpha, - gamma, - divisor); + gamma, + divisor); } return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::RunBackwardGPU() +template +int SigmoidFocalLossDriver::RunBackwardGPU() { float kernel_total_time = 0; float kernel_first_time = 0; @@ -678,10 +680,10 @@ int SigmoidFocalLossDriver::RunBackwardGPU() return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::RunBackwardCPU() +template +int SigmoidFocalLossDriver::RunBackwardCPU() { - TIO* p_dtarget = nullptr; + Tcheck* p_dtarget = nullptr; if(isTargetGradientComputed) { p_dtarget = dtargetHost.data(); @@ -689,7 +691,22 @@ int SigmoidFocalLossDriver::RunBackwardCPU() if(reduction == MIOPEN_LOSS_REDUCTION_NONE) { - mloSigmoidFocalLossUnreducedBwdRunHost(input.data(), + mloSigmoidFocalLossUnreducedBwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + doutput.data(), + doutputDesc, + dinputHost.data(), + dinputDesc, + p_dtarget, + dtargetDesc, + alpha, + gamma); + } + else + { + mloSigmoidFocalLossBwdRunHost(input.data(), inputDesc, target.data(), targetDesc, @@ -700,35 +717,33 @@ int SigmoidFocalLossDriver::RunBackwardCPU() p_dtarget, dtargetDesc, alpha, - gamma); - } - else - { - mloSigmoidFocalLossBwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - doutput.data(), - doutputDesc, - dinputHost.data(), - dinputDesc, - p_dtarget, - dtargetDesc, - alpha, - gamma, - divisor); + gamma, + divisor); } return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::VerifyForward() +template +Tcheck SigmoidFocalLossDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int SigmoidFocalLossDriver::VerifyForward() { RunForwardCPU(); - double tolerance = std::numeric_limits::epsilon() * 10; - auto error = miopen::rms_range(outputHost, output); + const Tcheck tolerance = GetTolerance(); + auto error = miopen::rms_range(outputHost, output); if(!std::isfinite(error) || error > tolerance) { @@ -745,14 +760,14 @@ int SigmoidFocalLossDriver::VerifyForward() return miopenStatusSuccess; } -template -int SigmoidFocalLossDriver::VerifyBackward() +template +int SigmoidFocalLossDriver::VerifyBackward() { RunBackwardCPU(); - double tolerance = std::numeric_limits::epsilon() * 10; - auto dinputError = miopen::rms_range(dinputHost, dinput); - auto dtargetError = miopen::rms_range(dtargetHost, dtarget); + const Tcheck tolerance = GetTolerance(); + auto dinputError = miopen::rms_range(dinputHost, dinput); + auto dtargetError = miopen::rms_range(dtargetHost, dtarget); if(!std::isfinite(dinputError) || dinputError > tolerance) { From 9d75374633b228f45fbe89eae40ac39e1d5dbe56 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 24 Jul 2024 18:19:37 +0700 Subject: [PATCH 06/28] fix cppcheck err --- .../miopen/solver/implicitgemm_ck_util.hpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index ff25d5f622..abdd171227 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, internal::MakeTaggedTransposeInstances( result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des); - result.invoker_factory = [split_k = split_k, + result.invoker_factory = [split_k, ck_args = std::move(ck_args), sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}, input1_tr_inst = std::move(_input1_tr_inst), @@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, output_init_tr_inst = std::move(_output_init_tr_inst), ck_buff_des = _ck_buff_des](const std::vector& kernels) mutable { - return [split_k = split_k, + return [split_k, kernels, ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr), @@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, input2_tr_inst = std::move(input2_tr_inst), output_tr_inst = std::move(output_tr_inst), output_init_tr_inst = std::move(output_init_tr_inst), - ck_buff_des = ck_buff_des](const Handle& handle, - const AnyInvokeParams& primitive_parameters) mutable { + ck_buff_des](const Handle& handle, + const AnyInvokeParams& primitive_parameters) mutable { handle.ResetKernelTime(); const auto& data_ctx = primitive_parameters.CastTo(); @@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, [[maybe_unused]] bool should_allocated_wrw_buffer = ShouldAllocateWorkSpaceBufferForWRW(problem); - result.invoker_factory = [split_k = split_k, - ck_args = CKArgsType{problem}, - alpha_beta_case = alpha_beta_case, - should_allocated_wrw_buffer = should_allocated_wrw_buffer, + result.invoker_factory = [split_k, + ck_args = CKArgsType{problem}, + alpha_beta_case, + should_allocated_wrw_buffer, sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}]( const std::vector&) mutable { - return [split_k = split_k, - ck_args = std::move(ck_args), - alpha_beta_case = alpha_beta_case, - should_allocated_wrw_buffer = should_allocated_wrw_buffer, - sh_conv_ptr = std::move(sh_conv_ptr)]( + return [split_k, + ck_args = std::move(ck_args), + alpha_beta_case, + should_allocated_wrw_buffer, + sh_conv_ptr = std::move(sh_conv_ptr)]( const Handle& handle, const AnyInvokeParams& primitive_parameters) { const auto& data_ctx = primitive_parameters.CastTo(); std::unique_ptr argument_ptr; From f91144c69e0ab87533a056d7cb3d4d9f940505c0 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Fri, 26 Jul 2024 19:00:38 +0700 Subject: [PATCH 07/28] add MIOPEN_INTERNALS_EXPORT --- src/include/miopen/sigmoid_focal_loss.hpp | 62 ++++++++++++----------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp index 07d3e32d61..cbb6dff65d 100644 --- a/src/include/miopen/sigmoid_focal_loss.hpp +++ b/src/include/miopen/sigmoid_focal_loss.hpp @@ -33,39 +33,41 @@ namespace miopen { struct Handle; struct TensorDescriptor; -size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle, - const TensorDescriptor& inputDesc, - const TensorDescriptor& targetDesc, - const TensorDescriptor& outputDesc, - miopenLossReductionMode_t reduction); - -miopenStatus_t SigmoidFocalLossForward(Handle& handle, - Data_t workspace, - size_t workspaceSizeInBytes, - const TensorDescriptor& inputDesc, - ConstData_t input, - const TensorDescriptor& targetDesc, - ConstData_t target, - const TensorDescriptor& outputDesc, - Data_t output, - float alpha, - float gamma, - miopenLossReductionMode_t reduction); - -miopenStatus_t SigmoidFocalLossBackward(Handle& handle, +MIOPEN_INTERNALS_EXPORT size_t +GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle, const TensorDescriptor& inputDesc, - ConstData_t input, const TensorDescriptor& targetDesc, - ConstData_t target, - const TensorDescriptor& doutputDesc, - ConstData_t doutput, - const TensorDescriptor& dinputDesc, - Data_t dinput, - const TensorDescriptor& dtargetDesc, - Data_t dtarget, - float alpha, - float gamma, + const TensorDescriptor& outputDesc, miopenLossReductionMode_t reduction); +MIOPEN_INTERNALS_EXPORT miopenStatus_t SigmoidFocalLossForward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& outputDesc, + Data_t output, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t +SigmoidFocalLossBackward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& targetDesc, + ConstData_t target, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dtargetDesc, + Data_t dtarget, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + } // namespace miopen #endif // MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ From 35c6ee696d216581078976e089ca6d58ee8e4252 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 30 Jul 2024 10:54:38 +0700 Subject: [PATCH 08/28] change gtest naming format following new convention --- driver/sigmoid_focal_loss_driver.hpp | 4 - .../forward_reduce_sigmoid_focal_loss.cpp | 5 +- test/gtest/sigmoid_focal_loss.cpp | 111 ++++++++---------- 3 files changed, 52 insertions(+), 68 deletions(-) diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 8bdc350b2f..603d1777e5 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -508,10 +508,6 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() doutput[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); } - fill(output.begin(), output.end(), static_cast(0)); - fill(dinput.begin(), dinput.end(), static_cast(0)); - fill(dtarget.begin(), dtarget.end(), static_cast(0)); - if(input_dev->ToGPU(GetStream(), input.data()) != 0) std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl; diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp index f7daa8b84c..a9abcf2e96 100644 --- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -97,7 +97,8 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( HipEventPtr stop; bool resetProfilingState = false; - if(handle_.IsProfilingEnabled()) + const bool profiling = handle_.IsProfilingEnabled(); + if(profiling) { resetProfilingState = true; handle_.EnableProfiling(false); @@ -153,7 +154,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( handle_.EnableProfiling(true); } - if(handle_.IsProfilingEnabled()) + if(profiling) { hipEventRecord(stop.get(), handle_.GetStream()); hipEventSynchronize(stop.get()); diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp index f2f6ec5d17..fa90ceb218 100644 --- a/test/gtest/sigmoid_focal_loss.cpp +++ b/test/gtest/sigmoid_focal_loss.cpp @@ -25,15 +25,10 @@ *******************************************************************************/ #include "sigmoid_focal_loss.hpp" -#include "miopen/bfloat16.hpp" #include "tensor_holder.hpp" +#include #include -#define TEST_FWD_REDUCED -#define TEST_BWD_REDUCED -#define TEST_FWD_UNREDUCED -#define TEST_BWD_UNREDUCED - MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) @@ -49,59 +44,58 @@ std::string GetFloatArg() return tmp; } -struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest { }; }; // namespace sigmoidfocalloss using namespace sigmoidfocalloss; -#ifdef TEST_FWD_REDUCED -TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -115,11 +109,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -133,11 +127,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -151,13 +145,11 @@ TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_BWD_REDUCED -TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -171,11 +163,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -189,11 +181,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -207,13 +199,11 @@ TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_FWD_UNREDUCED -TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -227,11 +217,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedFor } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -245,11 +235,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedFor } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -263,13 +253,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedFo } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_BWD_UNREDUCED -TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -283,11 +271,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBa } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -301,11 +289,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBa } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -319,7 +307,6 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedB } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif From 2b16cdb1c4538cbef3e5279c885708fd564344b0 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 30 Jul 2024 11:48:20 +0700 Subject: [PATCH 09/28] update drive random bound --- driver/sigmoid_focal_loss_driver.hpp | 15 ++++++++++++--- .../forward_reduce_sigmoid_focal_loss.cpp | 14 ++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 603d1777e5..7b51e38fd4 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -498,14 +498,23 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() workspace = std::vector(workSpaceElems, static_cast(0)); workspaceHost = std::vector(workSpaceElems, static_cast(0)); + float randomBound = 2; + // For half, the random bound is smaller to avoid half overflow + if(data_type == miopenHalf && reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + randomBound = 0.5; + } for(int i = 0; i < in_sz; i++) { - input[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); - target[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + input[i] = + prng::gen_A_to_B(static_cast(-randomBound), static_cast(randomBound)); + target[i] = + prng::gen_A_to_B(static_cast(-randomBound), static_cast(randomBound)); } for(int i = 0; i < dO_sz; ++i) { - doutput[i] = prng::gen_A_to_B(static_cast(-2), static_cast(2)); + doutput[i] = + prng::gen_A_to_B(static_cast(-randomBound), static_cast(randomBound)); } if(input_dev->ToGPU(GetStream(), input.data()) != 0) diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp index a9abcf2e96..d3f874251f 100644 --- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -96,11 +96,9 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( HipEventPtr start; HipEventPtr stop; - bool resetProfilingState = false; - const bool profiling = handle_.IsProfilingEnabled(); + const bool profiling = handle_.IsProfilingEnabled(); if(profiling) { - resetProfilingState = true; handle_.EnableProfiling(false); start = miopen::make_hip_event(); stop = miopen::make_hip_event(); @@ -149,18 +147,18 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD; } - if(resetProfilingState) - { - handle_.EnableProfiling(true); - } - if(profiling) { hipEventRecord(stop.get(), handle_.GetStream()); hipEventSynchronize(stop.get()); hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); handle_.ResetKernelTime(); handle_.AccumKernelTime(elapsed); + + handle_.EnableProfiling(true); }; }; }; From ee5952a8db21d5a2ec712d924d359272346f71ba Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 30 Jul 2024 13:22:53 +0700 Subject: [PATCH 10/28] try revert back unit-test file to check pipeline --- test/gtest/sigmoid_focal_loss.cpp | 111 +++++++++++++++++------------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp index fa90ceb218..f2f6ec5d17 100644 --- a/test/gtest/sigmoid_focal_loss.cpp +++ b/test/gtest/sigmoid_focal_loss.cpp @@ -25,10 +25,15 @@ *******************************************************************************/ #include "sigmoid_focal_loss.hpp" +#include "miopen/bfloat16.hpp" #include "tensor_holder.hpp" -#include #include +#define TEST_FWD_REDUCED +#define TEST_BWD_REDUCED +#define TEST_FWD_UNREDUCED +#define TEST_BWD_UNREDUCED + MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) @@ -44,58 +49,59 @@ std::string GetFloatArg() return tmp; } -struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest +struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest { }; -struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest +struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest { }; -struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest +struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest { }; -struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest +struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest { }; -struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest +struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest { }; -struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest +struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest +struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest +struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest +struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest +struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest +struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest { }; -struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest +struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest { }; }; // namespace sigmoidfocalloss using namespace sigmoidfocalloss; -TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test) +#ifdef TEST_FWD_REDUCED +TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -109,11 +115,11 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_fwd_FP32, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestFloat32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test) +TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -127,11 +133,11 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_fwd_FP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test) +TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -145,11 +151,13 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_fwd_BFP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, + SigmoidFocalLossForwardTestBFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif -TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test) +#ifdef TEST_BWD_REDUCED +TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -163,11 +171,11 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_bwd_FP32, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestFloat32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test) +TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -181,11 +189,11 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_bwd_FP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test) +TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -199,11 +207,13 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLoss_bwd_BFP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, + SigmoidFocalLossBackwardTestBFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif -TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test) +#ifdef TEST_FWD_UNREDUCED +TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -217,11 +227,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_fwd_FP32, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestFloat32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test) +TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -235,11 +245,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_fwd_FP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test) +TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -253,11 +263,13 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_fwd_BFP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, + SigmoidFocalLossUnreducedForwardTestBFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif -TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test) +#ifdef TEST_BWD_UNREDUCED +TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -271,11 +283,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_bwd_FP32, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestFloat32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test) +TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -289,11 +301,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_bwd_FP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test) +TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -307,6 +319,7 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test) } }; -INSTANTIATE_TEST_SUITE_P(Full, - GPU_SigmoidFocalLossUnreduced_bwd_BFP16, +INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, + SigmoidFocalLossUnreducedBackwardTestBFloat16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); +#endif From 4fcf6897edf2e4d0459beb7239cad653d7a35252 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 30 Jul 2024 14:55:11 +0700 Subject: [PATCH 11/28] try __hip_ds_swizzlef_N --- src/kernels/warp_shuffle.hpp | 63 ++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp index ebd5861976..5caf7d2ec4 100644 --- a/src/kernels/warp_shuffle.hpp +++ b/src/kernels/warp_shuffle.hpp @@ -52,21 +52,78 @@ __device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val) return val; } +template +__forceinline__ __device__ float reductionFullWarp(float reduced_val, uint32_t laneId) +{ + static_assert(WARP_SIZE != 0, "WARP_SIZEmust not be 0"); + static_assert((SWIZZLE_SIZE & (SWIZZLE_SIZE - 1)) == 0, + "WARP_SIZE and SWIZZLE must be a power of 2"); + + if constexpr(SWIZZLE_SIZE == 1) + return reduced_val; + + reduced_val = reductionFullWarp> 1)>(reduced_val, laneId); + + constexpr uint32_t warp_msk = (WARP_SIZE - 1); + + float tmp; + if constexpr(SWIZZLE_SIZE >= 64) + { + // swizzle can handle only 32 lanes, switching to bpermute + uint32_t idx = laneId ^ (SWIZZLE_SIZE >> 1); + + idx = idx >= ((laneId + WARP_SIZE) & ~warp_msk) ? laneId : idx; + int itmp = + __builtin_amdgcn_ds_bpermute(static_cast(idx << 2), __float_as_int(reduced_val)); + tmp = __int_as_float(itmp); + } + else + { + // butterfly reduction based on __shfl_xor + // swizzle () + constexpr uint32_t xor_off = 10; + // constexpr uint32_t or_off = 5; + constexpr uint32_t and_off = 0; + + constexpr uint32_t field_msk = 0x1f; + + constexpr uint32_t and_msk = warp_msk & field_msk; + // constexpr uint32_t or_msk = 0; + constexpr uint32_t xor_msk = (SWIZZLE_SIZE >> 1) & field_msk; + + // clang tidy does not like that (or_msk << or_off) is zero + // and cliams that it's redundant, but it's required for + // __hip_ds_swizzlef_N reference. Menawhile swizzle_op generation + // must be a part of hip intrinsics, because it depends on ISA + // like __hip_ds_swizzlef_N + // For some reason NILINT doesn't work. + // NOLINTBEGIN + constexpr uint32_t swizzle_op = + (xor_msk << xor_off) /* | (or_msk << or_off) */ | (and_msk << and_off); + // NOLINTEND + + tmp = __hip_ds_swizzlef_N(reduced_val); + } + + return tmp + reduced_val; +}; + __device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val) { static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize]; auto lane = threadIdx.x % warpSize; auto wid = threadIdx.x / warpSize; - val = warp_reduce_sum(val); + val = reductionFullWarp(val, lane); if(lane == 0) shared[wid] = val; __syncthreads(); val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0; - if(wid == 0) - val = warp_reduce_sum(val); + // if(wid == 0) + // val = warp_reduce_sum(val); + val = reductionFullWarp(val, lane); return val; } From debd5301a9e1af5a6ccbc0782be555ddb9ba9bcd Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 30 Jul 2024 15:17:29 +0700 Subject: [PATCH 12/28] change unit-test format --- test/gtest/sigmoid_focal_loss.cpp | 111 +++++++++++++----------------- 1 file changed, 49 insertions(+), 62 deletions(-) diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp index f2f6ec5d17..fa90ceb218 100644 --- a/test/gtest/sigmoid_focal_loss.cpp +++ b/test/gtest/sigmoid_focal_loss.cpp @@ -25,15 +25,10 @@ *******************************************************************************/ #include "sigmoid_focal_loss.hpp" -#include "miopen/bfloat16.hpp" #include "tensor_holder.hpp" +#include #include -#define TEST_FWD_REDUCED -#define TEST_BWD_REDUCED -#define TEST_FWD_UNREDUCED -#define TEST_BWD_UNREDUCED - MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) @@ -49,59 +44,58 @@ std::string GetFloatArg() return tmp; } -struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest +struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest { }; -struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest +struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest +struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest { }; -struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest +struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest { }; }; // namespace sigmoidfocalloss using namespace sigmoidfocalloss; -#ifdef TEST_FWD_REDUCED -TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -115,11 +109,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -133,11 +127,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) +TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -151,13 +145,11 @@ TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet, - SigmoidFocalLossForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_fwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_BWD_REDUCED -TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -171,11 +163,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -189,11 +181,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) +TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -207,13 +199,11 @@ TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest) } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet, - SigmoidFocalLossBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLoss_bwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_FWD_UNREDUCED -TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -227,11 +217,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedFor } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -245,11 +235,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedFor } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -263,13 +253,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedFo } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet, - SigmoidFocalLossUnreducedForwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_fwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif -#ifdef TEST_BWD_UNREDUCED -TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) @@ -283,11 +271,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBa } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestFloat32, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_FP32, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) @@ -301,11 +289,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBa } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_FP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest) +TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test) { if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) @@ -319,7 +307,6 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedB } }; -INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet, - SigmoidFocalLossUnreducedBackwardTestBFloat16, +INSTANTIATE_TEST_SUITE_P(Full, + GPU_SigmoidFocalLossUnreduced_bwd_BFP16, testing::ValuesIn(SigmoidFocalLossTestConfigs())); -#endif From 182ea0b86a794d0de1b5b416e1c25576ed7b4f59 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 31 Jul 2024 13:26:31 +0700 Subject: [PATCH 13/28] use MultiBufferWorkspaceTraits --- .../miopen/sigmoidfocalloss/solvers.hpp | 3 ++ .../forward_reduce_sigmoid_focal_loss.cpp | 30 +++++++++++-------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp index 992ad5a9d6..9cb3bd15e8 100644 --- a/src/include/miopen/sigmoidfocalloss/solvers.hpp +++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp @@ -50,6 +50,9 @@ struct SigmoidFocalLossFwd final : SigmoidFocalLossFwdSolverBase const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const override; + MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits( + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const; + std::size_t GetWorkspaceSize(const ExecutionContext& context, const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp index d3f874251f..f1a37fc54f 100644 --- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include "miopen/buffer_info.hpp" #include #include #include @@ -36,7 +37,7 @@ #include #define LOCAL_SIZE 256 -#define LOCAL_SIZE_REDUCE_FWD 256 +#define LOCAL_SIZE_REDUCE 256 namespace miopen { @@ -83,11 +84,11 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( do { result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_REDUCE_FWD}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); - _size = AlignUp(_size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD; + {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); + _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; } while(_size > 1); - result.invoker_factory = [](const std::vector& kernels) { + result.invoker_factory = [this, problem](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) params = raw_params.CastTo(); auto size = deref(params.inputDesc).GetElementSize(); @@ -127,11 +128,11 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( } /* Execute reduce kernels */ + auto wt = GetMultiBufferWorkspaceTraits(problem); auto reduceIn = params.workspace; auto reduceOut = - static_cast(static_cast(params.workspace) + - deref(params.inputDesc).GetElementSize() * - get_data_size(deref(params.outputDesc).GetType())); + static_cast(static_cast(params.workspace) + wt.GetOffset(1)); + for(int i = 1; i < kernels.size(); ++i) { decltype(auto) kernel = handle_.Run(kernels[i]); @@ -144,7 +145,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( { kernel(reduceIn, params.output, size); } - size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD; + size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; } if(profiling) @@ -169,13 +170,18 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( std::size_t SigmoidFocalLossFwd::GetWorkspaceSize( const ExecutionContext& /*context*/, const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const +{ + return GetMultiBufferWorkspaceTraits(problem).GetSize(); +} + +MultiBufferWorkspaceTraits SigmoidFocalLossFwd::GetMultiBufferWorkspaceTraits( + const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const { size_t inputElements = problem.GetInputDesc().GetElementSize(); - size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE_FWD - 1) / LOCAL_SIZE_REDUCE_FWD; - size_t res = - (inputElements + reduceElements) * get_data_size(problem.GetOutputDesc().GetType()); + size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; + size_t elementSize = get_data_size(problem.GetOutputDesc().GetType()); - return res; + return MultiBufferWorkspaceTraits{inputElements * elementSize, reduceElements * elementSize}; } } // namespace sigmoidfocalloss From 19c6390ea9fcb1363969612624a01a586722853d Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Mon, 5 Aug 2024 17:33:54 +0700 Subject: [PATCH 14/28] remove redundant files --- driver/driver.hpp | 3 +- rocfft_r2c_ex.cpp | 317 ---------------------------------------------- 2 files changed, 1 insertion(+), 319 deletions(-) delete mode 100644 rocfft_r2c_ex.cpp diff --git a/driver/driver.hpp b/driver/driver.hpp index 749ee16a17..1c4e59c371 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -206,8 +206,7 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && - arg != "getitemfp16" && arg != "getitembfp16" && arg != "transformersadamwfp16" && - arg != "transformersampadamw" && arg != "reducecalculation" && + arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "sigmoidfocalloss" && arg != "sigmoidfocallossfp16" && arg != "sigmoidfocallossbfp16" && arg != "--version") diff --git a/rocfft_r2c_ex.cpp b/rocfft_r2c_ex.cpp deleted file mode 100644 index 8c17fac21b..0000000000 --- a/rocfft_r2c_ex.cpp +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "../../../shared/CLI11.hpp" -#include "examplekernels.h" -#include "exampleutils.h" -#include - -int main(int argc, char* argv[]) -{ - std::cout << "rocfft double-precision real/complex transform\n" << std::endl; - - // Length of transform: - std::vector length = {8}; - - // Gpu device id: - size_t deviceId = 0; - - // Command-line options: - CLI::App app{"rocfft sample command line options"}; - app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); - CLI::Option* opt_outofplace = - app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); - CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); - app.add_option( - "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); - - try - { - app.parse(argc, argv); - } - catch(const CLI::ParseError& e) - { - return app.exit(e); - } - - // Placeness for the transform - if(rocfft_setup() != rocfft_status_success) - throw std::runtime_error("rocfft_setup failed."); - const rocfft_result_placement place = - *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; - const bool inplace = place == rocfft_placement_inplace; - - // Direction of transform - const rocfft_transform_type direction = - *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward; - const bool forward = direction == rocfft_transform_type_real_forward; - - // Set up the strides and buffer size for the real values: - std::vector rstride = {1}; - for(unsigned int i = 1; i < length.size(); ++i) - { - // In-place transforms need space for two extra real values in the contiguous - // direction. - auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1]; - rstride.push_back(val); - } - // NB: not tight, but hey - const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1]; - std::vector rdata(real_size); // host storage - - // The complex data length is half + 1 of the real data length in the contiguous - // dimensions. Since rocFFT is column-major, this is the first index. - std::vector clength = length; - clength[0] = clength[0] / 2 + 1; - std::vector cstride = {1}; - for(unsigned int i = 1; i < clength.size(); ++i) - { - cstride.push_back(clength[i - 1] * cstride[i - 1]); - } - const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; - std::vector cdata(complex_size); // host storage - - // Based on the direction, we set the input and output parameters appropriately. - const size_t isize = forward ? real_size : complex_size; - const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); - const std::vector ilength = forward ? length : clength; - const std::vector istride = forward ? rstride : cstride; - - const size_t osize = forward ? complex_size : real_size; - const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); - const std::vector olength = forward ? clength : length; - const std::vector ostride = forward ? cstride : rstride; - - // Print information about the transform: - std::cout << "direction: "; - if(forward) - std::cout << "forward\n"; - else - std::cout << "inverse\n"; - std::cout << "length:"; - for(const auto i : length) - std::cout << " " << i; - std::cout << "\n"; - if(inplace) - std::cout << "in-place transform\n"; - else - std::cout << "out-of-place transform\n"; - std::cout << "deviceID: " << deviceId << "\n"; - std::cout << "input length:"; - for(auto i : ilength) - std::cout << " " << i; - std::cout << "\n"; - std::cout << "input buffer stride:"; - for(auto i : istride) - std::cout << " " << i; - std::cout << "\n"; - std::cout << "input buffer size: " << ibytes << "\n"; - - std::cout << "output length:"; - for(auto i : olength) - std::cout << " " << i; - std::cout << "\n"; - std::cout << "output buffer stride:"; - for(auto i : ostride) - std::cout << " " << i; - std::cout << "\n"; - std::cout << "output buffer size: " << obytes << "\n"; - std::cout << std::endl; - - // Set the device: - if(hipSetDevice(deviceId) != hipSuccess) - throw std::runtime_error("hipSetDevice failed."); - - // Create HIP device object and initialize data - // Kernels are provided in examplekernels.h - void* gpu_in = nullptr; - hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); - if(hip_status != hipSuccess) - throw std::runtime_error("device error"); - - if(forward) - { - initreal_cm(length, istride, gpu_in); - } - else - { - init_hermitiancomplex_cm(length, ilength, istride, gpu_in); - } - - // Print the input: - std::cout << "input:\n"; - if(forward) - { - hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMemcpy failed."); - printbuffer_cm(rdata, ilength, istride, 1, isize); - } - else - { - hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMemcpy failed."); - printbuffer_cm(cdata, ilength, istride, 1, isize); - - // Check that the buffer is Hermitian symmetric: - check_symmetry_cm(cdata, length, istride, 1, isize); - } - - // rocfft_status can be used to capture API status info - rocfft_status rc = rocfft_status_success; - - // Create the a descrition struct to set data layout: - rocfft_plan_description gpu_description = nullptr; - rc = rocfft_plan_description_create(&gpu_description); - if(rc != rocfft_status_success) - throw std::runtime_error("failed to create plan description"); - - rc = rocfft_plan_description_set_data_layout( - gpu_description, - // input data format: - forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, - // output data format: - forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, - nullptr, - nullptr, - istride.size(), // input stride length - istride.data(), // input stride data - 0, // input batch distance - ostride.size(), // output stride length - ostride.data(), // output stride data - 0); // ouptut batch distance - if(rc != rocfft_status_success) - throw std::runtime_error("failed to set data layout"); - - // We can also pass "nullptr" instead of a description; rocFFT will use reasonable - // default parameters. If the data isn't contiguous, we need to set strides, etc, - // using the description. - - // Create the FFT plan: - rocfft_plan gpu_plan = nullptr; - rc = rocfft_plan_create(&gpu_plan, - place, - direction, - rocfft_precision_double, - length.size(), // Dimension - length.data(), // lengths - 1, // Number of transforms - gpu_description); // Description - if(rc != rocfft_status_success) - throw std::runtime_error("failed to create plan"); - - // Get the execution info for the fft plan (in particular, work memory requirements): - rocfft_execution_info planinfo = nullptr; - rc = rocfft_execution_info_create(&planinfo); - if(rc != rocfft_status_success) - throw std::runtime_error("failed to create execution info"); - - size_t workbuffersize = 0; - rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); - if(rc != rocfft_status_success) - throw std::runtime_error("failed to get work buffer size"); - - // If the transform requires work memory, allocate a work buffer: - void* wbuffer = nullptr; - if(workbuffersize > 0) - { - hip_status = hipMalloc(&wbuffer, workbuffersize); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMalloc failed"); - - rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); - if(rc != rocfft_status_success) - throw std::runtime_error("failed to set work buffer"); - } - - // If the transform is out-of-place, allocate the output buffer as well: - void* gpu_out = inplace ? gpu_in : nullptr; - if(!inplace) - { - hip_status = hipMalloc(&gpu_out, obytes); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMalloc failed"); - } - - // Execute the GPU transform: - rc = rocfft_execute(gpu_plan, // plan - (void**)&gpu_in, // in_buffer - (void**)&gpu_out, // out_buffer - planinfo); // execution info - if(rc != rocfft_status_success) - throw std::runtime_error("failed to execute"); - - // Get the output from the device and print to cout: - std::cout << "output:\n"; - if(forward) - { - hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMemcpy failed."); - printbuffer_cm(cdata, olength, ostride, 1, osize); - } - else - { - hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); - if(hip_status != hipSuccess) - throw std::runtime_error("hipMemcpy failed."); - printbuffer_cm(rdata, olength, ostride, 1, osize); - } - - // Clean up: free GPU memory: - if(hipFree(gpu_in) != hipSuccess) - throw std::runtime_error("hipFree failed."); - - if(!inplace) - { - if(hipFree(gpu_out) != hipSuccess) - throw std::runtime_error("hipFree failed."); - } - if(wbuffer != nullptr) - { - if(hipFree(wbuffer) != hipSuccess) - throw std::runtime_error("hipFree failed."); - } - - // Clean up: destroy plans: - if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) - throw std::runtime_error("rocfft_execution_info_destroy failed."); - planinfo = nullptr; - if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) - throw std::runtime_error("rocfft_plan_description_destroy failed."); - gpu_description = nullptr; - if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) - throw std::runtime_error("rocfft_plan_destroy failed."); - gpu_plan = nullptr; - - rocfft_cleanup(); - return 0; -} From ae2ee253d3e2fa531c5e611846dae5e5ebdcc07b Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Mon, 5 Aug 2024 21:22:36 +0700 Subject: [PATCH 15/28] rollback src/include/miopen/solver/implicitgemm_ck_util.hpp --- .../miopen/solver/implicitgemm_ck_util.hpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index abdd171227..ff25d5f622 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, internal::MakeTaggedTransposeInstances( result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des); - result.invoker_factory = [split_k, + result.invoker_factory = [split_k = split_k, ck_args = std::move(ck_args), sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}, input1_tr_inst = std::move(_input1_tr_inst), @@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, output_init_tr_inst = std::move(_output_init_tr_inst), ck_buff_des = _ck_buff_des](const std::vector& kernels) mutable { - return [split_k, + return [split_k = split_k, kernels, ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr), @@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, input2_tr_inst = std::move(input2_tr_inst), output_tr_inst = std::move(output_tr_inst), output_init_tr_inst = std::move(output_init_tr_inst), - ck_buff_des](const Handle& handle, - const AnyInvokeParams& primitive_parameters) mutable { + ck_buff_des = ck_buff_des](const Handle& handle, + const AnyInvokeParams& primitive_parameters) mutable { handle.ResetKernelTime(); const auto& data_ctx = primitive_parameters.CastTo(); @@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, [[maybe_unused]] bool should_allocated_wrw_buffer = ShouldAllocateWorkSpaceBufferForWRW(problem); - result.invoker_factory = [split_k, - ck_args = CKArgsType{problem}, - alpha_beta_case, - should_allocated_wrw_buffer, + result.invoker_factory = [split_k = split_k, + ck_args = CKArgsType{problem}, + alpha_beta_case = alpha_beta_case, + should_allocated_wrw_buffer = should_allocated_wrw_buffer, sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}]( const std::vector&) mutable { - return [split_k, - ck_args = std::move(ck_args), - alpha_beta_case, - should_allocated_wrw_buffer, - sh_conv_ptr = std::move(sh_conv_ptr)]( + return [split_k = split_k, + ck_args = std::move(ck_args), + alpha_beta_case = alpha_beta_case, + should_allocated_wrw_buffer = should_allocated_wrw_buffer, + sh_conv_ptr = std::move(sh_conv_ptr)]( const Handle& handle, const AnyInvokeParams& primitive_parameters) { const auto& data_ctx = primitive_parameters.CastTo(); std::unique_ptr argument_ptr; From c1c602c0f6f18846c2c30fd1ce7119b3737a1bde Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 6 Aug 2024 11:25:22 +0700 Subject: [PATCH 16/28] revert warp_shuffle using shlf_down --- src/kernels/warp_shuffle.hpp | 68 +++++------------------------------- 1 file changed, 8 insertions(+), 60 deletions(-) diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp index 5caf7d2ec4..c1b53ea565 100644 --- a/src/kernels/warp_shuffle.hpp +++ b/src/kernels/warp_shuffle.hpp @@ -24,6 +24,9 @@ * *******************************************************************************/ +#ifndef GUARD_WARP_SHUFFLE_HPP +#define GUARD_WARP_SHUFFLE_HPP + #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #include @@ -52,78 +55,23 @@ __device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val) return val; } -template -__forceinline__ __device__ float reductionFullWarp(float reduced_val, uint32_t laneId) -{ - static_assert(WARP_SIZE != 0, "WARP_SIZEmust not be 0"); - static_assert((SWIZZLE_SIZE & (SWIZZLE_SIZE - 1)) == 0, - "WARP_SIZE and SWIZZLE must be a power of 2"); - - if constexpr(SWIZZLE_SIZE == 1) - return reduced_val; - - reduced_val = reductionFullWarp> 1)>(reduced_val, laneId); - - constexpr uint32_t warp_msk = (WARP_SIZE - 1); - - float tmp; - if constexpr(SWIZZLE_SIZE >= 64) - { - // swizzle can handle only 32 lanes, switching to bpermute - uint32_t idx = laneId ^ (SWIZZLE_SIZE >> 1); - - idx = idx >= ((laneId + WARP_SIZE) & ~warp_msk) ? laneId : idx; - int itmp = - __builtin_amdgcn_ds_bpermute(static_cast(idx << 2), __float_as_int(reduced_val)); - tmp = __int_as_float(itmp); - } - else - { - // butterfly reduction based on __shfl_xor - // swizzle () - constexpr uint32_t xor_off = 10; - // constexpr uint32_t or_off = 5; - constexpr uint32_t and_off = 0; - - constexpr uint32_t field_msk = 0x1f; - - constexpr uint32_t and_msk = warp_msk & field_msk; - // constexpr uint32_t or_msk = 0; - constexpr uint32_t xor_msk = (SWIZZLE_SIZE >> 1) & field_msk; - - // clang tidy does not like that (or_msk << or_off) is zero - // and cliams that it's redundant, but it's required for - // __hip_ds_swizzlef_N reference. Menawhile swizzle_op generation - // must be a part of hip intrinsics, because it depends on ISA - // like __hip_ds_swizzlef_N - // For some reason NILINT doesn't work. - // NOLINTBEGIN - constexpr uint32_t swizzle_op = - (xor_msk << xor_off) /* | (or_msk << or_off) */ | (and_msk << and_off); - // NOLINTEND - - tmp = __hip_ds_swizzlef_N(reduced_val); - } - - return tmp + reduced_val; -}; - __device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val) { static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize]; auto lane = threadIdx.x % warpSize; auto wid = threadIdx.x / warpSize; - val = reductionFullWarp(val, lane); + val = warp_reduce_sum(val); if(lane == 0) shared[wid] = val; __syncthreads(); val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0; - // if(wid == 0) - // val = warp_reduce_sum(val); - val = reductionFullWarp(val, lane); + if(wid == 0) + val = warp_reduce_sum(val); return val; } + +#endif // GUARD_WARP_SHUFFLE_HPP From edcd7e7018a4745c58b0d4a48a8c479c0f6d9e64 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 6 Aug 2024 16:41:54 +0700 Subject: [PATCH 17/28] include header in .cpp file --- src/include/miopen/sigmoid_focal_loss.hpp | 1 + src/sigmoid_focal_loss.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp index cbb6dff65d..353c8b479a 100644 --- a/src/include/miopen/sigmoid_focal_loss.hpp +++ b/src/include/miopen/sigmoid_focal_loss.hpp @@ -26,6 +26,7 @@ #ifndef MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ #define MIOPEN_SIGMOID_FOCAL_LOSS_HPP_ +#include #include namespace miopen { diff --git a/src/sigmoid_focal_loss.cpp b/src/sigmoid_focal_loss.cpp index e1123a799c..3858f0a918 100644 --- a/src/sigmoid_focal_loss.cpp +++ b/src/sigmoid_focal_loss.cpp @@ -25,6 +25,7 @@ *******************************************************************************/ #include +#include #include #include #include From 091aa5b8b8345cb8ffcb7a80ef42745703d53ad1 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Thu, 8 Aug 2024 16:20:04 +0700 Subject: [PATCH 18/28] merge duplicate code to validate in CPU and driver --- driver/mloSigmoidFocalLossHost.hpp | 163 +++++++++++++ driver/sigmoid_focal_loss_driver.hpp | 327 +++------------------------ test/cpu_sigmoid_focal_loss.hpp | 146 +++--------- test/gtest/sigmoid_focal_loss.hpp | 36 ++- 4 files changed, 252 insertions(+), 420 deletions(-) create mode 100644 driver/mloSigmoidFocalLossHost.hpp diff --git a/driver/mloSigmoidFocalLossHost.hpp b/driver/mloSigmoidFocalLossHost.hpp new file mode 100644 index 0000000000..555c0b4e88 --- /dev/null +++ b/driver/mloSigmoidFocalLossHost.hpp @@ -0,0 +1,163 @@ +#include +#include + +template +void mloSigmoidFocalLossFwdRunHost(Tgpu* input, + miopenTensorDescriptor_t inputDesc, + Tgpu* target, + miopenTensorDescriptor_t targetDesc, + Tcheck* outputHost, + miopenTensorDescriptor_t outputDesc, + Tcheck* workspaceHost, + float alpha, + float gamma, + miopenLossReductionMode_t reduction, + float divisor) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + + Tcheck sig = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); + Tcheck sigT = sig * t + (1 - sig) * (1 - t); + Tcheck loss = ceLoss * pow(1 - sigT, gamma); + + if(alpha >= 0) + { + Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t); + loss = alphaT * loss; + } + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + outputHost[output_tv.get_tensor_view_idx(idx)] = loss; + } + else + { + workspaceHost[id] = static_cast(loss / divisor); + } + } + + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + return; + + // Reduce loss + const int local_size = 256; + int offset_a = 0; + int offset_b = inputSize; + size_t _size = inputSize; + do + { + for(int i = 0; i < _size; i += local_size) + { + Tcheck shared[local_size]; + for(int j = 0; j < local_size; ++j) + shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f; + for(int offset = local_size / 2; offset > 0; offset >>= 1) + for(int j = 0; j < offset; ++j) + shared[j] += shared[j + offset]; + if(_size <= local_size) + outputHost[0] = shared[0]; + else + workspaceHost[offset_b + i / local_size] = shared[0]; + } + std::swap(offset_a, offset_b); + _size = (_size + local_size - 1) / local_size; + } while(_size > 1); +} + +template +void mloSigmoidFocalLossBwdRunHost(Tgpu* input, + miopenTensorDescriptor_t inputDesc, + Tgpu* target, + miopenTensorDescriptor_t targetDesc, + Tgpu* doutput, + miopenTensorDescriptor_t doutputDesc, + Tcheck* dinput, + miopenTensorDescriptor_t dinputDesc, + Tcheck* dtarget, + miopenTensorDescriptor_t dtargetDesc, + float alpha, + float gamma, + miopenLossReductionMode_t reduction, + float divisor) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); + auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc)); + + size_t inputSize = miopen::deref(inputDesc).GetElementSize(); + + tensor_layout_t<5> doIdx(input_tv, 0); + Tcheck dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + + for(size_t id = 0; id < inputSize; ++id) + { + tensor_layout_t<5> idx(input_tv, id); + + Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); + } + + Tcheck p = 1 / (1 + exp(-i)); + Tcheck ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); + Tcheck pT = p * t + (1 - p) * (1 - t); + Tcheck powPt = pow(1 - pT, gamma); + Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t); + + if(dinput) + { + Tcheck dpdi = exp(-i) / pow(1 + exp(-i), 2); + Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; + Tcheck dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; + + // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di + Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi; + Tcheck grad = dO * dLdi; + + if(alpha >= 0) + { + grad *= alpha_t; + } + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + grad /= divisor; + } + dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + } + + if(dtarget) + { + Tcheck dcelossdt = -log(p) + log(1 - p); + Tcheck dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); + // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt + Tcheck dLdt = dcelossdt * powPt + ceLoss * dpowptdt; + Tcheck gradTarget = dO * dLdt; + + if(alpha >= 0) + { + // alpha_t * dL/dt + dalpha_t/dt * dL + gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; + } + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + gradTarget /= divisor; + } + dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + } + } +} diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 7b51e38fd4..a6ee50fbd1 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -28,259 +28,15 @@ #include "InputFlags.hpp" #include "driver.hpp" #include -#include #include #include "tensor_driver.hpp" #include "timer.hpp" +#include "mloSigmoidFocalLossHost.hpp" #include <../test/tensor_holder.hpp> #include <../test/verify.hpp> #include #include -template -void mloSigmoidFocalLossUnreducedFwdRunHost(Tgpu* input, - miopenTensorDescriptor_t inputDesc, - Tgpu* target, - miopenTensorDescriptor_t targetDesc, - Tcheck* outputHost, - miopenTensorDescriptor_t outputDesc, - float alpha = 0.25, - float gamma = 2) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); - auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); - auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); - size_t inputSize = miopen::deref(inputDesc).GetElementSize(); - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - - Tcheck sig = 1 / (1 + exp(-i)); - Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); - Tcheck sigT = sig * t + (1 - sig) * (1 - t); - Tcheck loss = ceLoss * pow(1 - sigT, gamma); - - if(alpha >= 0) - { - Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t); - loss = alphaT * loss; - } - - outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); - } -} - -template -void mloSigmoidFocalLossUnreducedBwdRunHost(Tgpu* input, - miopenTensorDescriptor_t inputDesc, - Tgpu* target, - miopenTensorDescriptor_t targetDesc, - Tgpu* doutput, - miopenTensorDescriptor_t doutputDesc, - Tcheck* dinput, - miopenTensorDescriptor_t dinputDesc, - Tcheck* dtarget, - miopenTensorDescriptor_t dtargetDesc, - float alpha = 0.25, - float gamma = 2) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); - auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); - auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); - auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); - auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc)); - size_t inputSize = miopen::deref(inputDesc).GetElementSize(); - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - Tcheck dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); - - Tcheck p = 1 / (1 + exp(-i)); - Tcheck ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); - Tcheck pT = p * t + (1 - p) * (1 - t); - Tcheck powPt = pow(1 - pT, gamma); - Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t); - - if(dinput) - { - Tcheck dpdi = exp(-i) / pow(1 + exp(-i), 2); - Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; - Tcheck dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; - - // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di - Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi; - Tcheck grad = dO * dLdi; - - if(alpha >= 0) - { - grad *= alpha_t; - } - dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); - } - - if(dtarget) - { - Tcheck dcelossdt = -log(p) + log(1 - p); - Tcheck dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); - // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt - Tcheck dLdt = dcelossdt * powPt + ceLoss * dpowptdt; - Tcheck gradTarget = dO * dLdt; - - if(alpha >= 0) - { - // alpha_t * dL/dt + dalpha_t/dt * dL - gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; - } - dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); - } - } -} - -template -void mloSigmoidFocalLossFwdRunHost(Tgpu* input, - miopenTensorDescriptor_t inputDesc, - Tgpu* target, - miopenTensorDescriptor_t targetDesc, - Tcheck* workspaceHost, - Tcheck* outputHost, - float alpha = 0.25, - float gamma = 2, - float divisor = 1) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); - auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); - size_t inputSize = miopen::deref(inputDesc).GetElementSize(); - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - - Tcheck sig = 1 / (1 + exp(-i)); - Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig)); - Tcheck sigT = sig * t + (1 - sig) * (1 - t); - Tcheck loss = ceLoss * pow(1 - sigT, gamma); - - if(alpha >= 0) - { - Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t); - loss = alphaT * loss; - } - - workspaceHost[id] = static_cast(loss / divisor); - } - - // Reduce loss - const int local_size = 256; - int offset_a = 0; - int offset_b = inputSize; - size_t _size = inputSize; - do - { - for(int i = 0; i < _size; i += local_size) - { - Tcheck shared[local_size]; - for(int j = 0; j < local_size; ++j) - shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f; - for(int offset = local_size / 2; offset > 0; offset >>= 1) - for(int j = 0; j < offset; ++j) - shared[j] += shared[j + offset]; - if(_size <= local_size) - outputHost[0] = shared[0]; - else - workspaceHost[offset_b + i / local_size] = shared[0]; - } - std::swap(offset_a, offset_b); - _size = (_size + local_size - 1) / local_size; - } while(_size > 1); -} - -template -void mloSigmoidFocalLossBwdRunHost(Tgpu* input, - miopenTensorDescriptor_t inputDesc, - Tgpu* target, - miopenTensorDescriptor_t targetDesc, - Tgpu* doutput, - miopenTensorDescriptor_t doutputDesc, - Tcheck* dinput, - miopenTensorDescriptor_t dinputDesc, - Tcheck* dtarget, - miopenTensorDescriptor_t dtargetDesc, - float alpha = 0.25, - float gamma = 2, - float divisor = 1) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); - auto target_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc)); - auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); - auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); - auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc)); - - size_t inputSize = miopen::deref(inputDesc).GetElementSize(); - - tensor_layout_t<5> doIdx(input_tv, 0); - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - Tcheck i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - Tcheck t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - Tcheck dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); - - Tcheck p = 1 / (1 + exp(-i)); - Tcheck ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); - Tcheck pT = p * t + (1 - p) * (1 - t); - Tcheck powPt = pow(1 - pT, gamma); - Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t); - - if(dinput) - { - Tcheck dpdi = exp(-i) / pow(1 + exp(-i), 2); - Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; - Tcheck dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; - - // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di - Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi; - Tcheck grad = dO * dLdi; - - if(alpha >= 0) - { - grad *= alpha_t; - } - grad /= divisor; - dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); - } - - if(dtarget) - { - Tcheck dcelossdt = -log(p) + log(1 - p); - Tcheck dpowptdt = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p); - // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt - Tcheck dLdt = dcelossdt * powPt + ceLoss * dpowptdt; - Tcheck gradTarget = dO * dLdt; - - if(alpha >= 0) - { - // alpha_t * dL/dt + dalpha_t/dt * dL - gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; - } - gradTarget /= divisor; - dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); - } - } -} - template class SigmoidFocalLossDriver : public Driver { @@ -595,30 +351,17 @@ int SigmoidFocalLossDriver::RunForwardGPU() template int SigmoidFocalLossDriver::RunForwardCPU() { - if(reduction == MIOPEN_LOSS_REDUCTION_NONE) - { - mloSigmoidFocalLossUnreducedFwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - outputHost.data(), - outputDesc, - alpha, - gamma); - } - else - { - mloSigmoidFocalLossFwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - workspaceHost.data(), - outputHost.data(), - alpha, - gamma, - divisor); - } - + mloSigmoidFocalLossFwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + outputHost.data(), + outputDesc, + workspaceHost.data(), + alpha, + gamma, + reduction, + divisor); return miopenStatusSuccess; } @@ -693,38 +436,20 @@ int SigmoidFocalLossDriver::RunBackwardCPU() { p_dtarget = dtargetHost.data(); } - if(reduction == MIOPEN_LOSS_REDUCTION_NONE) - { - - mloSigmoidFocalLossUnreducedBwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - doutput.data(), - doutputDesc, - dinputHost.data(), - dinputDesc, - p_dtarget, - dtargetDesc, - alpha, - gamma); - } - else - { - mloSigmoidFocalLossBwdRunHost(input.data(), - inputDesc, - target.data(), - targetDesc, - doutput.data(), - doutputDesc, - dinputHost.data(), - dinputDesc, - p_dtarget, - dtargetDesc, - alpha, - gamma, - divisor); - } + mloSigmoidFocalLossBwdRunHost(input.data(), + inputDesc, + target.data(), + targetDesc, + doutput.data(), + doutputDesc, + dinputHost.data(), + dinputDesc, + p_dtarget, + dtargetDesc, + alpha, + gamma, + reduction, + divisor); return miopenStatusSuccess; } diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp index 3b13b955e3..f1df613b54 100644 --- a/test/cpu_sigmoid_focal_loss.hpp +++ b/test/cpu_sigmoid_focal_loss.hpp @@ -1,16 +1,20 @@ #pragma once +#include "miopen/miopen.h" #include "tensor_holder.hpp" #include "tensor_view.hpp" #include #include template -void cpu_sigmoid_focal_loss_unreduced_forward(tensor input, - tensor target, - tensor& outputHost, - float alpha = 0.25, - float gamma = 2) +void cpu_sigmoid_focal_loss_forward(tensor input, + tensor target, + tensor& workspace, + tensor& outputHost, + float alpha, + float gamma, + miopenLossReductionMode_t reduction, + float divisor) { auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); @@ -35,111 +39,18 @@ void cpu_sigmoid_focal_loss_unreduced_forward(tensor input, loss = alphaT * loss; } - outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); - } -} - -template -void cpu_sigmoid_focal_loss_unreduced_backward(tensor input, - tensor target, - tensor doutput, - tensor& dinput, - tensor& dtarget, - float alpha = 0.25, - float gamma = 2) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); - auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); - auto doutput_tv = miopen::get_inner_expanded_tv<5>(doutput.desc); - auto dinput_tv = miopen::get_inner_expanded_tv<5>(dinput.desc); - auto dtarget_tv = miopen::get_inner_expanded_tv<5>(dtarget.desc); - size_t inputSize = input.desc.GetElementSize(); - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); - - float p = 1 / (1 + std::exp(-i)); - float ceLoss = -(t * std::log(p) + (1 - t) * std::log(1 - p)); - float pT = p * t + (1 - p) * (1 - t); - float powPt = std::pow(1 - pT, gamma); - float alpha_t = alpha * t + (1 - alpha) * (1 - t); - - if(dinput.data.size() > 0) + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) { - float dpdi = std::exp(-i) / std::pow(1 + std::exp(-i), 2); - float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi; - float dpowptdi = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi; - - // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di - float dLdi = dcelossdi * powPt + ceLoss * dpowptdi; - float grad = dO * dLdi; - - if(alpha >= 0) - { - grad *= alpha_t; - } - dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); + outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast(loss); } - - if(dtarget.data.size() > 0) + else { - float dcelossdt = -std::log(p) + std::log(1 - p); - float dpowptdt = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p); - // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt - float dLdt = dcelossdt * powPt + ceLoss * dpowptdt; - float gradTarget = dO * dLdt; - - if(alpha >= 0) - { - // alpha_t * dL/dt + dalpha_t/dt * dL - gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; - } - dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); + workspace[id] = static_cast(loss / divisor); } } -} -template -void cpu_sigmoid_focal_loss_forward(tensor input, - tensor target, - tensor& workspace, - tensor& outputHost, - float alpha = 0.25, - float gamma = 2, - float divisor = 1) -{ - auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); - auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); - size_t inputSize = input.desc.GetElementSize(); - // float reduction_float; - - for(size_t id = 0; id < inputSize; ++id) - { - tensor_layout_t<5> idx(input_tv, id); - - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - - float sig = 1 / (1 + std::exp(-i)); - float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig)); - float sigT = sig * t + (1 - sig) * (1 - t); - float loss = ceLoss * std::pow(1 - sigT, gamma); - - if(alpha >= 0) - { - float alphaT = alpha * t + (1 - alpha) * (1 - t); - loss = alphaT * loss; - } - // reduction_float += (loss / divisor); - - workspace[id] = static_cast(loss / divisor); - } - // std::cout << "Reduction result in float" << reduction_float << " " << divisor << std::endl; + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + return; // Reduce loss const int local_size = 256; @@ -172,9 +83,10 @@ void cpu_sigmoid_focal_loss_backward(tensor input, tensor doutput, tensor& dinput, tensor& dtarget, - float alpha = 0.25, - float gamma = 2, - float divisor = 1) + float alpha, + float gamma, + miopenLossReductionMode_t reduction, + float divisor) { auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); @@ -185,14 +97,18 @@ void cpu_sigmoid_focal_loss_backward(tensor input, size_t inputSize = input.desc.GetElementSize(); tensor_layout_t<5> doIdx(input_tv, 0); + float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); for(size_t id = 0; id < inputSize; ++id) { tensor_layout_t<5> idx(input_tv, id); - float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); - float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); - float dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + float i = static_cast(input[input_tv.get_tensor_view_idx(idx)]); + float t = static_cast(target[target_tv.get_tensor_view_idx(idx)]); + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + dO = static_cast(doutput[doutput_tv.get_tensor_view_idx(idx)]); + } float p = 1 / (1 + std::exp(-i)); float ceLoss = -(t * std::log(p) + (1 - t) * std::log(1 - p)); @@ -214,7 +130,10 @@ void cpu_sigmoid_focal_loss_backward(tensor input, { grad *= alpha_t; } - grad /= divisor; + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + grad /= divisor; + } dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast(grad); } @@ -231,7 +150,10 @@ void cpu_sigmoid_focal_loss_backward(tensor input, // alpha_t * dL/dt + dalpha_t/dt * dL gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt; } - gradTarget /= divisor; + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) + { + gradTarget /= divisor; + } dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast(gradTarget); } } diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp index 7443b7a94a..e612b2f86b 100644 --- a/test/gtest/sigmoid_focal_loss.hpp +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -143,7 +143,9 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam(input, target, outputHost, config.alpha); + tensor workspace; + cpu_sigmoid_focal_loss_forward( + input, target, workspace, outputHost, config.alpha, config.gamma, config.reduction, 1); EXPECT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -231,8 +233,15 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam( - input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma); + cpu_sigmoid_focal_loss_backward(input, + target, + dOutput, + dInputHost, + dTargetHost, + config.alpha, + config.gamma, + config.reduction, + 1); EXPECT_EQ(status, miopenStatusSuccess); @@ -339,8 +348,14 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam( - input, target, workspace, outputHost, config.alpha, config.gamma, divisor); + cpu_sigmoid_focal_loss_forward(input, + target, + workspace, + outputHost, + config.alpha, + config.gamma, + config.reduction, + divisor); EXPECT_EQ(status, miopenStatusSuccess); @@ -441,8 +456,15 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam( - input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma, divisor); + cpu_sigmoid_focal_loss_backward(input, + target, + dOutput, + dInputHost, + dTargetHost, + config.alpha, + config.gamma, + config.reduction, + divisor); EXPECT_EQ(status, miopenStatusSuccess); From cf3bcc0da219274a658f0c4e9a6f347a2ac18c37 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Thu, 8 Aug 2024 17:10:50 +0700 Subject: [PATCH 19/28] remove param reduction in test config --- include/miopen/miopen.h | 2 +- test/gtest/sigmoid_focal_loss.hpp | 61 +++++++++++++++---------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index c983f92619..e36f80814d 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -1603,7 +1603,7 @@ miopenConvolutionBackwardWeightsGetSolution(miopenHandle_t handle, * as part of the * miopenConvSolution_t struct. * - * @param handle MIOpen handle (input) + * @param handle MIOpen handle (input * @param dyDesc Tensor descriptor for data tensor dy (input) * @param xDesc Tensor descriptor for data tensor x (input) * @param convDesc Convolution layer descriptor (input) diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp index e612b2f86b..7f2c1314b3 100644 --- a/test/gtest/sigmoid_focal_loss.hpp +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -39,7 +39,6 @@ struct SigmoidFocalLossTestCase bool isContiguous; float alpha; float gamma; - miopenLossReductionMode_t reduction; friend std::ostream& operator<<(std::ostream& os, const SigmoidFocalLossTestCase& tc) { os << "dims: "; @@ -56,15 +55,11 @@ struct SigmoidFocalLossTestCase SigmoidFocalLossTestCase() {} SigmoidFocalLossTestCase(std::vector dim_, - bool isContiguous_ = true, - miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE, - float alpha_ = 0.25, - float gamma_ = 2) - : dims(dim_), - isContiguous(isContiguous_), - alpha(alpha_), - gamma(gamma_), - reduction(reduction_) + bool isContiguous_ = true, + // miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE, + float alpha_ = 0.25, + float gamma_ = 2) + : dims(dim_), isContiguous(isContiguous_), alpha(alpha_), gamma(gamma_) { } @@ -94,6 +89,10 @@ inline std::vector SigmoidFocalLossTestConfigs() SigmoidFocalLossTestCase({8, 3, 20, 100}, false), // 4D non-cont SigmoidFocalLossTestCase({2, 2, 3, 4, 100}), // 5D cont SigmoidFocalLossTestCase({2, 2, 3, 4, 100}, false), // 5D non-cont + SigmoidFocalLossTestCase({10}, + true, + 0.6, + 3), // 5D non-cont, custom alpha, gamma }; } @@ -105,6 +104,7 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam workspace; cpu_sigmoid_focal_loss_forward( - input, target, workspace, outputHost, config.alpha, config.gamma, config.reduction, 1); + input, target, workspace, outputHost, config.alpha, config.gamma, reduction, 1); EXPECT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -162,6 +162,7 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam input; tensor target; @@ -182,6 +183,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam(input, target, dOutput, @@ -240,7 +242,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam input; tensor target; @@ -294,7 +297,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam{in_dims, in_strides}.generate(tar_gen_value); size_t workspaceSizeBytes = miopen::GetSigmoidFocalLossForwardWorkspaceSize( - handle, input.desc, target.desc, output.desc, config.reduction); + handle, input.desc, target.desc, output.desc, reduction); size_t workspaceElements = workspaceSizeBytes / sizeof(TIO); workspace = tensor(workspaceElements); @@ -319,7 +322,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam(input, - target, - workspace, - outputHost, - config.alpha, - config.gamma, - config.reduction, - divisor); + reduction); + cpu_sigmoid_focal_loss_forward( + input, target, workspace, outputHost, config.alpha, config.gamma, reduction, divisor); EXPECT_EQ(status, miopenStatusSuccess); @@ -371,9 +368,10 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam input; tensor target; @@ -401,7 +399,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam(0.1, 50); }; input = tensor{in_dims, in_strides}.generate(in_gen_value); @@ -425,7 +423,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam(input, target, dOutput, @@ -463,7 +461,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam input; tensor target; From a41a00587df26d87eba1b7d167484090769adf5e Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Mon, 19 Aug 2024 15:46:20 +0700 Subject: [PATCH 20/28] change verify algo in CPU to naive accumulate in reduce kernels --- driver/mloSigmoidFocalLossHost.hpp | 30 +------------- driver/sigmoid_focal_loss_driver.hpp | 33 ++++++++++------ test/cpu_sigmoid_focal_loss.hpp | 59 +++++++++++++--------------- test/gtest/sigmoid_focal_loss.hpp | 44 ++++++++++----------- 4 files changed, 72 insertions(+), 94 deletions(-) diff --git a/driver/mloSigmoidFocalLossHost.hpp b/driver/mloSigmoidFocalLossHost.hpp index 555c0b4e88..2f77cd10ee 100644 --- a/driver/mloSigmoidFocalLossHost.hpp +++ b/driver/mloSigmoidFocalLossHost.hpp @@ -8,7 +8,6 @@ void mloSigmoidFocalLossFwdRunHost(Tgpu* input, miopenTensorDescriptor_t targetDesc, Tcheck* outputHost, miopenTensorDescriptor_t outputDesc, - Tcheck* workspaceHost, float alpha, float gamma, miopenLossReductionMode_t reduction, @@ -43,36 +42,9 @@ void mloSigmoidFocalLossFwdRunHost(Tgpu* input, } else { - workspaceHost[id] = static_cast(loss / divisor); + outputHost[0] += static_cast(loss / divisor); } } - - if(reduction == MIOPEN_LOSS_REDUCTION_NONE) - return; - - // Reduce loss - const int local_size = 256; - int offset_a = 0; - int offset_b = inputSize; - size_t _size = inputSize; - do - { - for(int i = 0; i < _size; i += local_size) - { - Tcheck shared[local_size]; - for(int j = 0; j < local_size; ++j) - shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f; - for(int offset = local_size / 2; offset > 0; offset >>= 1) - for(int j = 0; j < offset; ++j) - shared[j] += shared[j + offset]; - if(_size <= local_size) - outputHost[0] = shared[0]; - else - workspaceHost[offset_b + i / local_size] = shared[0]; - } - std::swap(offset_a, offset_b); - _size = (_size + local_size - 1) / local_size; - } while(_size > 1); } template diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index a6ee50fbd1..1e14efd548 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -37,6 +37,8 @@ #include #include +const float MAX_FP16 = 65504; + template class SigmoidFocalLossDriver : public Driver { @@ -109,7 +111,6 @@ class SigmoidFocalLossDriver : public Driver std::vector dtarget; std::vector dtargetHost; std::vector workspace; - std::vector workspaceHost; float alpha; float gamma; @@ -252,7 +253,6 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() dtargetHost = std::vector(dT_sz, static_cast(0)); size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu); workspace = std::vector(workSpaceElems, static_cast(0)); - workspaceHost = std::vector(workSpaceElems, static_cast(0)); float randomBound = 2; // For half, the random bound is smaller to avoid half overflow @@ -357,7 +357,6 @@ int SigmoidFocalLossDriver::RunForwardCPU() targetDesc, outputHost.data(), outputDesc, - workspaceHost.data(), alpha, gamma, reduction, @@ -457,13 +456,19 @@ int SigmoidFocalLossDriver::RunBackwardCPU() template Tcheck SigmoidFocalLossDriver::GetTolerance() { - // Computation error of fp16 is ~2^13 (=8192) bigger than - // the one of fp32 because mantissa is shorter by 13 bits. - auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + Tcheck tolerance; + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + } + else + { + tolerance = std::is_same::value ? 1.0e-2 : 8.2e-1; + } - // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. - if(std::is_same::value) - tolerance *= 8.0; return tolerance; } @@ -472,6 +477,12 @@ int SigmoidFocalLossDriver::VerifyForward() { RunForwardCPU(); + if(miopen::deref(inputDesc).GetType() == miopenHalf && + reduction != MIOPEN_LOSS_REDUCTION_NONE && abs(outputHost[0]) > MAX_FP16) + { + std::cout << "Float16 overflow - CPU output: " << outputHost[0] << std::endl; + } + const Tcheck tolerance = GetTolerance(); auto error = miopen::rms_range(outputHost, output); @@ -503,13 +514,13 @@ int SigmoidFocalLossDriver::VerifyBackward() { std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dinputError << " > " << tolerance << std::endl; - return EC_VerifyFwd; + return EC_VerifyBwd; } else if(isTargetGradientComputed && (!std::isfinite(dtargetError) || dtargetError > tolerance)) { std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dtargetError << " > " << tolerance << std::endl; - return EC_VerifyFwd; + return EC_VerifyBwd; } else { diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp index f1df613b54..fe21c94e27 100644 --- a/test/cpu_sigmoid_focal_loss.hpp +++ b/test/cpu_sigmoid_focal_loss.hpp @@ -9,17 +9,17 @@ template void cpu_sigmoid_focal_loss_forward(tensor input, tensor target, - tensor& workspace, tensor& outputHost, float alpha, float gamma, miopenLossReductionMode_t reduction, float divisor) { - auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); - auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); - auto output_tv = miopen::get_inner_expanded_tv<5>(outputHost.desc); - size_t inputSize = input.desc.GetElementSize(); + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto target_tv = miopen::get_inner_expanded_tv<5>(target.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(outputHost.desc); + size_t inputSize = input.desc.GetElementSize(); + float outputFloat = 0; for(size_t id = 0; id < inputSize; ++id) { @@ -45,36 +45,14 @@ void cpu_sigmoid_focal_loss_forward(tensor input, } else { - workspace[id] = static_cast(loss / divisor); + outputFloat += loss / divisor; } } - if(reduction == MIOPEN_LOSS_REDUCTION_NONE) - return; - - // Reduce loss - const int local_size = 256; - int offset_a = 0; - int offset_b = inputSize; - size_t _size = inputSize; - do + if(reduction != MIOPEN_LOSS_REDUCTION_NONE) { - for(int i = 0; i < _size; i += local_size) - { - TIO shared[local_size]; - for(int j = 0; j < local_size; ++j) - shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f; - for(int offset = local_size / 2; offset > 0; offset >>= 1) - for(int j = 0; j < offset; ++j) - shared[j] += shared[j + offset]; - if(_size <= local_size) - outputHost[0] = shared[0]; - else - workspace[offset_b + i / local_size] = shared[0]; - } - std::swap(offset_a, offset_b); - _size = (_size + local_size - 1) / local_size; - } while(_size > 1); + outputHost[0] = static_cast(outputFloat); + } } template @@ -158,3 +136,22 @@ void cpu_sigmoid_focal_loss_backward(tensor input, } } } + +template +float get_tolerance(miopenLossReductionMode_t reduction) +{ + float tolerance; + if(reduction == MIOPEN_LOSS_REDUCTION_NONE) + { + tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + } + else + { + tolerance = std::is_same::value ? 1.0e-2 : 8.2e-1; + } + + return tolerance; +} diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp index 7f2c1314b3..db8c0aa7ca 100644 --- a/test/gtest/sigmoid_focal_loss.hpp +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -56,9 +56,8 @@ struct SigmoidFocalLossTestCase SigmoidFocalLossTestCase(std::vector dim_, bool isContiguous_ = true, - // miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE, - float alpha_ = 0.25, - float gamma_ = 2) + float alpha_ = 0.25, + float gamma_ = 2) : dims(dim_), isContiguous(isContiguous_), alpha(alpha_), gamma(gamma_) { } @@ -143,9 +142,8 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam workspace; cpu_sigmoid_focal_loss_forward( - input, target, workspace, outputHost, config.alpha, config.gamma, reduction, 1); + input, target, outputHost, config.alpha, config.gamma, reduction, 1); EXPECT_EQ(status, miopenStatusSuccess); output.data = handle.Read(output_dev, output.data.size()); @@ -153,13 +151,13 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam::epsilon(); + double threshold = get_tolerance(reduction); auto error = miopen::rms_range(outputHost, output); EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output)); - EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error - << ", Thresholdx10: " << threshold * 10; + EXPECT_TRUE(error < threshold) + << "Error output beyond tolerance Error: " << error << ", Threshold: " << threshold; } SigmoidFocalLossTestCase config; miopenLossReductionMode_t reduction; @@ -253,21 +251,21 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam::epsilon(); + double threshold = get_tolerance(reduction); auto dInputError = miopen::rms_range(dInputHost, dInput); EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput)); - EXPECT_TRUE(dInputError < threshold * 10) + EXPECT_TRUE(dInputError < threshold) << "dInput error output beyond tolerance Error: " << dInputError - << ", Thresholdx10: " << threshold * 10; + << ", Threshold: " << threshold; auto dTargetError = miopen::rms_range(dTargetHost, dTarget); EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget)); - EXPECT_TRUE(dTargetError < threshold * 10) + EXPECT_TRUE(dTargetError < threshold) << "dTarget error output beyond tolerance Error: " << dTargetError - << ", Thresholdx10: " << threshold * 10; + << ", Threshold: " << threshold; } SigmoidFocalLossTestCase config; miopenLossReductionMode_t reduction; @@ -352,7 +350,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam( - input, target, workspace, outputHost, config.alpha, config.gamma, reduction, divisor); + input, target, outputHost, config.alpha, config.gamma, reduction, divisor); EXPECT_EQ(status, miopenStatusSuccess); @@ -361,14 +359,14 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam::epsilon(); + double threshold = get_tolerance(reduction); auto error = miopen::rms_range(outputHost, output); EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output)); - EXPECT_TRUE(error < threshold * 10) - << "Error output beyond tolerance Error: " << error - << ", Thresholdx10: " << threshold * 10 << " Reduction: " << reduction; + EXPECT_TRUE(error < threshold) + << "Error output beyond tolerance Error: " << error << ", Threshold: " << threshold + << " Reduction: " << reduction; } SigmoidFocalLossTestCase config; miopenLossReductionMode_t reduction; @@ -472,21 +470,21 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam::epsilon(); + double threshold = get_tolerance(reduction); auto dInputError = miopen::rms_range(dInputHost, dInput); EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput)); - EXPECT_TRUE(dInputError < threshold * 10) + EXPECT_TRUE(dInputError < threshold) << "dInput error output beyond tolerance Error: " << dInputError - << ", Thresholdx10: " << threshold * 10; + << ", Threshold: " << threshold; auto dTargetError = miopen::rms_range(dTargetHost, dTarget); EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget)); - EXPECT_TRUE(dTargetError < threshold * 10) + EXPECT_TRUE(dTargetError < threshold) << "dTarget error output beyond tolerance Error: " << dTargetError - << ", Thresholdx10: " << threshold * 10; + << ", Threshold: " << threshold; } SigmoidFocalLossTestCase config; miopenLossReductionMode_t reduction; From afc738f1f492dc3d444c38cf91f1f94941943f80 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 27 Aug 2024 13:20:50 +0700 Subject: [PATCH 21/28] merge code with reduce kernel used in develop branch --- driver/sigmoid_focal_loss_driver.hpp | 11 +-- src/CMakeLists.txt | 2 - src/kernels/MIOpenLossSum.cpp | 56 -------------- src/kernels/MIOpenReduceSum.cpp | 8 +- src/kernels/MIOpenSigmoidFocalLoss.cpp | 6 +- src/kernels/warp_shuffle.hpp | 77 ------------------- .../forward_reduce_sigmoid_focal_loss.cpp | 21 +++-- test/gtest/sigmoid_focal_loss.hpp | 7 +- 8 files changed, 27 insertions(+), 161 deletions(-) delete mode 100644 src/kernels/MIOpenLossSum.cpp delete mode 100644 src/kernels/warp_shuffle.hpp diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 1e14efd548..23071d7f97 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -110,7 +110,6 @@ class SigmoidFocalLossDriver : public Driver std::vector dinputHost; std::vector dtarget; std::vector dtargetHost; - std::vector workspace; float alpha; float gamma; @@ -239,8 +238,9 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() miopenGetSigmoidFocalLossForwardWorkspaceSize( handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes); - workspace_dev = - std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu))); + // workspace_dev = + // std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu))); + workspace_dev = std::make_unique(ctx, workSpaceSizeInBytes, sizeof(std::byte)); input = std::vector(in_sz, static_cast(0)); target = std::vector(target_sz, static_cast(0)); @@ -251,8 +251,6 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() dinputHost = std::vector(dI_sz, static_cast(0)); dtarget = std::vector(dT_sz, static_cast(0)); dtargetHost = std::vector(dT_sz, static_cast(0)); - size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu); - workspace = std::vector(workSpaceElems, static_cast(0)); float randomBound = 2; // For half, the random bound is smaller to avoid half overflow @@ -291,9 +289,6 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() if(dtarget_dev->ToGPU(GetStream(), dtarget.data()) != 0) std::cerr << "Error copying (dT) to GPU, size: " << dtarget_dev->GetSize() << std::endl; - if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0) - std::cerr << "Error copying (dI) to GPU, size: " << workspace_dev->GetSize() << std::endl; - return miopenStatusSuccess; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c931fa14ed..35496f87a7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -478,7 +478,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/stride_array.hpp kernels/tensor_view.hpp kernels/utilities.inc - kernels/warp_shuffle.hpp kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc @@ -521,7 +520,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenLayerNorm.cpp kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl - kernels/MIOpenLossSum.cpp kernels/MIOpenNeuron.cl kernels/MIOpenPReLU.cpp kernels/MIOpenPooling.cl diff --git a/src/kernels/MIOpenLossSum.cpp b/src/kernels/MIOpenLossSum.cpp deleted file mode 100644 index 08d3a656f6..0000000000 --- a/src/kernels/MIOpenLossSum.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include -#include -#endif - -#include "float_types.h" -#include "warp_shuffle.hpp" - -#ifndef IN_OUT_TYPE -#define IN_OUT_TYPE float -#endif - -template -__device__ void losssum(const TIO* input, TIO* output, size_t N) -{ - auto gid = blockIdx.x * blockDim.x + threadIdx.x; - - FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : static_cast(0.0f); - val = block_reduce_sum(val); - - if(threadIdx.x == 0) - output[blockIdx.x] = CVT_ACCUM2FLOAT(val); -} - -extern "C" __global__ void -LossSum(const IN_OUT_TYPE* __restrict__ input, IN_OUT_TYPE* __restrict__ output, size_t N) -{ - // instantiate the kernel - losssum(input, output, N); -} diff --git a/src/kernels/MIOpenReduceSum.cpp b/src/kernels/MIOpenReduceSum.cpp index 5ed52008bf..a7213acc38 100644 --- a/src/kernels/MIOpenReduceSum.cpp +++ b/src/kernels/MIOpenReduceSum.cpp @@ -47,12 +47,12 @@ ReduceSum(const FLOAT_ACCUM* input, TO* output, uint64_t N, tensor_view_t<1> out } extern "C" __global__ void ReduceSum(const FLOAT_ACCUM* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, + FLOAT* __restrict__ output, uint64_t N, tensor_view_t<1> output_tv) { // instantiate the kernel - ReduceSum(input, output, N, output_tv); + ReduceSum(input, output, N, output_tv); } extern "C" __global__ void ReduceSumFLOATACCUM(const FLOAT_ACCUM* __restrict__ input, @@ -93,12 +93,12 @@ __device__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input, } extern "C" __global__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input, - OUTPUT_TYPE* __restrict__ output, + FLOAT* __restrict__ output, uint64_t output_numel, uint64_t inner_size, uint64_t outer_size, tensor_view_t<1> output_tv) { // instantiate the kernel - Reduce1dSum(input, output, output_numel, inner_size, outer_size, output_tv); + Reduce1dSum(input, output, output_numel, inner_size, outer_size, output_tv); } diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp index 75c25c0e42..b8f3630e8d 100644 --- a/src/kernels/MIOpenSigmoidFocalLoss.cpp +++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp @@ -47,7 +47,7 @@ template __device__ void sigmoidFocalLossFwd(const TIO* input, TIO* target, - TIO* workspace, + FLOAT_ACCUM* workspace, float alpha, float gamma, float divisor, @@ -74,12 +74,12 @@ __device__ void sigmoidFocalLossFwd(const TIO* input, loss = alpha_t * loss; } - workspace[gid] = CVT_ACCUM2FLOAT(loss / divisor); + workspace[gid] = loss / divisor; } extern "C" __global__ void SigmoidFocalLossFwd(const IN_OUT_TYPE* input, IN_OUT_TYPE* target, - IN_OUT_TYPE* workspace, + FLOAT_ACCUM* workspace, float alpha, float gamma, float divisor, diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp deleted file mode 100644 index c1b53ea565..0000000000 --- a/src/kernels/warp_shuffle.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2024 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#ifndef GUARD_WARP_SHUFFLE_HPP -#define GUARD_WARP_SHUFFLE_HPP - -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include -#include -#endif - -#include "float_types.h" - -#ifndef REDUCE_SIZE -#define REDUCE_SIZE 256 -#endif - -__device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val) -{ - if(warpSize >= 64) - val += __shfl_down(val, 32); - if(warpSize >= 32) - val += __shfl_down(val, 16); - if(warpSize >= 16) - val += __shfl_down(val, 8); - if(warpSize >= 8) - val += __shfl_down(val, 4); - if(warpSize >= 4) - val += __shfl_down(val, 2); - if(warpSize >= 2) - val += __shfl_down(val, 1); - return val; -} - -__device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val) -{ - static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize]; - auto lane = threadIdx.x % warpSize; - auto wid = threadIdx.x / warpSize; - - val = warp_reduce_sum(val); - - if(lane == 0) - shared[wid] = val; - __syncthreads(); - - val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0; - if(wid == 0) - val = warp_reduce_sum(val); - - return val; -} - -#endif // GUARD_WARP_SHUFFLE_HPP diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp index f1a37fc54f..beacf73263 100644 --- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -36,7 +36,7 @@ #include #include -#define LOCAL_SIZE 256 +#define LOCAL_SIZE_SIGMOIDFOCALLOSS 256 #define LOCAL_SIZE_REDUCE 256 namespace miopen { @@ -72,21 +72,25 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype}, {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype}, - {"LOCAL_SIZE", LOCAL_SIZE}, + {"REDUCE_SIZE", LOCAL_SIZE_REDUCE}, }; /* Prepare params for loss kernel */ result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params)); + {LOCAL_SIZE_SIGMOIDFOCALLOSS}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params)); /* Prepare params for reduce kernels */ auto _size = size; - do + while(_size > LOCAL_SIZE_REDUCE) { result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); + {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSumFLOATACCUM", build_params)); + // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; - } while(_size > 1); + } + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); result.invoker_factory = [this, problem](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { @@ -143,7 +147,8 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( } else { - kernel(reduceIn, params.output, size); + auto output_tv = get_inner_expanded_tv<1>(deref(params.outputDesc)); + kernel(reduceIn, params.output, size, output_tv); } size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; } @@ -179,7 +184,7 @@ MultiBufferWorkspaceTraits SigmoidFocalLossFwd::GetMultiBufferWorkspaceTraits( { size_t inputElements = problem.GetInputDesc().GetElementSize(); size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE; - size_t elementSize = get_data_size(problem.GetOutputDesc().GetType()); + size_t elementSize = get_data_size(miopenFloat); return MultiBufferWorkspaceTraits{inputElements * elementSize, reduceElements * elementSize}; } diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp index db8c0aa7ca..77023364b3 100644 --- a/test/gtest/sigmoid_focal_loss.hpp +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -79,6 +79,7 @@ struct SigmoidFocalLossTestCase inline std::vector SigmoidFocalLossTestConfigs() { return { + SigmoidFocalLossTestCase({1}), // 1D cont SigmoidFocalLossTestCase({4000}), // 1D cont SigmoidFocalLossTestCase({100, 500}), // 2D cont SigmoidFocalLossTestCase({100, 500}, false), // 2D non-cont @@ -308,9 +309,9 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam(workspaceElements); + workspace = tensor(workspaceElements); std::fill(workspace.begin(), workspace.end(), 0); output = tensor(1); @@ -373,7 +374,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam input; tensor target; - tensor workspace; + tensor workspace; tensor output; tensor outputHost; From 449a51d9a07b9cc3c8058eb1cb07f6e7bcedce83 Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Tue, 27 Aug 2024 13:22:39 +0700 Subject: [PATCH 22/28] apply clang-format --- driver/sigmoid_focal_loss_driver.hpp | 20 ++++++++--------- .../forward_reduce_sigmoid_focal_loss.cpp | 22 ++++++++++++------- test/gtest/sigmoid_focal_loss.hpp | 2 +- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp index 23071d7f97..188336af62 100644 --- a/driver/sigmoid_focal_loss_driver.hpp +++ b/driver/sigmoid_focal_loss_driver.hpp @@ -238,19 +238,17 @@ int SigmoidFocalLossDriver::AllocateBuffersAndCopy() miopenGetSigmoidFocalLossForwardWorkspaceSize( handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes); - // workspace_dev = - // std::unique_ptr(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu))); workspace_dev = std::make_unique(ctx, workSpaceSizeInBytes, sizeof(std::byte)); - input = std::vector(in_sz, static_cast(0)); - target = std::vector(target_sz, static_cast(0)); - output = std::vector(out_sz, static_cast(0)); - outputHost = std::vector(out_sz, static_cast(0)); - doutput = std::vector(dO_sz, static_cast(0)); - dinput = std::vector(dI_sz, static_cast(0)); - dinputHost = std::vector(dI_sz, static_cast(0)); - dtarget = std::vector(dT_sz, static_cast(0)); - dtargetHost = std::vector(dT_sz, static_cast(0)); + input = std::vector(in_sz, static_cast(0)); + target = std::vector(target_sz, static_cast(0)); + output = std::vector(out_sz, static_cast(0)); + outputHost = std::vector(out_sz, static_cast(0)); + doutput = std::vector(dO_sz, static_cast(0)); + dinput = std::vector(dI_sz, static_cast(0)); + dinputHost = std::vector(dI_sz, static_cast(0)); + dtarget = std::vector(dT_sz, static_cast(0)); + dtargetHost = std::vector(dT_sz, static_cast(0)); float randomBound = 2; // For half, the random bound is smaller to avoid half overflow diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp index beacf73263..5af00b9701 100644 --- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp +++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp @@ -76,21 +76,27 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( }; /* Prepare params for loss kernel */ - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_SIGMOIDFOCALLOSS}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SIGMOIDFOCALLOSS}, + {size}, + "MIOpenSigmoidFocalLoss.cpp", + "SigmoidFocalLossFwd", + build_params)); /* Prepare params for reduce kernels */ auto _size = size; while(_size > LOCAL_SIZE_REDUCE) { - result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSumFLOATACCUM", build_params)); - // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_REDUCE}, + {_size}, + "MIOpenReduceSum.cpp", + "ReduceSumFLOATACCUM", + build_params)); + // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params)); _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; - } + } result.construction_params.push_back(make_hip_kernel( - {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); + {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); result.invoker_factory = [this, problem](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { @@ -147,7 +153,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution( } else { - auto output_tv = get_inner_expanded_tv<1>(deref(params.outputDesc)); + auto output_tv = get_inner_expanded_tv<1>(deref(params.outputDesc)); kernel(reduceIn, params.output, size, output_tv); } size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE; diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp index 77023364b3..ab59893e52 100644 --- a/test/gtest/sigmoid_focal_loss.hpp +++ b/test/gtest/sigmoid_focal_loss.hpp @@ -79,7 +79,7 @@ struct SigmoidFocalLossTestCase inline std::vector SigmoidFocalLossTestConfigs() { return { - SigmoidFocalLossTestCase({1}), // 1D cont + SigmoidFocalLossTestCase({1}), // 1D cont SigmoidFocalLossTestCase({4000}), // 1D cont SigmoidFocalLossTestCase({100, 500}), // 2D cont SigmoidFocalLossTestCase({100, 500}, false), // 2D non-cont From 446322c8b3dda9b1be619db08d45bd88ae84178e Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Wed, 28 Aug 2024 10:17:39 +0700 Subject: [PATCH 23/28] fix implicitgemm_ck_util.hpp --- include/miopen/miopen.h | 2 +- .../miopen/solver/implicitgemm_ck_util.hpp | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 82907855ab..923906df56 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -1605,7 +1605,7 @@ miopenConvolutionBackwardWeightsGetSolution(miopenHandle_t handle, * as part of the * miopenConvSolution_t struct. * - * @param handle MIOpen handle (input + * @param handle MIOpen handle (input) * @param dyDesc Tensor descriptor for data tensor dy (input) * @param xDesc Tensor descriptor for data tensor x (input) * @param convDesc Convolution layer descriptor (input) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index e6cceaef0f..64665b2af2 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, internal::MakeTaggedTransposeInstances( result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des); - result.invoker_factory = [split_k = split_k, + result.invoker_factory = [split_k, ck_args = std::move(ck_args), sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}, input1_tr_inst = std::move(_input1_tr_inst), @@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, output_init_tr_inst = std::move(_output_init_tr_inst), ck_buff_des = _ck_buff_des](const std::vector& kernels) mutable { - return [split_k = split_k, + return [split_k, kernels, ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr), @@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, input2_tr_inst = std::move(input2_tr_inst), output_tr_inst = std::move(output_tr_inst), output_init_tr_inst = std::move(output_init_tr_inst), - ck_buff_des = ck_buff_des](const Handle& handle, - const AnyInvokeParams& primitive_parameters) mutable { + ck_buff_des](const Handle& handle, + const AnyInvokeParams& primitive_parameters) mutable { handle.ResetKernelTime(); const auto& data_ctx = primitive_parameters.CastTo(); @@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, [[maybe_unused]] bool should_allocated_wrw_buffer = ShouldAllocateWorkSpaceBufferForWRW(problem); - result.invoker_factory = [split_k = split_k, - ck_args = CKArgsType{problem}, - alpha_beta_case = alpha_beta_case, - should_allocated_wrw_buffer = should_allocated_wrw_buffer, + result.invoker_factory = [split_k, + ck_args = CKArgsType{problem}, + alpha_beta_case, + should_allocated_wrw_buffer, sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}]( const std::vector&) mutable { - return [split_k = split_k, - ck_args = std::move(ck_args), - alpha_beta_case = alpha_beta_case, - should_allocated_wrw_buffer = should_allocated_wrw_buffer, - sh_conv_ptr = std::move(sh_conv_ptr)]( + return [split_k, + ck_args = std::move(ck_args), + alpha_beta_case, + should_allocated_wrw_buffer, + sh_conv_ptr = std::move(sh_conv_ptr)]( const Handle& handle, const AnyInvokeParams& primitive_parameters) { const auto& data_ctx = primitive_parameters.CastTo(); std::unique_ptr argument_ptr; From 41b9300368a3f31201711d7bef0f37702045ac6f Mon Sep 17 00:00:00 2001 From: BuiChiTrung Date: Sat, 31 Aug 2024 11:29:00 +0700 Subject: [PATCH 24/28] add comment to kernel --- src/kernels/MIOpenSigmoidFocalLoss.cpp | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp index b8f3630e8d..d12335a5f7 100644 --- a/src/kernels/MIOpenSigmoidFocalLoss.cpp +++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp @@ -54,6 +54,12 @@ __device__ void sigmoidFocalLossFwd(const TIO* input, tensor_view_t<5> input_tv, tensor_view_t<5> target_tv) { + /* + Dim: input = target = workspace = {N, C, D, H, W}. + Each thread handle an elem in the input, target tensor. + Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}. + Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}. + */ size_t gid = threadIdx.x + blockIdx.x * blockDim.x; tensor_layout_t<5> idx(input_tv, gid); @@ -63,6 +69,7 @@ __device__ void sigmoidFocalLossFwd(const TIO* input, FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + /* The formula follows torchvision package: torchvision/ops/focal_loss.py */ FLOAT_ACCUM p = 1 / (1 + exp(-i)); FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); @@ -105,6 +112,12 @@ __device__ void sigmoidFocalLossBwd(const TIO* input, tensor_view_t<5> dinput_tv, tensor_view_t<5> dtarget_tv) { + /* + Dim: input = target = doutput = dinput = dtarget = {N, C, D, H, W}. + Each thread handle an elem in the input, target, doutput tensor. + Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}. + Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}. + */ size_t gid = threadIdx.x + blockIdx.x * blockDim.x; tensor_layout_t<5> idx(input_tv, gid); @@ -116,6 +129,7 @@ __device__ void sigmoidFocalLossBwd(const TIO* input, FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(doIdx)]); + /* Formula is formed by compute fwd's formula gradient */ FLOAT_ACCUM p = 1 / (1 + exp(-i)); FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); @@ -199,6 +213,12 @@ __device__ void sigmoidFocalLossUnreducedFwd(const TIO* input, tensor_view_t<5> target_tv, tensor_view_t<5> output_tv) { + /* + Dim: input = target = output = {N, C, D, H, W}. + Each thread handle an elem in the input, target tensor. + Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}. + Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}. + */ size_t gid = threadIdx.x + blockIdx.x * blockDim.x; tensor_layout_t<5> idx(input_tv, gid); @@ -208,6 +228,7 @@ __device__ void sigmoidFocalLossUnreducedFwd(const TIO* input, FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]); FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); + /* The formula follows torchvision package: torchvision/ops/focal_loss.py */ FLOAT_ACCUM p = 1 / (1 + exp(-i)); FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); @@ -249,6 +270,12 @@ __device__ void sigmoidFocalLossUnreducedBwd(const TIO* input, tensor_view_t<5> dinput_tv, tensor_view_t<5> dtarget_tv) { + /* + Dim: input = target = doutput = dinput = dtarget = {N, C, D, H, W}. + Each thread handle an elem in the input, target, doutput tensor. + Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}. + Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}. + */ size_t gid = threadIdx.x + blockIdx.x * blockDim.x; tensor_layout_t<5> idx(input_tv, gid); @@ -259,6 +286,7 @@ __device__ void sigmoidFocalLossUnreducedBwd(const TIO* input, FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]); FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(idx)]); + /* Formula is formed by compute fwd's formula gradient */ FLOAT_ACCUM p = 1 / (1 + exp(-i)); FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p)); FLOAT_ACCUM pT = p * t + (1 - p) * (1 - t); From d2f2dd133eca00d86f02192e6b4179d5bd59c0b2 Mon Sep 17 00:00:00 2001 From: long10024070 Date: Tue, 5 Nov 2024 04:01:36 +0000 Subject: [PATCH 25/28] undo ck changed --- .../miopen/solver/implicitgemm_ck_util.hpp | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index 64665b2af2..cf3e53a53e 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -376,9 +377,10 @@ class TransposeInstance Run(handle, kernels, out_ptr, buf_handle.get()); } - void ZeroOutBuffer() + void ZeroOutBuffer(const Handle& handle) { - [[maybe_unused]] auto status = hipMemsetAsync(buf_handle.get(), 0, tensor_sz); + [[maybe_unused]] auto status = + hipMemsetAsync(buf_handle.get(), 0, tensor_sz, handle.GetStream()); assert(status == hipSuccess); } @@ -600,7 +602,8 @@ inline bool CKWrwRequireWorkspace( size_t K_per_group = K / G; return (alpha_beta_case == BILINEAR || alpha_beta_case == SCALE) || - (data_type == miopenHalf && (is_odd(C_per_group) || is_odd(K_per_group))); + ((data_type == miopenHalf || data_type == miopenBFloat16) && + (is_odd(C_per_group) || is_odd(K_per_group))); } /// \todo move to a cpp file @@ -680,7 +683,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, internal::MakeTaggedTransposeInstances( result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des); - result.invoker_factory = [split_k, + result.invoker_factory = [split_k = split_k, ck_args = std::move(ck_args), sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}, input1_tr_inst = std::move(_input1_tr_inst), @@ -689,7 +692,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, output_init_tr_inst = std::move(_output_init_tr_inst), ck_buff_des = _ck_buff_des](const std::vector& kernels) mutable { - return [split_k, + return [split_k = split_k, kernels, ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr), @@ -697,8 +700,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, input2_tr_inst = std::move(input2_tr_inst), output_tr_inst = std::move(output_tr_inst), output_init_tr_inst = std::move(output_init_tr_inst), - ck_buff_des](const Handle& handle, - const AnyInvokeParams& primitive_parameters) mutable { + ck_buff_des = ck_buff_des](const Handle& handle, + const AnyInvokeParams& primitive_parameters) mutable { handle.ResetKernelTime(); const auto& data_ctx = primitive_parameters.CastTo(); @@ -734,7 +737,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx, /// \todo: Will need SetTensor() to properly zero out non-packed tensors if(output_tr_inst.GetConvOperandTag() == internal::ConvOperandTag::Weights) { - output_tr_inst.ZeroOutBuffer(); + output_tr_inst.ZeroOutBuffer(handle); } std::array tr_ptrs = { @@ -826,17 +829,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&, [[maybe_unused]] bool should_allocated_wrw_buffer = ShouldAllocateWorkSpaceBufferForWRW(problem); - result.invoker_factory = [split_k, - ck_args = CKArgsType{problem}, - alpha_beta_case, - should_allocated_wrw_buffer, + result.invoker_factory = [split_k = split_k, + ck_args = CKArgsType{problem}, + alpha_beta_case = alpha_beta_case, + should_allocated_wrw_buffer = should_allocated_wrw_buffer, sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}]( const std::vector&) mutable { - return [split_k, - ck_args = std::move(ck_args), - alpha_beta_case, - should_allocated_wrw_buffer, - sh_conv_ptr = std::move(sh_conv_ptr)]( + return [split_k = split_k, + ck_args = std::move(ck_args), + alpha_beta_case = alpha_beta_case, + should_allocated_wrw_buffer = should_allocated_wrw_buffer, + sh_conv_ptr = std::move(sh_conv_ptr)]( const Handle& handle, const AnyInvokeParams& primitive_parameters) { const auto& data_ctx = primitive_parameters.CastTo(); std::unique_ptr argument_ptr; From 4e4e1fcb2dc3f6069d01271752cccf37cca764e2 Mon Sep 17 00:00:00 2001 From: long10024070 Date: Tue, 5 Nov 2024 04:01:51 +0000 Subject: [PATCH 26/28] code shorten --- test/gtest/sigmoid_focal_loss.cpp | 219 +++++------------------------- 1 file changed, 36 insertions(+), 183 deletions(-) diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp index fa90ceb218..48982ee8db 100644 --- a/test/gtest/sigmoid_focal_loss.cpp +++ b/test/gtest/sigmoid_focal_loss.cpp @@ -25,88 +25,29 @@ *******************************************************************************/ #include "sigmoid_focal_loss.hpp" -#include "tensor_holder.hpp" #include -#include - -MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) -MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) namespace sigmoidfocalloss { - -std::string GetFloatArg() -{ - const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); - if(tmp.empty()) - { - return ""; - } - return tmp; -} - -struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest -{ -}; - -struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest -{ -}; - -struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest -{ -}; - -struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest -{ -}; - -struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest -{ -}; - -struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest -{ -}; - -struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest -{ -}; +using GPU_SigmoidFocalLoss_fwd_FP32 = SigmoidFocalLossFwdTest; +using GPU_SigmoidFocalLoss_fwd_FP16 = SigmoidFocalLossFwdTest; +using GPU_SigmoidFocalLoss_fwd_BFP16 = SigmoidFocalLossFwdTest; +using GPU_SigmoidFocalLoss_bwd_FP32 = SigmoidFocalLossBwdTest; +using GPU_SigmoidFocalLoss_bwd_FP16 = SigmoidFocalLossBwdTest; +using GPU_SigmoidFocalLoss_bwd_BFP16 = SigmoidFocalLossBwdTest; +using GPU_SigmoidFocalLossUnreduced_fwd_FP32 = SigmoidFocalLossUnreducedFwdTest; +using GPU_SigmoidFocalLossUnreduced_fwd_FP16 = SigmoidFocalLossUnreducedFwdTest; +using GPU_SigmoidFocalLossUnreduced_fwd_BFP16 = SigmoidFocalLossUnreducedFwdTest; +using GPU_SigmoidFocalLossUnreduced_bwd_FP32 = SigmoidFocalLossUnreducedBwdTest; +using GPU_SigmoidFocalLossUnreduced_bwd_FP16 = SigmoidFocalLossUnreducedBwdTest; +using GPU_SigmoidFocalLossUnreduced_bwd_BFP16 = SigmoidFocalLossUnreducedBwdTest; }; // namespace sigmoidfocalloss using namespace sigmoidfocalloss; TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -115,16 +56,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -133,16 +66,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -151,16 +76,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -169,16 +86,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -187,16 +96,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -205,16 +106,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -223,16 +116,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -241,16 +126,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -259,16 +136,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -277,16 +146,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, @@ -295,16 +156,8 @@ INSTANTIATE_TEST_SUITE_P(Full, TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test) { - if(!MIOPEN_TEST_ALL || - (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) - { - RunTest(); - Verify(); - } - else - { - GTEST_SKIP(); - } + RunTest(); + Verify(); }; INSTANTIATE_TEST_SUITE_P(Full, From a33ad209aead3dbdcd98e37df84c1ad834f979c1 Mon Sep 17 00:00:00 2001 From: long10024070 Date: Tue, 5 Nov 2024 05:06:59 +0000 Subject: [PATCH 27/28] Fig build error --- include/miopen/miopen.h | 682 +++++++++++++++--- .../miopen/sigmoidfocalloss/solvers.hpp | 1 + src/include/miopen/solver_id.hpp | 2 +- src/solver.cpp | 41 +- 4 files changed, 600 insertions(+), 126 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 923906df56..27b5ebe327 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -2737,6 +2737,67 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, double epsilon, void* resultSaveMean, void* resultSaveInvVariance); +/*! @brief Execute forward training layer for batch normalization + * + * Batch normalization pass for forward training pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean + * and inverse variance will not be used. + * + * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values + * for the running mean and variance will not be saved. + * Running averages and variances are scaled using an exponential averaging factor: \f[ + * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor) + * \f] + * where \f[ + * factor=1/(1+iteration) + * \f] + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param savedMeanDesc Tensor descriptor for BN saved Mean + * @param savedVarDesc Tensor descriptor for BN saved Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param expAvgFactor Exponential averaging factor (input) + * @param resultRunningMean Running average saved for inference (output) + * @param resultRunningVariance Running variance saved for inference (output) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param resultSaveMean Saved mini-batch mean for backwards pass (output) + * @param resultSaveInvVariance Saved mini-batch inverse variance for backwards pass (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasVarDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance); /*! @brief Execute forward inference layer for batch normalization * @@ -2783,6 +2844,56 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedVariance, double epsilon); +/*! @brief Execute forward inference layer for batch normalization + * + * Batch normalization pass for forward inference pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and + * variance will be calculated from input data and this calculated mean and variance will be used + * to update input values. + * If variance is zero and epsilon is also zero, this function outputs NAN values. Input espilon + * value should always be non zero positive value. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param estMeanDesc Tensor descriptor for BN estimated Mean + * @param estVarianceDesc Tensor descriptor for BN estimated Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param estimatedMean Running average saved during forward training (input) + * @param estimatedVariance Running variance saved during forward training (input) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon); + /*! @brief Execute backwards propagation layer for batch normalization * * Batch normalization pass for backwards propagation training pass. @@ -2838,6 +2949,68 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance); +/*! @brief Execute backwards propagation layer for batch normalization + * + * Batch normalization pass for backwards propagation training pass. + * The method for backwards propagation batch normalization. + * + * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy, + * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their + * descriptor. + * + * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved + * mean and variance calculated by the forward training phase. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alphaDataDiff Floating point scaling factor, allocated on the host (input) + * @param betaDataDiff Floating point shift factor, allocated on the host (input) + * @param alphaParamDiff Floating point scaling factor, allocated on the host (input) + * @param betaParamDiff Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param dyDesc Tensor descriptor for output data tensor y (input) + * @param dy Data tensor y (input) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dx Data delta tensor dx (output) + * @param scaleDesc Tensor descriptor for scaling descriptor (input) + * @param biasDesc Tensor descriptor for bias/shift descriptor (input) + * @param savedMeanDesc Tensor descriptor for saved Mean descriptor (input) + * @param savedVarDesc Tensor descriptor for saved Variance descriptor (input) + * , shifting, saved variance and + * mean (input) + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param resultBnScaleDiff Tensor for dscale (output) + * @param resultBnBiasDiff Tensor for dbias (output) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @param savedMean Saved mini-batch mean for backwards pass (input) + * @param savedInvVariance Saved mini-bathc inverse variance for backwards pass (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance); + /** @} */ // CLOSEOUT BATCHNORM DOXYGEN GROUP @@ -2951,6 +3124,54 @@ miopenDestroyActivationDescriptor(miopenActivationDescriptor_t activDesc); /** @} */ // CLOSEOUT ACTIVATION DOXYGEN GROUP +#ifdef MIOPEN_BETA_API +/** @addtogroup activation + * + * @{ + */ + +/*! @brief Execute a GLU forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Input tensor (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Output tensor (output) + * @param dim Dimension to split the input (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGLUForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const uint32_t dim); + +/*! @brief Execute a GLU backward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Input tensor (input) + * @param outputGradDesc Tensor descriptor for delta output tensor (input) + * @param outputGrad Delta output tensor (input) + * @param inputGradDesc Tensor descriptor for delta input tensor (input) + * @param inputGrad Delta input tensor (output) + * @param dim Dimension to split the input (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGLUBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputGradDesc, + const void* outputGrad, + const miopenTensorDescriptor_t inputGradDesc, + void* inputGrad, + const uint32_t dim); + +/** @} */ +// CLOSEOUT ACTIVATION DOXYGEN GROUP +#endif // MIOPEN_BETA_API + // Softmax APIs /** @addtogroup softmax * @@ -5006,98 +5227,6 @@ MIOPEN_EXPORT miopenStatus_t miopenCTCLoss(miopenHandle_t handle, void* workSpace, size_t workSpaceSize); -#ifdef MIOPEN_BETA_API - -typedef enum -{ - MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */ - MIOPEN_LOSS_REDUCTION_SUM = 1, /*!< output tensor elements are summed up */ - MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total - number of elements to get mean value */ -} miopenLossReductionMode_t; - -/*! @brief Helper function to query the minimum workspace size required by the sigmoid focal loss - * call - * - * @param handle MIOpen Handle (input) - * @param inputDesc Tensor descriptor for input tensor (input) - * @param targetDesc Tensor descriptor for target tensor (input) - * @param outputDesc Tensor descriptor for output tensor (input) - * @param reduction Reduction (input) - * @param sizeInBytes Pointer to data to return the minimum workspace size - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t -miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle, - miopenTensorDescriptor_t inputDesc, - miopenTensorDescriptor_t targetDesc, - miopenTensorDescriptor_t outputDesc, - miopenLossReductionMode_t reduction, - size_t* sizeInBytes); - -/*! @brief Execute a SigmoidFocalLoss forward layer - * - * @param handle MIOpen handle (input) - * @param workspace Address of the allocated workspace data (input) - * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) - * @param inputDesc Tensor descriptor for input tensor (input) - * @param input Data tensor input (input) - * @param targetDesc Tensor descriptor for target tensor (input) - * @param target Data tensor target (input) - * @param outputDesc Tensor descriptor for output tensor (input) - * @param output Data tensor output (output) - * @param alpha Alpha (input) - * @param gamma Gamma (input) - * @param reduction Reduction (input) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle, - void* workspace, - size_t workspaceSizeInBytes, - miopenTensorDescriptor_t inputDesc, - const void* input, - miopenTensorDescriptor_t targetDesc, - const void* target, - miopenTensorDescriptor_t outputDesc, - void* output, - float alpha, - float gamma, - miopenLossReductionMode_t reduction); - -/*! @brief Execute a SigmoidFocalLoss backward layer - * - * @param handle MIOpen handle (input) - * @param inputDesc Tensor descriptor for input tensor (input) - * @param input Data tensor input (input) - * @param targetDesc Tensor descriptor for target tensor (input) - * @param target Data tensor target (input) - * @param doutputDesc Tensor descriptor for output gradient (input) - * @param doutput Gradient of output (input) - * @param dinputDesc Tensor descriptor for input gradient (input) - * @param dinput Gradient of input (output) - * @param dtargetDesc Tensor descriptor for target gradient (input) - * @param dtarget Gradient of target (output) - * @param alpha Alpha (input) - * @param gamma Gamma (input) - * @param reduction Reduction (input) - * @return miopenStatus_t - */ -MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle, - miopenTensorDescriptor_t inputDesc, - const void* input, - miopenTensorDescriptor_t targetDesc, - const void* target, - miopenTensorDescriptor_t doutputDesc, - const void* doutput, - miopenTensorDescriptor_t dinputDesc, - void* dinput, - miopenTensorDescriptor_t dtargetDesc, - void* dtarget, - float alpha, - float gamma, - miopenLossReductionMode_t reduction); -#endif - /** @} */ // CLOSEOUT LossFunction DOXYGEN GROUP @@ -5499,40 +5628,42 @@ typedef enum miopenTensorMhaAmaxDK = 33, miopenTensorMhaAmaxDV = 34, miopenTensorMhaAmaxDS = 35, + miopenTensorMhaBias = 36, #ifdef MIOPEN_BETA_API - miopenTensorActivationX = 36, - miopenTensorActivationY = 37, - miopenTensorActivationDX = 38, - miopenTensorActivationDY = 39, - miopenTensorBiasX = 40, - miopenTensorBiasY = 41, - miopenTensorBias = 42, - miopenTensorSoftmaxX = 43, - miopenTensorSoftmaxY = 44, - miopenTensorSoftmaxDX = 45, - miopenTensorSoftmaxDY = 46, - miopenTensorBatchnormX = 47, - miopenTensorBatchnormY = 48, - miopenTensorBatchnormRunningMean = 49, - miopenTensorBatchnormRunningVariance = 50, - miopenTensorBatchnormSavedMean = 51, - miopenTensorBatchnormSavedVariance = 52, - miopenTensorBatchnormScale = 53, - miopenTensorBatchnormScaleDiff = 54, - miopenTensorBatchnormEstimatedMean = 55, - miopenTensorBatchnormEstimatedVariance = 56, - miopenTensorBatchnormBias = 57, - miopenTensorBatchnormBiasDiff = 58, - miopenTensorBatchnormDX = 59, - miopenTensorBatchnormDY = 60, + miopenTensorActivationX = 37, + miopenTensorActivationY = 38, + miopenTensorActivationDX = 39, + miopenTensorActivationDY = 40, + miopenTensorBiasX = 41, + miopenTensorBiasY = 42, + miopenTensorBias = 43, + miopenTensorSoftmaxX = 44, + miopenTensorSoftmaxY = 45, + miopenTensorSoftmaxDX = 46, + miopenTensorSoftmaxDY = 47, + miopenTensorBatchnormX = 48, + miopenTensorBatchnormY = 49, + miopenTensorBatchnormRunningMean = 50, + miopenTensorBatchnormRunningVariance = 51, + miopenTensorBatchnormSavedMean = 52, + miopenTensorBatchnormSavedVariance = 53, + miopenTensorBatchnormScale = 54, + miopenTensorBatchnormScaleDiff = 55, + miopenTensorBatchnormEstimatedMean = 56, + miopenTensorBatchnormEstimatedVariance = 57, + miopenTensorBatchnormBias = 58, + miopenTensorBatchnormBiasDiff = 59, + miopenTensorBatchnormDX = 60, + miopenTensorBatchnormDY = 61, #endif miopenTensorArgumentIsScalar = 1U << 31, + miopenTensorMhaMask = miopenTensorArgumentIsScalar | 1, #ifdef MIOPEN_BETA_API - miopenScalarBatchnormExpAvgFactor = miopenTensorArgumentIsScalar | 1, - miopenScalarBatchnormEpsilon = miopenTensorArgumentIsScalar | 2, + miopenScalarBatchnormExpAvgFactor = miopenTensorArgumentIsScalar | 2, + miopenScalarBatchnormEpsilon = miopenTensorArgumentIsScalar | 3, #endif } miopenTensorArgumentId_t; @@ -5564,6 +5695,15 @@ MIOPEN_EXPORT miopenStatus_t miopenCreateConvProblem(miopenProblem_t* problem, * @return miopenStatus_t */ +/*! @enum miopenMhaMask_t + * Different masks for Mha. + */ +typedef enum +{ + miopenMhaMaskNone = 0, + miopenMhaMaskCausal = 1, +} miopenMhaMask_t; + MIOPEN_EXPORT miopenStatus_t miopenCreateMhaProblem(miopenProblem_t* problem, miopenMhaDescriptor_t operatorDesc, miopenProblemDirection_t direction); @@ -7768,6 +7908,40 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle, void* dx); /** @} */ // CLOSEOUT ROPE DOXYGEN GROUP +// kthvalue APIs +/** @addtogroup kthvalue + * + * @{ + */ + +/*! @brief Execute a Kthvalue forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param indices Data tensor indices (output) + * @param indicesDesc Tensor descriptor for indices tensor (input) + * @param k The k-th smallest element(input) + * @param dim The dimension to find the kth value along (Default = -1)(input) + * @param keepDim Whether the output tensor has dim retained or not (Default = + * false)(input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenKthvalueForward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t outputDesc, + void* output, + miopenTensorDescriptor_t indicesDesc, + size_t* indices, + size_t k, + int32_t dim = -1, + bool keepDim = false); + +/** @} */ +// CLOSEOUT kthvalue DOXYGEN GROUP #endif // MIOPEN_BETA_API #ifdef MIOPEN_BETA_API @@ -7824,6 +7998,276 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, // CLOSEOUT RELU DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API + +/*! @ingroup LossFunction + * @enum miopenLossReductionMode_t + * Reduction mode for loss function + */ +typedef enum +{ + MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */ + MIOPEN_LOSS_REDUCTION_SUM = 1, /*!< output tensor elements are summed up */ + MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total + number of elements to get mean value */ +} miopenLossReductionMode_t; + +// SoftMarginLoss APIs +/** @addtogroup LossFunction + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the +SoftMarginLossForward call + * + * @param [in] handle MIOpen Handle + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] targetDesc Tensor descriptor for target tensor + * @param [in] outputDesc Tensor descriptor for output tensor +* @param [in] reduction Reduction mode (sum, mean). For none reduction we don't need to +use this function + * @param [out] sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetSoftMarginLossForwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t targetDesc, + miopenTensorDescriptor_t outputDesc, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes); + +/*! @brief Execute a SoftMarginLoss forward layer + * + * @param [in] handle MIOpen handle + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] input Data tensor input + * @param [in] targetDesc Tensor descriptor for target tensor + * @param [in] target Data tensor target + * @param [in] outputDesc Tensor descriptor for output tensor + * @param [out] output Data tensor output + * @param [in] reduction Reduction mode. If reduction mode is mean or sum, you must + * provide param workspace and workspaceSizeInBytes. Call + * miopenGetSoftMarginLossForwardWorkspaceSize to get workspaceSizeInBytes + * @param [in] workspace Address of the allocated workspace data (Default = null) + * @param [in] workspaceSizeInBytes Size in bytes of the allocated workspace data (Default = 0) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossForward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t outputDesc, + void* output, + miopenLossReductionMode_t reduction, + void* workspace = nullptr, + size_t workspaceSizeInBytes = 0); + +/*! @brief Execute a SoftMarginLoss backward layer + * + * @param [in] handle MIOpen handle + * @param [in] inputDesc Tensor descriptor for input tensor + * @param [in] input Data tensor input + * @param [in] targetDesc Tensor descriptor for target tensor + * @param [in] target Data tensor target + * @param [in] doutputDesc Tensor descriptor for output gradient + * @param [in] doutput Output gradient + * @param [in] dinputDesc Tensor descriptor for input gradient + * @param [out] dinput Input gradient + * @param [in] reduction Reduction mode (none, sum, mean) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenLossReductionMode_t reduction); + +/** @} */ +// CLOSEOUT LossFunction DOXYGEN GROUP +#endif // MIOPEN_BETA_API + +#ifdef MIOPEN_BETA_API +// MultiMarginLoss APIs +/** @addtogroup LossFunction + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the +MultiMarginLoss Forward call + * + * @param [in] handle MIOpen Handle + * @param [in] inputDesc Tensor descriptor for input tensor (N, C) where N is the batch +size and C is the number of classes + * @param [in] targetDesc Tensor descriptor for target tensor, must have shape (N). Each +value is between 0 and C - 1 + * @param [in] weightDesc Tensor descriptor for weight tensor. It is a manual rescaling +weight given to each class. It has to be a Tensor of size C + * @param [in] outputDesc Tensor descriptor for output tensor. If reduction is 'none, +then it must have shape (N). Otherwise, it is a scalar + * @param [in] p Has a default value of 1. The only supported values are 1 and 2 + * @param [in] margin Has a default value of 1 + * @param [in] reduction Reduction mode (sum, mean) + * @param [out] sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetMultiMarginLossForwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t targetDesc, + miopenTensorDescriptor_t weightDesc, + miopenTensorDescriptor_t outputDesc, + long p, + float margin, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes); + +/*! @brief Execute a MultiMarginLoss forward layer + * + * @param [in] handle MIOpen handle + * @param [in] inputDesc Tensor descriptor for input tensor (N, C) where N is the +batch size and C is the number of classes. + * @param [in] input Data tensor input + * @param [in] targetDesc Tensor descriptor for target tensor, must have shape (N). +Each value is between 0 and C - 1 + * @param [in] target Data tensor target + * @param [in] weightDesc Tensor descriptor for weight tensor. It is a manual +rescaling weight given to each class. It has to be a Tensor of size C + * @param [in] weight Data tensor weight + * @param [in] outputDesc Tensor descriptor for output tensor. If reduction is 'none, +then it must have shape (N). Otherwise, it is a scalar. + * @param [out] output Data tensor output + * @param [in] p Has a default value of 1. The only supported values are 1 +and 2 + * @param [in] margin Has a default value of 1 + * @param [in] reduction Reduction mode. If reduction mode is mean or sum, you must + * provide param workspace and workspaceSizeInBytes. Call + * miopenGetMultiMarginLossForwardWorkspaceSize to get workspaceSizeInBytes + * @param [in] workspace Address of the allocated workspace data. Set = nullptr if +reduction = 'none' + * @param [in] workspaceSizeInBytes Size in bytes of the allocated workspace data. Set = 0 if +reduction = 'none + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t weightDesc, + const void* weight, + miopenTensorDescriptor_t outputDesc, + void* output, + long p, + float margin, + miopenLossReductionMode_t reduction, + void* workspace, + size_t workspaceSizeInBytes); + +/** @} */ +// CLOSEOUT LossFunction DOXYGEN GROUP +#endif // MIOPEN_BETA_API + +#ifdef MIOPEN_BETA_API +// SigmoidFocalLoss APIs +/** @addtogroup LossFunction + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the SigmoidFocalLoss + * Forward call + * + * @param handle MIOpen Handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param reduction Reduction (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t targetDesc, + miopenTensorDescriptor_t outputDesc, + miopenLossReductionMode_t reduction, + size_t* sizeInBytes); + +/*! @brief Execute a SigmoidFocalLoss forward layer + * + * @param handle MIOpen handle (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param target Data tensor target (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param alpha Alpha (input) + * @param gamma Gamma (input) + * @param reduction Reduction (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t outputDesc, + void* output, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +/*! @brief Execute a SigmoidFocalLoss backward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param targetDesc Tensor descriptor for target tensor (input) + * @param target Data tensor target (input) + * @param doutputDesc Tensor descriptor for output gradient (input) + * @param doutput Gradient of output (input) + * @param dinputDesc Tensor descriptor for input gradient (input) + * @param dinput Gradient of input (output) + * @param dtargetDesc Tensor descriptor for target gradient (input) + * @param dtarget Gradient of target (output) + * @param alpha Alpha (input) + * @param gamma Gamma (input) + * @param reduction Reduction (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t targetDesc, + const void* target, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dtargetDesc, + void* dtarget, + float alpha, + float gamma, + miopenLossReductionMode_t reduction); + +/** @} */ +// CLOSEOUT LossFunction DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp index 9cb3bd15e8..67d566c935 100644 --- a/src/include/miopen/sigmoidfocalloss/solvers.hpp +++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp @@ -27,6 +27,7 @@ #include #include +#include namespace miopen { diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index a5d62b6092..3524c33451 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -32,7 +32,6 @@ #include #include -#include namespace miopen { @@ -62,6 +61,7 @@ enum class Primitive Item, RoPE, ReLU, + Kthvalue, Loss }; diff --git a/src/solver.cpp b/src/solver.cpp index f0b2854de7..167a085872 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -24,22 +24,25 @@ * *******************************************************************************/ -#include - #include #include #include #include +#include #include -#include #include +#include +#include +#include #include +#include +#include #include #include #include #include -#include #include +#include #include #include @@ -57,6 +60,14 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS) namespace miopen { + +namespace debug { + +// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) +bool enable_deprecated_solvers = false; + +} // namespace debug + namespace solver { std::ostream& operator<<(std::ostream& os, const KernelInfo& k) @@ -678,9 +689,24 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId()); Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId()); - + + Register(registry, ++id, Primitive::Kthvalue, kthvalue::KthvalueFwd{}.SolverDbId()); + + Register(registry, ++id, Primitive::Activation, glu::GLUForward{}.SolverDbId()); + Register(registry, ++id, Primitive::Activation, glu::GLUBackward{}.SolverDbId()); + + Register(registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossForward{}.SolverDbId()); + Register( + registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossBackward{}.SolverDbId()); + + Register( + registry, ++id, Primitive::Loss, multimarginloss::MultiMarginLossForward{}.SolverDbId()); + + Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + Register(registry, ++id, Primitive::Loss, @@ -692,12 +718,15 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossFwd{}.SolverDbId()); Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossBwd{}.SolverDbId()); - // IMPORTANT: New solvers should be added to the end of the function! + // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white + // space between this comment and the newly registered solver(s)! } bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx) { static const bool device_is_allowed = [&]() { + if(miopen::debug::enable_deprecated_solvers) + return true; if(env::enabled(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS)) return true; const auto device = ctx.GetStream().GetTargetProperties().Name(); From 009c7853fb1ad2156d529e9dc4d6ca9bd4507016 Mon Sep 17 00:00:00 2001 From: long10024070 Date: Tue, 5 Nov 2024 10:45:26 +0000 Subject: [PATCH 28/28] fix clang tidy --- include/miopen/miopen.h | 4 ++-- src/solver.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 27b5ebe327..5524874405 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -8094,8 +8094,8 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API -#ifdef MIOPEN_BETA_API // MultiMarginLoss APIs +#ifdef MIOPEN_BETA_API /** @addtogroup LossFunction * * @{ @@ -8176,8 +8176,8 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, // CLOSEOUT LossFunction DOXYGEN GROUP #endif // MIOPEN_BETA_API -#ifdef MIOPEN_BETA_API // SigmoidFocalLoss APIs +#ifdef MIOPEN_BETA_API /** @addtogroup LossFunction * * @{ diff --git a/src/solver.cpp b/src/solver.cpp index ddb206e4ab..167a085872 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -44,7 +44,6 @@ #include #include #include -#include #include #include