From d4f50a2324e9294e5cff5291a66412ba9fad7e8d Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Mon, 5 Aug 2024 15:06:17 +0700
Subject: [PATCH 01/28] resolve conflict

---
 .githooks/post-checkout                       |   3 +
 .githooks/post-commit                         |   3 +
 .githooks/post-merge                          |   3 +
 .githooks/pre-push                            |   3 +
 docs/reference/index.rst                      |   1 +
 driver/CMakeLists.txt                         |   1 +
 driver/dm_sigmoid_focal_loss.cpp              |  41 +
 driver/driver.hpp                             |  10 +-
 driver/sigmoid_focal_loss_driver.hpp          | 777 ++++++++++++++++++
 include/miopen/miopen.h                       |  92 +++
 rocfft_r2c_ex.cpp                             | 317 +++++++
 src/CMakeLists.txt                            |  10 +
 src/include/miopen/sigmoid_focal_loss.hpp     |  71 ++
 .../miopen/sigmoidfocalloss/invoke_params.hpp |  79 ++
 .../sigmoidfocalloss/problem_description.hpp  | 118 +++
 .../miopen/sigmoidfocalloss/solvers.hpp       | 121 +++
 src/include/miopen/sigmoidfocalloss/utils.hpp |  49 ++
 src/include/miopen/solver_id.hpp              |   3 +-
 src/include/miopen/tensor_view_utils.hpp      |  11 +-
 src/kernels/MIOpenLossSum.cpp                 |  56 ++
 src/kernels/MIOpenSigmoidFocalLoss.cpp        | 329 ++++++++
 src/kernels/warp_shuffle.hpp                  |  72 ++
 src/sigmoid_focal_loss.cpp                    | 170 ++++
 src/sigmoid_focal_loss_api.cpp                | 192 +++++
 src/sigmoidfocalloss/problem_description.cpp  |  88 ++
 src/solver.cpp                                |  12 +
 .../backward_reduce_sigmoid_focal_loss.cpp    | 119 +++
 .../backward_unreduce_sigmoid_focal_loss.cpp  | 113 +++
 .../forward_reduce_sigmoid_focal_loss.cpp     | 186 +++++
 .../forward_unreduce_sigmoid_focal_loss.cpp   | 107 +++
 test/cpu_sigmoid_focal_loss.hpp               | 238 ++++++
 test/gtest/sigmoid_focal_loss.cpp             | 325 ++++++++
 test/gtest/sigmoid_focal_loss.hpp             | 489 +++++++++++
 33 files changed, 4202 insertions(+), 7 deletions(-)
 create mode 100755 .githooks/post-checkout
 create mode 100755 .githooks/post-commit
 create mode 100755 .githooks/post-merge
 create mode 100755 .githooks/pre-push
 create mode 100644 driver/dm_sigmoid_focal_loss.cpp
 create mode 100644 driver/sigmoid_focal_loss_driver.hpp
 create mode 100644 rocfft_r2c_ex.cpp
 create mode 100644 src/include/miopen/sigmoid_focal_loss.hpp
 create mode 100644 src/include/miopen/sigmoidfocalloss/invoke_params.hpp
 create mode 100644 src/include/miopen/sigmoidfocalloss/problem_description.hpp
 create mode 100644 src/include/miopen/sigmoidfocalloss/solvers.hpp
 create mode 100644 src/include/miopen/sigmoidfocalloss/utils.hpp
 create mode 100644 src/kernels/MIOpenLossSum.cpp
 create mode 100644 src/kernels/MIOpenSigmoidFocalLoss.cpp
 create mode 100644 src/kernels/warp_shuffle.hpp
 create mode 100644 src/sigmoid_focal_loss.cpp
 create mode 100644 src/sigmoid_focal_loss_api.cpp
 create mode 100644 src/sigmoidfocalloss/problem_description.cpp
 create mode 100644 src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp
 create mode 100644 src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp
 create mode 100644 src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
 create mode 100644 src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp
 create mode 100644 test/cpu_sigmoid_focal_loss.hpp
 create mode 100644 test/gtest/sigmoid_focal_loss.cpp
 create mode 100644 test/gtest/sigmoid_focal_loss.hpp

diff --git a/.githooks/post-checkout b/.githooks/post-checkout
new file mode 100755
index 0000000000..ca7fcb4008
--- /dev/null
+++ b/.githooks/post-checkout
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-checkout "$@"
diff --git a/.githooks/post-commit b/.githooks/post-commit
new file mode 100755
index 0000000000..52b339cb3f
--- /dev/null
+++ b/.githooks/post-commit
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-commit "$@"
diff --git a/.githooks/post-merge b/.githooks/post-merge
new file mode 100755
index 0000000000..a912e667aa
--- /dev/null
+++ b/.githooks/post-merge
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-merge "$@"
diff --git a/.githooks/pre-push b/.githooks/pre-push
new file mode 100755
index 0000000000..0f0089bc25
--- /dev/null
+++ b/.githooks/pre-push
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs pre-push "$@"
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 90e29ffaa9..2387ef1be8 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,3 +35,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
+  * :doc:`SigmoidFocalLoss <../doxygen/html/group__loss_function>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index cd663eb8b4..c8763f0c7b 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -52,6 +52,7 @@ add_executable(MIOpenDriver
     dm_reduceextreme.cpp
     dm_reducecalculation.cpp
     dm_rnn.cpp
+    dm_sigmoid_focal_loss.cpp
     dm_softmax.cpp
     dm_t5layernorm.cpp
     dm_tensorop.cpp
diff --git a/driver/dm_sigmoid_focal_loss.cpp b/driver/dm_sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..001f2964b5
--- /dev/null
+++ b/driver/dm_sigmoid_focal_loss.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "registry_driver_maker.hpp"
+#include "sigmoid_focal_loss_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "sigmoidfocalloss")
+        return new SigmoidFocalLossDriver<float>();
+    else if(base_arg == "sigmoidfocallossfp16")
+        return new SigmoidFocalLossDriver<float16>();
+    else if(base_arg == "sigmoidfocallossbfp16")
+        return new SigmoidFocalLossDriver<bfloat16>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index b23df690d1..749ee16a17 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], "
+           "sigmoidfocalloss[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -205,8 +206,11 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
-       arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version")
+       arg != "getitemfp16" && arg != "getitembfp16" && arg != "transformersadamwfp16" &&
+       arg != "transformersampadamw" && arg != "reducecalculation" &&
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" &&
+       arg != "sigmoidfocalloss" && arg != "sigmoidfocallossfp16" &&
+       arg != "sigmoidfocallossbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
new file mode 100644
index 0000000000..6c739a3911
--- /dev/null
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -0,0 +1,777 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include <miopen/errors.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/miopen.h>
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "random.hpp"
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+template <typename TIO>
+void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input,
+                                            miopenTensorDescriptor_t inputDesc,
+                                            TIO* target,
+                                            miopenTensorDescriptor_t targetDesc,
+                                            TIO* outputHost,
+                                            miopenTensorDescriptor_t outputDesc,
+                                            float alpha = 0.25,
+                                            float gamma = 2)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    auto output_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+
+        float sig    = 1 / (1 + exp(-i));
+        float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
+        float sigT   = sig * t + (1 - sig) * (1 - t);
+        float loss   = ceLoss * pow(1 - sigT, gamma);
+
+        if(alpha >= 0)
+        {
+            float alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss         = alphaT * loss;
+        }
+
+        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(loss);
+    }
+}
+
+template <class TIO>
+void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input,
+                                            miopenTensorDescriptor_t inputDesc,
+                                            TIO* target,
+                                            miopenTensorDescriptor_t targetDesc,
+                                            TIO* doutput,
+                                            miopenTensorDescriptor_t doutputDesc,
+                                            TIO* dinput,
+                                            miopenTensorDescriptor_t dinputDesc,
+                                            TIO* dtarget,
+                                            miopenTensorDescriptor_t dtargetDesc,
+                                            float alpha = 0.25,
+                                            float gamma = 2)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    auto doutput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    auto dinput_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+    auto dtarget_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc));
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+
+        float p       = 1 / (1 + exp(-i));
+        float ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+        float pT      = p * t + (1 - p) * (1 - t);
+        float powPt   = pow(1 - pT, gamma);
+        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+        if(dinput)
+        {
+            float dpdi      = exp(-i) / pow(1 + exp(-i), 2);
+            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            float dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            float grad = dO * dLdi;
+
+            if(alpha >= 0)
+            {
+                grad *= alpha_t;
+            }
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+        }
+
+        if(dtarget)
+        {
+            float dcelossdt = -log(p) + log(1 - p);
+            float dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            float gradTarget = dO * dLdt;
+
+            if(alpha >= 0)
+            {
+                // alpha_t * dL/dt + dalpha_t/dt * dL
+                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+            }
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+        }
+    }
+}
+
+template <typename TIO>
+void mloSigmoidFocalLossFwdRunHost(TIO* input,
+                                   miopenTensorDescriptor_t inputDesc,
+                                   TIO* target,
+                                   miopenTensorDescriptor_t targetDesc,
+                                   TIO* workspace,
+                                   TIO* ref_output,
+                                   float alpha   = 0.25,
+                                   float gamma   = 2,
+                                   float divisor = 1)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+
+        float sig    = 1 / (1 + exp(-i));
+        float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
+        float sigT   = sig * t + (1 - sig) * (1 - t);
+        float loss   = ceLoss * pow(1 - sigT, gamma);
+
+        if(alpha >= 0)
+        {
+            float alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss         = alphaT * loss;
+        }
+
+        workspace[id] = static_cast<TIO>(loss / divisor);
+    }
+
+    // Reduce loss
+    const int local_size = 256;
+    int offset_a         = 0;
+    int offset_b         = inputSize;
+    size_t _size         = inputSize;
+    do
+    {
+        for(int i = 0; i < _size; i += local_size)
+        {
+            TIO shared[local_size];
+            for(int j = 0; j < local_size; ++j)
+                shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f;
+            for(int offset = local_size / 2; offset > 0; offset >>= 1)
+                for(int j = 0; j < offset; ++j)
+                    shared[j] += shared[j + offset];
+            if(_size <= local_size)
+                ref_output[0] = shared[0];
+            else
+                workspace[offset_b + i / local_size] = shared[0];
+        }
+        std::swap(offset_a, offset_b);
+        _size = (_size + local_size - 1) / local_size;
+    } while(_size > 1);
+}
+
+template <class TIO>
+void mloSigmoidFocalLossBwdRunHost(TIO* input,
+                                   miopenTensorDescriptor_t inputDesc,
+                                   TIO* target,
+                                   miopenTensorDescriptor_t targetDesc,
+                                   TIO* doutput,
+                                   miopenTensorDescriptor_t doutputDesc,
+                                   TIO* dinput,
+                                   miopenTensorDescriptor_t dinputDesc,
+                                   TIO* dtarget,
+                                   miopenTensorDescriptor_t dtargetDesc,
+                                   float alpha   = 0.25,
+                                   float gamma   = 2,
+                                   float divisor = 1)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+    auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc));
+
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    tensor_layout_t<5> doIdx(input_tv, 0);
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+
+        float p       = 1 / (1 + exp(-i));
+        float ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+        float pT      = p * t + (1 - p) * (1 - t);
+        float powPt   = pow(1 - pT, gamma);
+        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+        if(dinput)
+        {
+            float dpdi      = exp(-i) / pow(1 + exp(-i), 2);
+            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            float dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            float grad = dO * dLdi;
+
+            if(alpha >= 0)
+            {
+                grad *= alpha_t;
+            }
+            grad /= divisor;
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+        }
+
+        if(dtarget)
+        {
+            float dcelossdt = -log(p) + log(1 - p);
+            float dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            float gradTarget = dO * dLdt;
+
+            if(alpha >= 0)
+            {
+                // alpha_t * dL/dt + dalpha_t/dt * dL
+                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+            }
+            gradTarget /= divisor;
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+        }
+    }
+}
+
+template <typename TIO>
+class SigmoidFocalLossDriver : public Driver
+{
+public:
+    SigmoidFocalLossDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&targetDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&dtargetDesc);
+
+        data_type = miopen_type<TIO>{};
+    }
+
+    std::vector<int> ComputeStrides(std::vector<int> input);
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~SigmoidFocalLossDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(targetDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(dtargetDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t targetDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+    miopenTensorDescriptor_t dtargetDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> target_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+    std::unique_ptr<GPUMem> dtarget_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<TIO> input;
+    std::vector<TIO> target;
+    std::vector<TIO> output;
+    std::vector<TIO> outputHost;
+    std::vector<TIO> doutput;
+    std::vector<TIO> dinput;
+    std::vector<TIO> dinputHost;
+    std::vector<TIO> dtarget;
+    std::vector<TIO> dtargetHost;
+    std::vector<TIO> workspace;
+
+    float alpha;
+    float gamma;
+    float divisor;
+    bool isContiguous;
+    bool isTargetGradientComputed;
+    miopenLossReductionMode_t reduction;
+
+    size_t workSpaceSizeInBytes;
+};
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::GetandSetData()
+{
+    auto inDims              = inflags.GetValueTensor("dim-lengths").lengths;
+    alpha                    = inflags.GetValueDouble("alpha");
+    gamma                    = inflags.GetValueDouble("gamma");
+    isContiguous             = inflags.GetValueInt("is-contiguous") == 1 ? true : false;
+    isTargetGradientComputed = inflags.GetValueInt("target-gradient") == 1 ? true : false;
+    reduction = static_cast<miopenLossReductionMode_t>(inflags.GetValueInt("reduction"));
+
+    std::vector<int> inStride = ComputeStrides(inDims);
+
+    SetTensorNd(inputDesc, inDims, inStride, data_type);
+    SetTensorNd(targetDesc, inDims, inStride, data_type);
+    SetTensorNd(doutputDesc, inDims, data_type);
+    SetTensorNd(dinputDesc, inDims, data_type);
+
+    if(isTargetGradientComputed)
+    {
+        SetTensorNd(dtargetDesc, inDims, data_type);
+    }
+    else
+    {
+        std::vector<int> dtargetDim(1);
+        dtargetDim[0] = 1;
+        SetTensorNd(dtargetDesc, dtargetDim, data_type);
+    }
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        SetTensorNd(outputDesc, inDims, data_type);
+    }
+    else
+    {
+        std::vector<int> outDim(1);
+        outDim[0] = 1;
+        SetTensorNd(outputDesc, outDim, data_type);
+        divisor = 1;
+        if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        {
+            divisor = miopen::deref(inputDesc).GetElementSize();
+        }
+    }
+
+    return 0;
+}
+
+// Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
+template <typename TIO>
+std::vector<int> SigmoidFocalLossDriver<TIO>::ComputeStrides(std::vector<int> inputDim)
+{
+    if(!isContiguous)
+        std::swap(inputDim.front(), inputDim.back());
+    std::vector<int> strides(inputDim.size());
+    strides.back() = 1;
+    for(int i = inputDim.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * inputDim[i + 1];
+    if(!isContiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward (Default=1)", "int");
+    inflags.AddTensorFlag(
+        "dim-lengths", 'D', "256x4x2", "The dimensional lengths of the input tensor");
+    inflags.AddInputFlag("is-contiguous", 'c', "1", "is-contiguous (Default=1)", "int");
+    inflags.AddInputFlag(
+        "reduction", 'R', "0", "reduction mode: 0(default) - unreduced, 1 - sum, 2 -mean", "int");
+    inflags.AddInputFlag("alpha", 'A', "0.25", "Alpha (Default=0.25)", "float");
+    inflags.AddInputFlag("gamma", 'G', "2", "Gamma (Default=2)", "float");
+    inflags.AddInputFlag(
+        "target-gradient", 'T', "0", "Is target gradient computed (Default=0)", "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::AllocateBuffersAndCopy()
+{
+    size_t in_sz     = miopen::deref(inputDesc).GetElementSize();
+    size_t target_sz = miopen::deref(targetDesc).GetElementSize();
+    size_t out_sz    = miopen::deref(outputDesc).GetElementSize();
+    size_t dO_sz     = miopen::deref(doutputDesc).GetElementSize();
+    size_t dI_sz     = miopen::deref(dinputDesc).GetElementSize();
+    size_t dT_sz     = miopen::deref(dtargetDesc).GetElementSize();
+
+    uint32_t ctx = 0;
+
+    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(TIO)));
+    target_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, target_sz, sizeof(TIO)));
+    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(TIO)));
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dO_sz, sizeof(TIO)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dI_sz, sizeof(TIO)));
+    dtarget_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dT_sz, sizeof(TIO)));
+
+    miopenGetSigmoidFocalLossForwardWorkspaceSize(
+        handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes);
+    workspace_dev =
+        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(TIO), sizeof(TIO)));
+
+    input       = std::vector<TIO>(in_sz, static_cast<TIO>(0));
+    target      = std::vector<TIO>(target_sz, static_cast<TIO>(0));
+    output      = std::vector<TIO>(out_sz, static_cast<TIO>(0));
+    outputHost  = std::vector<TIO>(out_sz, static_cast<TIO>(0));
+    doutput     = std::vector<TIO>(dO_sz, static_cast<TIO>(0));
+    dinput      = std::vector<TIO>(dI_sz, static_cast<TIO>(0));
+    dinputHost  = std::vector<TIO>(dI_sz, static_cast<TIO>(0));
+    dtarget     = std::vector<TIO>(dT_sz, static_cast<TIO>(0));
+    dtargetHost = std::vector<TIO>(dT_sz, static_cast<TIO>(0));
+    workspace   = std::vector<TIO>(workSpaceSizeInBytes / sizeof(TIO), static_cast<TIO>(0));
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        input[i]  = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
+        target[i] = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
+    }
+    for(int i = 0; i < dO_sz; ++i)
+    {
+        doutput[i] = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
+    }
+
+    fill(output.begin(), output.end(), static_cast<TIO>(0));
+    fill(dinput.begin(), dinput.end(), static_cast<TIO>(0));
+    fill(dtarget.begin(), dtarget.end(), static_cast<TIO>(0));
+
+    if(input_dev->ToGPU(GetStream(), input.data()) != 0)
+        std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl;
+
+    if(target_dev->ToGPU(GetStream(), target.data()) != 0)
+        std::cerr << "Error copying (in) to GPU, size: " << target_dev->GetSize() << std::endl;
+
+    if(output_dev->ToGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out) to GPU, size: " << output_dev->GetSize() << std::endl;
+
+    if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0)
+        std::cerr << "Error copying (dO) to GPU, size: " << doutput_dev->GetSize() << std::endl;
+
+    if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dI) to GPU, size: " << dinput_dev->GetSize() << std::endl;
+
+    if(dtarget_dev->ToGPU(GetStream(), dtarget.data()) != 0)
+        std::cerr << "Error copying (dT) to GPU, size: " << dtarget_dev->GetSize() << std::endl;
+
+    if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0)
+        std::cerr << "Error copying (dI) to GPU, size: " << workspace_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenSigmoidFocalLossForward(GetHandle(),
+                                      workspace_dev->GetMem(),
+                                      workSpaceSizeInBytes,
+                                      inputDesc,
+                                      input_dev->GetMem(),
+                                      targetDesc,
+                                      target_dev->GetMem(),
+                                      outputDesc,
+                                      output_dev->GetMem(),
+                                      alpha,
+                                      gamma,
+                                      reduction);
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Sigmoid Focal Loss Fwd Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Sigmoid Focal Loss Fwd Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::RunForwardCPU()
+{
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        mloSigmoidFocalLossUnreducedFwdRunHost<TIO>(input.data(),
+                                                    inputDesc,
+                                                    target.data(),
+                                                    targetDesc,
+                                                    outputHost.data(),
+                                                    outputDesc,
+                                                    alpha,
+                                                    gamma);
+    }
+    else
+    {
+        mloSigmoidFocalLossFwdRunHost<TIO>(input.data(),
+                                           inputDesc,
+                                           target.data(),
+                                           targetDesc,
+                                           workspace.data(),
+                                           outputHost.data(),
+                                           alpha,
+                                           gamma,
+                                           divisor);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        void* p_dtarget = nullptr;
+        if(isTargetGradientComputed)
+        {
+            p_dtarget = dtarget_dev->GetMem();
+        }
+
+        miopenSigmoidFocalLossBackward(GetHandle(),
+                                       inputDesc,
+                                       input_dev->GetMem(),
+                                       targetDesc,
+                                       target_dev->GetMem(),
+                                       doutputDesc,
+                                       doutput_dev->GetMem(),
+                                       dinputDesc,
+                                       dinput_dev->GetMem(),
+                                       dtargetDesc,
+                                       p_dtarget,
+                                       alpha,
+                                       gamma,
+                                       reduction);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Sigmoid Focal Loss Bwd Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Sigmoid Focal Loss Bwd Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dI_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+    if(isTargetGradientComputed && dtarget_dev->FromGPU(GetStream(), dtarget.data()) != 0)
+        std::cerr << "Error copying (dT_dev) from GPU, size: " << dtarget_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::RunBackwardCPU()
+{
+    TIO* p_dtarget = nullptr;
+    if(isTargetGradientComputed)
+    {
+        p_dtarget = dtargetHost.data();
+    }
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+
+        mloSigmoidFocalLossUnreducedBwdRunHost<TIO>(input.data(),
+                                                    inputDesc,
+                                                    target.data(),
+                                                    targetDesc,
+                                                    doutput.data(),
+                                                    doutputDesc,
+                                                    dinputHost.data(),
+                                                    dinputDesc,
+                                                    p_dtarget,
+                                                    dtargetDesc,
+                                                    alpha,
+                                                    gamma);
+    }
+    else
+    {
+        mloSigmoidFocalLossBwdRunHost<TIO>(input.data(),
+                                           inputDesc,
+                                           target.data(),
+                                           targetDesc,
+                                           doutput.data(),
+                                           doutputDesc,
+                                           dinputHost.data(),
+                                           dinputDesc,
+                                           p_dtarget,
+                                           dtargetDesc,
+                                           alpha,
+                                           gamma,
+                                           divisor);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::VerifyForward()
+{
+    RunForwardCPU();
+
+    double tolerance = std::numeric_limits<TIO>::epsilon() * 10;
+    auto error       = miopen::rms_range(outputHost, output);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward " << reduction << " Sigmoid Focal Loss FAILED: " << error << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward " << reduction << " Sigmoid Focal Loss Verifies OK on CPU reference ("
+                  << error << "< " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename TIO>
+int SigmoidFocalLossDriver<TIO>::VerifyBackward()
+{
+    RunBackwardCPU();
+
+    double tolerance  = std::numeric_limits<TIO>::epsilon() * 10;
+    auto dinputError  = miopen::rms_range(dinputHost, dinput);
+    auto dtargetError = miopen::rms_range(dtargetHost, dtarget);
+
+    if(!std::isfinite(dinputError) || dinputError > tolerance)
+    {
+        std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dinputError
+                  << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else if(isTargetGradientComputed && (!std::isfinite(dtargetError) || dtargetError > tolerance))
+    {
+        std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dtargetError
+                  << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Backward " << reduction
+                  << " Sigmoid Focal Loss Verifies OK on CPU reference (dinput: " << dinputError
+                  << ", dtarget: " << dtargetError << "< " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 3b9bbeccc1..c983f92619 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -5004,6 +5004,98 @@ MIOPEN_EXPORT miopenStatus_t miopenCTCLoss(miopenHandle_t handle,
                                            void* workSpace,
                                            size_t workSpaceSize);
 
+#ifdef MIOPEN_BETA_API
+
+typedef enum
+{
+    MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */
+    MIOPEN_LOSS_REDUCTION_SUM  = 1, /*!< output tensor elements are summed up */
+    MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total
+                                       number of elements to get mean value */
+} miopenLossReductionMode_t;
+
+/*! @brief Helper function to query the minimum workspace size required by the sigmoid focal loss
+ * call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param reduction                Reduction (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle,
+                                              miopenTensorDescriptor_t inputDesc,
+                                              miopenTensorDescriptor_t targetDesc,
+                                              miopenTensorDescriptor_t outputDesc,
+                                              miopenLossReductionMode_t reduction,
+                                              size_t* sizeInBytes);
+
+/*! @brief Execute a SigmoidFocalLoss forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param target                   Data tensor target (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param alpha                    Alpha (input)
+ * @param gamma                    Gamma (input)
+ * @param reduction                Reduction (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle,
+                                                           void* workspace,
+                                                           size_t workspaceSizeInBytes,
+                                                           miopenTensorDescriptor_t inputDesc,
+                                                           const void* input,
+                                                           miopenTensorDescriptor_t targetDesc,
+                                                           const void* target,
+                                                           miopenTensorDescriptor_t outputDesc,
+                                                           void* output,
+                                                           float alpha,
+                                                           float gamma,
+                                                           miopenLossReductionMode_t reduction);
+
+/*! @brief Execute a SigmoidFocalLoss backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param target                   Data tensor target (input)
+ * @param doutputDesc              Tensor descriptor for output gradient (input)
+ * @param doutput                  Gradient of output (input)
+ * @param dinputDesc               Tensor descriptor for input gradient (input)
+ * @param dinput                   Gradient of input (output)
+ * @param dtargetDesc              Tensor descriptor for target gradient (input)
+ * @param dtarget                  Gradient of target (output)
+ * @param alpha                    Alpha (input)
+ * @param gamma                    Gamma (input)
+ * @param reduction                Reduction (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle,
+                                                            miopenTensorDescriptor_t inputDesc,
+                                                            const void* input,
+                                                            miopenTensorDescriptor_t targetDesc,
+                                                            const void* target,
+                                                            miopenTensorDescriptor_t doutputDesc,
+                                                            const void* doutput,
+                                                            miopenTensorDescriptor_t dinputDesc,
+                                                            void* dinput,
+                                                            miopenTensorDescriptor_t dtargetDesc,
+                                                            void* dtarget,
+                                                            float alpha,
+                                                            float gamma,
+                                                            miopenLossReductionMode_t reduction);
+#endif
+
 /** @} */
 // CLOSEOUT LossFunction DOXYGEN GROUP
 
diff --git a/rocfft_r2c_ex.cpp b/rocfft_r2c_ex.cpp
new file mode 100644
index 0000000000..8c17fac21b
--- /dev/null
+++ b/rocfft_r2c_ex.cpp
@@ -0,0 +1,317 @@
+// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <complex>
+#include <functional>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_vector_types.h>
+#include <rocfft/rocfft.h>
+
+#include "../../../shared/CLI11.hpp"
+#include "examplekernels.h"
+#include "exampleutils.h"
+#include <stdexcept>
+
+int main(int argc, char* argv[])
+{
+    std::cout << "rocfft double-precision real/complex transform\n" << std::endl;
+
+    // Length of transform:
+    std::vector<size_t> length = {8};
+
+    // Gpu device id:
+    size_t deviceId = 0;
+
+    // Command-line options:
+    CLI::App app{"rocfft sample command line options"};
+    app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
+    CLI::Option* opt_outofplace =
+        app.add_flag("-o, --outofplace", "Perform an out-of-place transform");
+    CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform");
+    app.add_option(
+        "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)");
+
+    try
+    {
+        app.parse(argc, argv);
+    }
+    catch(const CLI::ParseError& e)
+    {
+        return app.exit(e);
+    }
+
+    // Placeness for the transform
+    if(rocfft_setup() != rocfft_status_success)
+        throw std::runtime_error("rocfft_setup failed.");
+    const rocfft_result_placement place =
+        *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace;
+    const bool inplace = place == rocfft_placement_inplace;
+
+    // Direction of transform
+    const rocfft_transform_type direction =
+        *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward;
+    const bool forward = direction == rocfft_transform_type_real_forward;
+
+    // Set up the strides and buffer size for the real values:
+    std::vector<size_t> rstride = {1};
+    for(unsigned int i = 1; i < length.size(); ++i)
+    {
+        // In-place transforms need space for two extra real values in the contiguous
+        // direction.
+        auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1];
+        rstride.push_back(val);
+    }
+    // NB: not tight, but hey
+    const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1];
+    std::vector<double> rdata(real_size); // host storage
+
+    // The complex data length is half + 1 of the real data length in the contiguous
+    // dimensions.  Since rocFFT is column-major, this is the first index.
+    std::vector<size_t> clength = length;
+    clength[0]                  = clength[0] / 2 + 1;
+    std::vector<size_t> cstride = {1};
+    for(unsigned int i = 1; i < clength.size(); ++i)
+    {
+        cstride.push_back(clength[i - 1] * cstride[i - 1]);
+    }
+    const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1];
+    std::vector<hipDoubleComplex> cdata(complex_size); // host storage
+
+    // Based on the direction, we set the input and output parameters appropriately.
+    const size_t isize  = forward ? real_size : complex_size;
+    const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex));
+    const std::vector<size_t> ilength = forward ? length : clength;
+    const std::vector<size_t> istride = forward ? rstride : cstride;
+
+    const size_t osize  = forward ? complex_size : real_size;
+    const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double));
+    const std::vector<size_t> olength = forward ? clength : length;
+    const std::vector<size_t> ostride = forward ? cstride : rstride;
+
+    // Print information about the transform:
+    std::cout << "direction: ";
+    if(forward)
+        std::cout << "forward\n";
+    else
+        std::cout << "inverse\n";
+    std::cout << "length:";
+    for(const auto i : length)
+        std::cout << " " << i;
+    std::cout << "\n";
+    if(inplace)
+        std::cout << "in-place transform\n";
+    else
+        std::cout << "out-of-place transform\n";
+    std::cout << "deviceID: " << deviceId << "\n";
+    std::cout << "input length:";
+    for(auto i : ilength)
+        std::cout << " " << i;
+    std::cout << "\n";
+    std::cout << "input buffer stride:";
+    for(auto i : istride)
+        std::cout << " " << i;
+    std::cout << "\n";
+    std::cout << "input buffer size: " << ibytes << "\n";
+
+    std::cout << "output length:";
+    for(auto i : olength)
+        std::cout << " " << i;
+    std::cout << "\n";
+    std::cout << "output buffer stride:";
+    for(auto i : ostride)
+        std::cout << " " << i;
+    std::cout << "\n";
+    std::cout << "output buffer size: " << obytes << "\n";
+    std::cout << std::endl;
+
+    // Set the device:
+    if(hipSetDevice(deviceId) != hipSuccess)
+        throw std::runtime_error("hipSetDevice failed.");
+
+    // Create HIP device object and initialize data
+    // Kernels are provided in examplekernels.h
+    void* gpu_in          = nullptr;
+    hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes);
+    if(hip_status != hipSuccess)
+        throw std::runtime_error("device error");
+
+    if(forward)
+    {
+        initreal_cm(length, istride, gpu_in);
+    }
+    else
+    {
+        init_hermitiancomplex_cm(length, ilength, istride, gpu_in);
+    }
+
+    // Print the input:
+    std::cout << "input:\n";
+    if(forward)
+    {
+        hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
+        printbuffer_cm(rdata, ilength, istride, 1, isize);
+    }
+    else
+    {
+        hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
+        printbuffer_cm(cdata, ilength, istride, 1, isize);
+
+        // Check that the buffer is Hermitian symmetric:
+        check_symmetry_cm(cdata, length, istride, 1, isize);
+    }
+
+    // rocfft_status can be used to capture API status info
+    rocfft_status rc = rocfft_status_success;
+
+    // Create the a descrition struct to set data layout:
+    rocfft_plan_description gpu_description = nullptr;
+    rc                                      = rocfft_plan_description_create(&gpu_description);
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to create plan description");
+
+    rc = rocfft_plan_description_set_data_layout(
+        gpu_description,
+        // input data format:
+        forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
+        // output data format:
+        forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
+        nullptr,
+        nullptr,
+        istride.size(), // input stride length
+        istride.data(), // input stride data
+        0,              // input batch distance
+        ostride.size(), // output stride length
+        ostride.data(), // output stride data
+        0);             // ouptut batch distance
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to set data layout");
+
+    // We can also pass "nullptr" instead of a description; rocFFT will use reasonable
+    // default parameters.  If the data isn't contiguous, we need to set strides, etc,
+    // using the description.
+
+    // Create the FFT plan:
+    rocfft_plan gpu_plan = nullptr;
+    rc                   = rocfft_plan_create(&gpu_plan,
+                            place,
+                            direction,
+                            rocfft_precision_double,
+                            length.size(),    // Dimension
+                            length.data(),    // lengths
+                            1,                // Number of transforms
+                            gpu_description); // Description
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to create plan");
+
+    // Get the execution info for the fft plan (in particular, work memory requirements):
+    rocfft_execution_info planinfo = nullptr;
+    rc                             = rocfft_execution_info_create(&planinfo);
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to create execution info");
+
+    size_t workbuffersize = 0;
+    rc                    = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize);
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to get work buffer size");
+
+    // If the transform requires work memory, allocate a work buffer:
+    void* wbuffer = nullptr;
+    if(workbuffersize > 0)
+    {
+        hip_status = hipMalloc(&wbuffer, workbuffersize);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMalloc failed");
+
+        rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
+        if(rc != rocfft_status_success)
+            throw std::runtime_error("failed to set work buffer");
+    }
+
+    // If the transform is out-of-place, allocate the output buffer as well:
+    void* gpu_out = inplace ? gpu_in : nullptr;
+    if(!inplace)
+    {
+        hip_status = hipMalloc(&gpu_out, obytes);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMalloc failed");
+    }
+
+    // Execute the GPU transform:
+    rc = rocfft_execute(gpu_plan,         // plan
+                        (void**)&gpu_in,  // in_buffer
+                        (void**)&gpu_out, // out_buffer
+                        planinfo);        // execution info
+    if(rc != rocfft_status_success)
+        throw std::runtime_error("failed to execute");
+
+    // Get the output from the device and print to cout:
+    std::cout << "output:\n";
+    if(forward)
+    {
+        hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
+        printbuffer_cm(cdata, olength, ostride, 1, osize);
+    }
+    else
+    {
+        hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
+        if(hip_status != hipSuccess)
+            throw std::runtime_error("hipMemcpy failed.");
+        printbuffer_cm(rdata, olength, ostride, 1, osize);
+    }
+
+    // Clean up: free GPU memory:
+    if(hipFree(gpu_in) != hipSuccess)
+        throw std::runtime_error("hipFree failed.");
+
+    if(!inplace)
+    {
+        if(hipFree(gpu_out) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+    }
+    if(wbuffer != nullptr)
+    {
+        if(hipFree(wbuffer) != hipSuccess)
+            throw std::runtime_error("hipFree failed.");
+    }
+
+    // Clean up: destroy plans:
+    if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
+        throw std::runtime_error("rocfft_execution_info_destroy failed.");
+    planinfo = nullptr;
+    if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_description_destroy failed.");
+    gpu_description = nullptr;
+    if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
+        throw std::runtime_error("rocfft_plan_destroy failed.");
+    gpu_plan = nullptr;
+
+    rocfft_cleanup();
+    return 0;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77acf3f7d3..2f4d54976a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -180,6 +180,8 @@ set( MIOpen_Source
     rnn/Solutions/bwd_s_stream.cpp
     rnn/Solutions/bwd_multi_stream.cpp
     scalar.cpp
+    sigmoidfocalloss/problem_description.cpp
+    sigmoid_focal_loss_api.cpp
     softmax.cpp
     softmax_api.cpp
     softmax/problem_description.cpp
@@ -305,6 +307,10 @@ set( MIOpen_Source
     solver/reduce/forward_min.cpp
     solver/reduce/forward_prod.cpp
     solver/reduce/forward_sum.cpp
+    solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp
+    solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp
+    solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+    solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
     subbuffers.cpp
@@ -461,6 +467,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/stride_array.hpp
         kernels/tensor_view.hpp
         kernels/utilities.inc
+        kernels/warp_shuffle.hpp
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
@@ -503,6 +510,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenNeuron.cl
+        kernels/MIOpenLossSum.cpp
         kernels/MIOpenPooling.cl
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
@@ -548,6 +556,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/gcnAsmBNBwdTrainSpatial.s
         kernels/MIOpenTensorKernels.cl
         kernels/MIOpenTensorKernelsHip.cpp
+        kernels/MIOpenSigmoidFocalLoss.cpp
         kernels/MIOpenSubTensorOpWithScalarKernel.cl
         kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
         kernels/MIOpenSubTensorOpWithCastTensorKernel.cl
@@ -656,6 +665,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ocl/fusionopbiasbnactivocl.cpp
         reducecalculation.cpp
         reduceextreme.cpp
+        sigmoid_focal_loss.cpp
         transformers_adam_w.cpp
         ${PROJECT_BINARY_DIR}/db_path.cpp
         )
diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp
new file mode 100644
index 0000000000..07d3e32d61
--- /dev/null
+++ b/src/include/miopen/sigmoid_focal_loss.hpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_SIGMOID_FOCAL_LOSS_HPP_
+#define MIOPEN_SIGMOID_FOCAL_LOSS_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle,
+                                               const TensorDescriptor& inputDesc,
+                                               const TensorDescriptor& targetDesc,
+                                               const TensorDescriptor& outputDesc,
+                                               miopenLossReductionMode_t reduction);
+
+miopenStatus_t SigmoidFocalLossForward(Handle& handle,
+                                       Data_t workspace,
+                                       size_t workspaceSizeInBytes,
+                                       const TensorDescriptor& inputDesc,
+                                       ConstData_t input,
+                                       const TensorDescriptor& targetDesc,
+                                       ConstData_t target,
+                                       const TensorDescriptor& outputDesc,
+                                       Data_t output,
+                                       float alpha,
+                                       float gamma,
+                                       miopenLossReductionMode_t reduction);
+
+miopenStatus_t SigmoidFocalLossBackward(Handle& handle,
+                                        const TensorDescriptor& inputDesc,
+                                        ConstData_t input,
+                                        const TensorDescriptor& targetDesc,
+                                        ConstData_t target,
+                                        const TensorDescriptor& doutputDesc,
+                                        ConstData_t doutput,
+                                        const TensorDescriptor& dinputDesc,
+                                        Data_t dinput,
+                                        const TensorDescriptor& dtargetDesc,
+                                        Data_t dtarget,
+                                        float alpha,
+                                        float gamma,
+                                        miopenLossReductionMode_t reduction);
+
+} // namespace miopen
+#endif // MIOPEN_SIGMOID_FOCAL_LOSS_HPP_
diff --git a/src/include/miopen/sigmoidfocalloss/invoke_params.hpp b/src/include/miopen/sigmoidfocalloss/invoke_params.hpp
new file mode 100644
index 0000000000..e2801cead2
--- /dev/null
+++ b/src/include/miopen/sigmoidfocalloss/invoke_params.hpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+#include <miopen/miopen.h>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace sigmoidfocalloss {
+
+struct SigmoidFocalLossInvokeParams : public miopen::InvokeParams
+{
+    SigmoidFocalLossInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* targetDesc = nullptr;
+
+    ConstData_t input                   = nullptr;
+    ConstData_t target                  = nullptr;
+    Data_t workspace                    = nullptr;
+    std::size_t workspace_size          = 0;
+    float alpha                         = 0.25;
+    float gamma                         = 2.0f;
+    miopenLossReductionMode_t reduction = MIOPEN_LOSS_REDUCTION_NONE;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+struct FwdInvokeParams : SigmoidFocalLossInvokeParams
+{
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* outputDesc = nullptr;
+    Data_t output                      = nullptr;
+};
+
+struct BwdInvokeParams : SigmoidFocalLossInvokeParams
+{
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* doutputDesc = nullptr;
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* dtargetDesc = nullptr;
+
+    ConstData_t doutput = nullptr;
+    ConstData_t dinput  = nullptr;
+    ConstData_t dtarget = nullptr;
+};
+
+} // namespace sigmoidfocalloss
+
+} // namespace miopen
diff --git a/src/include/miopen/sigmoidfocalloss/problem_description.hpp b/src/include/miopen/sigmoidfocalloss/problem_description.hpp
new file mode 100644
index 0000000000..3590b5c3d4
--- /dev/null
+++ b/src/include/miopen/sigmoidfocalloss/problem_description.hpp
@@ -0,0 +1,118 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/errors.hpp>
+#include <miopen/miopen.h>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace sigmoidfocalloss {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+struct SigmoidFocalLossProblemDescription : ProblemDescriptionBase
+{
+    SigmoidFocalLossProblemDescription(const TensorDescriptor& inputDesc_,
+                                       const TensorDescriptor& targetDesc_,
+                                       const miopenLossReductionMode_t reduction_)
+        : inputDesc(inputDesc_), targetDesc(targetDesc_), reduction(reduction_)
+    {
+        if(!checkSameLength(inputDesc, targetDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SigmoidFocalLoss: Input, target tensor sizes do not match.");
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetTargetDesc() const { return targetDesc; }
+
+public:
+    TensorDescriptor inputDesc;
+    TensorDescriptor targetDesc;
+    miopenLossReductionMode_t reduction;
+};
+
+struct SigmoidFocalLossFwdProblemDescription : SigmoidFocalLossProblemDescription
+{
+    SigmoidFocalLossFwdProblemDescription(const TensorDescriptor& inputDesc_,
+                                          const TensorDescriptor& targetDesc_,
+                                          const TensorDescriptor& outputDesc_,
+                                          const miopenLossReductionMode_t reduction_)
+        : SigmoidFocalLossProblemDescription(inputDesc_, targetDesc_, reduction_),
+          outputDesc(outputDesc_)
+    {
+        miopenDataType_t dtype = inputDesc.GetType();
+        if(dtype != targetDesc.GetType() || dtype != outputDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SigmoidFocalLoss: Input, target, output tensor type do not match.");
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+public:
+    TensorDescriptor outputDesc;
+};
+
+struct SigmoidFocalLossBwdProblemDescription : SigmoidFocalLossProblemDescription
+{
+    SigmoidFocalLossBwdProblemDescription(const TensorDescriptor& inputDesc_,
+                                          const TensorDescriptor& targetDesc_,
+                                          const TensorDescriptor& doutputDesc_,
+                                          const TensorDescriptor& dinputDesc_,
+                                          const TensorDescriptor& dtargetDesc_,
+                                          const miopenLossReductionMode_t reduction_)
+        : SigmoidFocalLossProblemDescription(inputDesc_, targetDesc_, reduction_),
+          doutputDesc(doutputDesc_),
+          dinputDesc(dinputDesc_),
+          dtargetDesc(dtargetDesc_)
+    {
+        miopenDataType_t dtype = inputDesc.GetType();
+        if(dtype != targetDesc.GetType() || dtype != doutputDesc.GetType() ||
+           dtype != dinputDesc.GetType() || dtype != dtargetDesc.GetType())
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "SigmoidFocalLoss: Input, target, doutput, dinput, dtarget tensor type do "
+                         "not match.");
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+    const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
+    const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDtargetDesc() const { return dtargetDesc; }
+
+public:
+    TensorDescriptor doutputDesc;
+    TensorDescriptor dinputDesc;
+    TensorDescriptor dtargetDesc;
+};
+
+} // namespace sigmoidfocalloss
+
+} // namespace miopen
diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp
new file mode 100644
index 0000000000..992ad5a9d6
--- /dev/null
+++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp
@@ -0,0 +1,121 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace sigmoidfocalloss {
+
+using SigmoidFocalLossFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext,
+                         miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription>;
+
+struct SigmoidFocalLossFwd final : SigmoidFocalLossFwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SigmoidFocalLossFwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription&
+                          problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription&
+                                 problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem)
+        const override;
+
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+using SigmoidFocalLossBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext,
+                         miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription>;
+
+struct SigmoidFocalLossBwd final : SigmoidFocalLossBwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SigmoidFocalLossBwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription&
+                          problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription&
+                                 problem) const override;
+};
+
+using SigmoidFocalLossUnreducedFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext,
+                         miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription>;
+
+struct SigmoidFocalLossUnreducedFwd final : SigmoidFocalLossUnreducedFwdSolverBase
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<SigmoidFocalLossUnreducedFwd>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription&
+                          problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription&
+                                 problem) const override;
+};
+
+using SigmoidFocalLossUnreducedBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext,
+                         miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription>;
+
+struct SigmoidFocalLossUnreducedBwd final : SigmoidFocalLossUnreducedBwdSolverBase
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<SigmoidFocalLossUnreducedBwd>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription&
+                          problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription&
+                                 problem) const override;
+};
+
+} // namespace sigmoidfocalloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/sigmoidfocalloss/utils.hpp b/src/include/miopen/sigmoidfocalloss/utils.hpp
new file mode 100644
index 0000000000..0dddceea7e
--- /dev/null
+++ b/src/include/miopen/sigmoidfocalloss/utils.hpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/mlo_internal.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                miopen::KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return miopen::solver::KernelInfo{build_params.GenerateFor(miopen::kbp::HIP{}),
+                                      localsize,
+                                      gridsize,
+                                      kernel_file,
+                                      kernel_name};
+};
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..9f79cefc6b 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -59,7 +59,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    Loss
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..77a9b6ddae 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -27,8 +27,8 @@
 #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
-#include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
 
 namespace miopen {
 
@@ -38,10 +38,15 @@ inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
     auto dims    = Desc.GetLengths();
     auto strides = Desc.GetStrides();
 
-    tensor_view_t<N> tensor_view;
+    tensor_view_t<N> tensor_view{};
     for(size_t i = 0; i < N; ++i)
     {
-        if(i < dims.size())
+        if(dims.empty())
+        {
+            tensor_view.stride[i] = 0;
+            tensor_view.size[i]   = 0;
+        }
+        else if(i < dims.size())
         {
             tensor_view.stride[i] = strides[i];
             tensor_view.size[i]   = dims[i];
diff --git a/src/kernels/MIOpenLossSum.cpp b/src/kernels/MIOpenLossSum.cpp
new file mode 100644
index 0000000000..08d3a656f6
--- /dev/null
+++ b/src/kernels/MIOpenLossSum.cpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "warp_shuffle.hpp"
+
+#ifndef IN_OUT_TYPE
+#define IN_OUT_TYPE float
+#endif
+
+template <typename TIO>
+__device__ void losssum(const TIO* input, TIO* output, size_t N)
+{
+    auto gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : static_cast<FLOAT_ACCUM>(0.0f);
+    val             = block_reduce_sum(val);
+
+    if(threadIdx.x == 0)
+        output[blockIdx.x] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void
+LossSum(const IN_OUT_TYPE* __restrict__ input, IN_OUT_TYPE* __restrict__ output, size_t N)
+{
+    // instantiate the kernel
+    losssum<IN_OUT_TYPE>(input, output, N);
+}
diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp
new file mode 100644
index 0000000000..75c25c0e42
--- /dev/null
+++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp
@@ -0,0 +1,329 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#ifndef IN_OUT_TYPE
+#define IN_OUT_TYPE float
+#endif
+
+#ifndef CVT_ACCUM2FLOAT
+#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x))
+#endif
+
+#ifndef CVT_FLOAT2ACCUM
+#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x))
+#endif
+
+template <typename TIO>
+__device__ void sigmoidFocalLossFwd(const TIO* input,
+                                    TIO* target,
+                                    TIO* workspace,
+                                    float alpha,
+                                    float gamma,
+                                    float divisor,
+                                    tensor_view_t<5> input_tv,
+                                    tensor_view_t<5> target_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> idx(input_tv, gid);
+    if(idx.layout[0] >= input_tv.size[0])
+        return;
+
+    FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
+
+    FLOAT_ACCUM p      = 1 / (1 + exp(-i));
+    FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p));
+    FLOAT_ACCUM pT     = p * t + (1 - p) * (1 - t);
+    FLOAT_ACCUM loss   = ceLoss * pow(1 - pT, gamma);
+
+    if(alpha >= 0)
+    {
+        FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t);
+        loss                = alpha_t * loss;
+    }
+
+    workspace[gid] = CVT_ACCUM2FLOAT(loss / divisor);
+}
+
+extern "C" __global__ void SigmoidFocalLossFwd(const IN_OUT_TYPE* input,
+                                               IN_OUT_TYPE* target,
+                                               IN_OUT_TYPE* workspace,
+                                               float alpha,
+                                               float gamma,
+                                               float divisor,
+                                               tensor_view_t<5> input_tv,
+                                               tensor_view_t<5> target_tv)
+{
+    sigmoidFocalLossFwd<IN_OUT_TYPE>(
+        input, target, workspace, alpha, gamma, divisor, input_tv, target_tv);
+}
+
+template <typename TIO>
+__device__ void sigmoidFocalLossBwd(const TIO* input,
+                                    const TIO* target,
+                                    const TIO* doutput,
+                                    TIO* dinput,
+                                    TIO* dtarget,
+                                    float alpha,
+                                    float gamma,
+                                    float divisor,
+                                    tensor_view_t<5> input_tv,
+                                    tensor_view_t<5> target_tv,
+                                    tensor_view_t<5> doutput_tv,
+                                    tensor_view_t<5> dinput_tv,
+                                    tensor_view_t<5> dtarget_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> idx(input_tv, gid);
+    tensor_layout_t<5> doIdx(doutput_tv, 0);
+    if(idx.layout[0] >= input_tv.size[0])
+        return;
+
+    FLOAT_ACCUM i  = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM t  = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+
+    FLOAT_ACCUM p       = 1 / (1 + exp(-i));
+    FLOAT_ACCUM ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+    FLOAT_ACCUM pT      = p * t + (1 - p) * (1 - t);
+    FLOAT_ACCUM powPt   = pow(1 - pT, gamma);
+    FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+    if(dinput)
+    {
+        FLOAT_ACCUM dpdi = exp(-i) / pow(1 + exp(-i), 2);
+        // dceloss/di = dceloss/dp * dp/di
+        FLOAT_ACCUM dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+        // dpowt/di = dpowt/dpT * dpT/dp * dp/di
+        FLOAT_ACCUM dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+        // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+        FLOAT_ACCUM dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+        FLOAT_ACCUM grad = dO * dLdi;
+
+        if(alpha >= 0)
+        {
+            grad *= alpha_t;
+        }
+        grad /= divisor;
+        dinput[dinput_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(grad);
+    }
+
+    if(dtarget)
+    {
+        FLOAT_ACCUM dcelossdt = -log(p) + log(1 - p);
+        FLOAT_ACCUM dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+        // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+        FLOAT_ACCUM dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+        FLOAT_ACCUM gradTarget = dO * dLdt;
+
+        if(alpha >= 0)
+        {
+            // alpha_t * dL/dt + dalpha_t/dt * dL
+            gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+        }
+        gradTarget /= divisor;
+        dtarget[dtarget_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(gradTarget);
+    }
+}
+
+extern "C" __global__ void SigmoidFocalLossBwd(const IN_OUT_TYPE* input,
+                                               IN_OUT_TYPE* target,
+                                               IN_OUT_TYPE* doutput,
+                                               IN_OUT_TYPE* dinput,
+                                               IN_OUT_TYPE* dtarget,
+                                               float alpha,
+                                               float gamma,
+                                               float divisor,
+                                               tensor_view_t<5> input_tv,
+                                               tensor_view_t<5> target_tv,
+                                               tensor_view_t<5> doutput_tv,
+                                               tensor_view_t<5> dinput_tv,
+                                               tensor_view_t<5> dtarget_tv)
+{
+    sigmoidFocalLossBwd<IN_OUT_TYPE>(input,
+                                     target,
+                                     doutput,
+                                     dinput,
+                                     dtarget,
+                                     alpha,
+                                     gamma,
+                                     divisor,
+                                     input_tv,
+                                     target_tv,
+                                     doutput_tv,
+                                     dinput_tv,
+                                     dtarget_tv);
+}
+
+template <typename TIO>
+__device__ void sigmoidFocalLossUnreducedFwd(const TIO* input,
+                                             TIO* target,
+                                             TIO* output,
+                                             float alpha,
+                                             float gamma,
+                                             tensor_view_t<5> input_tv,
+                                             tensor_view_t<5> target_tv,
+                                             tensor_view_t<5> output_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> idx(input_tv, gid);
+    if(idx.layout[0] >= input_tv.size[0])
+        return;
+
+    FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
+
+    FLOAT_ACCUM p      = 1 / (1 + exp(-i));
+    FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p));
+    FLOAT_ACCUM pT     = p * t + (1 - p) * (1 - t);
+    FLOAT_ACCUM loss   = ceLoss * pow(1 - pT, gamma);
+
+    if(alpha >= 0)
+    {
+        FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t);
+        loss                = alpha_t * loss;
+    }
+
+    output[output_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(loss);
+}
+
+extern "C" __global__ void SigmoidFocalLossUnreducedFwd(const IN_OUT_TYPE* input,
+                                                        IN_OUT_TYPE* target,
+                                                        IN_OUT_TYPE* output,
+                                                        float alpha,
+                                                        float gamma,
+                                                        tensor_view_t<5> input_tv,
+                                                        tensor_view_t<5> target_tv,
+                                                        tensor_view_t<5> output_tv)
+{
+    sigmoidFocalLossUnreducedFwd<IN_OUT_TYPE>(
+        input, target, output, alpha, gamma, input_tv, target_tv, output_tv);
+}
+
+template <typename TIO>
+__device__ void sigmoidFocalLossUnreducedBwd(const TIO* input,
+                                             const TIO* target,
+                                             const TIO* doutput,
+                                             TIO* dinput,
+                                             TIO* dtarget,
+                                             float alpha,
+                                             float gamma,
+                                             tensor_view_t<5> input_tv,
+                                             tensor_view_t<5> target_tv,
+                                             tensor_view_t<5> doutput_tv,
+                                             tensor_view_t<5> dinput_tv,
+                                             tensor_view_t<5> dtarget_tv)
+{
+    size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> idx(input_tv, gid);
+    if(idx.layout[0] >= input_tv.size[0])
+        return;
+
+    FLOAT_ACCUM i  = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM t  = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
+    FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+
+    FLOAT_ACCUM p       = 1 / (1 + exp(-i));
+    FLOAT_ACCUM ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+    FLOAT_ACCUM pT      = p * t + (1 - p) * (1 - t);
+    FLOAT_ACCUM powPt   = pow(1 - pT, gamma);
+    FLOAT_ACCUM alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+    if(dinput)
+    {
+        FLOAT_ACCUM dpdi = exp(-i) / pow(1 + exp(-i), 2);
+        // dceloss/di = dceloss/dp * dp/di
+        FLOAT_ACCUM dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+        // dpowt/di = dpowt/dpT * dpT/dp * dp/di
+        FLOAT_ACCUM dpowptdi = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+        // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+        FLOAT_ACCUM dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+        FLOAT_ACCUM grad = dO * dLdi;
+
+        if(alpha >= 0)
+        {
+            grad *= alpha_t;
+        }
+        dinput[dinput_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(grad);
+    }
+
+    if(dtarget)
+    {
+        FLOAT_ACCUM dcelossdt = -log(p) + log(1 - p);
+        FLOAT_ACCUM dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+        // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+        FLOAT_ACCUM dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+        FLOAT_ACCUM gradTarget = dO * dLdt;
+
+        if(alpha >= 0)
+        {
+            // alpha_t * dL/dt + dalpha_t/dt * dL
+            gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+        }
+        dtarget[dtarget_tv.get_tensor_view_idx(idx)] = CVT_ACCUM2FLOAT(gradTarget);
+    }
+}
+
+extern "C" __global__ void SigmoidFocalLossUnreducedBwd(const IN_OUT_TYPE* input,
+                                                        IN_OUT_TYPE* target,
+                                                        IN_OUT_TYPE* doutput,
+                                                        IN_OUT_TYPE* dinput,
+                                                        IN_OUT_TYPE* dtarget,
+                                                        float alpha,
+                                                        float gamma,
+                                                        tensor_view_t<5> input_tv,
+                                                        tensor_view_t<5> target_tv,
+                                                        tensor_view_t<5> doutput_tv,
+                                                        tensor_view_t<5> dinput_tv,
+                                                        tensor_view_t<5> dtarget_tv)
+{
+    sigmoidFocalLossUnreducedBwd<IN_OUT_TYPE>(input,
+                                              target,
+                                              doutput,
+                                              dinput,
+                                              dtarget,
+                                              alpha,
+                                              gamma,
+                                              input_tv,
+                                              target_tv,
+                                              doutput_tv,
+                                              dinput_tv,
+                                              dtarget_tv);
+}
diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp
new file mode 100644
index 0000000000..ebd5861976
--- /dev/null
+++ b/src/kernels/warp_shuffle.hpp
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+
+#ifndef REDUCE_SIZE
+#define REDUCE_SIZE 256
+#endif
+
+__device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val)
+{
+    if(warpSize >= 64)
+        val += __shfl_down(val, 32);
+    if(warpSize >= 32)
+        val += __shfl_down(val, 16);
+    if(warpSize >= 16)
+        val += __shfl_down(val, 8);
+    if(warpSize >= 8)
+        val += __shfl_down(val, 4);
+    if(warpSize >= 4)
+        val += __shfl_down(val, 2);
+    if(warpSize >= 2)
+        val += __shfl_down(val, 1);
+    return val;
+}
+
+__device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val)
+{
+    static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize];
+    auto lane = threadIdx.x % warpSize;
+    auto wid  = threadIdx.x / warpSize;
+
+    val = warp_reduce_sum(val);
+
+    if(lane == 0)
+        shared[wid] = val;
+    __syncthreads();
+
+    val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0;
+    if(wid == 0)
+        val = warp_reduce_sum(val);
+
+    return val;
+}
diff --git a/src/sigmoid_focal_loss.cpp b/src/sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..e1123a799c
--- /dev/null
+++ b/src/sigmoid_focal_loss.cpp
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/miopen.h>
+#include <miopen/sigmoidfocalloss/invoke_params.hpp>
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle,
+                                               const TensorDescriptor& inputDesc,
+                                               const TensorDescriptor& targetDesc,
+                                               const TensorDescriptor& outputDesc,
+                                               miopenLossReductionMode_t reduction)
+{
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        return 0;
+    }
+
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription{
+        inputDesc, targetDesc, outputDesc, reduction};
+
+    const auto algo    = AlgorithmName{"SigmoidFocalLossFwd"};
+    const auto solvers = solver::SolverContainer<solver::sigmoidfocalloss::SigmoidFocalLossFwd>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t SigmoidFocalLossForward(Handle& handle,
+                                       Data_t workspace,
+                                       size_t workspaceSizeInBytes,
+                                       const TensorDescriptor& inputDesc,
+                                       ConstData_t input,
+                                       const TensorDescriptor& targetDesc,
+                                       ConstData_t target,
+                                       const TensorDescriptor& outputDesc,
+                                       Data_t output,
+                                       float alpha,
+                                       float gamma,
+                                       miopenLossReductionMode_t reduction)
+{
+    const auto problem = sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription{
+        inputDesc, targetDesc, outputDesc, reduction};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = sigmoidfocalloss::FwdInvokeParams{};
+        tmp.inputDesc      = &inputDesc;
+        tmp.targetDesc     = &targetDesc;
+        tmp.outputDesc     = &outputDesc;
+        tmp.input          = input;
+        tmp.target         = target;
+        tmp.output         = output;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        tmp.alpha          = alpha;
+        tmp.gamma          = gamma;
+        tmp.reduction      = reduction;
+        return tmp;
+    }();
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        const auto algo = AlgorithmName{"SigmoidFocalLossUnreducedFwd"};
+        const auto solvers =
+            solver::SolverContainer<solver::sigmoidfocalloss::SigmoidFocalLossUnreducedFwd>{};
+
+        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    }
+    else
+    {
+        const auto algo = AlgorithmName{"SigmoidFocalLossFwd"};
+        const auto solvers =
+            solver::SolverContainer<solver::sigmoidfocalloss::SigmoidFocalLossFwd>{};
+
+        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    }
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t SigmoidFocalLossBackward(Handle& handle,
+                                        const TensorDescriptor& inputDesc,
+                                        ConstData_t input,
+                                        const TensorDescriptor& targetDesc,
+                                        ConstData_t target,
+                                        const TensorDescriptor& doutputDesc,
+                                        ConstData_t doutput,
+                                        const TensorDescriptor& dinputDesc,
+                                        Data_t dinput,
+                                        const TensorDescriptor& dtargetDesc,
+                                        Data_t dtarget,
+                                        float alpha,
+                                        float gamma,
+                                        const miopenLossReductionMode_t reduction)
+{
+    const auto problem = sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription{
+        inputDesc, targetDesc, doutputDesc, dinputDesc, dtargetDesc, reduction};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = sigmoidfocalloss::BwdInvokeParams{};
+        tmp.inputDesc   = &inputDesc;
+        tmp.targetDesc  = &targetDesc;
+        tmp.doutputDesc = &doutputDesc;
+        tmp.dinputDesc  = &dinputDesc;
+        tmp.dtargetDesc = &dtargetDesc;
+        tmp.input       = input;
+        tmp.target      = target;
+        tmp.doutput     = doutput;
+        tmp.dinput      = dinput;
+        tmp.dtarget     = dtarget;
+        tmp.alpha       = alpha;
+        tmp.gamma       = gamma;
+        tmp.reduction   = reduction;
+        return tmp;
+    }();
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        const auto algo = AlgorithmName{"SigmoidFocalLossUnreducedBwd"};
+        const auto solvers =
+            solver::SolverContainer<solver::sigmoidfocalloss::SigmoidFocalLossUnreducedBwd>{};
+
+        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    }
+    else
+    {
+        const auto algo = AlgorithmName{"SigmoidFocalLossBwd"};
+        const auto solvers =
+            solver::SolverContainer<solver::sigmoidfocalloss::SigmoidFocalLossBwd>{};
+
+        solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    }
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/sigmoid_focal_loss_api.cpp b/src/sigmoid_focal_loss_api.cpp
new file mode 100644
index 0000000000..2cc511bb28
--- /dev/null
+++ b/src/sigmoid_focal_loss_api.cpp
@@ -0,0 +1,192 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/miopen.h>
+#include <miopen/sigmoid_focal_loss.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdSigmoidFocalLoss(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t targetDesc,
+                                   bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "sigmoidFocalLossfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "sigmoidFocalLossfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "sigmoidFocalLossbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(inputDesc, targetDesc);
+        ss << " -n " << miopen::deref(inputDesc).GetLengths()[0];
+        ss << " -T " << miopen::deref(inputDesc).GetLengths();
+        ss << " -Si " << miopen::deref(inputDesc).GetStrides();
+        ss << " -St " << miopen::deref(targetDesc).GetStrides();
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t
+miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const miopenTensorDescriptor_t targetDesc,
+                                              const miopenTensorDescriptor_t outputDesc,
+                                              miopenLossReductionMode_t reduction,
+                                              size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, targetDesc, outputDesc, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) =
+            miopen::GetSigmoidFocalLossForwardWorkspaceSize(miopen::deref(handle),
+                                                            miopen::deref(inputDesc),
+                                                            miopen::deref(targetDesc),
+                                                            miopen::deref(outputDesc),
+                                                            reduction);
+    });
+}
+
+extern "C" miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle,
+                                                        void* workspace,
+                                                        size_t workspaceSizeInBytes,
+                                                        const miopenTensorDescriptor_t inputDesc,
+                                                        const void* input,
+                                                        const miopenTensorDescriptor_t targetDesc,
+                                                        const void* target,
+                                                        const miopenTensorDescriptor_t outputDesc,
+                                                        void* output,
+                                                        const float alpha,
+                                                        const float gamma,
+                                                        const miopenLossReductionMode_t reduction)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        workspace,
+                        workspaceSizeInBytes,
+                        inputDesc,
+                        input,
+                        targetDesc,
+                        target,
+                        outputDesc,
+                        output,
+                        alpha,
+                        gamma,
+                        reduction);
+
+    LogCmdSigmoidFocalLoss(inputDesc, targetDesc, true);
+
+    return miopen::try_([&] {
+        miopen::SigmoidFocalLossForward(miopen::deref(handle),
+                                        DataCast(workspace),
+                                        workspaceSizeInBytes,
+                                        miopen::deref(inputDesc),
+                                        DataCast(input),
+                                        miopen::deref(targetDesc),
+                                        DataCast(target),
+                                        miopen::deref(outputDesc),
+                                        DataCast(output),
+                                        alpha,
+                                        gamma,
+                                        reduction);
+    });
+}
+
+extern "C" miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle,
+                                                         miopenTensorDescriptor_t inputDesc,
+                                                         const void* input,
+                                                         miopenTensorDescriptor_t targetDesc,
+                                                         const void* target,
+                                                         miopenTensorDescriptor_t doutputDesc,
+                                                         const void* doutput,
+                                                         miopenTensorDescriptor_t dinputDesc,
+                                                         void* dinput,
+                                                         miopenTensorDescriptor_t dtargetDesc,
+                                                         void* dtarget,
+                                                         float alpha,
+                                                         float gamma,
+                                                         const miopenLossReductionMode_t reduction)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        targetDesc,
+                        target,
+                        doutputDesc,
+                        doutput,
+                        dinputDesc,
+                        dinput,
+                        dtargetDesc,
+                        dtarget,
+                        alpha,
+                        gamma,
+                        reduction);
+
+    LogCmdSigmoidFocalLoss(inputDesc, targetDesc, false);
+
+    return miopen::try_([&] {
+        miopen::SigmoidFocalLossBackward(miopen::deref(handle),
+                                         miopen::deref(inputDesc),
+                                         DataCast(input),
+                                         miopen::deref(targetDesc),
+                                         DataCast(target),
+                                         miopen::deref(doutputDesc),
+                                         DataCast(doutput),
+                                         miopen::deref(dinputDesc),
+                                         DataCast(dinput),
+                                         miopen::deref(dtargetDesc),
+                                         DataCast(dtarget),
+                                         alpha,
+                                         gamma,
+                                         reduction);
+    });
+}
diff --git a/src/sigmoidfocalloss/problem_description.cpp b/src/sigmoidfocalloss/problem_description.cpp
new file mode 100644
index 0000000000..825df9286e
--- /dev/null
+++ b/src/sigmoidfocalloss/problem_description.cpp
@@ -0,0 +1,88 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace sigmoidfocalloss {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetNumDims() != y.GetNumDims())
+        return false;
+    for(int32_t i = 0; i < x.GetNumDims(); ++i)
+    {
+        if(x.GetLengths()[i] != y.GetLengths()[i])
+            return false;
+    }
+    return true;
+}
+
+NetworkConfig SigmoidFocalLossBwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto target_dtype = targetDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto dim_num      = inputDesc.GetNumDims();
+
+    std::ostringstream ss;
+
+    ss << "sfl_bwd";
+    ss << "reduction" << reduction;
+    ss << "i_dtype" << input_dtype;
+    ss << "t_dtype" << target_dtype;
+    ss << "dim_num" << dim_num;
+    ss << "size" << size;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig SigmoidFocalLossFwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto target_dtype = targetDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto dim_num      = inputDesc.GetNumDims();
+
+    std::ostringstream ss;
+
+    ss << "sfl_fwd";
+    ss << "reduction" << reduction;
+    ss << "i_dtype" << input_dtype;
+    ss << "t_dtype" << target_dtype;
+    ss << "dim_num" << dim_num;
+    ss << "size" << size;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace sigmoidfocalloss
+
+} // namespace miopen
diff --git a/src/solver.cpp b/src/solver.cpp
index 6b451ca498..91def5d6eb 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -37,6 +37,7 @@
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
@@ -673,6 +674,17 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
 
+    Register(registry,
+             ++id,
+             Primitive::Loss,
+             sigmoidfocalloss::SigmoidFocalLossUnreducedFwd{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Loss,
+             sigmoidfocalloss::SigmoidFocalLossUnreducedBwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossBwd{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..4e5046da49
--- /dev/null
+++ b/src/solver/sigmoidfocalloss/backward_reduce_sigmoid_focal_loss.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/sigmoidfocalloss/invoke_params.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/sigmoid_focal_loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/sigmoidfocalloss/utils.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace sigmoidfocalloss {
+
+bool SigmoidFocalLossBwd::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() > 5)
+        return false;
+    return true;
+}
+
+ConvSolution SigmoidFocalLossBwd::GetSolution(
+    const ExecutionContext& context,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype     = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype        = problem.GetDinputDesc().GetType();
+    auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType());
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"LOCAL_SIZE", LOCAL_SIZE},
+    };
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE},
+                                                         {problem.GetInputDesc().GetElementSize()},
+                                                         "MIOpenSigmoidFocalLoss.cpp",
+                                                         "SigmoidFocalLossBwd",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::sigmoidfocalloss::BwdInvokeParams>();
+            auto input_tv         = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto target_tv        = get_inner_expanded_tv<5>(deref(params.targetDesc));
+            auto doutput_tv       = get_inner_expanded_tv<5>(deref(params.doutputDesc));
+            auto dinput_tv        = get_inner_expanded_tv<5>(deref(params.dinputDesc));
+            auto dtarget_tv       = get_inner_expanded_tv<5>(deref(params.dtargetDesc));
+            float divisor         = 1;
+            if(params.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+            {
+                divisor = deref(params.inputDesc).GetElementSize();
+            }
+
+            kernel(params.input,
+                   params.target,
+                   params.doutput,
+                   params.dinput,
+                   params.dtarget,
+                   params.alpha,
+                   params.gamma,
+                   divisor,
+                   input_tv,
+                   target_tv,
+                   doutput_tv,
+                   dinput_tv,
+                   dtarget_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace sigmoidfocalloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..8d34198d73
--- /dev/null
+++ b/src/solver/sigmoidfocalloss/backward_unreduce_sigmoid_focal_loss.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/sigmoidfocalloss/invoke_params.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/sigmoid_focal_loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/sigmoidfocalloss/utils.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace sigmoidfocalloss {
+
+bool SigmoidFocalLossUnreducedBwd::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() > 5)
+        return false;
+    return true;
+}
+
+ConvSolution SigmoidFocalLossUnreducedBwd::GetSolution(
+    const ExecutionContext& context,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype     = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype        = problem.GetDinputDesc().GetType();
+    auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType());
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"LOCAL_SIZE", LOCAL_SIZE},
+    };
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE},
+                                                         {problem.GetInputDesc().GetElementSize()},
+                                                         "MIOpenSigmoidFocalLoss.cpp",
+                                                         "SigmoidFocalLossUnreducedBwd",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::sigmoidfocalloss::BwdInvokeParams>();
+            auto input_tv         = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto target_tv        = get_inner_expanded_tv<5>(deref(params.targetDesc));
+            auto doutput_tv       = get_inner_expanded_tv<5>(deref(params.doutputDesc));
+            auto dinput_tv        = get_inner_expanded_tv<5>(deref(params.dinputDesc));
+            auto dtarget_tv       = get_inner_expanded_tv<5>(deref(params.dtargetDesc));
+
+            kernel(params.input,
+                   params.target,
+                   params.doutput,
+                   params.dinput,
+                   params.dtarget,
+                   params.alpha,
+                   params.gamma,
+                   input_tv,
+                   target_tv,
+                   doutput_tv,
+                   dinput_tv,
+                   dtarget_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace sigmoidfocalloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..f7daa8b84c
--- /dev/null
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/miopen.h>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/sigmoidfocalloss/invoke_params.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/sigmoid_focal_loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/sigmoidfocalloss/utils.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+#define LOCAL_SIZE_REDUCE_FWD 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace sigmoidfocalloss {
+
+bool SigmoidFocalLossFwd::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() > 5)
+        return false;
+    return true;
+}
+
+ConvSolution SigmoidFocalLossFwd::GetSolution(
+    const ExecutionContext& context,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype     = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType());
+    auto size         = problem.GetInputDesc().GetElementSize();
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"LOCAL_SIZE", LOCAL_SIZE},
+    };
+
+    /* Prepare params for loss kernel */
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params));
+
+    /* Prepare params for reduce kernels */
+    auto _size = size;
+    do
+    {
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_REDUCE_FWD}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
+        _size = AlignUp(_size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD;
+    } while(_size > 1);
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::sigmoidfocalloss::FwdInvokeParams>();
+            auto size             = deref(params.inputDesc).GetElementSize();
+
+            auto elapsed = 0.f;
+            HipEventPtr start;
+            HipEventPtr stop;
+
+            bool resetProfilingState = false;
+            if(handle_.IsProfilingEnabled())
+            {
+                resetProfilingState = true;
+                handle_.EnableProfiling(false);
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            /* Execute loss kernel */
+            {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                auto input_tv         = get_inner_expanded_tv<5>(deref(params.inputDesc));
+                auto target_tv        = get_inner_expanded_tv<5>(deref(params.targetDesc));
+                float divisor         = 1;
+                if(params.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+                {
+                    divisor = size;
+                }
+
+                kernel(params.input,
+                       params.target,
+                       params.workspace,
+                       params.alpha,
+                       params.gamma,
+                       divisor,
+                       input_tv,
+                       target_tv);
+            }
+
+            /* Execute reduce kernels */
+            auto reduceIn = params.workspace;
+            auto reduceOut =
+                static_cast<Data_t>(static_cast<char*>(params.workspace) +
+                                    deref(params.inputDesc).GetElementSize() *
+                                        get_data_size(deref(params.outputDesc).GetType()));
+            for(int i = 1; i < kernels.size(); ++i)
+            {
+                decltype(auto) kernel = handle_.Run(kernels[i]);
+                if(i + 1 != kernels.size())
+                {
+                    kernel(reduceIn, reduceOut, size);
+                    std::swap(reduceIn, reduceOut);
+                }
+                else
+                {
+                    kernel(reduceIn, params.output, size);
+                }
+                size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD;
+            }
+
+            if(resetProfilingState)
+            {
+                handle_.EnableProfiling(true);
+            }
+
+            if(handle_.IsProfilingEnabled())
+            {
+                hipEventRecord(stop.get(), handle_.GetStream());
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t SigmoidFocalLossFwd::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    size_t inputElements  = problem.GetInputDesc().GetElementSize();
+    size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE_FWD - 1) / LOCAL_SIZE_REDUCE_FWD;
+    size_t res =
+        (inputElements + reduceElements) * get_data_size(problem.GetOutputDesc().GetType());
+
+    return res;
+}
+
+} // namespace sigmoidfocalloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..91e8b48e49
--- /dev/null
+++ b/src/solver/sigmoidfocalloss/forward_unreduce_sigmoid_focal_loss.cpp
@@ -0,0 +1,107 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/sigmoidfocalloss/problem_description.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/sigmoidfocalloss/invoke_params.hpp>
+#include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/sigmoid_focal_loss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/sigmoidfocalloss/utils.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace sigmoidfocalloss {
+
+bool SigmoidFocalLossUnreducedFwd::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() > 5)
+        return false;
+    return true;
+}
+
+ConvSolution SigmoidFocalLossUnreducedFwd::GetSolution(
+    const ExecutionContext& context,
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype     = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    auto target_dtype = miopen::GetDataType(problem.GetTargetDesc().GetType());
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype},
+        {"LOCAL_SIZE", LOCAL_SIZE},
+    };
+
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE},
+                                                         {problem.GetInputDesc().GetElementSize()},
+                                                         "MIOpenSigmoidFocalLoss.cpp",
+                                                         "SigmoidFocalLossUnreducedFwd",
+                                                         build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::sigmoidfocalloss::FwdInvokeParams>();
+            auto input_tv         = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto target_tv        = get_inner_expanded_tv<5>(deref(params.targetDesc));
+            auto output_tv        = get_inner_expanded_tv<5>(deref(params.outputDesc));
+
+            kernel(params.input,
+                   params.target,
+                   params.output,
+                   params.alpha,
+                   params.gamma,
+                   input_tv,
+                   target_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace sigmoidfocalloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp
new file mode 100644
index 0000000000..3b13b955e3
--- /dev/null
+++ b/test/cpu_sigmoid_focal_loss.hpp
@@ -0,0 +1,238 @@
+#pragma once
+
+#include "tensor_holder.hpp"
+#include "tensor_view.hpp"
+#include <miopen/tensor_view_utils.hpp>
+#include <cmath>
+
+template <class TIO>
+void cpu_sigmoid_focal_loss_unreduced_forward(tensor<TIO> input,
+                                              tensor<TIO> target,
+                                              tensor<TIO>& outputHost,
+                                              float alpha = 0.25,
+                                              float gamma = 2)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
+    auto output_tv   = miopen::get_inner_expanded_tv<5>(outputHost.desc);
+    size_t inputSize = input.desc.GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+
+        float sig    = 1 / (1 + std::exp(-i));
+        float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig));
+        float sigT   = sig * t + (1 - sig) * (1 - t);
+        float loss   = ceLoss * std::pow(1 - sigT, gamma);
+
+        if(alpha >= 0)
+        {
+            float alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss         = alphaT * loss;
+        }
+
+        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(loss);
+    }
+}
+
+template <class TIO>
+void cpu_sigmoid_focal_loss_unreduced_backward(tensor<TIO> input,
+                                               tensor<TIO> target,
+                                               tensor<TIO> doutput,
+                                               tensor<TIO>& dinput,
+                                               tensor<TIO>& dtarget,
+                                               float alpha = 0.25,
+                                               float gamma = 2)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
+    auto doutput_tv  = miopen::get_inner_expanded_tv<5>(doutput.desc);
+    auto dinput_tv   = miopen::get_inner_expanded_tv<5>(dinput.desc);
+    auto dtarget_tv  = miopen::get_inner_expanded_tv<5>(dtarget.desc);
+    size_t inputSize = input.desc.GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+
+        float p       = 1 / (1 + std::exp(-i));
+        float ceLoss  = -(t * std::log(p) + (1 - t) * std::log(1 - p));
+        float pT      = p * t + (1 - p) * (1 - t);
+        float powPt   = std::pow(1 - pT, gamma);
+        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+        if(dinput.data.size() > 0)
+        {
+            float dpdi      = std::exp(-i) / std::pow(1 + std::exp(-i), 2);
+            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            float dpowptdi  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            float grad = dO * dLdi;
+
+            if(alpha >= 0)
+            {
+                grad *= alpha_t;
+            }
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+        }
+
+        if(dtarget.data.size() > 0)
+        {
+            float dcelossdt = -std::log(p) + std::log(1 - p);
+            float dpowptdt  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            float gradTarget = dO * dLdt;
+
+            if(alpha >= 0)
+            {
+                // alpha_t * dL/dt + dalpha_t/dt * dL
+                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+            }
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+        }
+    }
+}
+
+template <class TIO>
+void cpu_sigmoid_focal_loss_forward(tensor<TIO> input,
+                                    tensor<TIO> target,
+                                    tensor<TIO>& workspace,
+                                    tensor<TIO>& outputHost,
+                                    float alpha   = 0.25,
+                                    float gamma   = 2,
+                                    float divisor = 1)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
+    size_t inputSize = input.desc.GetElementSize();
+    // float reduction_float;
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+
+        float sig    = 1 / (1 + std::exp(-i));
+        float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig));
+        float sigT   = sig * t + (1 - sig) * (1 - t);
+        float loss   = ceLoss * std::pow(1 - sigT, gamma);
+
+        if(alpha >= 0)
+        {
+            float alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss         = alphaT * loss;
+        }
+        // reduction_float += (loss / divisor);
+
+        workspace[id] = static_cast<TIO>(loss / divisor);
+    }
+    // std::cout << "Reduction result in float" << reduction_float << " " << divisor << std::endl;
+
+    // Reduce loss
+    const int local_size = 256;
+    int offset_a         = 0;
+    int offset_b         = inputSize;
+    size_t _size         = inputSize;
+    do
+    {
+        for(int i = 0; i < _size; i += local_size)
+        {
+            TIO shared[local_size];
+            for(int j = 0; j < local_size; ++j)
+                shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f;
+            for(int offset = local_size / 2; offset > 0; offset >>= 1)
+                for(int j = 0; j < offset; ++j)
+                    shared[j] += shared[j + offset];
+            if(_size <= local_size)
+                outputHost[0] = shared[0];
+            else
+                workspace[offset_b + i / local_size] = shared[0];
+        }
+        std::swap(offset_a, offset_b);
+        _size = (_size + local_size - 1) / local_size;
+    } while(_size > 1);
+}
+
+template <class TIO>
+void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
+                                     tensor<TIO> target,
+                                     tensor<TIO> doutput,
+                                     tensor<TIO>& dinput,
+                                     tensor<TIO>& dtarget,
+                                     float alpha   = 0.25,
+                                     float gamma   = 2,
+                                     float divisor = 1)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto target_tv  = miopen::get_inner_expanded_tv<5>(target.desc);
+    auto doutput_tv = miopen::get_inner_expanded_tv<5>(doutput.desc);
+    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(dinput.desc);
+    auto dtarget_tv = miopen::get_inner_expanded_tv<5>(dtarget.desc);
+
+    size_t inputSize = input.desc.GetElementSize();
+
+    tensor_layout_t<5> doIdx(input_tv, 0);
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+
+        float p       = 1 / (1 + std::exp(-i));
+        float ceLoss  = -(t * std::log(p) + (1 - t) * std::log(1 - p));
+        float pT      = p * t + (1 - p) * (1 - t);
+        float powPt   = std::pow(1 - pT, gamma);
+        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+        if(dinput.data.size() > 0)
+        {
+            float dpdi      = std::exp(-i) / std::pow(1 + std::exp(-i), 2);
+            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            float dpowptdi  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            float grad = dO * dLdi;
+
+            if(alpha >= 0)
+            {
+                grad *= alpha_t;
+            }
+            grad /= divisor;
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+        }
+
+        if(dtarget.data.size() > 0)
+        {
+            float dcelossdt = -std::log(p) + std::log(1 - p);
+            float dpowptdt  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            float gradTarget = dO * dLdt;
+
+            if(alpha >= 0)
+            {
+                // alpha_t * dL/dt + dalpha_t/dt * dL
+                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+            }
+            gradTarget /= divisor;
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+        }
+    }
+}
diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp
new file mode 100644
index 0000000000..f2f6ec5d17
--- /dev/null
+++ b/test/gtest/sigmoid_focal_loss.cpp
@@ -0,0 +1,325 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "sigmoid_focal_loss.hpp"
+#include "miopen/bfloat16.hpp"
+#include "tensor_holder.hpp"
+#include <miopen/env.hpp>
+
+#define TEST_FWD_REDUCED
+#define TEST_BWD_REDUCED
+#define TEST_FWD_UNREDUCED
+#define TEST_BWD_UNREDUCED
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace sigmoidfocalloss {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest<float>
+{
+};
+
+struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest<half>
+{
+};
+
+struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest<bfloat16>
+{
+};
+
+struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest<float>
+{
+};
+
+struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest<half>
+{
+};
+
+struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest<bfloat16>
+{
+};
+
+struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest<float>
+{
+};
+
+struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest<half>
+{
+};
+
+struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
+{
+};
+
+struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest<float>
+{
+};
+
+struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest<half>
+{
+};
+
+struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
+{
+};
+}; // namespace sigmoidfocalloss
+
+using namespace sigmoidfocalloss;
+
+#ifdef TEST_FWD_REDUCED
+TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestFloat32,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestBFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
+
+#ifdef TEST_BWD_REDUCED
+TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestFloat32,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestBFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
+
+#ifdef TEST_FWD_UNREDUCED
+TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestFloat32,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestBFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
+
+#ifdef TEST_BWD_UNREDUCED
+TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestFloat32,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+
+TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestBFloat16,
+                         testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
new file mode 100644
index 0000000000..7443b7a94a
--- /dev/null
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -0,0 +1,489 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "cpu_sigmoid_focal_loss.hpp"
+#include "get_handle.hpp"
+#include "miopen/allocator.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/sigmoid_focal_loss.hpp>
+
+struct SigmoidFocalLossTestCase
+{
+    std::vector<size_t> dims;
+    bool isContiguous;
+    float alpha;
+    float gamma;
+    miopenLossReductionMode_t reduction;
+    friend std::ostream& operator<<(std::ostream& os, const SigmoidFocalLossTestCase& tc)
+    {
+        os << "dims: ";
+        for(auto dim : tc.dims)
+        {
+            os << dim << " ";
+        }
+        return os << "is_contiguous: " << tc.isContiguous << " alpha: " << tc.alpha
+                  << " gamma: " << tc.gamma;
+    }
+
+    std::vector<size_t> GetDims() const { return dims; }
+
+    SigmoidFocalLossTestCase() {}
+
+    SigmoidFocalLossTestCase(std::vector<size_t> dim_,
+                             bool isContiguous_                   = true,
+                             miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE,
+                             float alpha_                         = 0.25,
+                             float gamma_                         = 2)
+        : dims(dim_),
+          isContiguous(isContiguous_),
+          alpha(alpha_),
+          gamma(gamma_),
+          reduction(reduction_)
+    {
+    }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+inline std::vector<SigmoidFocalLossTestCase> SigmoidFocalLossTestConfigs()
+{
+    return {
+        SigmoidFocalLossTestCase({4000}),                   // 1D cont
+        SigmoidFocalLossTestCase({100, 500}),               // 2D cont
+        SigmoidFocalLossTestCase({100, 500}, false),        // 2D non-cont
+        SigmoidFocalLossTestCase({10, 20, 200}),            // 3D cont
+        SigmoidFocalLossTestCase({10, 20, 200}, false),     // 3D non-cont
+        SigmoidFocalLossTestCase({8, 3, 20, 100}),          // 4D cont
+        SigmoidFocalLossTestCase({8, 3, 20, 100}, false),   // 4D non-cont
+        SigmoidFocalLossTestCase({2, 2, 3, 4, 100}),        // 5D cont
+        SigmoidFocalLossTestCase({2, 2, 3, 4, 100}, false), // 5D non-cont
+    };
+}
+
+template <typename TIO>
+struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<SigmoidFocalLossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        auto in_dims    = config.GetDims();
+        auto in_strides = config.ComputeStrides(in_dims);
+
+        auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        input             = tensor<TIO>{in_dims, in_strides}.generate(in_gen_value);
+
+        auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        target             = tensor<TIO>{in_dims, in_strides}.generate(tar_gen_value);
+
+        output = tensor<TIO>{in_dims};
+        std::fill(output.begin(), output.end(), 0);
+
+        outputHost = tensor<TIO>{in_dims};
+        std::fill(outputHost.begin(), outputHost.end(), 0);
+
+        input_dev  = handle.Write(input.data);
+        target_dev = handle.Write(target.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::SigmoidFocalLossForward(handle,
+                                                 nullptr,
+                                                 0,
+                                                 input.desc,
+                                                 input_dev.get(),
+                                                 target.desc,
+                                                 target_dev.get(),
+                                                 output.desc,
+                                                 output_dev.get(),
+                                                 config.alpha,
+                                                 config.gamma,
+                                                 config.reduction);
+        cpu_sigmoid_focal_loss_unreduced_forward<TIO>(input, target, outputHost, config.alpha);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<TIO>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<TIO>::epsilon();
+
+        auto error = miopen::rms_range(outputHost, output);
+
+        EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output));
+        EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error
+                                            << ",  Thresholdx10: " << threshold * 10;
+    }
+    SigmoidFocalLossTestCase config;
+
+    tensor<TIO> input;
+    tensor<TIO> target;
+    tensor<TIO> output;
+
+    tensor<TIO> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+template <typename TIO>
+struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<SigmoidFocalLossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        auto in_dims      = config.GetDims();
+        auto in_strides   = config.ComputeStrides(in_dims);
+        auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        input             = tensor<TIO>{in_dims, in_strides}.generate(in_gen_value);
+
+        auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        target             = tensor<TIO>{in_dims, in_strides}.generate(tar_gen_value);
+
+        auto dOut_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        dOutput             = tensor<TIO>{in_dims, in_strides}.generate(dOut_gen_value);
+
+        dInput = tensor<TIO>{in_dims};
+        std::fill(dInput.begin(), dInput.end(), 0);
+
+        dInputHost = tensor<TIO>{in_dims};
+        std::fill(dInputHost.begin(), dInputHost.end(), 0);
+
+        dTarget = tensor<TIO>{in_dims};
+        std::fill(dTarget.begin(), dTarget.end(), 0);
+
+        dTargetHost = tensor<TIO>{in_dims};
+        std::fill(dTargetHost.begin(), dTargetHost.end(), 0);
+
+        input_dev   = handle.Write(input.data);
+        target_dev  = handle.Write(target.data);
+        dOutput_dev = handle.Write(dOutput.data);
+        dInput_dev  = handle.Write(dInput.data);
+        dTarget_dev = handle.Write(dTarget.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        status = miopen::SigmoidFocalLossBackward(handle,
+                                                  input.desc,
+                                                  input_dev.get(),
+                                                  target.desc,
+                                                  target_dev.get(),
+                                                  dOutput.desc,
+                                                  dOutput_dev.get(),
+                                                  dInput.desc,
+                                                  dInput_dev.get(),
+                                                  dTarget.desc,
+                                                  dTarget_dev.get(),
+                                                  config.alpha,
+                                                  config.gamma,
+                                                  config.reduction);
+        cpu_sigmoid_focal_loss_unreduced_backward<TIO>(
+            input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        dInput.data  = handle.Read<TIO>(dInput_dev, dInput.data.size());
+        dTarget.data = handle.Read<TIO>(dTarget_dev, dTarget.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<TIO>::epsilon();
+
+        auto dInputError = miopen::rms_range(dInputHost, dInput);
+
+        EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput));
+        EXPECT_TRUE(dInputError < threshold * 10)
+            << "dInput error output beyond tolerance Error: " << dInputError
+            << ",  Thresholdx10: " << threshold * 10;
+
+        auto dTargetError = miopen::rms_range(dTargetHost, dTarget);
+
+        EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget));
+        EXPECT_TRUE(dTargetError < threshold * 10)
+            << "dTarget error output beyond tolerance Error: " << dTargetError
+            << ",  Thresholdx10: " << threshold * 10;
+    }
+    SigmoidFocalLossTestCase config;
+
+    tensor<TIO> input;
+    tensor<TIO> target;
+    tensor<TIO> dOutput;
+    tensor<TIO> dInput;
+    tensor<TIO> dTarget;
+
+    tensor<TIO> dInputHost;
+    tensor<TIO> dTargetHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr dOutput_dev;
+    miopen::Allocator::ManageDataPtr dInput_dev;
+    miopen::Allocator::ManageDataPtr dTarget_dev;
+};
+
+template <typename TIO>
+struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
+
+        auto in_dims    = config.GetDims();
+        auto in_strides = config.ComputeStrides(in_dims);
+
+        auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 20); };
+        input             = tensor<TIO>{in_dims, in_strides}.generate(in_gen_value);
+
+        auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 20); };
+        target             = tensor<TIO>{in_dims, in_strides}.generate(tar_gen_value);
+
+        size_t workspaceSizeBytes = miopen::GetSigmoidFocalLossForwardWorkspaceSize(
+            handle, input.desc, target.desc, output.desc, config.reduction);
+        size_t workspaceElements = workspaceSizeBytes / sizeof(TIO);
+
+        workspace = tensor<TIO>(workspaceElements);
+        std::fill(workspace.begin(), workspace.end(), 0);
+
+        output = tensor<TIO>(1);
+        std::fill(output.begin(), output.end(), 0);
+
+        outputHost = tensor<TIO>(1);
+        std::fill(outputHost.begin(), outputHost.end(), 0);
+
+        divisor = 1;
+        if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        {
+            divisor *= input.desc.GetElementSize();
+        }
+
+        input_dev     = handle.Write(input.data);
+        target_dev    = handle.Write(target.data);
+        workspace_dev = handle.Write(workspace.data);
+        output_dev    = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        status = miopen::SigmoidFocalLossForward(handle,
+                                                 workspace_dev.get(),
+                                                 workspace.GetDataByteSize(),
+                                                 input.desc,
+                                                 input_dev.get(),
+                                                 target.desc,
+                                                 target_dev.get(),
+                                                 output.desc,
+                                                 output_dev.get(),
+                                                 config.alpha,
+                                                 config.gamma,
+                                                 config.reduction);
+        cpu_sigmoid_focal_loss_forward<TIO>(
+            input, target, workspace, outputHost, config.alpha, config.gamma, divisor);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<TIO>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<TIO>::epsilon();
+
+        auto error = miopen::rms_range(outputHost, output);
+
+        EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output));
+        EXPECT_TRUE(error < threshold * 10)
+            << "Error output beyond tolerance Error: " << error
+            << ",  Thresholdx10: " << threshold * 10 << " Reduction: " << config.reduction;
+    }
+    SigmoidFocalLossTestCase config;
+
+    tensor<TIO> input;
+    tensor<TIO> target;
+    tensor<TIO> workspace;
+    tensor<TIO> output;
+
+    tensor<TIO> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+
+    float divisor;
+};
+
+template <typename TIO>
+struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        config          = GetParam();
+        auto in_dims    = config.GetDims();
+        auto in_strides = config.ComputeStrides(in_dims);
+
+        config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
+
+        auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        input             = tensor<TIO>{in_dims, in_strides}.generate(in_gen_value);
+
+        auto tar_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
+        target             = tensor<TIO>{in_dims, in_strides}.generate(tar_gen_value);
+
+        dOutput    = tensor<TIO>(1);
+        dOutput[0] = prng::gen_descreet_uniform_sign<TIO>(0.1, 50);
+
+        dInput = tensor<TIO>{in_dims};
+        std::fill(dInput.begin(), dInput.end(), 0);
+
+        dInputHost = tensor<TIO>{in_dims};
+        std::fill(dInputHost.begin(), dInputHost.end(), 0);
+
+        dTarget = tensor<TIO>{in_dims};
+        std::fill(dTarget.begin(), dTarget.end(), 0);
+
+        dTargetHost = tensor<TIO>{in_dims};
+        std::fill(dTargetHost.begin(), dTargetHost.end(), 0);
+
+        divisor = 1;
+        if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        {
+            divisor *= input.desc.GetElementSize();
+        }
+        input_dev   = handle.Write(input.data);
+        target_dev  = handle.Write(target.data);
+        dOutput_dev = handle.Write(dOutput.data);
+        dInput_dev  = handle.Write(dInput.data);
+        dTarget_dev = handle.Write(dTarget.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        status = miopen::SigmoidFocalLossBackward(handle,
+                                                  input.desc,
+                                                  input_dev.get(),
+                                                  target.desc,
+                                                  target_dev.get(),
+                                                  dOutput.desc,
+                                                  dOutput_dev.get(),
+                                                  dInput.desc,
+                                                  dInput_dev.get(),
+                                                  dTarget.desc,
+                                                  dTarget_dev.get(),
+                                                  config.alpha,
+                                                  config.gamma,
+                                                  config.reduction);
+        cpu_sigmoid_focal_loss_backward<TIO>(
+            input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma, divisor);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        dInput.data  = handle.Read<TIO>(dInput_dev, dInput.data.size());
+        dTarget.data = handle.Read<TIO>(dTarget_dev, dTarget.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<TIO>::epsilon();
+
+        auto dInputError = miopen::rms_range(dInputHost, dInput);
+
+        EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput));
+        EXPECT_TRUE(dInputError < threshold * 10)
+            << "dInput error output beyond tolerance Error: " << dInputError
+            << ",  Thresholdx10: " << threshold * 10;
+
+        auto dTargetError = miopen::rms_range(dTargetHost, dTarget);
+
+        EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget));
+        EXPECT_TRUE(dTargetError < threshold * 10)
+            << "dTarget error output beyond tolerance Error: " << dTargetError
+            << ",  Thresholdx10: " << threshold * 10;
+    }
+    SigmoidFocalLossTestCase config;
+
+    tensor<TIO> input;
+    tensor<TIO> target;
+    tensor<TIO> dOutput;
+    tensor<TIO> dInput;
+    tensor<TIO> dTarget;
+
+    tensor<TIO> dInputHost;
+    tensor<TIO> dTargetHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr dOutput_dev;
+    miopen::Allocator::ManageDataPtr dInput_dev;
+    miopen::Allocator::ManageDataPtr dTarget_dev;
+
+    float divisor;
+};

From 7a6dfa4fd626f685dad0a3bba48b14cb56ae1f66 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 24 Jul 2024 15:31:37 +0700
Subject: [PATCH 02/28] remove githooks

---
 .githooks/install       |  7 -------
 .githooks/post-checkout |  3 ---
 .githooks/post-commit   |  3 ---
 .githooks/post-merge    |  3 ---
 .githooks/pre-commit    | 43 -----------------------------------------
 .githooks/pre-push      |  3 ---
 6 files changed, 62 deletions(-)
 delete mode 100755 .githooks/install
 delete mode 100755 .githooks/post-checkout
 delete mode 100755 .githooks/post-commit
 delete mode 100755 .githooks/post-merge
 delete mode 100755 .githooks/pre-commit
 delete mode 100755 .githooks/pre-push

diff --git a/.githooks/install b/.githooks/install
deleted file mode 100755
index 52fec83a2f..0000000000
--- a/.githooks/install
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-cd $(git rev-parse --git-dir)
-
-echo "Installing hooks..." 
-ln -s ../.githooks hooks
-echo "Done!"
diff --git a/.githooks/post-checkout b/.githooks/post-checkout
deleted file mode 100755
index ca7fcb4008..0000000000
--- a/.githooks/post-checkout
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-checkout "$@"
diff --git a/.githooks/post-commit b/.githooks/post-commit
deleted file mode 100755
index 52b339cb3f..0000000000
--- a/.githooks/post-commit
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-commit "$@"
diff --git a/.githooks/post-merge b/.githooks/post-merge
deleted file mode 100755
index a912e667aa..0000000000
--- a/.githooks/post-merge
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-merge "$@"
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
deleted file mode 100755
index e166dadd03..0000000000
--- a/.githooks/pre-commit
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/sh
-#
-# This pre-commit hook checks if any versions of clang-format
-# are installed, and if so, uses the installed version to format
-# the staged changes.
-
-base=clang-format-12
-format=""
-
-# Redirect output to stderr.
-exec 1>&2
-
- # check if clang-format is installed
-type "$base" >/dev/null 2>&1 && format="$base"
-
-# no versions of clang-format are installed
-if [ -z "$format" ]
-then
-    echo "$base is not installed. Pre-commit hook will not be executed."
-    exit 0
-fi
-
-# Do everything from top - level
-cd $(git rev-parse --show-toplevel)
-
-if git rev-parse --verify HEAD >/dev/null 2>&1
-then
-    against=HEAD
-else
-    # Initial commit: diff against an empty tree object
-    against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
-fi
-
-# do the formatting
-for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$')
-do
-    if [ -e "$file" ]
-    then
-        echo "$format $file"
-        "$format" -i -style=file "$file"
-    fi
-done
-
diff --git a/.githooks/pre-push b/.githooks/pre-push
deleted file mode 100755
index 0f0089bc25..0000000000
--- a/.githooks/pre-push
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs pre-push "$@"

From 42aafe37e6769e1979ec5074f9ccdbedfa76a08b Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 24 Jul 2024 15:33:42 +0700
Subject: [PATCH 03/28] add .githooks

---
 .githooks/install       |  7 +++++++
 .githooks/post-checkout |  3 +++
 .githooks/post-commit   |  3 +++
 .githooks/post-merge    |  3 +++
 .githooks/pre-commit    | 43 +++++++++++++++++++++++++++++++++++++++++
 .githooks/pre-push      |  3 +++
 6 files changed, 62 insertions(+)
 create mode 100755 .githooks/install
 create mode 100755 .githooks/post-checkout
 create mode 100755 .githooks/post-commit
 create mode 100755 .githooks/post-merge
 create mode 100755 .githooks/pre-commit
 create mode 100755 .githooks/pre-push

diff --git a/.githooks/install b/.githooks/install
new file mode 100755
index 0000000000..52fec83a2f
--- /dev/null
+++ b/.githooks/install
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+cd $(git rev-parse --git-dir)
+
+echo "Installing hooks..." 
+ln -s ../.githooks hooks
+echo "Done!"
diff --git a/.githooks/post-checkout b/.githooks/post-checkout
new file mode 100755
index 0000000000..ca7fcb4008
--- /dev/null
+++ b/.githooks/post-checkout
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-checkout "$@"
diff --git a/.githooks/post-commit b/.githooks/post-commit
new file mode 100755
index 0000000000..52b339cb3f
--- /dev/null
+++ b/.githooks/post-commit
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-commit "$@"
diff --git a/.githooks/post-merge b/.githooks/post-merge
new file mode 100755
index 0000000000..a912e667aa
--- /dev/null
+++ b/.githooks/post-merge
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs post-merge "$@"
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
new file mode 100755
index 0000000000..e166dadd03
--- /dev/null
+++ b/.githooks/pre-commit
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+# This pre-commit hook checks if any versions of clang-format
+# are installed, and if so, uses the installed version to format
+# the staged changes.
+
+base=clang-format-12
+format=""
+
+# Redirect output to stderr.
+exec 1>&2
+
+ # check if clang-format is installed
+type "$base" >/dev/null 2>&1 && format="$base"
+
+# no versions of clang-format are installed
+if [ -z "$format" ]
+then
+    echo "$base is not installed. Pre-commit hook will not be executed."
+    exit 0
+fi
+
+# Do everything from top - level
+cd $(git rev-parse --show-toplevel)
+
+if git rev-parse --verify HEAD >/dev/null 2>&1
+then
+    against=HEAD
+else
+    # Initial commit: diff against an empty tree object
+    against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
+fi
+
+# do the formatting
+for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$')
+do
+    if [ -e "$file" ]
+    then
+        echo "$format $file"
+        "$format" -i -style=file "$file"
+    fi
+done
+
diff --git a/.githooks/pre-push b/.githooks/pre-push
new file mode 100755
index 0000000000..0f0089bc25
--- /dev/null
+++ b/.githooks/pre-push
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
+git lfs pre-push "$@"

From 605542b9ebfdb8b7378b8d2a860cbe7af8eb117f Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 24 Jul 2024 15:36:52 +0700
Subject: [PATCH 04/28] add githooks

---
 .githooks/post-checkout | 3 ---
 .githooks/post-commit   | 3 ---
 .githooks/post-merge    | 3 ---
 .githooks/pre-push      | 3 ---
 4 files changed, 12 deletions(-)
 delete mode 100755 .githooks/post-checkout
 delete mode 100755 .githooks/post-commit
 delete mode 100755 .githooks/post-merge
 delete mode 100755 .githooks/pre-push

diff --git a/.githooks/post-checkout b/.githooks/post-checkout
deleted file mode 100755
index ca7fcb4008..0000000000
--- a/.githooks/post-checkout
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-checkout "$@"
diff --git a/.githooks/post-commit b/.githooks/post-commit
deleted file mode 100755
index 52b339cb3f..0000000000
--- a/.githooks/post-commit
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-commit "$@"
diff --git a/.githooks/post-merge b/.githooks/post-merge
deleted file mode 100755
index a912e667aa..0000000000
--- a/.githooks/post-merge
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs post-merge "$@"
diff --git a/.githooks/pre-push b/.githooks/pre-push
deleted file mode 100755
index 0f0089bc25..0000000000
--- a/.githooks/pre-push
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks').\n"; exit 2; }
-git lfs pre-push "$@"

From 0a6dfa2c151864cc5cbe5682c82eb1f857569d73 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 24 Jul 2024 17:43:32 +0700
Subject: [PATCH 05/28] add Tcheck type in driver

---
 driver/dm_sigmoid_focal_loss.cpp     |   6 +-
 driver/sigmoid_focal_loss_driver.hpp | 375 ++++++++++++++-------------
 2 files changed, 198 insertions(+), 183 deletions(-)

diff --git a/driver/dm_sigmoid_focal_loss.cpp b/driver/dm_sigmoid_focal_loss.cpp
index 001f2964b5..3ec7e9ac31 100644
--- a/driver/dm_sigmoid_focal_loss.cpp
+++ b/driver/dm_sigmoid_focal_loss.cpp
@@ -30,11 +30,11 @@
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "sigmoidfocalloss")
-        return new SigmoidFocalLossDriver<float>();
+        return new SigmoidFocalLossDriver<float, float>();
     else if(base_arg == "sigmoidfocallossfp16")
-        return new SigmoidFocalLossDriver<float16>();
+        return new SigmoidFocalLossDriver<float16, float>();
     else if(base_arg == "sigmoidfocallossbfp16")
-        return new SigmoidFocalLossDriver<bfloat16>();
+        return new SigmoidFocalLossDriver<bfloat16, float>();
     return nullptr;
 }
 
diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 6c739a3911..8bdc350b2f 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -32,19 +32,17 @@
 #include <miopen/miopen.h>
 #include "tensor_driver.hpp"
 #include "timer.hpp"
-#include "random.hpp"
 #include <../test/tensor_holder.hpp>
 #include <../test/verify.hpp>
 #include <cmath>
-#include <iostream>
 #include <vector>
 
-template <typename TIO>
-void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input,
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossUnreducedFwdRunHost(Tgpu* input,
                                             miopenTensorDescriptor_t inputDesc,
-                                            TIO* target,
+                                            Tgpu* target,
                                             miopenTensorDescriptor_t targetDesc,
-                                            TIO* outputHost,
+                                            Tcheck* outputHost,
                                             miopenTensorDescriptor_t outputDesc,
                                             float alpha = 0.25,
                                             float gamma = 2)
@@ -58,34 +56,34 @@ void mloSigmoidFocalLossUnreducedFwdRunHost(TIO* input,
     {
         tensor_layout_t<5> idx(input_tv, id);
 
-        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
 
-        float sig    = 1 / (1 + exp(-i));
-        float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
-        float sigT   = sig * t + (1 - sig) * (1 - t);
-        float loss   = ceLoss * pow(1 - sigT, gamma);
+        Tcheck sig    = 1 / (1 + exp(-i));
+        Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
+        Tcheck sigT   = sig * t + (1 - sig) * (1 - t);
+        Tcheck loss   = ceLoss * pow(1 - sigT, gamma);
 
         if(alpha >= 0)
         {
-            float alphaT = alpha * t + (1 - alpha) * (1 - t);
-            loss         = alphaT * loss;
+            Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss          = alphaT * loss;
         }
 
-        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(loss);
+        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(loss);
     }
 }
 
-template <class TIO>
-void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input,
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossUnreducedBwdRunHost(Tgpu* input,
                                             miopenTensorDescriptor_t inputDesc,
-                                            TIO* target,
+                                            Tgpu* target,
                                             miopenTensorDescriptor_t targetDesc,
-                                            TIO* doutput,
+                                            Tgpu* doutput,
                                             miopenTensorDescriptor_t doutputDesc,
-                                            TIO* dinput,
+                                            Tcheck* dinput,
                                             miopenTensorDescriptor_t dinputDesc,
-                                            TIO* dtarget,
+                                            Tcheck* dtarget,
                                             miopenTensorDescriptor_t dtargetDesc,
                                             float alpha = 0.25,
                                             float gamma = 2)
@@ -101,58 +99,58 @@ void mloSigmoidFocalLossUnreducedBwdRunHost(TIO* input,
     {
         tensor_layout_t<5> idx(input_tv, id);
 
-        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
-        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+        Tcheck i  = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t  = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
+        Tcheck dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
 
-        float p       = 1 / (1 + exp(-i));
-        float ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
-        float pT      = p * t + (1 - p) * (1 - t);
-        float powPt   = pow(1 - pT, gamma);
-        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+        Tcheck p       = 1 / (1 + exp(-i));
+        Tcheck ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+        Tcheck pT      = p * t + (1 - p) * (1 - t);
+        Tcheck powPt   = pow(1 - pT, gamma);
+        Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t);
 
         if(dinput)
         {
-            float dpdi      = exp(-i) / pow(1 + exp(-i), 2);
-            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
-            float dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+            Tcheck dpdi      = exp(-i) / pow(1 + exp(-i), 2);
+            Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            Tcheck dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
 
             // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
-            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
-            float grad = dO * dLdi;
+            Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            Tcheck grad = dO * dLdi;
 
             if(alpha >= 0)
             {
                 grad *= alpha_t;
             }
-            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(grad);
         }
 
         if(dtarget)
         {
-            float dcelossdt = -log(p) + log(1 - p);
-            float dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            Tcheck dcelossdt = -log(p) + log(1 - p);
+            Tcheck dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
             // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
-            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
-            float gradTarget = dO * dLdt;
+            Tcheck dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            Tcheck gradTarget = dO * dLdt;
 
             if(alpha >= 0)
             {
                 // alpha_t * dL/dt + dalpha_t/dt * dL
                 gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
             }
-            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(gradTarget);
         }
     }
 }
 
-template <typename TIO>
-void mloSigmoidFocalLossFwdRunHost(TIO* input,
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossFwdRunHost(Tgpu* input,
                                    miopenTensorDescriptor_t inputDesc,
-                                   TIO* target,
+                                   Tgpu* target,
                                    miopenTensorDescriptor_t targetDesc,
-                                   TIO* workspace,
-                                   TIO* ref_output,
+                                   Tcheck* workspaceHost,
+                                   Tcheck* outputHost,
                                    float alpha   = 0.25,
                                    float gamma   = 2,
                                    float divisor = 1)
@@ -165,21 +163,21 @@ void mloSigmoidFocalLossFwdRunHost(TIO* input,
     {
         tensor_layout_t<5> idx(input_tv, id);
 
-        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
 
-        float sig    = 1 / (1 + exp(-i));
-        float ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
-        float sigT   = sig * t + (1 - sig) * (1 - t);
-        float loss   = ceLoss * pow(1 - sigT, gamma);
+        Tcheck sig    = 1 / (1 + exp(-i));
+        Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
+        Tcheck sigT   = sig * t + (1 - sig) * (1 - t);
+        Tcheck loss   = ceLoss * pow(1 - sigT, gamma);
 
         if(alpha >= 0)
         {
-            float alphaT = alpha * t + (1 - alpha) * (1 - t);
-            loss         = alphaT * loss;
+            Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss          = alphaT * loss;
         }
 
-        workspace[id] = static_cast<TIO>(loss / divisor);
+        workspaceHost[id] = static_cast<Tcheck>(loss / divisor);
     }
 
     // Reduce loss
@@ -191,32 +189,32 @@ void mloSigmoidFocalLossFwdRunHost(TIO* input,
     {
         for(int i = 0; i < _size; i += local_size)
         {
-            TIO shared[local_size];
+            Tcheck shared[local_size];
             for(int j = 0; j < local_size; ++j)
-                shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f;
+                shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f;
             for(int offset = local_size / 2; offset > 0; offset >>= 1)
                 for(int j = 0; j < offset; ++j)
                     shared[j] += shared[j + offset];
             if(_size <= local_size)
-                ref_output[0] = shared[0];
+                outputHost[0] = shared[0];
             else
-                workspace[offset_b + i / local_size] = shared[0];
+                workspaceHost[offset_b + i / local_size] = shared[0];
         }
         std::swap(offset_a, offset_b);
         _size = (_size + local_size - 1) / local_size;
     } while(_size > 1);
 }
 
-template <class TIO>
-void mloSigmoidFocalLossBwdRunHost(TIO* input,
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossBwdRunHost(Tgpu* input,
                                    miopenTensorDescriptor_t inputDesc,
-                                   TIO* target,
+                                   Tgpu* target,
                                    miopenTensorDescriptor_t targetDesc,
-                                   TIO* doutput,
+                                   Tgpu* doutput,
                                    miopenTensorDescriptor_t doutputDesc,
-                                   TIO* dinput,
+                                   Tcheck* dinput,
                                    miopenTensorDescriptor_t dinputDesc,
-                                   TIO* dtarget,
+                                   Tcheck* dtarget,
                                    miopenTensorDescriptor_t dtargetDesc,
                                    float alpha   = 0.25,
                                    float gamma   = 2,
@@ -236,41 +234,41 @@ void mloSigmoidFocalLossBwdRunHost(TIO* input,
     {
         tensor_layout_t<5> idx(input_tv, id);
 
-        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
-        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+        Tcheck i  = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t  = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
+        Tcheck dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
 
-        float p       = 1 / (1 + exp(-i));
-        float ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
-        float pT      = p * t + (1 - p) * (1 - t);
-        float powPt   = pow(1 - pT, gamma);
-        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
+        Tcheck p       = 1 / (1 + exp(-i));
+        Tcheck ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+        Tcheck pT      = p * t + (1 - p) * (1 - t);
+        Tcheck powPt   = pow(1 - pT, gamma);
+        Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t);
 
         if(dinput)
         {
-            float dpdi      = exp(-i) / pow(1 + exp(-i), 2);
-            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
-            float dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+            Tcheck dpdi      = exp(-i) / pow(1 + exp(-i), 2);
+            Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            Tcheck dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
 
             // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
-            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
-            float grad = dO * dLdi;
+            Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            Tcheck grad = dO * dLdi;
 
             if(alpha >= 0)
             {
                 grad *= alpha_t;
             }
             grad /= divisor;
-            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(grad);
         }
 
         if(dtarget)
         {
-            float dcelossdt = -log(p) + log(1 - p);
-            float dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            Tcheck dcelossdt = -log(p) + log(1 - p);
+            Tcheck dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
             // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
-            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
-            float gradTarget = dO * dLdt;
+            Tcheck dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            Tcheck gradTarget = dO * dLdt;
 
             if(alpha >= 0)
             {
@@ -278,12 +276,12 @@ void mloSigmoidFocalLossBwdRunHost(TIO* input,
                 gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
             }
             gradTarget /= divisor;
-            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(gradTarget);
         }
     }
 }
 
-template <typename TIO>
+template <typename Tgpu, typename Tcheck>
 class SigmoidFocalLossDriver : public Driver
 {
 public:
@@ -296,7 +294,7 @@ class SigmoidFocalLossDriver : public Driver
         miopenCreateTensorDescriptor(&dinputDesc);
         miopenCreateTensorDescriptor(&dtargetDesc);
 
-        data_type = miopen_type<TIO>{};
+        data_type = miopen_type<Tgpu>{};
     }
 
     std::vector<int> ComputeStrides(std::vector<int> input);
@@ -314,6 +312,7 @@ class SigmoidFocalLossDriver : public Driver
     int RunBackwardGPU() override;
     int RunBackwardCPU();
 
+    Tcheck GetTolerance();
     int VerifyBackward() override;
     int VerifyForward() override;
     ~SigmoidFocalLossDriver() override
@@ -344,16 +343,17 @@ class SigmoidFocalLossDriver : public Driver
     std::unique_ptr<GPUMem> dtarget_dev;
     std::unique_ptr<GPUMem> workspace_dev;
 
-    std::vector<TIO> input;
-    std::vector<TIO> target;
-    std::vector<TIO> output;
-    std::vector<TIO> outputHost;
-    std::vector<TIO> doutput;
-    std::vector<TIO> dinput;
-    std::vector<TIO> dinputHost;
-    std::vector<TIO> dtarget;
-    std::vector<TIO> dtargetHost;
-    std::vector<TIO> workspace;
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> target;
+    std::vector<Tgpu> output;
+    std::vector<Tcheck> outputHost;
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+    std::vector<Tcheck> dinputHost;
+    std::vector<Tgpu> dtarget;
+    std::vector<Tcheck> dtargetHost;
+    std::vector<Tgpu> workspace;
+    std::vector<Tcheck> workspaceHost;
 
     float alpha;
     float gamma;
@@ -365,8 +365,8 @@ class SigmoidFocalLossDriver : public Driver
     size_t workSpaceSizeInBytes;
 };
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -377,8 +377,8 @@ int SigmoidFocalLossDriver<TIO>::ParseCmdLineArgs(int argc, char* argv[])
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::GetandSetData()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::GetandSetData()
 {
     auto inDims              = inflags.GetValueTensor("dim-lengths").lengths;
     alpha                    = inflags.GetValueDouble("alpha");
@@ -425,8 +425,8 @@ int SigmoidFocalLossDriver<TIO>::GetandSetData()
 }
 
 // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
-template <typename TIO>
-std::vector<int> SigmoidFocalLossDriver<TIO>::ComputeStrides(std::vector<int> inputDim)
+template <typename Tgpu, typename Tcheck>
+std::vector<int> SigmoidFocalLossDriver<Tgpu, Tcheck>::ComputeStrides(std::vector<int> inputDim)
 {
     if(!isContiguous)
         std::swap(inputDim.front(), inputDim.back());
@@ -439,8 +439,8 @@ std::vector<int> SigmoidFocalLossDriver<TIO>::ComputeStrides(std::vector<int> in
     return strides;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::AddCmdLineArgs()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward (Default=1)", "int");
     inflags.AddTensorFlag(
@@ -461,8 +461,8 @@ int SigmoidFocalLossDriver<TIO>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::AllocateBuffersAndCopy()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
 {
     size_t in_sz     = miopen::deref(inputDesc).GetElementSize();
     size_t target_sz = miopen::deref(targetDesc).GetElementSize();
@@ -473,42 +473,44 @@ int SigmoidFocalLossDriver<TIO>::AllocateBuffersAndCopy()
 
     uint32_t ctx = 0;
 
-    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(TIO)));
-    target_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, target_sz, sizeof(TIO)));
-    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(TIO)));
-    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dO_sz, sizeof(TIO)));
-    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dI_sz, sizeof(TIO)));
-    dtarget_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dT_sz, sizeof(TIO)));
+    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    target_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, target_sz, sizeof(Tgpu)));
+    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dO_sz, sizeof(Tgpu)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dI_sz, sizeof(Tgpu)));
+    dtarget_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, dT_sz, sizeof(Tgpu)));
 
     miopenGetSigmoidFocalLossForwardWorkspaceSize(
         handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes);
     workspace_dev =
-        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(TIO), sizeof(TIO)));
-
-    input       = std::vector<TIO>(in_sz, static_cast<TIO>(0));
-    target      = std::vector<TIO>(target_sz, static_cast<TIO>(0));
-    output      = std::vector<TIO>(out_sz, static_cast<TIO>(0));
-    outputHost  = std::vector<TIO>(out_sz, static_cast<TIO>(0));
-    doutput     = std::vector<TIO>(dO_sz, static_cast<TIO>(0));
-    dinput      = std::vector<TIO>(dI_sz, static_cast<TIO>(0));
-    dinputHost  = std::vector<TIO>(dI_sz, static_cast<TIO>(0));
-    dtarget     = std::vector<TIO>(dT_sz, static_cast<TIO>(0));
-    dtargetHost = std::vector<TIO>(dT_sz, static_cast<TIO>(0));
-    workspace   = std::vector<TIO>(workSpaceSizeInBytes / sizeof(TIO), static_cast<TIO>(0));
+        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu)));
+
+    input                 = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    target                = std::vector<Tgpu>(target_sz, static_cast<Tgpu>(0));
+    output                = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    outputHost            = std::vector<Tcheck>(out_sz, static_cast<Tcheck>(0));
+    doutput               = std::vector<Tgpu>(dO_sz, static_cast<Tgpu>(0));
+    dinput                = std::vector<Tgpu>(dI_sz, static_cast<Tgpu>(0));
+    dinputHost            = std::vector<Tcheck>(dI_sz, static_cast<Tcheck>(0));
+    dtarget               = std::vector<Tgpu>(dT_sz, static_cast<Tgpu>(0));
+    dtargetHost           = std::vector<Tcheck>(dT_sz, static_cast<Tcheck>(0));
+    size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu);
+    workspace             = std::vector<Tgpu>(workSpaceElems, static_cast<Tgpu>(0));
+    workspaceHost         = std::vector<Tcheck>(workSpaceElems, static_cast<Tcheck>(0));
 
     for(int i = 0; i < in_sz; i++)
     {
-        input[i]  = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
-        target[i] = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
+        input[i]  = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
+        target[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
     }
     for(int i = 0; i < dO_sz; ++i)
     {
-        doutput[i] = prng::gen_A_to_B<TIO>(static_cast<TIO>(-2), static_cast<TIO>(2));
+        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
     }
 
-    fill(output.begin(), output.end(), static_cast<TIO>(0));
-    fill(dinput.begin(), dinput.end(), static_cast<TIO>(0));
-    fill(dtarget.begin(), dtarget.end(), static_cast<TIO>(0));
+    fill(output.begin(), output.end(), static_cast<Tgpu>(0));
+    fill(dinput.begin(), dinput.end(), static_cast<Tgpu>(0));
+    fill(dtarget.begin(), dtarget.end(), static_cast<Tgpu>(0));
 
     if(input_dev->ToGPU(GetStream(), input.data()) != 0)
         std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl;
@@ -534,8 +536,8 @@ int SigmoidFocalLossDriver<TIO>::AllocateBuffersAndCopy()
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::RunForwardGPU()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunForwardGPU()
 {
     float kernel_total_time = 0;
     float kernel_first_time = 0;
@@ -585,38 +587,38 @@ int SigmoidFocalLossDriver<TIO>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::RunForwardCPU()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunForwardCPU()
 {
     if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
     {
-        mloSigmoidFocalLossUnreducedFwdRunHost<TIO>(input.data(),
+        mloSigmoidFocalLossUnreducedFwdRunHost<Tgpu, Tcheck>(input.data(),
+                                                             inputDesc,
+                                                             target.data(),
+                                                             targetDesc,
+                                                             outputHost.data(),
+                                                             outputDesc,
+                                                             alpha,
+                                                             gamma);
+    }
+    else
+    {
+        mloSigmoidFocalLossFwdRunHost<Tgpu, Tcheck>(input.data(),
                                                     inputDesc,
                                                     target.data(),
                                                     targetDesc,
+                                                    workspaceHost.data(),
                                                     outputHost.data(),
-                                                    outputDesc,
                                                     alpha,
-                                                    gamma);
-    }
-    else
-    {
-        mloSigmoidFocalLossFwdRunHost<TIO>(input.data(),
-                                           inputDesc,
-                                           target.data(),
-                                           targetDesc,
-                                           workspace.data(),
-                                           outputHost.data(),
-                                           alpha,
-                                           gamma,
-                                           divisor);
+                                                    gamma,
+                                                    divisor);
     }
 
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::RunBackwardGPU()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunBackwardGPU()
 {
     float kernel_total_time = 0;
     float kernel_first_time = 0;
@@ -678,10 +680,10 @@ int SigmoidFocalLossDriver<TIO>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::RunBackwardCPU()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunBackwardCPU()
 {
-    TIO* p_dtarget = nullptr;
+    Tcheck* p_dtarget = nullptr;
     if(isTargetGradientComputed)
     {
         p_dtarget = dtargetHost.data();
@@ -689,7 +691,22 @@ int SigmoidFocalLossDriver<TIO>::RunBackwardCPU()
     if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
     {
 
-        mloSigmoidFocalLossUnreducedBwdRunHost<TIO>(input.data(),
+        mloSigmoidFocalLossUnreducedBwdRunHost<Tgpu, Tcheck>(input.data(),
+                                                             inputDesc,
+                                                             target.data(),
+                                                             targetDesc,
+                                                             doutput.data(),
+                                                             doutputDesc,
+                                                             dinputHost.data(),
+                                                             dinputDesc,
+                                                             p_dtarget,
+                                                             dtargetDesc,
+                                                             alpha,
+                                                             gamma);
+    }
+    else
+    {
+        mloSigmoidFocalLossBwdRunHost<Tgpu, Tcheck>(input.data(),
                                                     inputDesc,
                                                     target.data(),
                                                     targetDesc,
@@ -700,35 +717,33 @@ int SigmoidFocalLossDriver<TIO>::RunBackwardCPU()
                                                     p_dtarget,
                                                     dtargetDesc,
                                                     alpha,
-                                                    gamma);
-    }
-    else
-    {
-        mloSigmoidFocalLossBwdRunHost<TIO>(input.data(),
-                                           inputDesc,
-                                           target.data(),
-                                           targetDesc,
-                                           doutput.data(),
-                                           doutputDesc,
-                                           dinputHost.data(),
-                                           dinputDesc,
-                                           p_dtarget,
-                                           dtargetDesc,
-                                           alpha,
-                                           gamma,
-                                           divisor);
+                                                    gamma,
+                                                    divisor);
     }
 
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::VerifyForward()
+template <typename Tgpu, typename Tcheck>
+Tcheck SigmoidFocalLossDriver<Tgpu, Tcheck>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::VerifyForward()
 {
     RunForwardCPU();
 
-    double tolerance = std::numeric_limits<TIO>::epsilon() * 10;
-    auto error       = miopen::rms_range(outputHost, output);
+    const Tcheck tolerance = GetTolerance();
+    auto error             = miopen::rms_range(outputHost, output);
 
     if(!std::isfinite(error) || error > tolerance)
     {
@@ -745,14 +760,14 @@ int SigmoidFocalLossDriver<TIO>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename TIO>
-int SigmoidFocalLossDriver<TIO>::VerifyBackward()
+template <typename Tgpu, typename Tcheck>
+int SigmoidFocalLossDriver<Tgpu, Tcheck>::VerifyBackward()
 {
     RunBackwardCPU();
 
-    double tolerance  = std::numeric_limits<TIO>::epsilon() * 10;
-    auto dinputError  = miopen::rms_range(dinputHost, dinput);
-    auto dtargetError = miopen::rms_range(dtargetHost, dtarget);
+    const Tcheck tolerance = GetTolerance();
+    auto dinputError       = miopen::rms_range(dinputHost, dinput);
+    auto dtargetError      = miopen::rms_range(dtargetHost, dtarget);
 
     if(!std::isfinite(dinputError) || dinputError > tolerance)
     {

From 9d75374633b228f45fbe89eae40ac39e1d5dbe56 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 24 Jul 2024 18:19:37 +0700
Subject: [PATCH 06/28] fix cppcheck err

---
 .../miopen/solver/implicitgemm_ck_util.hpp    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index ff25d5f622..abdd171227 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
         internal::MakeTaggedTransposeInstances<CKArgsType>(
             result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des);
 
-    result.invoker_factory = [split_k             = split_k,
+    result.invoker_factory = [split_k,
                               ck_args             = std::move(ck_args),
                               sh_conv_ptr         = std::shared_ptr{std::move(*ptr_iter)},
                               input1_tr_inst      = std::move(_input1_tr_inst),
@@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                               output_init_tr_inst = std::move(_output_init_tr_inst),
                               ck_buff_des =
                                   _ck_buff_des](const std::vector<Kernel>& kernels) mutable {
-        return [split_k = split_k,
+        return [split_k,
                 kernels,
                 ck_args             = std::move(ck_args),
                 sh_conv_ptr         = std::move(sh_conv_ptr),
@@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                 input2_tr_inst      = std::move(input2_tr_inst),
                 output_tr_inst      = std::move(output_tr_inst),
                 output_init_tr_inst = std::move(output_init_tr_inst),
-                ck_buff_des         = ck_buff_des](const Handle& handle,
-                                           const AnyInvokeParams& primitive_parameters) mutable {
+                ck_buff_des](const Handle& handle,
+                             const AnyInvokeParams& primitive_parameters) mutable {
             handle.ResetKernelTime();
 
             const auto& data_ctx = primitive_parameters.CastTo<CastType>();
@@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
         [[maybe_unused]] bool should_allocated_wrw_buffer =
             ShouldAllocateWorkSpaceBufferForWRW(problem);
 
-        result.invoker_factory = [split_k                     = split_k,
-                                  ck_args                     = CKArgsType{problem},
-                                  alpha_beta_case             = alpha_beta_case,
-                                  should_allocated_wrw_buffer = should_allocated_wrw_buffer,
+        result.invoker_factory = [split_k,
+                                  ck_args = CKArgsType{problem},
+                                  alpha_beta_case,
+                                  should_allocated_wrw_buffer,
                                   sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](
                                      const std::vector<Kernel>&) mutable {
-            return [split_k                     = split_k,
-                    ck_args                     = std::move(ck_args),
-                    alpha_beta_case             = alpha_beta_case,
-                    should_allocated_wrw_buffer = should_allocated_wrw_buffer,
-                    sh_conv_ptr                 = std::move(sh_conv_ptr)](
+            return [split_k,
+                    ck_args = std::move(ck_args),
+                    alpha_beta_case,
+                    should_allocated_wrw_buffer,
+                    sh_conv_ptr = std::move(sh_conv_ptr)](
                        const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 const auto& data_ctx = primitive_parameters.CastTo<CastType>();
                 std::unique_ptr<ck::tensor_operation::device::BaseArgument> argument_ptr;

From f91144c69e0ab87533a056d7cb3d4d9f940505c0 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Fri, 26 Jul 2024 19:00:38 +0700
Subject: [PATCH 07/28] add MIOPEN_INTERNALS_EXPORT

---
 src/include/miopen/sigmoid_focal_loss.hpp | 62 ++++++++++++-----------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp
index 07d3e32d61..cbb6dff65d 100644
--- a/src/include/miopen/sigmoid_focal_loss.hpp
+++ b/src/include/miopen/sigmoid_focal_loss.hpp
@@ -33,39 +33,41 @@ namespace miopen {
 struct Handle;
 struct TensorDescriptor;
 
-size_t GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle,
-                                               const TensorDescriptor& inputDesc,
-                                               const TensorDescriptor& targetDesc,
-                                               const TensorDescriptor& outputDesc,
-                                               miopenLossReductionMode_t reduction);
-
-miopenStatus_t SigmoidFocalLossForward(Handle& handle,
-                                       Data_t workspace,
-                                       size_t workspaceSizeInBytes,
-                                       const TensorDescriptor& inputDesc,
-                                       ConstData_t input,
-                                       const TensorDescriptor& targetDesc,
-                                       ConstData_t target,
-                                       const TensorDescriptor& outputDesc,
-                                       Data_t output,
-                                       float alpha,
-                                       float gamma,
-                                       miopenLossReductionMode_t reduction);
-
-miopenStatus_t SigmoidFocalLossBackward(Handle& handle,
+MIOPEN_INTERNALS_EXPORT size_t
+GetSigmoidFocalLossForwardWorkspaceSize(Handle& handle,
                                         const TensorDescriptor& inputDesc,
-                                        ConstData_t input,
                                         const TensorDescriptor& targetDesc,
-                                        ConstData_t target,
-                                        const TensorDescriptor& doutputDesc,
-                                        ConstData_t doutput,
-                                        const TensorDescriptor& dinputDesc,
-                                        Data_t dinput,
-                                        const TensorDescriptor& dtargetDesc,
-                                        Data_t dtarget,
-                                        float alpha,
-                                        float gamma,
+                                        const TensorDescriptor& outputDesc,
                                         miopenLossReductionMode_t reduction);
 
+MIOPEN_INTERNALS_EXPORT miopenStatus_t SigmoidFocalLossForward(Handle& handle,
+                                                               Data_t workspace,
+                                                               size_t workspaceSizeInBytes,
+                                                               const TensorDescriptor& inputDesc,
+                                                               ConstData_t input,
+                                                               const TensorDescriptor& targetDesc,
+                                                               ConstData_t target,
+                                                               const TensorDescriptor& outputDesc,
+                                                               Data_t output,
+                                                               float alpha,
+                                                               float gamma,
+                                                               miopenLossReductionMode_t reduction);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t
+SigmoidFocalLossBackward(Handle& handle,
+                         const TensorDescriptor& inputDesc,
+                         ConstData_t input,
+                         const TensorDescriptor& targetDesc,
+                         ConstData_t target,
+                         const TensorDescriptor& doutputDesc,
+                         ConstData_t doutput,
+                         const TensorDescriptor& dinputDesc,
+                         Data_t dinput,
+                         const TensorDescriptor& dtargetDesc,
+                         Data_t dtarget,
+                         float alpha,
+                         float gamma,
+                         miopenLossReductionMode_t reduction);
+
 } // namespace miopen
 #endif // MIOPEN_SIGMOID_FOCAL_LOSS_HPP_

From 35c6ee696d216581078976e089ca6d58ee8e4252 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 30 Jul 2024 10:54:38 +0700
Subject: [PATCH 08/28] change gtest naming format following new convention

---
 driver/sigmoid_focal_loss_driver.hpp          |   4 -
 .../forward_reduce_sigmoid_focal_loss.cpp     |   5 +-
 test/gtest/sigmoid_focal_loss.cpp             | 111 ++++++++----------
 3 files changed, 52 insertions(+), 68 deletions(-)

diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 8bdc350b2f..603d1777e5 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -508,10 +508,6 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
         doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
     }
 
-    fill(output.begin(), output.end(), static_cast<Tgpu>(0));
-    fill(dinput.begin(), dinput.end(), static_cast<Tgpu>(0));
-    fill(dtarget.begin(), dtarget.end(), static_cast<Tgpu>(0));
-
     if(input_dev->ToGPU(GetStream(), input.data()) != 0)
         std::cerr << "Error copying (in) to GPU, size: " << input_dev->GetSize() << std::endl;
 
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
index f7daa8b84c..a9abcf2e96 100644
--- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -97,7 +97,8 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
             HipEventPtr stop;
 
             bool resetProfilingState = false;
-            if(handle_.IsProfilingEnabled())
+            const bool profiling     = handle_.IsProfilingEnabled();
+            if(profiling)
             {
                 resetProfilingState = true;
                 handle_.EnableProfiling(false);
@@ -153,7 +154,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
                 handle_.EnableProfiling(true);
             }
 
-            if(handle_.IsProfilingEnabled())
+            if(profiling)
             {
                 hipEventRecord(stop.get(), handle_.GetStream());
                 hipEventSynchronize(stop.get());
diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp
index f2f6ec5d17..fa90ceb218 100644
--- a/test/gtest/sigmoid_focal_loss.cpp
+++ b/test/gtest/sigmoid_focal_loss.cpp
@@ -25,15 +25,10 @@
  *******************************************************************************/
 
 #include "sigmoid_focal_loss.hpp"
-#include "miopen/bfloat16.hpp"
 #include "tensor_holder.hpp"
+#include <miopen/bfloat16.hpp>
 #include <miopen/env.hpp>
 
-#define TEST_FWD_REDUCED
-#define TEST_BWD_REDUCED
-#define TEST_FWD_UNREDUCED
-#define TEST_BWD_UNREDUCED
-
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
@@ -49,59 +44,58 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest<float>
+struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest<float>
 {
 };
 
-struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest<half>
+struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest<half>
 {
 };
 
-struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest<bfloat16>
+struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest<float>
+struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest<float>
 {
 };
 
-struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest<half>
+struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest<half>
 {
 };
 
-struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest<bfloat16>
+struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest<float>
+struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest<float>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest<half>
+struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest<half>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
+struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest<float>
+struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest<float>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest<half>
+struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest<half>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
+struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
 {
 };
 }; // namespace sigmoidfocalloss
 
 using namespace sigmoidfocalloss;
 
-#ifdef TEST_FWD_REDUCED
-TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -115,11 +109,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -133,11 +127,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -151,13 +145,11 @@ TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_BWD_REDUCED
-TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -171,11 +163,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -189,11 +181,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -207,13 +199,11 @@ TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_FWD_UNREDUCED
-TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -227,11 +217,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedFor
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -245,11 +235,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedFor
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -263,13 +253,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedFo
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_BWD_UNREDUCED
-TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -283,11 +271,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBa
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -301,11 +289,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBa
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -319,7 +307,6 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedB
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif

From 2b16cdb1c4538cbef3e5279c885708fd564344b0 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 30 Jul 2024 11:48:20 +0700
Subject: [PATCH 09/28] update drive random bound

---
 driver/sigmoid_focal_loss_driver.hpp              | 15 ++++++++++++---
 .../forward_reduce_sigmoid_focal_loss.cpp         | 14 ++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 603d1777e5..7b51e38fd4 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -498,14 +498,23 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
     workspace             = std::vector<Tgpu>(workSpaceElems, static_cast<Tgpu>(0));
     workspaceHost         = std::vector<Tcheck>(workSpaceElems, static_cast<Tcheck>(0));
 
+    float randomBound = 2;
+    // For half, the random bound is smaller to avoid half overflow
+    if(data_type == miopenHalf && reduction != MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        randomBound = 0.5;
+    }
     for(int i = 0; i < in_sz; i++)
     {
-        input[i]  = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
-        target[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
+        input[i] =
+            prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-randomBound), static_cast<Tgpu>(randomBound));
+        target[i] =
+            prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-randomBound), static_cast<Tgpu>(randomBound));
     }
     for(int i = 0; i < dO_sz; ++i)
     {
-        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-2), static_cast<Tgpu>(2));
+        doutput[i] =
+            prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-randomBound), static_cast<Tgpu>(randomBound));
     }
 
     if(input_dev->ToGPU(GetStream(), input.data()) != 0)
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
index a9abcf2e96..d3f874251f 100644
--- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -96,11 +96,9 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
             HipEventPtr start;
             HipEventPtr stop;
 
-            bool resetProfilingState = false;
-            const bool profiling     = handle_.IsProfilingEnabled();
+            const bool profiling = handle_.IsProfilingEnabled();
             if(profiling)
             {
-                resetProfilingState = true;
                 handle_.EnableProfiling(false);
                 start = miopen::make_hip_event();
                 stop  = miopen::make_hip_event();
@@ -149,18 +147,18 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
                 size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD;
             }
 
-            if(resetProfilingState)
-            {
-                handle_.EnableProfiling(true);
-            }
-
             if(profiling)
             {
                 hipEventRecord(stop.get(), handle_.GetStream());
                 hipEventSynchronize(stop.get());
                 hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
                 handle_.ResetKernelTime();
                 handle_.AccumKernelTime(elapsed);
+
+                handle_.EnableProfiling(true);
             };
         };
     };

From ee5952a8db21d5a2ec712d924d359272346f71ba Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 30 Jul 2024 13:22:53 +0700
Subject: [PATCH 10/28] try revert back unit-test file to check pipeline

---
 test/gtest/sigmoid_focal_loss.cpp | 111 +++++++++++++++++-------------
 1 file changed, 62 insertions(+), 49 deletions(-)

diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp
index fa90ceb218..f2f6ec5d17 100644
--- a/test/gtest/sigmoid_focal_loss.cpp
+++ b/test/gtest/sigmoid_focal_loss.cpp
@@ -25,10 +25,15 @@
  *******************************************************************************/
 
 #include "sigmoid_focal_loss.hpp"
+#include "miopen/bfloat16.hpp"
 #include "tensor_holder.hpp"
-#include <miopen/bfloat16.hpp>
 #include <miopen/env.hpp>
 
+#define TEST_FWD_REDUCED
+#define TEST_BWD_REDUCED
+#define TEST_FWD_UNREDUCED
+#define TEST_BWD_UNREDUCED
+
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
@@ -44,58 +49,59 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest<float>
+struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest<float>
 {
 };
 
-struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest<half>
+struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest<half>
 {
 };
 
-struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest<bfloat16>
+struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest<bfloat16>
 {
 };
 
-struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest<float>
+struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest<float>
 {
 };
 
-struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest<half>
+struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest<half>
 {
 };
 
-struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest<bfloat16>
+struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest<bfloat16>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest<float>
+struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest<float>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest<half>
+struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest<half>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
+struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest<float>
+struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest<float>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest<half>
+struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest<half>
 {
 };
 
-struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
+struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
 {
 };
 }; // namespace sigmoidfocalloss
 
 using namespace sigmoidfocalloss;
 
-TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test)
+#ifdef TEST_FWD_REDUCED
+TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -109,11 +115,11 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_fwd_FP32,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestFloat32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test)
+TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -127,11 +133,11 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_fwd_FP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test)
+TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -145,11 +151,13 @@ TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_fwd_BFP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
+                         SigmoidFocalLossForwardTestBFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
 
-TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test)
+#ifdef TEST_BWD_REDUCED
+TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -163,11 +171,11 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_bwd_FP32,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestFloat32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test)
+TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -181,11 +189,11 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_bwd_FP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test)
+TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -199,11 +207,13 @@ TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLoss_bwd_BFP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
+                         SigmoidFocalLossBackwardTestBFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test)
+#ifdef TEST_FWD_UNREDUCED
+TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -217,11 +227,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_fwd_FP32,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestFloat32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test)
+TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -235,11 +245,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_fwd_FP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test)
+TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -253,11 +263,13 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_fwd_BFP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
+                         SigmoidFocalLossUnreducedForwardTestBFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test)
+#ifdef TEST_BWD_UNREDUCED
+TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -271,11 +283,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_bwd_FP32,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestFloat32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test)
+TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -289,11 +301,11 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_bwd_FP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test)
+TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -307,6 +319,7 @@ TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_SigmoidFocalLossUnreduced_bwd_BFP16,
+INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
+                         SigmoidFocalLossUnreducedBackwardTestBFloat16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
+#endif

From 4fcf6897edf2e4d0459beb7239cad653d7a35252 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 30 Jul 2024 14:55:11 +0700
Subject: [PATCH 11/28] try __hip_ds_swizzlef_N

---
 src/kernels/warp_shuffle.hpp | 63 ++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp
index ebd5861976..5caf7d2ec4 100644
--- a/src/kernels/warp_shuffle.hpp
+++ b/src/kernels/warp_shuffle.hpp
@@ -52,21 +52,78 @@ __device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val)
     return val;
 }
 
+template <uint32_t WARP_SIZE, uint32_t SWIZZLE_SIZE = WARP_SIZE>
+__forceinline__ __device__ float reductionFullWarp(float reduced_val, uint32_t laneId)
+{
+    static_assert(WARP_SIZE != 0, "WARP_SIZEmust not be 0");
+    static_assert((SWIZZLE_SIZE & (SWIZZLE_SIZE - 1)) == 0,
+                  "WARP_SIZE and SWIZZLE must be a power of 2");
+
+    if constexpr(SWIZZLE_SIZE == 1)
+        return reduced_val;
+
+    reduced_val = reductionFullWarp<WARP_SIZE, (SWIZZLE_SIZE >> 1)>(reduced_val, laneId);
+
+    constexpr uint32_t warp_msk = (WARP_SIZE - 1);
+
+    float tmp;
+    if constexpr(SWIZZLE_SIZE >= 64)
+    {
+        // swizzle can handle only 32 lanes, switching to bpermute
+        uint32_t idx = laneId ^ (SWIZZLE_SIZE >> 1);
+
+        idx = idx >= ((laneId + WARP_SIZE) & ~warp_msk) ? laneId : idx;
+        int itmp =
+            __builtin_amdgcn_ds_bpermute(static_cast<int>(idx << 2), __float_as_int(reduced_val));
+        tmp = __int_as_float(itmp);
+    }
+    else
+    {
+        // butterfly reduction based on __shfl_xor
+        // swizzle <xor_mask[14:10], or_mask[9:5], and_mask[4:0]>()
+        constexpr uint32_t xor_off = 10;
+        // constexpr uint32_t or_off  = 5;
+        constexpr uint32_t and_off = 0;
+
+        constexpr uint32_t field_msk = 0x1f;
+
+        constexpr uint32_t and_msk = warp_msk & field_msk;
+        // constexpr uint32_t or_msk  = 0;
+        constexpr uint32_t xor_msk = (SWIZZLE_SIZE >> 1) & field_msk;
+
+        // clang tidy does not like that (or_msk << or_off) is zero
+        // and cliams that it's redundant, but it's required for
+        // __hip_ds_swizzlef_N reference. Menawhile swizzle_op generation
+        // must be a part of hip intrinsics, because it depends on ISA
+        // like __hip_ds_swizzlef_N<xor_mask, or_mask, and_mask>
+        // For some reason NILINT doesn't work.
+        // NOLINTBEGIN
+        constexpr uint32_t swizzle_op =
+            (xor_msk << xor_off) /* | (or_msk << or_off) */ | (and_msk << and_off);
+        // NOLINTEND
+
+        tmp = __hip_ds_swizzlef_N<swizzle_op>(reduced_val);
+    }
+
+    return tmp + reduced_val;
+};
+
 __device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val)
 {
     static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize];
     auto lane = threadIdx.x % warpSize;
     auto wid  = threadIdx.x / warpSize;
 
-    val = warp_reduce_sum(val);
+    val = reductionFullWarp<warpSize>(val, lane);
 
     if(lane == 0)
         shared[wid] = val;
     __syncthreads();
 
     val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0;
-    if(wid == 0)
-        val = warp_reduce_sum(val);
+    // if(wid == 0)
+    //     val = warp_reduce_sum(val);
+    val = reductionFullWarp<warpSize>(val, lane);
 
     return val;
 }

From debd5301a9e1af5a6ccbc0782be555ddb9ba9bcd Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 30 Jul 2024 15:17:29 +0700
Subject: [PATCH 12/28] change unit-test format

---
 test/gtest/sigmoid_focal_loss.cpp | 111 +++++++++++++-----------------
 1 file changed, 49 insertions(+), 62 deletions(-)

diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp
index f2f6ec5d17..fa90ceb218 100644
--- a/test/gtest/sigmoid_focal_loss.cpp
+++ b/test/gtest/sigmoid_focal_loss.cpp
@@ -25,15 +25,10 @@
  *******************************************************************************/
 
 #include "sigmoid_focal_loss.hpp"
-#include "miopen/bfloat16.hpp"
 #include "tensor_holder.hpp"
+#include <miopen/bfloat16.hpp>
 #include <miopen/env.hpp>
 
-#define TEST_FWD_REDUCED
-#define TEST_BWD_REDUCED
-#define TEST_FWD_UNREDUCED
-#define TEST_BWD_UNREDUCED
-
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
@@ -49,59 +44,58 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct SigmoidFocalLossForwardTestFloat32 : SigmoidFocalLossFwdTest<float>
+struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest<float>
 {
 };
 
-struct SigmoidFocalLossForwardTestFloat16 : SigmoidFocalLossFwdTest<half>
+struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest<half>
 {
 };
 
-struct SigmoidFocalLossForwardTestBFloat16 : SigmoidFocalLossFwdTest<bfloat16>
+struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossBackwardTestFloat32 : SigmoidFocalLossBwdTest<float>
+struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest<float>
 {
 };
 
-struct SigmoidFocalLossBackwardTestFloat16 : SigmoidFocalLossBwdTest<half>
+struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest<half>
 {
 };
 
-struct SigmoidFocalLossBackwardTestBFloat16 : SigmoidFocalLossBwdTest<bfloat16>
+struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestFloat32 : SigmoidFocalLossUnreducedFwdTest<float>
+struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest<float>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestFloat16 : SigmoidFocalLossUnreducedFwdTest<half>
+struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest<half>
 {
 };
 
-struct SigmoidFocalLossUnreducedForwardTestBFloat16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
+struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestFloat32 : SigmoidFocalLossUnreducedBwdTest<float>
+struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest<float>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestFloat16 : SigmoidFocalLossUnreducedBwdTest<half>
+struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest<half>
 {
 };
 
-struct SigmoidFocalLossUnreducedBackwardTestBFloat16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
+struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
 {
 };
 }; // namespace sigmoidfocalloss
 
 using namespace sigmoidfocalloss;
 
-#ifdef TEST_FWD_REDUCED
-TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -115,11 +109,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat32, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -133,11 +127,11 @@ TEST_P(SigmoidFocalLossForwardTestFloat16, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
+TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -151,13 +145,11 @@ TEST_P(SigmoidFocalLossForwardTestBFloat16, SigmoidFocalLossForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossForwardTestSet,
-                         SigmoidFocalLossForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_fwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_BWD_REDUCED
-TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -171,11 +163,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat32, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -189,11 +181,11 @@ TEST_P(SigmoidFocalLossBackwardTestFloat16, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
+TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -207,13 +199,11 @@ TEST_P(SigmoidFocalLossBackwardTestBFloat16, SigmoidFocalLossBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossBackwardTestSet,
-                         SigmoidFocalLossBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLoss_bwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_FWD_UNREDUCED
-TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -227,11 +217,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat32, SigmoidFocalLossUnreducedFor
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -245,11 +235,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestFloat16, SigmoidFocalLossUnreducedFor
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedForwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -263,13 +253,11 @@ TEST_P(SigmoidFocalLossUnreducedForwardTestBFloat16, SigmoidFocalLossUnreducedFo
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedForwardTestSet,
-                         SigmoidFocalLossUnreducedForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_fwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif
 
-#ifdef TEST_BWD_UNREDUCED
-TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -283,11 +271,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat32, SigmoidFocalLossUnreducedBa
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_FP32,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -301,11 +289,11 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestFloat16, SigmoidFocalLossUnreducedBa
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_FP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
 
-TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedBackwardTest)
+TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -319,7 +307,6 @@ TEST_P(SigmoidFocalLossUnreducedBackwardTestBFloat16, SigmoidFocalLossUnreducedB
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SigmoidFocalLossUnreducedBackwardTestSet,
-                         SigmoidFocalLossUnreducedBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_SigmoidFocalLossUnreduced_bwd_BFP16,
                          testing::ValuesIn(SigmoidFocalLossTestConfigs()));
-#endif

From 182ea0b86a794d0de1b5b416e1c25576ed7b4f59 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 31 Jul 2024 13:26:31 +0700
Subject: [PATCH 13/28] use MultiBufferWorkspaceTraits

---
 .../miopen/sigmoidfocalloss/solvers.hpp       |  3 ++
 .../forward_reduce_sigmoid_focal_loss.cpp     | 30 +++++++++++--------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp
index 992ad5a9d6..9cb3bd15e8 100644
--- a/src/include/miopen/sigmoidfocalloss/solvers.hpp
+++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp
@@ -50,6 +50,9 @@ struct SigmoidFocalLossFwd final : SigmoidFocalLossFwdSolverBase
                              const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription&
                                  problem) const override;
 
+    MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits(
+        const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const;
+
     std::size_t
     GetWorkspaceSize(const ExecutionContext& context,
                      const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem)
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
index d3f874251f..f1a37fc54f 100644
--- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/buffer_info.hpp"
 #include <miopen/sigmoidfocalloss/problem_description.hpp>
 #include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
@@ -36,7 +37,7 @@
 #include <miopen/tensor_view_utils.hpp>
 
 #define LOCAL_SIZE 256
-#define LOCAL_SIZE_REDUCE_FWD 256
+#define LOCAL_SIZE_REDUCE 256
 
 namespace miopen {
 
@@ -83,11 +84,11 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
     do
     {
         result.construction_params.push_back(make_hip_kernel(
-            {LOCAL_SIZE_REDUCE_FWD}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
-        _size = AlignUp(_size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD;
+            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
+        _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
     } while(_size > 1);
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+    result.invoker_factory = [this, problem](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) params = raw_params.CastTo<miopen::sigmoidfocalloss::FwdInvokeParams>();
             auto size             = deref(params.inputDesc).GetElementSize();
@@ -127,11 +128,11 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
             }
 
             /* Execute reduce kernels */
+            auto wt       = GetMultiBufferWorkspaceTraits(problem);
             auto reduceIn = params.workspace;
             auto reduceOut =
-                static_cast<Data_t>(static_cast<char*>(params.workspace) +
-                                    deref(params.inputDesc).GetElementSize() *
-                                        get_data_size(deref(params.outputDesc).GetType()));
+                static_cast<Data_t>(static_cast<char*>(params.workspace) + wt.GetOffset(1));
+
             for(int i = 1; i < kernels.size(); ++i)
             {
                 decltype(auto) kernel = handle_.Run(kernels[i]);
@@ -144,7 +145,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
                 {
                     kernel(reduceIn, params.output, size);
                 }
-                size = AlignUp(size, LOCAL_SIZE_REDUCE_FWD) / LOCAL_SIZE_REDUCE_FWD;
+                size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
             }
 
             if(profiling)
@@ -169,13 +170,18 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
 std::size_t SigmoidFocalLossFwd::GetWorkspaceSize(
     const ExecutionContext& /*context*/,
     const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
+{
+    return GetMultiBufferWorkspaceTraits(problem).GetSize();
+}
+
+MultiBufferWorkspaceTraits SigmoidFocalLossFwd::GetMultiBufferWorkspaceTraits(
+    const miopen::sigmoidfocalloss::SigmoidFocalLossFwdProblemDescription& problem) const
 {
     size_t inputElements  = problem.GetInputDesc().GetElementSize();
-    size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE_FWD - 1) / LOCAL_SIZE_REDUCE_FWD;
-    size_t res =
-        (inputElements + reduceElements) * get_data_size(problem.GetOutputDesc().GetType());
+    size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
+    size_t elementSize    = get_data_size(problem.GetOutputDesc().GetType());
 
-    return res;
+    return MultiBufferWorkspaceTraits{inputElements * elementSize, reduceElements * elementSize};
 }
 
 } // namespace sigmoidfocalloss

From 19c6390ea9fcb1363969612624a01a586722853d Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Mon, 5 Aug 2024 17:33:54 +0700
Subject: [PATCH 14/28] remove redundant files

---
 driver/driver.hpp |   3 +-
 rocfft_r2c_ex.cpp | 317 ----------------------------------------------
 2 files changed, 1 insertion(+), 319 deletions(-)
 delete mode 100644 rocfft_r2c_ex.cpp

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 749ee16a17..1c4e59c371 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -206,8 +206,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
-       arg != "getitemfp16" && arg != "getitembfp16" && arg != "transformersadamwfp16" &&
-       arg != "transformersampadamw" && arg != "reducecalculation" &&
+       arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
        arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" &&
        arg != "sigmoidfocalloss" && arg != "sigmoidfocallossfp16" &&
        arg != "sigmoidfocallossbfp16" && arg != "--version")
diff --git a/rocfft_r2c_ex.cpp b/rocfft_r2c_ex.cpp
deleted file mode 100644
index 8c17fac21b..0000000000
--- a/rocfft_r2c_ex.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE.
-
-#include <complex>
-#include <functional>
-#include <iostream>
-#include <numeric>
-#include <vector>
-
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_vector_types.h>
-#include <rocfft/rocfft.h>
-
-#include "../../../shared/CLI11.hpp"
-#include "examplekernels.h"
-#include "exampleutils.h"
-#include <stdexcept>
-
-int main(int argc, char* argv[])
-{
-    std::cout << "rocfft double-precision real/complex transform\n" << std::endl;
-
-    // Length of transform:
-    std::vector<size_t> length = {8};
-
-    // Gpu device id:
-    size_t deviceId = 0;
-
-    // Command-line options:
-    CLI::App app{"rocfft sample command line options"};
-    app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
-    CLI::Option* opt_outofplace =
-        app.add_flag("-o, --outofplace", "Perform an out-of-place transform");
-    CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform");
-    app.add_option(
-        "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)");
-
-    try
-    {
-        app.parse(argc, argv);
-    }
-    catch(const CLI::ParseError& e)
-    {
-        return app.exit(e);
-    }
-
-    // Placeness for the transform
-    if(rocfft_setup() != rocfft_status_success)
-        throw std::runtime_error("rocfft_setup failed.");
-    const rocfft_result_placement place =
-        *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace;
-    const bool inplace = place == rocfft_placement_inplace;
-
-    // Direction of transform
-    const rocfft_transform_type direction =
-        *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward;
-    const bool forward = direction == rocfft_transform_type_real_forward;
-
-    // Set up the strides and buffer size for the real values:
-    std::vector<size_t> rstride = {1};
-    for(unsigned int i = 1; i < length.size(); ++i)
-    {
-        // In-place transforms need space for two extra real values in the contiguous
-        // direction.
-        auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1];
-        rstride.push_back(val);
-    }
-    // NB: not tight, but hey
-    const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1];
-    std::vector<double> rdata(real_size); // host storage
-
-    // The complex data length is half + 1 of the real data length in the contiguous
-    // dimensions.  Since rocFFT is column-major, this is the first index.
-    std::vector<size_t> clength = length;
-    clength[0]                  = clength[0] / 2 + 1;
-    std::vector<size_t> cstride = {1};
-    for(unsigned int i = 1; i < clength.size(); ++i)
-    {
-        cstride.push_back(clength[i - 1] * cstride[i - 1]);
-    }
-    const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1];
-    std::vector<hipDoubleComplex> cdata(complex_size); // host storage
-
-    // Based on the direction, we set the input and output parameters appropriately.
-    const size_t isize  = forward ? real_size : complex_size;
-    const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex));
-    const std::vector<size_t> ilength = forward ? length : clength;
-    const std::vector<size_t> istride = forward ? rstride : cstride;
-
-    const size_t osize  = forward ? complex_size : real_size;
-    const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double));
-    const std::vector<size_t> olength = forward ? clength : length;
-    const std::vector<size_t> ostride = forward ? cstride : rstride;
-
-    // Print information about the transform:
-    std::cout << "direction: ";
-    if(forward)
-        std::cout << "forward\n";
-    else
-        std::cout << "inverse\n";
-    std::cout << "length:";
-    for(const auto i : length)
-        std::cout << " " << i;
-    std::cout << "\n";
-    if(inplace)
-        std::cout << "in-place transform\n";
-    else
-        std::cout << "out-of-place transform\n";
-    std::cout << "deviceID: " << deviceId << "\n";
-    std::cout << "input length:";
-    for(auto i : ilength)
-        std::cout << " " << i;
-    std::cout << "\n";
-    std::cout << "input buffer stride:";
-    for(auto i : istride)
-        std::cout << " " << i;
-    std::cout << "\n";
-    std::cout << "input buffer size: " << ibytes << "\n";
-
-    std::cout << "output length:";
-    for(auto i : olength)
-        std::cout << " " << i;
-    std::cout << "\n";
-    std::cout << "output buffer stride:";
-    for(auto i : ostride)
-        std::cout << " " << i;
-    std::cout << "\n";
-    std::cout << "output buffer size: " << obytes << "\n";
-    std::cout << std::endl;
-
-    // Set the device:
-    if(hipSetDevice(deviceId) != hipSuccess)
-        throw std::runtime_error("hipSetDevice failed.");
-
-    // Create HIP device object and initialize data
-    // Kernels are provided in examplekernels.h
-    void* gpu_in          = nullptr;
-    hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes);
-    if(hip_status != hipSuccess)
-        throw std::runtime_error("device error");
-
-    if(forward)
-    {
-        initreal_cm(length, istride, gpu_in);
-    }
-    else
-    {
-        init_hermitiancomplex_cm(length, ilength, istride, gpu_in);
-    }
-
-    // Print the input:
-    std::cout << "input:\n";
-    if(forward)
-    {
-        hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMemcpy failed.");
-        printbuffer_cm(rdata, ilength, istride, 1, isize);
-    }
-    else
-    {
-        hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMemcpy failed.");
-        printbuffer_cm(cdata, ilength, istride, 1, isize);
-
-        // Check that the buffer is Hermitian symmetric:
-        check_symmetry_cm(cdata, length, istride, 1, isize);
-    }
-
-    // rocfft_status can be used to capture API status info
-    rocfft_status rc = rocfft_status_success;
-
-    // Create the a descrition struct to set data layout:
-    rocfft_plan_description gpu_description = nullptr;
-    rc                                      = rocfft_plan_description_create(&gpu_description);
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to create plan description");
-
-    rc = rocfft_plan_description_set_data_layout(
-        gpu_description,
-        // input data format:
-        forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
-        // output data format:
-        forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
-        nullptr,
-        nullptr,
-        istride.size(), // input stride length
-        istride.data(), // input stride data
-        0,              // input batch distance
-        ostride.size(), // output stride length
-        ostride.data(), // output stride data
-        0);             // ouptut batch distance
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to set data layout");
-
-    // We can also pass "nullptr" instead of a description; rocFFT will use reasonable
-    // default parameters.  If the data isn't contiguous, we need to set strides, etc,
-    // using the description.
-
-    // Create the FFT plan:
-    rocfft_plan gpu_plan = nullptr;
-    rc                   = rocfft_plan_create(&gpu_plan,
-                            place,
-                            direction,
-                            rocfft_precision_double,
-                            length.size(),    // Dimension
-                            length.data(),    // lengths
-                            1,                // Number of transforms
-                            gpu_description); // Description
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to create plan");
-
-    // Get the execution info for the fft plan (in particular, work memory requirements):
-    rocfft_execution_info planinfo = nullptr;
-    rc                             = rocfft_execution_info_create(&planinfo);
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to create execution info");
-
-    size_t workbuffersize = 0;
-    rc                    = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize);
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to get work buffer size");
-
-    // If the transform requires work memory, allocate a work buffer:
-    void* wbuffer = nullptr;
-    if(workbuffersize > 0)
-    {
-        hip_status = hipMalloc(&wbuffer, workbuffersize);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMalloc failed");
-
-        rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
-        if(rc != rocfft_status_success)
-            throw std::runtime_error("failed to set work buffer");
-    }
-
-    // If the transform is out-of-place, allocate the output buffer as well:
-    void* gpu_out = inplace ? gpu_in : nullptr;
-    if(!inplace)
-    {
-        hip_status = hipMalloc(&gpu_out, obytes);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMalloc failed");
-    }
-
-    // Execute the GPU transform:
-    rc = rocfft_execute(gpu_plan,         // plan
-                        (void**)&gpu_in,  // in_buffer
-                        (void**)&gpu_out, // out_buffer
-                        planinfo);        // execution info
-    if(rc != rocfft_status_success)
-        throw std::runtime_error("failed to execute");
-
-    // Get the output from the device and print to cout:
-    std::cout << "output:\n";
-    if(forward)
-    {
-        hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMemcpy failed.");
-        printbuffer_cm(cdata, olength, ostride, 1, osize);
-    }
-    else
-    {
-        hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
-        if(hip_status != hipSuccess)
-            throw std::runtime_error("hipMemcpy failed.");
-        printbuffer_cm(rdata, olength, ostride, 1, osize);
-    }
-
-    // Clean up: free GPU memory:
-    if(hipFree(gpu_in) != hipSuccess)
-        throw std::runtime_error("hipFree failed.");
-
-    if(!inplace)
-    {
-        if(hipFree(gpu_out) != hipSuccess)
-            throw std::runtime_error("hipFree failed.");
-    }
-    if(wbuffer != nullptr)
-    {
-        if(hipFree(wbuffer) != hipSuccess)
-            throw std::runtime_error("hipFree failed.");
-    }
-
-    // Clean up: destroy plans:
-    if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
-        throw std::runtime_error("rocfft_execution_info_destroy failed.");
-    planinfo = nullptr;
-    if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
-        throw std::runtime_error("rocfft_plan_description_destroy failed.");
-    gpu_description = nullptr;
-    if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
-        throw std::runtime_error("rocfft_plan_destroy failed.");
-    gpu_plan = nullptr;
-
-    rocfft_cleanup();
-    return 0;
-}

From ae2ee253d3e2fa531c5e611846dae5e5ebdcc07b Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Mon, 5 Aug 2024 21:22:36 +0700
Subject: [PATCH 15/28] rollback
 src/include/miopen/solver/implicitgemm_ck_util.hpp

---
 .../miopen/solver/implicitgemm_ck_util.hpp    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index abdd171227..ff25d5f622 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
         internal::MakeTaggedTransposeInstances<CKArgsType>(
             result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des);
 
-    result.invoker_factory = [split_k,
+    result.invoker_factory = [split_k             = split_k,
                               ck_args             = std::move(ck_args),
                               sh_conv_ptr         = std::shared_ptr{std::move(*ptr_iter)},
                               input1_tr_inst      = std::move(_input1_tr_inst),
@@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                               output_init_tr_inst = std::move(_output_init_tr_inst),
                               ck_buff_des =
                                   _ck_buff_des](const std::vector<Kernel>& kernels) mutable {
-        return [split_k,
+        return [split_k = split_k,
                 kernels,
                 ck_args             = std::move(ck_args),
                 sh_conv_ptr         = std::move(sh_conv_ptr),
@@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                 input2_tr_inst      = std::move(input2_tr_inst),
                 output_tr_inst      = std::move(output_tr_inst),
                 output_init_tr_inst = std::move(output_init_tr_inst),
-                ck_buff_des](const Handle& handle,
-                             const AnyInvokeParams& primitive_parameters) mutable {
+                ck_buff_des         = ck_buff_des](const Handle& handle,
+                                           const AnyInvokeParams& primitive_parameters) mutable {
             handle.ResetKernelTime();
 
             const auto& data_ctx = primitive_parameters.CastTo<CastType>();
@@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
         [[maybe_unused]] bool should_allocated_wrw_buffer =
             ShouldAllocateWorkSpaceBufferForWRW(problem);
 
-        result.invoker_factory = [split_k,
-                                  ck_args = CKArgsType{problem},
-                                  alpha_beta_case,
-                                  should_allocated_wrw_buffer,
+        result.invoker_factory = [split_k                     = split_k,
+                                  ck_args                     = CKArgsType{problem},
+                                  alpha_beta_case             = alpha_beta_case,
+                                  should_allocated_wrw_buffer = should_allocated_wrw_buffer,
                                   sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](
                                      const std::vector<Kernel>&) mutable {
-            return [split_k,
-                    ck_args = std::move(ck_args),
-                    alpha_beta_case,
-                    should_allocated_wrw_buffer,
-                    sh_conv_ptr = std::move(sh_conv_ptr)](
+            return [split_k                     = split_k,
+                    ck_args                     = std::move(ck_args),
+                    alpha_beta_case             = alpha_beta_case,
+                    should_allocated_wrw_buffer = should_allocated_wrw_buffer,
+                    sh_conv_ptr                 = std::move(sh_conv_ptr)](
                        const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 const auto& data_ctx = primitive_parameters.CastTo<CastType>();
                 std::unique_ptr<ck::tensor_operation::device::BaseArgument> argument_ptr;

From c1c602c0f6f18846c2c30fd1ce7119b3737a1bde Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 6 Aug 2024 11:25:22 +0700
Subject: [PATCH 16/28] revert warp_shuffle using shlf_down

---
 src/kernels/warp_shuffle.hpp | 68 +++++-------------------------------
 1 file changed, 8 insertions(+), 60 deletions(-)

diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp
index 5caf7d2ec4..c1b53ea565 100644
--- a/src/kernels/warp_shuffle.hpp
+++ b/src/kernels/warp_shuffle.hpp
@@ -24,6 +24,9 @@
  *
  *******************************************************************************/
 
+#ifndef GUARD_WARP_SHUFFLE_HPP
+#define GUARD_WARP_SHUFFLE_HPP
+
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -52,78 +55,23 @@ __device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val)
     return val;
 }
 
-template <uint32_t WARP_SIZE, uint32_t SWIZZLE_SIZE = WARP_SIZE>
-__forceinline__ __device__ float reductionFullWarp(float reduced_val, uint32_t laneId)
-{
-    static_assert(WARP_SIZE != 0, "WARP_SIZEmust not be 0");
-    static_assert((SWIZZLE_SIZE & (SWIZZLE_SIZE - 1)) == 0,
-                  "WARP_SIZE and SWIZZLE must be a power of 2");
-
-    if constexpr(SWIZZLE_SIZE == 1)
-        return reduced_val;
-
-    reduced_val = reductionFullWarp<WARP_SIZE, (SWIZZLE_SIZE >> 1)>(reduced_val, laneId);
-
-    constexpr uint32_t warp_msk = (WARP_SIZE - 1);
-
-    float tmp;
-    if constexpr(SWIZZLE_SIZE >= 64)
-    {
-        // swizzle can handle only 32 lanes, switching to bpermute
-        uint32_t idx = laneId ^ (SWIZZLE_SIZE >> 1);
-
-        idx = idx >= ((laneId + WARP_SIZE) & ~warp_msk) ? laneId : idx;
-        int itmp =
-            __builtin_amdgcn_ds_bpermute(static_cast<int>(idx << 2), __float_as_int(reduced_val));
-        tmp = __int_as_float(itmp);
-    }
-    else
-    {
-        // butterfly reduction based on __shfl_xor
-        // swizzle <xor_mask[14:10], or_mask[9:5], and_mask[4:0]>()
-        constexpr uint32_t xor_off = 10;
-        // constexpr uint32_t or_off  = 5;
-        constexpr uint32_t and_off = 0;
-
-        constexpr uint32_t field_msk = 0x1f;
-
-        constexpr uint32_t and_msk = warp_msk & field_msk;
-        // constexpr uint32_t or_msk  = 0;
-        constexpr uint32_t xor_msk = (SWIZZLE_SIZE >> 1) & field_msk;
-
-        // clang tidy does not like that (or_msk << or_off) is zero
-        // and cliams that it's redundant, but it's required for
-        // __hip_ds_swizzlef_N reference. Menawhile swizzle_op generation
-        // must be a part of hip intrinsics, because it depends on ISA
-        // like __hip_ds_swizzlef_N<xor_mask, or_mask, and_mask>
-        // For some reason NILINT doesn't work.
-        // NOLINTBEGIN
-        constexpr uint32_t swizzle_op =
-            (xor_msk << xor_off) /* | (or_msk << or_off) */ | (and_msk << and_off);
-        // NOLINTEND
-
-        tmp = __hip_ds_swizzlef_N<swizzle_op>(reduced_val);
-    }
-
-    return tmp + reduced_val;
-};
-
 __device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val)
 {
     static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize];
     auto lane = threadIdx.x % warpSize;
     auto wid  = threadIdx.x / warpSize;
 
-    val = reductionFullWarp<warpSize>(val, lane);
+    val = warp_reduce_sum(val);
 
     if(lane == 0)
         shared[wid] = val;
     __syncthreads();
 
     val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0;
-    // if(wid == 0)
-    //     val = warp_reduce_sum(val);
-    val = reductionFullWarp<warpSize>(val, lane);
+    if(wid == 0)
+        val = warp_reduce_sum(val);
 
     return val;
 }
+
+#endif // GUARD_WARP_SHUFFLE_HPP

From edcd7e7018a4745c58b0d4a48a8c479c0f6d9e64 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 6 Aug 2024 16:41:54 +0700
Subject: [PATCH 17/28] include header in .cpp file

---
 src/include/miopen/sigmoid_focal_loss.hpp | 1 +
 src/sigmoid_focal_loss.cpp                | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/include/miopen/sigmoid_focal_loss.hpp b/src/include/miopen/sigmoid_focal_loss.hpp
index cbb6dff65d..353c8b479a 100644
--- a/src/include/miopen/sigmoid_focal_loss.hpp
+++ b/src/include/miopen/sigmoid_focal_loss.hpp
@@ -26,6 +26,7 @@
 #ifndef MIOPEN_SIGMOID_FOCAL_LOSS_HPP_
 #define MIOPEN_SIGMOID_FOCAL_LOSS_HPP_
 
+#include <miopen/miopen.h>
 #include <miopen/common.hpp>
 
 namespace miopen {
diff --git a/src/sigmoid_focal_loss.cpp b/src/sigmoid_focal_loss.cpp
index e1123a799c..3858f0a918 100644
--- a/src/sigmoid_focal_loss.cpp
+++ b/src/sigmoid_focal_loss.cpp
@@ -25,6 +25,7 @@
  *******************************************************************************/
 
 #include <miopen/miopen.h>
+#include <miopen/sigmoid_focal_loss.hpp>
 #include <miopen/sigmoidfocalloss/invoke_params.hpp>
 #include <miopen/sigmoidfocalloss/problem_description.hpp>
 #include <miopen/sigmoidfocalloss/solvers.hpp>

From 091aa5b8b8345cb8ffcb7a80ef42745703d53ad1 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Thu, 8 Aug 2024 16:20:04 +0700
Subject: [PATCH 18/28] merge duplicate code to validate in CPU and driver

---
 driver/mloSigmoidFocalLossHost.hpp   | 163 +++++++++++++
 driver/sigmoid_focal_loss_driver.hpp | 327 +++------------------------
 test/cpu_sigmoid_focal_loss.hpp      | 146 +++---------
 test/gtest/sigmoid_focal_loss.hpp    |  36 ++-
 4 files changed, 252 insertions(+), 420 deletions(-)
 create mode 100644 driver/mloSigmoidFocalLossHost.hpp

diff --git a/driver/mloSigmoidFocalLossHost.hpp b/driver/mloSigmoidFocalLossHost.hpp
new file mode 100644
index 0000000000..555c0b4e88
--- /dev/null
+++ b/driver/mloSigmoidFocalLossHost.hpp
@@ -0,0 +1,163 @@
+#include <miopen/miopen.h>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossFwdRunHost(Tgpu* input,
+                                   miopenTensorDescriptor_t inputDesc,
+                                   Tgpu* target,
+                                   miopenTensorDescriptor_t targetDesc,
+                                   Tcheck* outputHost,
+                                   miopenTensorDescriptor_t outputDesc,
+                                   Tcheck* workspaceHost,
+                                   float alpha,
+                                   float gamma,
+                                   miopenLossReductionMode_t reduction,
+                                   float divisor)
+{
+    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    auto output_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
+
+        Tcheck sig    = 1 / (1 + exp(-i));
+        Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
+        Tcheck sigT   = sig * t + (1 - sig) * (1 - t);
+        Tcheck loss   = ceLoss * pow(1 - sigT, gamma);
+
+        if(alpha >= 0)
+        {
+            Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t);
+            loss          = alphaT * loss;
+        }
+
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            outputHost[output_tv.get_tensor_view_idx(idx)] = loss;
+        }
+        else
+        {
+            workspaceHost[id] = static_cast<Tcheck>(loss / divisor);
+        }
+    }
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        return;
+
+    // Reduce loss
+    const int local_size = 256;
+    int offset_a         = 0;
+    int offset_b         = inputSize;
+    size_t _size         = inputSize;
+    do
+    {
+        for(int i = 0; i < _size; i += local_size)
+        {
+            Tcheck shared[local_size];
+            for(int j = 0; j < local_size; ++j)
+                shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f;
+            for(int offset = local_size / 2; offset > 0; offset >>= 1)
+                for(int j = 0; j < offset; ++j)
+                    shared[j] += shared[j + offset];
+            if(_size <= local_size)
+                outputHost[0] = shared[0];
+            else
+                workspaceHost[offset_b + i / local_size] = shared[0];
+        }
+        std::swap(offset_a, offset_b);
+        _size = (_size + local_size - 1) / local_size;
+    } while(_size > 1);
+}
+
+template <typename Tgpu, typename Tcheck>
+void mloSigmoidFocalLossBwdRunHost(Tgpu* input,
+                                   miopenTensorDescriptor_t inputDesc,
+                                   Tgpu* target,
+                                   miopenTensorDescriptor_t targetDesc,
+                                   Tgpu* doutput,
+                                   miopenTensorDescriptor_t doutputDesc,
+                                   Tcheck* dinput,
+                                   miopenTensorDescriptor_t dinputDesc,
+                                   Tcheck* dtarget,
+                                   miopenTensorDescriptor_t dtargetDesc,
+                                   float alpha,
+                                   float gamma,
+                                   miopenLossReductionMode_t reduction,
+                                   float divisor)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto target_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
+    auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+    auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc));
+
+    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
+
+    tensor_layout_t<5> doIdx(input_tv, 0);
+    Tcheck dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+
+    for(size_t id = 0; id < inputSize; ++id)
+    {
+        tensor_layout_t<5> idx(input_tv, id);
+
+        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
+        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+        }
+
+        Tcheck p       = 1 / (1 + exp(-i));
+        Tcheck ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
+        Tcheck pT      = p * t + (1 - p) * (1 - t);
+        Tcheck powPt   = pow(1 - pT, gamma);
+        Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t);
+
+        if(dinput)
+        {
+            Tcheck dpdi      = exp(-i) / pow(1 + exp(-i), 2);
+            Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
+            Tcheck dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
+
+            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
+            Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
+            Tcheck grad = dO * dLdi;
+
+            if(alpha >= 0)
+            {
+                grad *= alpha_t;
+            }
+            if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+            {
+                grad /= divisor;
+            }
+            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(grad);
+        }
+
+        if(dtarget)
+        {
+            Tcheck dcelossdt = -log(p) + log(1 - p);
+            Tcheck dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
+            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
+            Tcheck dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
+            Tcheck gradTarget = dO * dLdt;
+
+            if(alpha >= 0)
+            {
+                // alpha_t * dL/dt + dalpha_t/dt * dL
+                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
+            }
+            if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+            {
+                gradTarget /= divisor;
+            }
+            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(gradTarget);
+        }
+    }
+}
diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 7b51e38fd4..a6ee50fbd1 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -28,259 +28,15 @@
 #include "InputFlags.hpp"
 #include "driver.hpp"
 #include <miopen/errors.hpp>
-#include <miopen/tensor_view_utils.hpp>
 #include <miopen/miopen.h>
 #include "tensor_driver.hpp"
 #include "timer.hpp"
+#include "mloSigmoidFocalLossHost.hpp"
 #include <../test/tensor_holder.hpp>
 #include <../test/verify.hpp>
 #include <cmath>
 #include <vector>
 
-template <typename Tgpu, typename Tcheck>
-void mloSigmoidFocalLossUnreducedFwdRunHost(Tgpu* input,
-                                            miopenTensorDescriptor_t inputDesc,
-                                            Tgpu* target,
-                                            miopenTensorDescriptor_t targetDesc,
-                                            Tcheck* outputHost,
-                                            miopenTensorDescriptor_t outputDesc,
-                                            float alpha = 0.25,
-                                            float gamma = 2)
-{
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
-    auto output_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
-    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
-        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
-
-        Tcheck sig    = 1 / (1 + exp(-i));
-        Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
-        Tcheck sigT   = sig * t + (1 - sig) * (1 - t);
-        Tcheck loss   = ceLoss * pow(1 - sigT, gamma);
-
-        if(alpha >= 0)
-        {
-            Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t);
-            loss          = alphaT * loss;
-        }
-
-        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(loss);
-    }
-}
-
-template <typename Tgpu, typename Tcheck>
-void mloSigmoidFocalLossUnreducedBwdRunHost(Tgpu* input,
-                                            miopenTensorDescriptor_t inputDesc,
-                                            Tgpu* target,
-                                            miopenTensorDescriptor_t targetDesc,
-                                            Tgpu* doutput,
-                                            miopenTensorDescriptor_t doutputDesc,
-                                            Tcheck* dinput,
-                                            miopenTensorDescriptor_t dinputDesc,
-                                            Tcheck* dtarget,
-                                            miopenTensorDescriptor_t dtargetDesc,
-                                            float alpha = 0.25,
-                                            float gamma = 2)
-{
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
-    auto doutput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
-    auto dinput_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
-    auto dtarget_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc));
-    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        Tcheck i  = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
-        Tcheck t  = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
-        Tcheck dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
-
-        Tcheck p       = 1 / (1 + exp(-i));
-        Tcheck ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
-        Tcheck pT      = p * t + (1 - p) * (1 - t);
-        Tcheck powPt   = pow(1 - pT, gamma);
-        Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t);
-
-        if(dinput)
-        {
-            Tcheck dpdi      = exp(-i) / pow(1 + exp(-i), 2);
-            Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
-            Tcheck dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
-
-            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
-            Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
-            Tcheck grad = dO * dLdi;
-
-            if(alpha >= 0)
-            {
-                grad *= alpha_t;
-            }
-            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(grad);
-        }
-
-        if(dtarget)
-        {
-            Tcheck dcelossdt = -log(p) + log(1 - p);
-            Tcheck dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
-            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
-            Tcheck dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
-            Tcheck gradTarget = dO * dLdt;
-
-            if(alpha >= 0)
-            {
-                // alpha_t * dL/dt + dalpha_t/dt * dL
-                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
-            }
-            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(gradTarget);
-        }
-    }
-}
-
-template <typename Tgpu, typename Tcheck>
-void mloSigmoidFocalLossFwdRunHost(Tgpu* input,
-                                   miopenTensorDescriptor_t inputDesc,
-                                   Tgpu* target,
-                                   miopenTensorDescriptor_t targetDesc,
-                                   Tcheck* workspaceHost,
-                                   Tcheck* outputHost,
-                                   float alpha   = 0.25,
-                                   float gamma   = 2,
-                                   float divisor = 1)
-{
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
-    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        Tcheck i = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
-        Tcheck t = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
-
-        Tcheck sig    = 1 / (1 + exp(-i));
-        Tcheck ceLoss = -(t * log(sig) + (1 - t) * log(1 - sig));
-        Tcheck sigT   = sig * t + (1 - sig) * (1 - t);
-        Tcheck loss   = ceLoss * pow(1 - sigT, gamma);
-
-        if(alpha >= 0)
-        {
-            Tcheck alphaT = alpha * t + (1 - alpha) * (1 - t);
-            loss          = alphaT * loss;
-        }
-
-        workspaceHost[id] = static_cast<Tcheck>(loss / divisor);
-    }
-
-    // Reduce loss
-    const int local_size = 256;
-    int offset_a         = 0;
-    int offset_b         = inputSize;
-    size_t _size         = inputSize;
-    do
-    {
-        for(int i = 0; i < _size; i += local_size)
-        {
-            Tcheck shared[local_size];
-            for(int j = 0; j < local_size; ++j)
-                shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f;
-            for(int offset = local_size / 2; offset > 0; offset >>= 1)
-                for(int j = 0; j < offset; ++j)
-                    shared[j] += shared[j + offset];
-            if(_size <= local_size)
-                outputHost[0] = shared[0];
-            else
-                workspaceHost[offset_b + i / local_size] = shared[0];
-        }
-        std::swap(offset_a, offset_b);
-        _size = (_size + local_size - 1) / local_size;
-    } while(_size > 1);
-}
-
-template <typename Tgpu, typename Tcheck>
-void mloSigmoidFocalLossBwdRunHost(Tgpu* input,
-                                   miopenTensorDescriptor_t inputDesc,
-                                   Tgpu* target,
-                                   miopenTensorDescriptor_t targetDesc,
-                                   Tgpu* doutput,
-                                   miopenTensorDescriptor_t doutputDesc,
-                                   Tcheck* dinput,
-                                   miopenTensorDescriptor_t dinputDesc,
-                                   Tcheck* dtarget,
-                                   miopenTensorDescriptor_t dtargetDesc,
-                                   float alpha   = 0.25,
-                                   float gamma   = 2,
-                                   float divisor = 1)
-{
-    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
-    auto target_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(targetDesc));
-    auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
-    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
-    auto dtarget_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dtargetDesc));
-
-    size_t inputSize = miopen::deref(inputDesc).GetElementSize();
-
-    tensor_layout_t<5> doIdx(input_tv, 0);
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        Tcheck i  = static_cast<Tcheck>(input[input_tv.get_tensor_view_idx(idx)]);
-        Tcheck t  = static_cast<Tcheck>(target[target_tv.get_tensor_view_idx(idx)]);
-        Tcheck dO = static_cast<Tcheck>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
-
-        Tcheck p       = 1 / (1 + exp(-i));
-        Tcheck ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
-        Tcheck pT      = p * t + (1 - p) * (1 - t);
-        Tcheck powPt   = pow(1 - pT, gamma);
-        Tcheck alpha_t = alpha * t + (1 - alpha) * (1 - t);
-
-        if(dinput)
-        {
-            Tcheck dpdi      = exp(-i) / pow(1 + exp(-i), 2);
-            Tcheck dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
-            Tcheck dpowptdi  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
-
-            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
-            Tcheck dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
-            Tcheck grad = dO * dLdi;
-
-            if(alpha >= 0)
-            {
-                grad *= alpha_t;
-            }
-            grad /= divisor;
-            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(grad);
-        }
-
-        if(dtarget)
-        {
-            Tcheck dcelossdt = -log(p) + log(1 - p);
-            Tcheck dpowptdt  = gamma * pow(1 - pT, gamma - 1) * (1 - 2 * p);
-            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
-            Tcheck dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
-            Tcheck gradTarget = dO * dLdt;
-
-            if(alpha >= 0)
-            {
-                // alpha_t * dL/dt + dalpha_t/dt * dL
-                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
-            }
-            gradTarget /= divisor;
-            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<Tcheck>(gradTarget);
-        }
-    }
-}
-
 template <typename Tgpu, typename Tcheck>
 class SigmoidFocalLossDriver : public Driver
 {
@@ -595,30 +351,17 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunForwardGPU()
 template <typename Tgpu, typename Tcheck>
 int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunForwardCPU()
 {
-    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
-    {
-        mloSigmoidFocalLossUnreducedFwdRunHost<Tgpu, Tcheck>(input.data(),
-                                                             inputDesc,
-                                                             target.data(),
-                                                             targetDesc,
-                                                             outputHost.data(),
-                                                             outputDesc,
-                                                             alpha,
-                                                             gamma);
-    }
-    else
-    {
-        mloSigmoidFocalLossFwdRunHost<Tgpu, Tcheck>(input.data(),
-                                                    inputDesc,
-                                                    target.data(),
-                                                    targetDesc,
-                                                    workspaceHost.data(),
-                                                    outputHost.data(),
-                                                    alpha,
-                                                    gamma,
-                                                    divisor);
-    }
-
+    mloSigmoidFocalLossFwdRunHost<Tgpu, Tcheck>(input.data(),
+                                                inputDesc,
+                                                target.data(),
+                                                targetDesc,
+                                                outputHost.data(),
+                                                outputDesc,
+                                                workspaceHost.data(),
+                                                alpha,
+                                                gamma,
+                                                reduction,
+                                                divisor);
     return miopenStatusSuccess;
 }
 
@@ -693,38 +436,20 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunBackwardCPU()
     {
         p_dtarget = dtargetHost.data();
     }
-    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
-    {
-
-        mloSigmoidFocalLossUnreducedBwdRunHost<Tgpu, Tcheck>(input.data(),
-                                                             inputDesc,
-                                                             target.data(),
-                                                             targetDesc,
-                                                             doutput.data(),
-                                                             doutputDesc,
-                                                             dinputHost.data(),
-                                                             dinputDesc,
-                                                             p_dtarget,
-                                                             dtargetDesc,
-                                                             alpha,
-                                                             gamma);
-    }
-    else
-    {
-        mloSigmoidFocalLossBwdRunHost<Tgpu, Tcheck>(input.data(),
-                                                    inputDesc,
-                                                    target.data(),
-                                                    targetDesc,
-                                                    doutput.data(),
-                                                    doutputDesc,
-                                                    dinputHost.data(),
-                                                    dinputDesc,
-                                                    p_dtarget,
-                                                    dtargetDesc,
-                                                    alpha,
-                                                    gamma,
-                                                    divisor);
-    }
+    mloSigmoidFocalLossBwdRunHost<Tgpu, Tcheck>(input.data(),
+                                                inputDesc,
+                                                target.data(),
+                                                targetDesc,
+                                                doutput.data(),
+                                                doutputDesc,
+                                                dinputHost.data(),
+                                                dinputDesc,
+                                                p_dtarget,
+                                                dtargetDesc,
+                                                alpha,
+                                                gamma,
+                                                reduction,
+                                                divisor);
 
     return miopenStatusSuccess;
 }
diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp
index 3b13b955e3..f1df613b54 100644
--- a/test/cpu_sigmoid_focal_loss.hpp
+++ b/test/cpu_sigmoid_focal_loss.hpp
@@ -1,16 +1,20 @@
 #pragma once
 
+#include "miopen/miopen.h"
 #include "tensor_holder.hpp"
 #include "tensor_view.hpp"
 #include <miopen/tensor_view_utils.hpp>
 #include <cmath>
 
 template <class TIO>
-void cpu_sigmoid_focal_loss_unreduced_forward(tensor<TIO> input,
-                                              tensor<TIO> target,
-                                              tensor<TIO>& outputHost,
-                                              float alpha = 0.25,
-                                              float gamma = 2)
+void cpu_sigmoid_focal_loss_forward(tensor<TIO> input,
+                                    tensor<TIO> target,
+                                    tensor<TIO>& workspace,
+                                    tensor<TIO>& outputHost,
+                                    float alpha,
+                                    float gamma,
+                                    miopenLossReductionMode_t reduction,
+                                    float divisor)
 {
     auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
     auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
@@ -35,111 +39,18 @@ void cpu_sigmoid_focal_loss_unreduced_forward(tensor<TIO> input,
             loss         = alphaT * loss;
         }
 
-        outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(loss);
-    }
-}
-
-template <class TIO>
-void cpu_sigmoid_focal_loss_unreduced_backward(tensor<TIO> input,
-                                               tensor<TIO> target,
-                                               tensor<TIO> doutput,
-                                               tensor<TIO>& dinput,
-                                               tensor<TIO>& dtarget,
-                                               float alpha = 0.25,
-                                               float gamma = 2)
-{
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
-    auto doutput_tv  = miopen::get_inner_expanded_tv<5>(doutput.desc);
-    auto dinput_tv   = miopen::get_inner_expanded_tv<5>(dinput.desc);
-    auto dtarget_tv  = miopen::get_inner_expanded_tv<5>(dtarget.desc);
-    size_t inputSize = input.desc.GetElementSize();
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
-        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
-
-        float p       = 1 / (1 + std::exp(-i));
-        float ceLoss  = -(t * std::log(p) + (1 - t) * std::log(1 - p));
-        float pT      = p * t + (1 - p) * (1 - t);
-        float powPt   = std::pow(1 - pT, gamma);
-        float alpha_t = alpha * t + (1 - alpha) * (1 - t);
-
-        if(dinput.data.size() > 0)
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
         {
-            float dpdi      = std::exp(-i) / std::pow(1 + std::exp(-i), 2);
-            float dcelossdi = (-t / p + (1 - t) / (1 - p)) * dpdi;
-            float dpowptdi  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * t) * dpdi;
-
-            // L = ce_loss * pow_pt => dL/di = dceloss/di * pow_pt + ce_loss * dpowpt/di
-            float dLdi = dcelossdi * powPt + ceLoss * dpowptdi;
-            float grad = dO * dLdi;
-
-            if(alpha >= 0)
-            {
-                grad *= alpha_t;
-            }
-            dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
+            outputHost[output_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(loss);
         }
-
-        if(dtarget.data.size() > 0)
+        else
         {
-            float dcelossdt = -std::log(p) + std::log(1 - p);
-            float dpowptdt  = gamma * std::pow(1 - pT, gamma - 1) * (1 - 2 * p);
-            // L = ce_loss * pow_pt => dL/dt = dceloss/dt * pow_pt + ce_loss * dpowpt/dt
-            float dLdt       = dcelossdt * powPt + ceLoss * dpowptdt;
-            float gradTarget = dO * dLdt;
-
-            if(alpha >= 0)
-            {
-                // alpha_t * dL/dt + dalpha_t/dt * dL
-                gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
-            }
-            dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
+            workspace[id] = static_cast<TIO>(loss / divisor);
         }
     }
-}
 
-template <class TIO>
-void cpu_sigmoid_focal_loss_forward(tensor<TIO> input,
-                                    tensor<TIO> target,
-                                    tensor<TIO>& workspace,
-                                    tensor<TIO>& outputHost,
-                                    float alpha   = 0.25,
-                                    float gamma   = 2,
-                                    float divisor = 1)
-{
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
-    size_t inputSize = input.desc.GetElementSize();
-    // float reduction_float;
-
-    for(size_t id = 0; id < inputSize; ++id)
-    {
-        tensor_layout_t<5> idx(input_tv, id);
-
-        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
-
-        float sig    = 1 / (1 + std::exp(-i));
-        float ceLoss = -(t * std::log(sig) + (1 - t) * std::log(1 - sig));
-        float sigT   = sig * t + (1 - sig) * (1 - t);
-        float loss   = ceLoss * std::pow(1 - sigT, gamma);
-
-        if(alpha >= 0)
-        {
-            float alphaT = alpha * t + (1 - alpha) * (1 - t);
-            loss         = alphaT * loss;
-        }
-        // reduction_float += (loss / divisor);
-
-        workspace[id] = static_cast<TIO>(loss / divisor);
-    }
-    // std::cout << "Reduction result in float" << reduction_float << " " << divisor << std::endl;
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        return;
 
     // Reduce loss
     const int local_size = 256;
@@ -172,9 +83,10 @@ void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
                                      tensor<TIO> doutput,
                                      tensor<TIO>& dinput,
                                      tensor<TIO>& dtarget,
-                                     float alpha   = 0.25,
-                                     float gamma   = 2,
-                                     float divisor = 1)
+                                     float alpha,
+                                     float gamma,
+                                     miopenLossReductionMode_t reduction,
+                                     float divisor)
 {
     auto input_tv   = miopen::get_inner_expanded_tv<5>(input.desc);
     auto target_tv  = miopen::get_inner_expanded_tv<5>(target.desc);
@@ -185,14 +97,18 @@ void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
     size_t inputSize = input.desc.GetElementSize();
 
     tensor_layout_t<5> doIdx(input_tv, 0);
+    float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
 
     for(size_t id = 0; id < inputSize; ++id)
     {
         tensor_layout_t<5> idx(input_tv, id);
 
-        float i  = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
-        float t  = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
-        float dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
+        float i = static_cast<float>(input[input_tv.get_tensor_view_idx(idx)]);
+        float t = static_cast<float>(target[target_tv.get_tensor_view_idx(idx)]);
+        if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            dO = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(idx)]);
+        }
 
         float p       = 1 / (1 + std::exp(-i));
         float ceLoss  = -(t * std::log(p) + (1 - t) * std::log(1 - p));
@@ -214,7 +130,10 @@ void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
             {
                 grad *= alpha_t;
             }
-            grad /= divisor;
+            if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+            {
+                grad /= divisor;
+            }
             dinput[dinput_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(grad);
         }
 
@@ -231,7 +150,10 @@ void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
                 // alpha_t * dL/dt + dalpha_t/dt * dL
                 gradTarget = alpha_t * dLdt + (2 * alpha - 1) * ceLoss * powPt;
             }
-            gradTarget /= divisor;
+            if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
+            {
+                gradTarget /= divisor;
+            }
             dtarget[dtarget_tv.get_tensor_view_idx(idx)] = static_cast<TIO>(gradTarget);
         }
     }
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
index 7443b7a94a..e612b2f86b 100644
--- a/test/gtest/sigmoid_focal_loss.hpp
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -143,7 +143,9 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
                                                  config.alpha,
                                                  config.gamma,
                                                  config.reduction);
-        cpu_sigmoid_focal_loss_unreduced_forward<TIO>(input, target, outputHost, config.alpha);
+        tensor<TIO> workspace;
+        cpu_sigmoid_focal_loss_forward<TIO>(
+            input, target, workspace, outputHost, config.alpha, config.gamma, config.reduction, 1);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         output.data = handle.Read<TIO>(output_dev, output.data.size());
@@ -231,8 +233,15 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
                                                   config.alpha,
                                                   config.gamma,
                                                   config.reduction);
-        cpu_sigmoid_focal_loss_unreduced_backward<TIO>(
-            input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma);
+        cpu_sigmoid_focal_loss_backward<TIO>(input,
+                                             target,
+                                             dOutput,
+                                             dInputHost,
+                                             dTargetHost,
+                                             config.alpha,
+                                             config.gamma,
+                                             config.reduction,
+                                             1);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 
@@ -339,8 +348,14 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                                  config.alpha,
                                                  config.gamma,
                                                  config.reduction);
-        cpu_sigmoid_focal_loss_forward<TIO>(
-            input, target, workspace, outputHost, config.alpha, config.gamma, divisor);
+        cpu_sigmoid_focal_loss_forward<TIO>(input,
+                                            target,
+                                            workspace,
+                                            outputHost,
+                                            config.alpha,
+                                            config.gamma,
+                                            config.reduction,
+                                            divisor);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 
@@ -441,8 +456,15 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                                   config.alpha,
                                                   config.gamma,
                                                   config.reduction);
-        cpu_sigmoid_focal_loss_backward<TIO>(
-            input, target, dOutput, dInputHost, dTargetHost, config.alpha, config.gamma, divisor);
+        cpu_sigmoid_focal_loss_backward<TIO>(input,
+                                             target,
+                                             dOutput,
+                                             dInputHost,
+                                             dTargetHost,
+                                             config.alpha,
+                                             config.gamma,
+                                             config.reduction,
+                                             divisor);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 

From cf3bcc0da219274a658f0c4e9a6f347a2ac18c37 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Thu, 8 Aug 2024 17:10:50 +0700
Subject: [PATCH 19/28] remove param reduction in test config

---
 include/miopen/miopen.h           |  2 +-
 test/gtest/sigmoid_focal_loss.hpp | 61 +++++++++++++++----------------
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index c983f92619..e36f80814d 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -1603,7 +1603,7 @@ miopenConvolutionBackwardWeightsGetSolution(miopenHandle_t handle,
  * as part of the
  * miopenConvSolution_t struct.
  *
- * @param handle         MIOpen handle (input)
+ * @param handle         MIOpen handle (input
  * @param dyDesc         Tensor descriptor for data tensor dy (input)
  * @param xDesc          Tensor descriptor for data tensor x (input)
  * @param convDesc       Convolution layer descriptor (input)
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
index e612b2f86b..7f2c1314b3 100644
--- a/test/gtest/sigmoid_focal_loss.hpp
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -39,7 +39,6 @@ struct SigmoidFocalLossTestCase
     bool isContiguous;
     float alpha;
     float gamma;
-    miopenLossReductionMode_t reduction;
     friend std::ostream& operator<<(std::ostream& os, const SigmoidFocalLossTestCase& tc)
     {
         os << "dims: ";
@@ -56,15 +55,11 @@ struct SigmoidFocalLossTestCase
     SigmoidFocalLossTestCase() {}
 
     SigmoidFocalLossTestCase(std::vector<size_t> dim_,
-                             bool isContiguous_                   = true,
-                             miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE,
-                             float alpha_                         = 0.25,
-                             float gamma_                         = 2)
-        : dims(dim_),
-          isContiguous(isContiguous_),
-          alpha(alpha_),
-          gamma(gamma_),
-          reduction(reduction_)
+                             bool isContiguous_ = true,
+                             //  miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE,
+                             float alpha_ = 0.25,
+                             float gamma_ = 2)
+        : dims(dim_), isContiguous(isContiguous_), alpha(alpha_), gamma(gamma_)
     {
     }
 
@@ -94,6 +89,10 @@ inline std::vector<SigmoidFocalLossTestCase> SigmoidFocalLossTestConfigs()
         SigmoidFocalLossTestCase({8, 3, 20, 100}, false),   // 4D non-cont
         SigmoidFocalLossTestCase({2, 2, 3, 4, 100}),        // 5D cont
         SigmoidFocalLossTestCase({2, 2, 3, 4, 100}, false), // 5D non-cont
+        SigmoidFocalLossTestCase({10},
+                                 true,
+                                 0.6,
+                                 3), // 5D non-cont, custom alpha, gamma
     };
 }
 
@@ -105,6 +104,7 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
     {
         auto&& handle = get_handle();
         config        = GetParam();
+        reduction     = MIOPEN_LOSS_REDUCTION_NONE;
 
         auto in_dims    = config.GetDims();
         auto in_strides = config.ComputeStrides(in_dims);
@@ -142,10 +142,10 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
                                                  output_dev.get(),
                                                  config.alpha,
                                                  config.gamma,
-                                                 config.reduction);
+                                                 reduction);
         tensor<TIO> workspace;
         cpu_sigmoid_focal_loss_forward<TIO>(
-            input, target, workspace, outputHost, config.alpha, config.gamma, config.reduction, 1);
+            input, target, workspace, outputHost, config.alpha, config.gamma, reduction, 1);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         output.data = handle.Read<TIO>(output_dev, output.data.size());
@@ -162,6 +162,7 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
                                             << ",  Thresholdx10: " << threshold * 10;
     }
     SigmoidFocalLossTestCase config;
+    miopenLossReductionMode_t reduction;
 
     tensor<TIO> input;
     tensor<TIO> target;
@@ -182,6 +183,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
     {
         auto&& handle = get_handle();
         config        = GetParam();
+        reduction     = MIOPEN_LOSS_REDUCTION_NONE;
 
         auto in_dims      = config.GetDims();
         auto in_strides   = config.ComputeStrides(in_dims);
@@ -232,7 +234,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
                                                   dTarget_dev.get(),
                                                   config.alpha,
                                                   config.gamma,
-                                                  config.reduction);
+                                                  reduction);
         cpu_sigmoid_focal_loss_backward<TIO>(input,
                                              target,
                                              dOutput,
@@ -240,7 +242,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
                                              dTargetHost,
                                              config.alpha,
                                              config.gamma,
-                                             config.reduction,
+                                             reduction,
                                              1);
 
         EXPECT_EQ(status, miopenStatusSuccess);
@@ -268,6 +270,7 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
             << ",  Thresholdx10: " << threshold * 10;
     }
     SigmoidFocalLossTestCase config;
+    miopenLossReductionMode_t reduction;
 
     tensor<TIO> input;
     tensor<TIO> target;
@@ -294,7 +297,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         auto&& handle = get_handle();
         config        = GetParam();
 
-        config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
+        reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
 
         auto in_dims    = config.GetDims();
         auto in_strides = config.ComputeStrides(in_dims);
@@ -306,7 +309,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         target             = tensor<TIO>{in_dims, in_strides}.generate(tar_gen_value);
 
         size_t workspaceSizeBytes = miopen::GetSigmoidFocalLossForwardWorkspaceSize(
-            handle, input.desc, target.desc, output.desc, config.reduction);
+            handle, input.desc, target.desc, output.desc, reduction);
         size_t workspaceElements = workspaceSizeBytes / sizeof(TIO);
 
         workspace = tensor<TIO>(workspaceElements);
@@ -319,7 +322,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         std::fill(outputHost.begin(), outputHost.end(), 0);
 
         divisor = 1;
-        if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
         {
             divisor *= input.desc.GetElementSize();
         }
@@ -347,15 +350,9 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                                  output_dev.get(),
                                                  config.alpha,
                                                  config.gamma,
-                                                 config.reduction);
-        cpu_sigmoid_focal_loss_forward<TIO>(input,
-                                            target,
-                                            workspace,
-                                            outputHost,
-                                            config.alpha,
-                                            config.gamma,
-                                            config.reduction,
-                                            divisor);
+                                                 reduction);
+        cpu_sigmoid_focal_loss_forward<TIO>(
+            input, target, workspace, outputHost, config.alpha, config.gamma, reduction, divisor);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 
@@ -371,9 +368,10 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output));
         EXPECT_TRUE(error < threshold * 10)
             << "Error output beyond tolerance Error: " << error
-            << ",  Thresholdx10: " << threshold * 10 << " Reduction: " << config.reduction;
+            << ",  Thresholdx10: " << threshold * 10 << " Reduction: " << reduction;
     }
     SigmoidFocalLossTestCase config;
+    miopenLossReductionMode_t reduction;
 
     tensor<TIO> input;
     tensor<TIO> target;
@@ -401,7 +399,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         auto in_dims    = config.GetDims();
         auto in_strides = config.ComputeStrides(in_dims);
 
-        config.reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
+        reduction = miopenLossReductionMode_t(int(prng::gen_0_to_B(2) + 1));
 
         auto in_gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TIO>(0.1, 50); };
         input             = tensor<TIO>{in_dims, in_strides}.generate(in_gen_value);
@@ -425,7 +423,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
         std::fill(dTargetHost.begin(), dTargetHost.end(), 0);
 
         divisor = 1;
-        if(config.reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+        if(reduction == MIOPEN_LOSS_REDUCTION_MEAN)
         {
             divisor *= input.desc.GetElementSize();
         }
@@ -455,7 +453,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                                   dTarget_dev.get(),
                                                   config.alpha,
                                                   config.gamma,
-                                                  config.reduction);
+                                                  reduction);
         cpu_sigmoid_focal_loss_backward<TIO>(input,
                                              target,
                                              dOutput,
@@ -463,7 +461,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                              dTargetHost,
                                              config.alpha,
                                              config.gamma,
-                                             config.reduction,
+                                             reduction,
                                              divisor);
 
         EXPECT_EQ(status, miopenStatusSuccess);
@@ -491,6 +489,7 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
             << ",  Thresholdx10: " << threshold * 10;
     }
     SigmoidFocalLossTestCase config;
+    miopenLossReductionMode_t reduction;
 
     tensor<TIO> input;
     tensor<TIO> target;

From a41a00587df26d87eba1b7d167484090769adf5e Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Mon, 19 Aug 2024 15:46:20 +0700
Subject: [PATCH 20/28] change verify algo in CPU to naive accumulate in reduce
 kernels

---
 driver/mloSigmoidFocalLossHost.hpp   | 30 +-------------
 driver/sigmoid_focal_loss_driver.hpp | 33 ++++++++++------
 test/cpu_sigmoid_focal_loss.hpp      | 59 +++++++++++++---------------
 test/gtest/sigmoid_focal_loss.hpp    | 44 ++++++++++-----------
 4 files changed, 72 insertions(+), 94 deletions(-)

diff --git a/driver/mloSigmoidFocalLossHost.hpp b/driver/mloSigmoidFocalLossHost.hpp
index 555c0b4e88..2f77cd10ee 100644
--- a/driver/mloSigmoidFocalLossHost.hpp
+++ b/driver/mloSigmoidFocalLossHost.hpp
@@ -8,7 +8,6 @@ void mloSigmoidFocalLossFwdRunHost(Tgpu* input,
                                    miopenTensorDescriptor_t targetDesc,
                                    Tcheck* outputHost,
                                    miopenTensorDescriptor_t outputDesc,
-                                   Tcheck* workspaceHost,
                                    float alpha,
                                    float gamma,
                                    miopenLossReductionMode_t reduction,
@@ -43,36 +42,9 @@ void mloSigmoidFocalLossFwdRunHost(Tgpu* input,
         }
         else
         {
-            workspaceHost[id] = static_cast<Tcheck>(loss / divisor);
+            outputHost[0] += static_cast<Tcheck>(loss / divisor);
         }
     }
-
-    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
-        return;
-
-    // Reduce loss
-    const int local_size = 256;
-    int offset_a         = 0;
-    int offset_b         = inputSize;
-    size_t _size         = inputSize;
-    do
-    {
-        for(int i = 0; i < _size; i += local_size)
-        {
-            Tcheck shared[local_size];
-            for(int j = 0; j < local_size; ++j)
-                shared[j] = i + j < _size ? workspaceHost[offset_a + i + j] : 0.0f;
-            for(int offset = local_size / 2; offset > 0; offset >>= 1)
-                for(int j = 0; j < offset; ++j)
-                    shared[j] += shared[j + offset];
-            if(_size <= local_size)
-                outputHost[0] = shared[0];
-            else
-                workspaceHost[offset_b + i / local_size] = shared[0];
-        }
-        std::swap(offset_a, offset_b);
-        _size = (_size + local_size - 1) / local_size;
-    } while(_size > 1);
 }
 
 template <typename Tgpu, typename Tcheck>
diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index a6ee50fbd1..1e14efd548 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -37,6 +37,8 @@
 #include <cmath>
 #include <vector>
 
+const float MAX_FP16 = 65504;
+
 template <typename Tgpu, typename Tcheck>
 class SigmoidFocalLossDriver : public Driver
 {
@@ -109,7 +111,6 @@ class SigmoidFocalLossDriver : public Driver
     std::vector<Tgpu> dtarget;
     std::vector<Tcheck> dtargetHost;
     std::vector<Tgpu> workspace;
-    std::vector<Tcheck> workspaceHost;
 
     float alpha;
     float gamma;
@@ -252,7 +253,6 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
     dtargetHost           = std::vector<Tcheck>(dT_sz, static_cast<Tcheck>(0));
     size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu);
     workspace             = std::vector<Tgpu>(workSpaceElems, static_cast<Tgpu>(0));
-    workspaceHost         = std::vector<Tcheck>(workSpaceElems, static_cast<Tcheck>(0));
 
     float randomBound = 2;
     // For half, the random bound is smaller to avoid half overflow
@@ -357,7 +357,6 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunForwardCPU()
                                                 targetDesc,
                                                 outputHost.data(),
                                                 outputDesc,
-                                                workspaceHost.data(),
                                                 alpha,
                                                 gamma,
                                                 reduction,
@@ -457,13 +456,19 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::RunBackwardCPU()
 template <typename Tgpu, typename Tcheck>
 Tcheck SigmoidFocalLossDriver<Tgpu, Tcheck>::GetTolerance()
 {
-    // Computation error of fp16 is ~2^13 (=8192) bigger than
-    // the one of fp32 because mantissa is shorter by 13 bits.
-    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+    Tcheck tolerance;
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<Tgpu, bfloat16>::value)
+            tolerance *= 8.0;
+    }
+    else
+    {
+        tolerance = std::is_same<Tgpu, float>::value ? 1.0e-2 : 8.2e-1;
+    }
 
-    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
-    if(std::is_same<Tgpu, bfloat16>::value)
-        tolerance *= 8.0;
     return tolerance;
 }
 
@@ -472,6 +477,12 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::VerifyForward()
 {
     RunForwardCPU();
 
+    if(miopen::deref(inputDesc).GetType() == miopenHalf &&
+       reduction != MIOPEN_LOSS_REDUCTION_NONE && abs(outputHost[0]) > MAX_FP16)
+    {
+        std::cout << "Float16 overflow - CPU output: " << outputHost[0] << std::endl;
+    }
+
     const Tcheck tolerance = GetTolerance();
     auto error             = miopen::rms_range(outputHost, output);
 
@@ -503,13 +514,13 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::VerifyBackward()
     {
         std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dinputError
                   << " > " << tolerance << std::endl;
-        return EC_VerifyFwd;
+        return EC_VerifyBwd;
     }
     else if(isTargetGradientComputed && (!std::isfinite(dtargetError) || dtargetError > tolerance))
     {
         std::cout << "Backward " << reduction << " Sigmoid Focal Loss FAILED: " << dtargetError
                   << " > " << tolerance << std::endl;
-        return EC_VerifyFwd;
+        return EC_VerifyBwd;
     }
     else
     {
diff --git a/test/cpu_sigmoid_focal_loss.hpp b/test/cpu_sigmoid_focal_loss.hpp
index f1df613b54..fe21c94e27 100644
--- a/test/cpu_sigmoid_focal_loss.hpp
+++ b/test/cpu_sigmoid_focal_loss.hpp
@@ -9,17 +9,17 @@
 template <class TIO>
 void cpu_sigmoid_focal_loss_forward(tensor<TIO> input,
                                     tensor<TIO> target,
-                                    tensor<TIO>& workspace,
                                     tensor<TIO>& outputHost,
                                     float alpha,
                                     float gamma,
                                     miopenLossReductionMode_t reduction,
                                     float divisor)
 {
-    auto input_tv    = miopen::get_inner_expanded_tv<5>(input.desc);
-    auto target_tv   = miopen::get_inner_expanded_tv<5>(target.desc);
-    auto output_tv   = miopen::get_inner_expanded_tv<5>(outputHost.desc);
-    size_t inputSize = input.desc.GetElementSize();
+    auto input_tv     = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto target_tv    = miopen::get_inner_expanded_tv<5>(target.desc);
+    auto output_tv    = miopen::get_inner_expanded_tv<5>(outputHost.desc);
+    size_t inputSize  = input.desc.GetElementSize();
+    float outputFloat = 0;
 
     for(size_t id = 0; id < inputSize; ++id)
     {
@@ -45,36 +45,14 @@ void cpu_sigmoid_focal_loss_forward(tensor<TIO> input,
         }
         else
         {
-            workspace[id] = static_cast<TIO>(loss / divisor);
+            outputFloat += loss / divisor;
         }
     }
 
-    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
-        return;
-
-    // Reduce loss
-    const int local_size = 256;
-    int offset_a         = 0;
-    int offset_b         = inputSize;
-    size_t _size         = inputSize;
-    do
+    if(reduction != MIOPEN_LOSS_REDUCTION_NONE)
     {
-        for(int i = 0; i < _size; i += local_size)
-        {
-            TIO shared[local_size];
-            for(int j = 0; j < local_size; ++j)
-                shared[j] = i + j < _size ? workspace[offset_a + i + j] : 0.0f;
-            for(int offset = local_size / 2; offset > 0; offset >>= 1)
-                for(int j = 0; j < offset; ++j)
-                    shared[j] += shared[j + offset];
-            if(_size <= local_size)
-                outputHost[0] = shared[0];
-            else
-                workspace[offset_b + i / local_size] = shared[0];
-        }
-        std::swap(offset_a, offset_b);
-        _size = (_size + local_size - 1) / local_size;
-    } while(_size > 1);
+        outputHost[0] = static_cast<TIO>(outputFloat);
+    }
 }
 
 template <class TIO>
@@ -158,3 +136,22 @@ void cpu_sigmoid_focal_loss_backward(tensor<TIO> input,
         }
     }
 }
+
+template <typename TIO>
+float get_tolerance(miopenLossReductionMode_t reduction)
+{
+    float tolerance;
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        tolerance = std::is_same<TIO, float>::value ? 1.5e-6 : 8.2e-3;
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<TIO, bfloat16>::value)
+            tolerance *= 8.0;
+    }
+    else
+    {
+        tolerance = std::is_same<TIO, float>::value ? 1.0e-2 : 8.2e-1;
+    }
+
+    return tolerance;
+}
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
index 7f2c1314b3..db8c0aa7ca 100644
--- a/test/gtest/sigmoid_focal_loss.hpp
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -56,9 +56,8 @@ struct SigmoidFocalLossTestCase
 
     SigmoidFocalLossTestCase(std::vector<size_t> dim_,
                              bool isContiguous_ = true,
-                             //  miopenLossReductionMode_t reduction_ = MIOPEN_LOSS_REDUCTION_NONE,
-                             float alpha_ = 0.25,
-                             float gamma_ = 2)
+                             float alpha_       = 0.25,
+                             float gamma_       = 2)
         : dims(dim_), isContiguous(isContiguous_), alpha(alpha_), gamma(gamma_)
     {
     }
@@ -143,9 +142,8 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
                                                  config.alpha,
                                                  config.gamma,
                                                  reduction);
-        tensor<TIO> workspace;
         cpu_sigmoid_focal_loss_forward<TIO>(
-            input, target, workspace, outputHost, config.alpha, config.gamma, reduction, 1);
+            input, target, outputHost, config.alpha, config.gamma, reduction, 1);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         output.data = handle.Read<TIO>(output_dev, output.data.size());
@@ -153,13 +151,13 @@ struct SigmoidFocalLossUnreducedFwdTest : public ::testing::TestWithParam<Sigmoi
 
     void Verify()
     {
-        double threshold = std::numeric_limits<TIO>::epsilon();
+        double threshold = get_tolerance<TIO>(reduction);
 
         auto error = miopen::rms_range(outputHost, output);
 
         EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output));
-        EXPECT_TRUE(error < threshold * 10) << "Error output beyond tolerance Error: " << error
-                                            << ",  Thresholdx10: " << threshold * 10;
+        EXPECT_TRUE(error < threshold)
+            << "Error output beyond tolerance Error: " << error << ",  Threshold: " << threshold;
     }
     SigmoidFocalLossTestCase config;
     miopenLossReductionMode_t reduction;
@@ -253,21 +251,21 @@ struct SigmoidFocalLossUnreducedBwdTest : public ::testing::TestWithParam<Sigmoi
 
     void Verify()
     {
-        double threshold = std::numeric_limits<TIO>::epsilon();
+        double threshold = get_tolerance<TIO>(reduction);
 
         auto dInputError = miopen::rms_range(dInputHost, dInput);
 
         EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput));
-        EXPECT_TRUE(dInputError < threshold * 10)
+        EXPECT_TRUE(dInputError < threshold)
             << "dInput error output beyond tolerance Error: " << dInputError
-            << ",  Thresholdx10: " << threshold * 10;
+            << ",  Threshold: " << threshold;
 
         auto dTargetError = miopen::rms_range(dTargetHost, dTarget);
 
         EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget));
-        EXPECT_TRUE(dTargetError < threshold * 10)
+        EXPECT_TRUE(dTargetError < threshold)
             << "dTarget error output beyond tolerance Error: " << dTargetError
-            << ",  Thresholdx10: " << threshold * 10;
+            << ",  Threshold: " << threshold;
     }
     SigmoidFocalLossTestCase config;
     miopenLossReductionMode_t reduction;
@@ -352,7 +350,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
                                                  config.gamma,
                                                  reduction);
         cpu_sigmoid_focal_loss_forward<TIO>(
-            input, target, workspace, outputHost, config.alpha, config.gamma, reduction, divisor);
+            input, target, outputHost, config.alpha, config.gamma, reduction, divisor);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 
@@ -361,14 +359,14 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
 
     void Verify()
     {
-        double threshold = std::numeric_limits<TIO>::epsilon();
+        double threshold = get_tolerance<TIO>(reduction);
 
         auto error = miopen::rms_range(outputHost, output);
 
         EXPECT_TRUE(miopen::range_distance(outputHost) == miopen::range_distance(output));
-        EXPECT_TRUE(error < threshold * 10)
-            << "Error output beyond tolerance Error: " << error
-            << ",  Thresholdx10: " << threshold * 10 << " Reduction: " << reduction;
+        EXPECT_TRUE(error < threshold)
+            << "Error output beyond tolerance Error: " << error << ",  Threshold: " << threshold
+            << " Reduction: " << reduction;
     }
     SigmoidFocalLossTestCase config;
     miopenLossReductionMode_t reduction;
@@ -472,21 +470,21 @@ struct SigmoidFocalLossBwdTest : public ::testing::TestWithParam<SigmoidFocalLos
 
     void Verify()
     {
-        double threshold = std::numeric_limits<TIO>::epsilon();
+        double threshold = get_tolerance<TIO>(reduction);
 
         auto dInputError = miopen::rms_range(dInputHost, dInput);
 
         EXPECT_TRUE(miopen::range_distance(dInputHost) == miopen::range_distance(dInput));
-        EXPECT_TRUE(dInputError < threshold * 10)
+        EXPECT_TRUE(dInputError < threshold)
             << "dInput error output beyond tolerance Error: " << dInputError
-            << ",  Thresholdx10: " << threshold * 10;
+            << ",  Threshold: " << threshold;
 
         auto dTargetError = miopen::rms_range(dTargetHost, dTarget);
 
         EXPECT_TRUE(miopen::range_distance(dTargetHost) == miopen::range_distance(dTarget));
-        EXPECT_TRUE(dTargetError < threshold * 10)
+        EXPECT_TRUE(dTargetError < threshold)
             << "dTarget error output beyond tolerance Error: " << dTargetError
-            << ",  Thresholdx10: " << threshold * 10;
+            << ",  Threshold: " << threshold;
     }
     SigmoidFocalLossTestCase config;
     miopenLossReductionMode_t reduction;

From afc738f1f492dc3d444c38cf91f1f94941943f80 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 27 Aug 2024 13:20:50 +0700
Subject: [PATCH 21/28] merge code with reduce kernel used in develop branch

---
 driver/sigmoid_focal_loss_driver.hpp          | 11 +--
 src/CMakeLists.txt                            |  2 -
 src/kernels/MIOpenLossSum.cpp                 | 56 --------------
 src/kernels/MIOpenReduceSum.cpp               |  8 +-
 src/kernels/MIOpenSigmoidFocalLoss.cpp        |  6 +-
 src/kernels/warp_shuffle.hpp                  | 77 -------------------
 .../forward_reduce_sigmoid_focal_loss.cpp     | 21 +++--
 test/gtest/sigmoid_focal_loss.hpp             |  7 +-
 8 files changed, 27 insertions(+), 161 deletions(-)
 delete mode 100644 src/kernels/MIOpenLossSum.cpp
 delete mode 100644 src/kernels/warp_shuffle.hpp

diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 1e14efd548..23071d7f97 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -110,7 +110,6 @@ class SigmoidFocalLossDriver : public Driver
     std::vector<Tcheck> dinputHost;
     std::vector<Tgpu> dtarget;
     std::vector<Tcheck> dtargetHost;
-    std::vector<Tgpu> workspace;
 
     float alpha;
     float gamma;
@@ -239,8 +238,9 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
 
     miopenGetSigmoidFocalLossForwardWorkspaceSize(
         handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes);
-    workspace_dev =
-        std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu)));
+    // workspace_dev =
+    //     std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu)));
+    workspace_dev = std::make_unique<GPUMem>(ctx, workSpaceSizeInBytes, sizeof(std::byte));
 
     input                 = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
     target                = std::vector<Tgpu>(target_sz, static_cast<Tgpu>(0));
@@ -251,8 +251,6 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
     dinputHost            = std::vector<Tcheck>(dI_sz, static_cast<Tcheck>(0));
     dtarget               = std::vector<Tgpu>(dT_sz, static_cast<Tgpu>(0));
     dtargetHost           = std::vector<Tcheck>(dT_sz, static_cast<Tcheck>(0));
-    size_t workSpaceElems = workSpaceSizeInBytes / sizeof(Tgpu);
-    workspace             = std::vector<Tgpu>(workSpaceElems, static_cast<Tgpu>(0));
 
     float randomBound = 2;
     // For half, the random bound is smaller to avoid half overflow
@@ -291,9 +289,6 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
     if(dtarget_dev->ToGPU(GetStream(), dtarget.data()) != 0)
         std::cerr << "Error copying (dT) to GPU, size: " << dtarget_dev->GetSize() << std::endl;
 
-    if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0)
-        std::cerr << "Error copying (dI) to GPU, size: " << workspace_dev->GetSize() << std::endl;
-
     return miopenStatusSuccess;
 }
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c931fa14ed..35496f87a7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -478,7 +478,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/stride_array.hpp
         kernels/tensor_view.hpp
         kernels/utilities.inc
-        kernels/warp_shuffle.hpp
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
@@ -521,7 +520,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenLayerNorm.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
-        kernels/MIOpenLossSum.cpp
         kernels/MIOpenNeuron.cl
         kernels/MIOpenPReLU.cpp
         kernels/MIOpenPooling.cl
diff --git a/src/kernels/MIOpenLossSum.cpp b/src/kernels/MIOpenLossSum.cpp
deleted file mode 100644
index 08d3a656f6..0000000000
--- a/src/kernels/MIOpenLossSum.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#endif
-
-#include "float_types.h"
-#include "warp_shuffle.hpp"
-
-#ifndef IN_OUT_TYPE
-#define IN_OUT_TYPE float
-#endif
-
-template <typename TIO>
-__device__ void losssum(const TIO* input, TIO* output, size_t N)
-{
-    auto gid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : static_cast<FLOAT_ACCUM>(0.0f);
-    val             = block_reduce_sum(val);
-
-    if(threadIdx.x == 0)
-        output[blockIdx.x] = CVT_ACCUM2FLOAT(val);
-}
-
-extern "C" __global__ void
-LossSum(const IN_OUT_TYPE* __restrict__ input, IN_OUT_TYPE* __restrict__ output, size_t N)
-{
-    // instantiate the kernel
-    losssum<IN_OUT_TYPE>(input, output, N);
-}
diff --git a/src/kernels/MIOpenReduceSum.cpp b/src/kernels/MIOpenReduceSum.cpp
index 5ed52008bf..a7213acc38 100644
--- a/src/kernels/MIOpenReduceSum.cpp
+++ b/src/kernels/MIOpenReduceSum.cpp
@@ -47,12 +47,12 @@ ReduceSum(const FLOAT_ACCUM* input, TO* output, uint64_t N, tensor_view_t<1> out
 }
 
 extern "C" __global__ void ReduceSum(const FLOAT_ACCUM* __restrict__ input,
-                                     OUTPUT_TYPE* __restrict__ output,
+                                     FLOAT* __restrict__ output,
                                      uint64_t N,
                                      tensor_view_t<1> output_tv)
 {
     // instantiate the kernel
-    ReduceSum<OUTPUT_TYPE>(input, output, N, output_tv);
+    ReduceSum<FLOAT>(input, output, N, output_tv);
 }
 
 extern "C" __global__ void ReduceSumFLOATACCUM(const FLOAT_ACCUM* __restrict__ input,
@@ -93,12 +93,12 @@ __device__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input,
 }
 
 extern "C" __global__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input,
-                                       OUTPUT_TYPE* __restrict__ output,
+                                       FLOAT* __restrict__ output,
                                        uint64_t output_numel,
                                        uint64_t inner_size,
                                        uint64_t outer_size,
                                        tensor_view_t<1> output_tv)
 {
     // instantiate the kernel
-    Reduce1dSum<OUTPUT_TYPE>(input, output, output_numel, inner_size, outer_size, output_tv);
+    Reduce1dSum<FLOAT>(input, output, output_numel, inner_size, outer_size, output_tv);
 }
diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp
index 75c25c0e42..b8f3630e8d 100644
--- a/src/kernels/MIOpenSigmoidFocalLoss.cpp
+++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp
@@ -47,7 +47,7 @@
 template <typename TIO>
 __device__ void sigmoidFocalLossFwd(const TIO* input,
                                     TIO* target,
-                                    TIO* workspace,
+                                    FLOAT_ACCUM* workspace,
                                     float alpha,
                                     float gamma,
                                     float divisor,
@@ -74,12 +74,12 @@ __device__ void sigmoidFocalLossFwd(const TIO* input,
         loss                = alpha_t * loss;
     }
 
-    workspace[gid] = CVT_ACCUM2FLOAT(loss / divisor);
+    workspace[gid] = loss / divisor;
 }
 
 extern "C" __global__ void SigmoidFocalLossFwd(const IN_OUT_TYPE* input,
                                                IN_OUT_TYPE* target,
-                                               IN_OUT_TYPE* workspace,
+                                               FLOAT_ACCUM* workspace,
                                                float alpha,
                                                float gamma,
                                                float divisor,
diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp
deleted file mode 100644
index c1b53ea565..0000000000
--- a/src/kernels/warp_shuffle.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2024 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#ifndef GUARD_WARP_SHUFFLE_HPP
-#define GUARD_WARP_SHUFFLE_HPP
-
-#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-#endif
-
-#include "float_types.h"
-
-#ifndef REDUCE_SIZE
-#define REDUCE_SIZE 256
-#endif
-
-__device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val)
-{
-    if(warpSize >= 64)
-        val += __shfl_down(val, 32);
-    if(warpSize >= 32)
-        val += __shfl_down(val, 16);
-    if(warpSize >= 16)
-        val += __shfl_down(val, 8);
-    if(warpSize >= 8)
-        val += __shfl_down(val, 4);
-    if(warpSize >= 4)
-        val += __shfl_down(val, 2);
-    if(warpSize >= 2)
-        val += __shfl_down(val, 1);
-    return val;
-}
-
-__device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val)
-{
-    static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize];
-    auto lane = threadIdx.x % warpSize;
-    auto wid  = threadIdx.x / warpSize;
-
-    val = warp_reduce_sum(val);
-
-    if(lane == 0)
-        shared[wid] = val;
-    __syncthreads();
-
-    val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0;
-    if(wid == 0)
-        val = warp_reduce_sum(val);
-
-    return val;
-}
-
-#endif // GUARD_WARP_SHUFFLE_HPP
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
index f1a37fc54f..beacf73263 100644
--- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -36,7 +36,7 @@
 #include <miopen/sigmoidfocalloss/utils.hpp>
 #include <miopen/tensor_view_utils.hpp>
 
-#define LOCAL_SIZE 256
+#define LOCAL_SIZE_SIGMOIDFOCALLOSS 256
 #define LOCAL_SIZE_REDUCE 256
 
 namespace miopen {
@@ -72,21 +72,25 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
         {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
         {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
         {"TARGET_TYPE", target_dtype == "bfloat16" ? "ushort" : in_dtype},
-        {"LOCAL_SIZE", LOCAL_SIZE},
+        {"REDUCE_SIZE", LOCAL_SIZE_REDUCE},
     };
 
     /* Prepare params for loss kernel */
     result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params));
+        {LOCAL_SIZE_SIGMOIDFOCALLOSS}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params));
 
     /* Prepare params for reduce kernels */
     auto _size = size;
-    do
+    while(_size > LOCAL_SIZE_REDUCE)
     {
         result.construction_params.push_back(make_hip_kernel(
-            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
+            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSumFLOATACCUM", build_params));
+            // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
         _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
-    } while(_size > 1);
+    } 
+
+    result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
 
     result.invoker_factory = [this, problem](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
@@ -143,7 +147,8 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
                 }
                 else
                 {
-                    kernel(reduceIn, params.output, size);
+                    auto output_tv        = get_inner_expanded_tv<1>(deref(params.outputDesc));
+                    kernel(reduceIn, params.output, size, output_tv);
                 }
                 size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
             }
@@ -179,7 +184,7 @@ MultiBufferWorkspaceTraits SigmoidFocalLossFwd::GetMultiBufferWorkspaceTraits(
 {
     size_t inputElements  = problem.GetInputDesc().GetElementSize();
     size_t reduceElements = (inputElements + LOCAL_SIZE_REDUCE - 1) / LOCAL_SIZE_REDUCE;
-    size_t elementSize    = get_data_size(problem.GetOutputDesc().GetType());
+    size_t elementSize    = get_data_size(miopenFloat);
 
     return MultiBufferWorkspaceTraits{inputElements * elementSize, reduceElements * elementSize};
 }
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
index db8c0aa7ca..77023364b3 100644
--- a/test/gtest/sigmoid_focal_loss.hpp
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -79,6 +79,7 @@ struct SigmoidFocalLossTestCase
 inline std::vector<SigmoidFocalLossTestCase> SigmoidFocalLossTestConfigs()
 {
     return {
+        SigmoidFocalLossTestCase({1}),                   // 1D cont
         SigmoidFocalLossTestCase({4000}),                   // 1D cont
         SigmoidFocalLossTestCase({100, 500}),               // 2D cont
         SigmoidFocalLossTestCase({100, 500}, false),        // 2D non-cont
@@ -308,9 +309,9 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
 
         size_t workspaceSizeBytes = miopen::GetSigmoidFocalLossForwardWorkspaceSize(
             handle, input.desc, target.desc, output.desc, reduction);
-        size_t workspaceElements = workspaceSizeBytes / sizeof(TIO);
+        size_t workspaceElements = workspaceSizeBytes / sizeof(float);
 
-        workspace = tensor<TIO>(workspaceElements);
+        workspace = tensor<float>(workspaceElements);
         std::fill(workspace.begin(), workspace.end(), 0);
 
         output = tensor<TIO>(1);
@@ -373,7 +374,7 @@ struct SigmoidFocalLossFwdTest : public ::testing::TestWithParam<SigmoidFocalLos
 
     tensor<TIO> input;
     tensor<TIO> target;
-    tensor<TIO> workspace;
+    tensor<float> workspace;
     tensor<TIO> output;
 
     tensor<TIO> outputHost;

From 449a51d9a07b9cc3c8058eb1cb07f6e7bcedce83 Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Tue, 27 Aug 2024 13:22:39 +0700
Subject: [PATCH 22/28] apply clang-format

---
 driver/sigmoid_focal_loss_driver.hpp          | 20 ++++++++---------
 .../forward_reduce_sigmoid_focal_loss.cpp     | 22 ++++++++++++-------
 test/gtest/sigmoid_focal_loss.hpp             |  2 +-
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/driver/sigmoid_focal_loss_driver.hpp b/driver/sigmoid_focal_loss_driver.hpp
index 23071d7f97..188336af62 100644
--- a/driver/sigmoid_focal_loss_driver.hpp
+++ b/driver/sigmoid_focal_loss_driver.hpp
@@ -238,19 +238,17 @@ int SigmoidFocalLossDriver<Tgpu, Tcheck>::AllocateBuffersAndCopy()
 
     miopenGetSigmoidFocalLossForwardWorkspaceSize(
         handle, inputDesc, targetDesc, outputDesc, reduction, &workSpaceSizeInBytes);
-    // workspace_dev =
-    //     std::unique_ptr<GPUMem>(new GPUMem(ctx, workSpaceSizeInBytes / sizeof(Tgpu), sizeof(Tgpu)));
     workspace_dev = std::make_unique<GPUMem>(ctx, workSpaceSizeInBytes, sizeof(std::byte));
 
-    input                 = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
-    target                = std::vector<Tgpu>(target_sz, static_cast<Tgpu>(0));
-    output                = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
-    outputHost            = std::vector<Tcheck>(out_sz, static_cast<Tcheck>(0));
-    doutput               = std::vector<Tgpu>(dO_sz, static_cast<Tgpu>(0));
-    dinput                = std::vector<Tgpu>(dI_sz, static_cast<Tgpu>(0));
-    dinputHost            = std::vector<Tcheck>(dI_sz, static_cast<Tcheck>(0));
-    dtarget               = std::vector<Tgpu>(dT_sz, static_cast<Tgpu>(0));
-    dtargetHost           = std::vector<Tcheck>(dT_sz, static_cast<Tcheck>(0));
+    input       = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    target      = std::vector<Tgpu>(target_sz, static_cast<Tgpu>(0));
+    output      = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    outputHost  = std::vector<Tcheck>(out_sz, static_cast<Tcheck>(0));
+    doutput     = std::vector<Tgpu>(dO_sz, static_cast<Tgpu>(0));
+    dinput      = std::vector<Tgpu>(dI_sz, static_cast<Tgpu>(0));
+    dinputHost  = std::vector<Tcheck>(dI_sz, static_cast<Tcheck>(0));
+    dtarget     = std::vector<Tgpu>(dT_sz, static_cast<Tgpu>(0));
+    dtargetHost = std::vector<Tcheck>(dT_sz, static_cast<Tcheck>(0));
 
     float randomBound = 2;
     // For half, the random bound is smaller to avoid half overflow
diff --git a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
index beacf73263..5af00b9701 100644
--- a/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
+++ b/src/solver/sigmoidfocalloss/forward_reduce_sigmoid_focal_loss.cpp
@@ -76,21 +76,27 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
     };
 
     /* Prepare params for loss kernel */
-    result.construction_params.push_back(make_hip_kernel(
-        {LOCAL_SIZE_SIGMOIDFOCALLOSS}, {size}, "MIOpenSigmoidFocalLoss.cpp", "SigmoidFocalLossFwd", build_params));
+    result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SIGMOIDFOCALLOSS},
+                                                         {size},
+                                                         "MIOpenSigmoidFocalLoss.cpp",
+                                                         "SigmoidFocalLossFwd",
+                                                         build_params));
 
     /* Prepare params for reduce kernels */
     auto _size = size;
     while(_size > LOCAL_SIZE_REDUCE)
     {
-        result.construction_params.push_back(make_hip_kernel(
-            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSumFLOATACCUM", build_params));
-            // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_REDUCE},
+                                                             {_size},
+                                                             "MIOpenReduceSum.cpp",
+                                                             "ReduceSumFLOATACCUM",
+                                                             build_params));
+        // {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenLossSum.cpp", "LossSum", build_params));
         _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
-    } 
+    }
 
     result.construction_params.push_back(make_hip_kernel(
-            {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
+        {LOCAL_SIZE_REDUCE}, {_size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
 
     result.invoker_factory = [this, problem](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
@@ -147,7 +153,7 @@ ConvSolution SigmoidFocalLossFwd::GetSolution(
                 }
                 else
                 {
-                    auto output_tv        = get_inner_expanded_tv<1>(deref(params.outputDesc));
+                    auto output_tv = get_inner_expanded_tv<1>(deref(params.outputDesc));
                     kernel(reduceIn, params.output, size, output_tv);
                 }
                 size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
diff --git a/test/gtest/sigmoid_focal_loss.hpp b/test/gtest/sigmoid_focal_loss.hpp
index 77023364b3..ab59893e52 100644
--- a/test/gtest/sigmoid_focal_loss.hpp
+++ b/test/gtest/sigmoid_focal_loss.hpp
@@ -79,7 +79,7 @@ struct SigmoidFocalLossTestCase
 inline std::vector<SigmoidFocalLossTestCase> SigmoidFocalLossTestConfigs()
 {
     return {
-        SigmoidFocalLossTestCase({1}),                   // 1D cont
+        SigmoidFocalLossTestCase({1}),                      // 1D cont
         SigmoidFocalLossTestCase({4000}),                   // 1D cont
         SigmoidFocalLossTestCase({100, 500}),               // 2D cont
         SigmoidFocalLossTestCase({100, 500}, false),        // 2D non-cont

From 446322c8b3dda9b1be619db08d45bd88ae84178e Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Wed, 28 Aug 2024 10:17:39 +0700
Subject: [PATCH 23/28] fix implicitgemm_ck_util.hpp

---
 include/miopen/miopen.h                       |  2 +-
 .../miopen/solver/implicitgemm_ck_util.hpp    | 26 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 82907855ab..923906df56 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -1605,7 +1605,7 @@ miopenConvolutionBackwardWeightsGetSolution(miopenHandle_t handle,
  * as part of the
  * miopenConvSolution_t struct.
  *
- * @param handle         MIOpen handle (input
+ * @param handle         MIOpen handle (input)
  * @param dyDesc         Tensor descriptor for data tensor dy (input)
  * @param xDesc          Tensor descriptor for data tensor x (input)
  * @param convDesc       Convolution layer descriptor (input)
diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index e6cceaef0f..64665b2af2 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -680,7 +680,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
         internal::MakeTaggedTransposeInstances<CKArgsType>(
             result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des);
 
-    result.invoker_factory = [split_k             = split_k,
+    result.invoker_factory = [split_k,
                               ck_args             = std::move(ck_args),
                               sh_conv_ptr         = std::shared_ptr{std::move(*ptr_iter)},
                               input1_tr_inst      = std::move(_input1_tr_inst),
@@ -689,7 +689,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                               output_init_tr_inst = std::move(_output_init_tr_inst),
                               ck_buff_des =
                                   _ck_buff_des](const std::vector<Kernel>& kernels) mutable {
-        return [split_k = split_k,
+        return [split_k,
                 kernels,
                 ck_args             = std::move(ck_args),
                 sh_conv_ptr         = std::move(sh_conv_ptr),
@@ -697,8 +697,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                 input2_tr_inst      = std::move(input2_tr_inst),
                 output_tr_inst      = std::move(output_tr_inst),
                 output_init_tr_inst = std::move(output_init_tr_inst),
-                ck_buff_des         = ck_buff_des](const Handle& handle,
-                                           const AnyInvokeParams& primitive_parameters) mutable {
+                ck_buff_des](const Handle& handle,
+                             const AnyInvokeParams& primitive_parameters) mutable {
             handle.ResetKernelTime();
 
             const auto& data_ctx = primitive_parameters.CastTo<CastType>();
@@ -826,17 +826,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
         [[maybe_unused]] bool should_allocated_wrw_buffer =
             ShouldAllocateWorkSpaceBufferForWRW(problem);
 
-        result.invoker_factory = [split_k                     = split_k,
-                                  ck_args                     = CKArgsType{problem},
-                                  alpha_beta_case             = alpha_beta_case,
-                                  should_allocated_wrw_buffer = should_allocated_wrw_buffer,
+        result.invoker_factory = [split_k,
+                                  ck_args = CKArgsType{problem},
+                                  alpha_beta_case,
+                                  should_allocated_wrw_buffer,
                                   sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](
                                      const std::vector<Kernel>&) mutable {
-            return [split_k                     = split_k,
-                    ck_args                     = std::move(ck_args),
-                    alpha_beta_case             = alpha_beta_case,
-                    should_allocated_wrw_buffer = should_allocated_wrw_buffer,
-                    sh_conv_ptr                 = std::move(sh_conv_ptr)](
+            return [split_k,
+                    ck_args = std::move(ck_args),
+                    alpha_beta_case,
+                    should_allocated_wrw_buffer,
+                    sh_conv_ptr = std::move(sh_conv_ptr)](
                        const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 const auto& data_ctx = primitive_parameters.CastTo<CastType>();
                 std::unique_ptr<ck::tensor_operation::device::BaseArgument> argument_ptr;

From 41b9300368a3f31201711d7bef0f37702045ac6f Mon Sep 17 00:00:00 2001
From: BuiChiTrung <trungcspntl@gmail.com>
Date: Sat, 31 Aug 2024 11:29:00 +0700
Subject: [PATCH 24/28] add comment to kernel

---
 src/kernels/MIOpenSigmoidFocalLoss.cpp | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/kernels/MIOpenSigmoidFocalLoss.cpp b/src/kernels/MIOpenSigmoidFocalLoss.cpp
index b8f3630e8d..d12335a5f7 100644
--- a/src/kernels/MIOpenSigmoidFocalLoss.cpp
+++ b/src/kernels/MIOpenSigmoidFocalLoss.cpp
@@ -54,6 +54,12 @@ __device__ void sigmoidFocalLossFwd(const TIO* input,
                                     tensor_view_t<5> input_tv,
                                     tensor_view_t<5> target_tv)
 {
+    /*
+        Dim: input = target = workspace = {N, C, D, H, W}.
+        Each thread handle an elem in the input, target tensor.
+        Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}.
+        Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}.
+    */
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
 
     tensor_layout_t<5> idx(input_tv, gid);
@@ -63,6 +69,7 @@ __device__ void sigmoidFocalLossFwd(const TIO* input,
     FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
     FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
 
+    /* The formula follows torchvision package: torchvision/ops/focal_loss.py */
     FLOAT_ACCUM p      = 1 / (1 + exp(-i));
     FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p));
     FLOAT_ACCUM pT     = p * t + (1 - p) * (1 - t);
@@ -105,6 +112,12 @@ __device__ void sigmoidFocalLossBwd(const TIO* input,
                                     tensor_view_t<5> dinput_tv,
                                     tensor_view_t<5> dtarget_tv)
 {
+    /*
+        Dim: input = target = doutput = dinput = dtarget = {N, C, D, H, W}.
+        Each thread handle an elem in the input, target, doutput tensor.
+        Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}.
+        Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}.
+    */
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
 
     tensor_layout_t<5> idx(input_tv, gid);
@@ -116,6 +129,7 @@ __device__ void sigmoidFocalLossBwd(const TIO* input,
     FLOAT_ACCUM t  = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
     FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(doIdx)]);
 
+    /* Formula is formed by compute fwd's formula gradient */
     FLOAT_ACCUM p       = 1 / (1 + exp(-i));
     FLOAT_ACCUM ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
     FLOAT_ACCUM pT      = p * t + (1 - p) * (1 - t);
@@ -199,6 +213,12 @@ __device__ void sigmoidFocalLossUnreducedFwd(const TIO* input,
                                              tensor_view_t<5> target_tv,
                                              tensor_view_t<5> output_tv)
 {
+    /*
+        Dim: input = target = output = {N, C, D, H, W}.
+        Each thread handle an elem in the input, target tensor.
+        Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}.
+        Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}.
+    */
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
 
     tensor_layout_t<5> idx(input_tv, gid);
@@ -208,6 +228,7 @@ __device__ void sigmoidFocalLossUnreducedFwd(const TIO* input,
     FLOAT_ACCUM i = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(idx)]);
     FLOAT_ACCUM t = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
 
+    /* The formula follows torchvision package: torchvision/ops/focal_loss.py */
     FLOAT_ACCUM p      = 1 / (1 + exp(-i));
     FLOAT_ACCUM ceLoss = -(t * log(p) + (1 - t) * log(1 - p));
     FLOAT_ACCUM pT     = p * t + (1 - p) * (1 - t);
@@ -249,6 +270,12 @@ __device__ void sigmoidFocalLossUnreducedBwd(const TIO* input,
                                              tensor_view_t<5> dinput_tv,
                                              tensor_view_t<5> dtarget_tv)
 {
+    /*
+        Dim: input = target = doutput = dinput = dtarget = {N, C, D, H, W}.
+        Each thread handle an elem in the input, target, doutput tensor.
+        Lws = {LOCAL_SIZE_SIGMOIDFOCALLOSS(default = 256), 1, 1}.
+        Gws = {AlignUp(N * C * D * H * W, lws.x), 1, 1}.
+    */
     size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
 
     tensor_layout_t<5> idx(input_tv, gid);
@@ -259,6 +286,7 @@ __device__ void sigmoidFocalLossUnreducedBwd(const TIO* input,
     FLOAT_ACCUM t  = CVT_FLOAT2ACCUM(target[target_tv.get_tensor_view_idx(idx)]);
     FLOAT_ACCUM dO = CVT_FLOAT2ACCUM(doutput[doutput_tv.get_tensor_view_idx(idx)]);
 
+    /* Formula is formed by compute fwd's formula gradient */
     FLOAT_ACCUM p       = 1 / (1 + exp(-i));
     FLOAT_ACCUM ceLoss  = -(t * log(p) + (1 - t) * log(1 - p));
     FLOAT_ACCUM pT      = p * t + (1 - p) * (1 - t);

From d2f2dd133eca00d86f02192e6b4179d5bd59c0b2 Mon Sep 17 00:00:00 2001
From: long10024070 <long.luong@moreh.com.vn>
Date: Tue, 5 Nov 2024 04:01:36 +0000
Subject: [PATCH 25/28] undo ck changed

---
 .../miopen/solver/implicitgemm_ck_util.hpp    | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index 64665b2af2..cf3e53a53e 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -29,6 +29,7 @@
 #include <miopen/conv/data_invoke_params.hpp>
 #include <miopen/conv/wrw_invoke_params.hpp>
 #include <miopen/batched_transpose_sol.hpp>
+#include <miopen/buffer_info.hpp>
 #include <miopen/tensor_ops.hpp>
 #include <miopen/miopen_internal.h>
 
@@ -376,9 +377,10 @@ class TransposeInstance
         Run(handle, kernels, out_ptr, buf_handle.get());
     }
 
-    void ZeroOutBuffer()
+    void ZeroOutBuffer(const Handle& handle)
     {
-        [[maybe_unused]] auto status = hipMemsetAsync(buf_handle.get(), 0, tensor_sz);
+        [[maybe_unused]] auto status =
+            hipMemsetAsync(buf_handle.get(), 0, tensor_sz, handle.GetStream());
         assert(status == hipSuccess);
     }
 
@@ -600,7 +602,8 @@ inline bool CKWrwRequireWorkspace(
     size_t K_per_group = K / G;
 
     return (alpha_beta_case == BILINEAR || alpha_beta_case == SCALE) ||
-           (data_type == miopenHalf && (is_odd(C_per_group) || is_odd(K_per_group)));
+           ((data_type == miopenHalf || data_type == miopenBFloat16) &&
+            (is_odd(C_per_group) || is_odd(K_per_group)));
 }
 
 /// \todo move to a cpp file
@@ -680,7 +683,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
         internal::MakeTaggedTransposeInstances<CKArgsType>(
             result, ctx, problem, ck_args, input1_op, input2_op, output_op, _ck_buff_des);
 
-    result.invoker_factory = [split_k,
+    result.invoker_factory = [split_k             = split_k,
                               ck_args             = std::move(ck_args),
                               sh_conv_ptr         = std::shared_ptr{std::move(*ptr_iter)},
                               input1_tr_inst      = std::move(_input1_tr_inst),
@@ -689,7 +692,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                               output_init_tr_inst = std::move(_output_init_tr_inst),
                               ck_buff_des =
                                   _ck_buff_des](const std::vector<Kernel>& kernels) mutable {
-        return [split_k,
+        return [split_k = split_k,
                 kernels,
                 ck_args             = std::move(ck_args),
                 sh_conv_ptr         = std::move(sh_conv_ptr),
@@ -697,8 +700,8 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
                 input2_tr_inst      = std::move(input2_tr_inst),
                 output_tr_inst      = std::move(output_tr_inst),
                 output_init_tr_inst = std::move(output_init_tr_inst),
-                ck_buff_des](const Handle& handle,
-                             const AnyInvokeParams& primitive_parameters) mutable {
+                ck_buff_des         = ck_buff_des](const Handle& handle,
+                                           const AnyInvokeParams& primitive_parameters) mutable {
             handle.ResetKernelTime();
 
             const auto& data_ctx = primitive_parameters.CastTo<CastType>();
@@ -734,7 +737,7 @@ ConvSolution InitInvokerFactoryNCHW(const ExecutionContext& ctx,
             /// \todo: Will need SetTensor() to properly zero out non-packed tensors
             if(output_tr_inst.GetConvOperandTag() == internal::ConvOperandTag::Weights)
             {
-                output_tr_inst.ZeroOutBuffer();
+                output_tr_inst.ZeroOutBuffer(handle);
             }
 
             std::array<internal::TransposeInstanceTagged*, 3> tr_ptrs = {
@@ -826,17 +829,17 @@ ConvSolution InitInvokerFactoryNHWC(const ExecutionContext&,
         [[maybe_unused]] bool should_allocated_wrw_buffer =
             ShouldAllocateWorkSpaceBufferForWRW(problem);
 
-        result.invoker_factory = [split_k,
-                                  ck_args = CKArgsType{problem},
-                                  alpha_beta_case,
-                                  should_allocated_wrw_buffer,
+        result.invoker_factory = [split_k                     = split_k,
+                                  ck_args                     = CKArgsType{problem},
+                                  alpha_beta_case             = alpha_beta_case,
+                                  should_allocated_wrw_buffer = should_allocated_wrw_buffer,
                                   sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](
                                      const std::vector<Kernel>&) mutable {
-            return [split_k,
-                    ck_args = std::move(ck_args),
-                    alpha_beta_case,
-                    should_allocated_wrw_buffer,
-                    sh_conv_ptr = std::move(sh_conv_ptr)](
+            return [split_k                     = split_k,
+                    ck_args                     = std::move(ck_args),
+                    alpha_beta_case             = alpha_beta_case,
+                    should_allocated_wrw_buffer = should_allocated_wrw_buffer,
+                    sh_conv_ptr                 = std::move(sh_conv_ptr)](
                        const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 const auto& data_ctx = primitive_parameters.CastTo<CastType>();
                 std::unique_ptr<ck::tensor_operation::device::BaseArgument> argument_ptr;

From 4e4e1fcb2dc3f6069d01271752cccf37cca764e2 Mon Sep 17 00:00:00 2001
From: long10024070 <long.luong@moreh.com.vn>
Date: Tue, 5 Nov 2024 04:01:51 +0000
Subject: [PATCH 26/28] code shorten

---
 test/gtest/sigmoid_focal_loss.cpp | 219 +++++-------------------------
 1 file changed, 36 insertions(+), 183 deletions(-)

diff --git a/test/gtest/sigmoid_focal_loss.cpp b/test/gtest/sigmoid_focal_loss.cpp
index fa90ceb218..48982ee8db 100644
--- a/test/gtest/sigmoid_focal_loss.cpp
+++ b/test/gtest/sigmoid_focal_loss.cpp
@@ -25,88 +25,29 @@
  *******************************************************************************/
 
 #include "sigmoid_focal_loss.hpp"
-#include "tensor_holder.hpp"
 #include <miopen/bfloat16.hpp>
-#include <miopen/env.hpp>
-
-MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
-MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
 namespace sigmoidfocalloss {
-
-std::string GetFloatArg()
-{
-    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(tmp.empty())
-    {
-        return "";
-    }
-    return tmp;
-}
-
-struct GPU_SigmoidFocalLoss_fwd_FP32 : SigmoidFocalLossFwdTest<float>
-{
-};
-
-struct GPU_SigmoidFocalLoss_fwd_FP16 : SigmoidFocalLossFwdTest<half>
-{
-};
-
-struct GPU_SigmoidFocalLoss_fwd_BFP16 : SigmoidFocalLossFwdTest<bfloat16>
-{
-};
-
-struct GPU_SigmoidFocalLoss_bwd_FP32 : SigmoidFocalLossBwdTest<float>
-{
-};
-
-struct GPU_SigmoidFocalLoss_bwd_FP16 : SigmoidFocalLossBwdTest<half>
-{
-};
-
-struct GPU_SigmoidFocalLoss_bwd_BFP16 : SigmoidFocalLossBwdTest<bfloat16>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_fwd_FP32 : SigmoidFocalLossUnreducedFwdTest<float>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_fwd_FP16 : SigmoidFocalLossUnreducedFwdTest<half>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_fwd_BFP16 : SigmoidFocalLossUnreducedFwdTest<bfloat16>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_bwd_FP32 : SigmoidFocalLossUnreducedBwdTest<float>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_bwd_FP16 : SigmoidFocalLossUnreducedBwdTest<half>
-{
-};
-
-struct GPU_SigmoidFocalLossUnreduced_bwd_BFP16 : SigmoidFocalLossUnreducedBwdTest<bfloat16>
-{
-};
+using GPU_SigmoidFocalLoss_fwd_FP32           = SigmoidFocalLossFwdTest<float>;
+using GPU_SigmoidFocalLoss_fwd_FP16           = SigmoidFocalLossFwdTest<half>;
+using GPU_SigmoidFocalLoss_fwd_BFP16          = SigmoidFocalLossFwdTest<bfloat16>;
+using GPU_SigmoidFocalLoss_bwd_FP32           = SigmoidFocalLossBwdTest<float>;
+using GPU_SigmoidFocalLoss_bwd_FP16           = SigmoidFocalLossBwdTest<half>;
+using GPU_SigmoidFocalLoss_bwd_BFP16          = SigmoidFocalLossBwdTest<bfloat16>;
+using GPU_SigmoidFocalLossUnreduced_fwd_FP32  = SigmoidFocalLossUnreducedFwdTest<float>;
+using GPU_SigmoidFocalLossUnreduced_fwd_FP16  = SigmoidFocalLossUnreducedFwdTest<half>;
+using GPU_SigmoidFocalLossUnreduced_fwd_BFP16 = SigmoidFocalLossUnreducedFwdTest<bfloat16>;
+using GPU_SigmoidFocalLossUnreduced_bwd_FP32  = SigmoidFocalLossUnreducedBwdTest<float>;
+using GPU_SigmoidFocalLossUnreduced_bwd_FP16  = SigmoidFocalLossUnreducedBwdTest<half>;
+using GPU_SigmoidFocalLossUnreduced_bwd_BFP16 = SigmoidFocalLossUnreducedBwdTest<bfloat16>;
 }; // namespace sigmoidfocalloss
 
 using namespace sigmoidfocalloss;
 
 TEST_P(GPU_SigmoidFocalLoss_fwd_FP32, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -115,16 +56,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLoss_fwd_FP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -133,16 +66,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLoss_fwd_BFP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -151,16 +76,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLoss_bwd_FP32, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -169,16 +86,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLoss_bwd_FP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -187,16 +96,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLoss_bwd_BFP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -205,16 +106,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP32, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -223,16 +116,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_FP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -241,16 +126,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_fwd_BFP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -259,16 +136,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP32, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -277,16 +146,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_FP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,
@@ -295,16 +156,8 @@ INSTANTIATE_TEST_SUITE_P(Full,
 
 TEST_P(GPU_SigmoidFocalLossUnreduced_bwd_BFP16, Test)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Full,

From a33ad209aead3dbdcd98e37df84c1ad834f979c1 Mon Sep 17 00:00:00 2001
From: long10024070 <long.luong@moreh.com.vn>
Date: Tue, 5 Nov 2024 05:06:59 +0000
Subject: [PATCH 27/28] Fig build error

---
 include/miopen/miopen.h                       | 682 +++++++++++++++---
 .../miopen/sigmoidfocalloss/solvers.hpp       |   1 +
 src/include/miopen/solver_id.hpp              |   2 +-
 src/solver.cpp                                |  41 +-
 4 files changed, 600 insertions(+), 126 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 923906df56..27b5ebe327 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -2737,6 +2737,67 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
                                         double epsilon,
                                         void* resultSaveMean,
                                         void* resultSaveInvVariance);
+/*! @brief Execute forward training layer for batch normalization
+ *
+ * Batch normalization pass for forward training pass.
+ * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
+ * with their descriptor.
+ *
+ * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean
+ * and inverse variance will not be used.
+ *
+ * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values
+ * for the running mean and variance will not be saved.
+ * Running averages and variances are scaled using an exponential averaging factor: \f[
+ * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor)
+ * \f]
+ * where \f[
+ * factor=1/(1+iteration)
+ * \f]
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alpha                     Floating point scaling factor, allocated on the host (input)
+ * @param beta                      Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param yDesc                     Tensor descriptor for output data tensor y (input)
+ * @param y                         Data tensor y (output)
+ * @param ScaleDesc                 Tensor descriptor for BN scaling
+ * @param biasVarDesc               Tensor descriptor for BN bias
+ * @param savedMeanDesc             Tensor descriptor for BN saved Mean
+ * @param savedVarDesc              Tensor descriptor for BN saved Variance
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param bnBias                    Batch norm bias, beta, tensor (input)
+ * @param expAvgFactor              Exponential averaging factor (input)
+ * @param resultRunningMean         Running average saved for inference (output)
+ * @param resultRunningVariance     Running variance saved for inference (output)
+ * @param epsilon                   Value to stablize inverse variance calculation (input)
+ * @param resultSaveMean            Saved mini-batch mean for backwards pass (output)
+ * @param resultSaveInvVariance     Saved mini-batch inverse variance for backwards pass (output)
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
+                                           miopenBatchNormMode_t bn_mode,
+                                           void* alpha,
+                                           void* beta,
+                                           const miopenTensorDescriptor_t xDesc,
+                                           const void* x,
+                                           const miopenTensorDescriptor_t yDesc,
+                                           void* y,
+                                           const miopenTensorDescriptor_t scaleDesc,
+                                           const miopenTensorDescriptor_t biasVarDesc,
+                                           const miopenTensorDescriptor_t savedMeanDesc,
+                                           const miopenTensorDescriptor_t savedVarDesc,
+                                           void* bnScale,
+                                           void* bnBias,
+                                           double expAvgFactor,
+                                           void* resultRunningMean,
+                                           void* resultRunningVariance,
+                                           double epsilon,
+                                           void* resultSaveMean,
+                                           void* resultSaveInvVariance);
 
 /*! @brief Execute forward inference layer for batch normalization
  *
@@ -2783,6 +2844,56 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                                          void* estimatedVariance,
                                          double epsilon);
 
+/*! @brief Execute forward inference layer for batch normalization
+ *
+ * Batch normalization pass for forward inference pass.
+ * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
+ * with their descriptor.
+ *
+ * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and
+ * variance will be calculated from input data and this calculated mean and variance will be used
+ * to update input values.
+ * If variance is zero and epsilon is also zero, this function outputs NAN values.  Input espilon
+ * value should always be non zero positive value.
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alpha                     Floating point scaling factor, allocated on the host (input)
+ * @param beta                      Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param yDesc                     Tensor descriptor for output data tensor y (input)
+ * @param y                         Data tensor y (output)
+ * @param ScaleDesc                 Tensor descriptor for BN scaling
+ * @param biasVarDesc               Tensor descriptor for BN bias
+ * @param estMeanDesc               Tensor descriptor for BN estimated Mean
+ * @param estVarianceDesc           Tensor descriptor for BN estimated Variance
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param bnBias                    Batch norm bias, beta, tensor (input)
+ * @param estimatedMean             Running average saved during forward training (input)
+ * @param estimatedVariance         Running variance saved during forward training (input)
+ * @param epsilon                   Value to stabilize inverse variance calculation (input)
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
+                                            miopenBatchNormMode_t bn_mode,
+                                            void* alpha,
+                                            void* beta,
+                                            const miopenTensorDescriptor_t xDesc,
+                                            const void* x,
+                                            const miopenTensorDescriptor_t yDesc,
+                                            void* y,
+                                            const miopenTensorDescriptor_t scaleDesc,
+                                            const miopenTensorDescriptor_t biasDesc,
+                                            const miopenTensorDescriptor_t estMeanDesc,
+                                            const miopenTensorDescriptor_t estVarianceDesc,
+                                            void* bnScale,
+                                            void* bnBias,
+                                            void* estimatedMean,
+                                            void* estimatedVariance,
+                                            double epsilon);
+
 /*! @brief Execute backwards propagation layer for batch normalization
  *
  * Batch normalization pass for backwards propagation training pass.
@@ -2838,6 +2949,68 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                  const void* savedMean,
                                  const void* savedInvVariance);
 
+/*! @brief Execute backwards propagation layer for batch normalization
+ *
+ * Batch normalization pass for backwards propagation training pass.
+ * The method for backwards propagation batch normalization.
+ *
+ * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy,
+ * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their
+ * descriptor.
+ *
+ * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved
+ * mean and variance calculated by the forward training phase.
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alphaDataDiff             Floating point scaling factor, allocated on the host (input)
+ * @param betaDataDiff              Floating point shift factor, allocated on the host (input)
+ * @param alphaParamDiff            Floating point scaling factor, allocated on the host (input)
+ * @param betaParamDiff             Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param dyDesc                    Tensor descriptor for output data tensor y (input)
+ * @param dy                        Data tensor y (input)
+ * @param dxDesc                    Tensor descriptor for output data tensor dx (input)
+ * @param dx                        Data delta tensor dx (output)
+ * @param scaleDesc                 Tensor descriptor for scaling descriptor (input)
+ * @param biasDesc                  Tensor descriptor for bias/shift descriptor (input)
+ * @param savedMeanDesc             Tensor descriptor for saved Mean  descriptor (input)
+ * @param savedVarDesc              Tensor descriptor for saved Variance descriptor (input)
+ * , shifting, saved variance and
+ * mean (input)
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param resultBnScaleDiff         Tensor for dscale (output)
+ * @param resultBnBiasDiff          Tensor for dbias (output)
+ * @param epsilon                   Value to stabilize inverse variance calculation (input)
+ * @param savedMean                 Saved mini-batch mean for backwards pass (input)
+ * @param savedInvVariance          Saved mini-bathc inverse variance for backwards pass (input)
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
+                                    miopenBatchNormMode_t bn_mode,
+                                    const void* alphaDataDiff,
+                                    const void* betaDataDiff,
+                                    const void* alphaParamDiff,
+                                    const void* betaParamDiff,
+                                    const miopenTensorDescriptor_t xDesc,
+                                    const void* x,
+                                    const miopenTensorDescriptor_t dyDesc,
+                                    const void* dy,
+                                    const miopenTensorDescriptor_t dxDesc,
+                                    void* dx,
+                                    const miopenTensorDescriptor_t scaleDesc,
+                                    const miopenTensorDescriptor_t biasDesc,
+                                    const miopenTensorDescriptor_t savedMeanDesc,
+                                    const miopenTensorDescriptor_t savedVarDesc,
+                                    const void* bnScale,
+                                    void* resultBnScaleDiff,
+                                    void* resultBnBiasDiff,
+                                    double epsilon,
+                                    const void* savedMean,
+                                    const void* savedInvVariance);
+
 /** @} */
 // CLOSEOUT BATCHNORM DOXYGEN GROUP
 
@@ -2951,6 +3124,54 @@ miopenDestroyActivationDescriptor(miopenActivationDescriptor_t activDesc);
 /** @} */
 // CLOSEOUT ACTIVATION DOXYGEN GROUP
 
+#ifdef MIOPEN_BETA_API
+/** @addtogroup activation
+ *
+ *  @{
+ */
+
+/*! @brief Execute a GLU forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Input tensor (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Output tensor (output)
+ * @param dim                      Dimension to split the input (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenGLUForward(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t outputDesc,
+                                              void* output,
+                                              const uint32_t dim);
+
+/*! @brief Execute a GLU backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Input tensor (input)
+ * @param outputGradDesc           Tensor descriptor for delta output tensor (input)
+ * @param outputGrad               Delta output tensor (input)
+ * @param inputGradDesc            Tensor descriptor for delta input tensor (input)
+ * @param inputGrad                Delta input tensor (output)
+ * @param dim                      Dimension to split the input (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenGLUBackward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputGradDesc,
+                                               const void* outputGrad,
+                                               const miopenTensorDescriptor_t inputGradDesc,
+                                               void* inputGrad,
+                                               const uint32_t dim);
+
+/** @} */
+// CLOSEOUT ACTIVATION DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 // Softmax APIs
 /** @addtogroup softmax
  *
@@ -5006,98 +5227,6 @@ MIOPEN_EXPORT miopenStatus_t miopenCTCLoss(miopenHandle_t handle,
                                            void* workSpace,
                                            size_t workSpaceSize);
 
-#ifdef MIOPEN_BETA_API
-
-typedef enum
-{
-    MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */
-    MIOPEN_LOSS_REDUCTION_SUM  = 1, /*!< output tensor elements are summed up */
-    MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total
-                                       number of elements to get mean value */
-} miopenLossReductionMode_t;
-
-/*! @brief Helper function to query the minimum workspace size required by the sigmoid focal loss
- * call
- *
- * @param handle                   MIOpen Handle (input)
- * @param inputDesc                Tensor descriptor for input tensor (input)
- * @param targetDesc               Tensor descriptor for target tensor (input)
- * @param outputDesc               Tensor descriptor for output tensor (input)
- * @param reduction                Reduction (input)
- * @param sizeInBytes              Pointer to data to return the minimum workspace size
- * @return                         miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t
-miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle,
-                                              miopenTensorDescriptor_t inputDesc,
-                                              miopenTensorDescriptor_t targetDesc,
-                                              miopenTensorDescriptor_t outputDesc,
-                                              miopenLossReductionMode_t reduction,
-                                              size_t* sizeInBytes);
-
-/*! @brief Execute a SigmoidFocalLoss forward layer
- *
- * @param handle                   MIOpen handle (input)
- * @param workspace                Address of the allocated workspace data (input)
- * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
- * @param inputDesc                Tensor descriptor for input tensor (input)
- * @param input                    Data tensor input (input)
- * @param targetDesc               Tensor descriptor for target tensor (input)
- * @param target                   Data tensor target (input)
- * @param outputDesc               Tensor descriptor for output tensor (input)
- * @param output                   Data tensor output (output)
- * @param alpha                    Alpha (input)
- * @param gamma                    Gamma (input)
- * @param reduction                Reduction (input)
- * @return                         miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle,
-                                                           void* workspace,
-                                                           size_t workspaceSizeInBytes,
-                                                           miopenTensorDescriptor_t inputDesc,
-                                                           const void* input,
-                                                           miopenTensorDescriptor_t targetDesc,
-                                                           const void* target,
-                                                           miopenTensorDescriptor_t outputDesc,
-                                                           void* output,
-                                                           float alpha,
-                                                           float gamma,
-                                                           miopenLossReductionMode_t reduction);
-
-/*! @brief Execute a SigmoidFocalLoss backward layer
- *
- * @param handle                   MIOpen handle (input)
- * @param inputDesc                Tensor descriptor for input tensor (input)
- * @param input                    Data tensor input (input)
- * @param targetDesc               Tensor descriptor for target tensor (input)
- * @param target                   Data tensor target (input)
- * @param doutputDesc              Tensor descriptor for output gradient (input)
- * @param doutput                  Gradient of output (input)
- * @param dinputDesc               Tensor descriptor for input gradient (input)
- * @param dinput                   Gradient of input (output)
- * @param dtargetDesc              Tensor descriptor for target gradient (input)
- * @param dtarget                  Gradient of target (output)
- * @param alpha                    Alpha (input)
- * @param gamma                    Gamma (input)
- * @param reduction                Reduction (input)
- * @return                         miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle,
-                                                            miopenTensorDescriptor_t inputDesc,
-                                                            const void* input,
-                                                            miopenTensorDescriptor_t targetDesc,
-                                                            const void* target,
-                                                            miopenTensorDescriptor_t doutputDesc,
-                                                            const void* doutput,
-                                                            miopenTensorDescriptor_t dinputDesc,
-                                                            void* dinput,
-                                                            miopenTensorDescriptor_t dtargetDesc,
-                                                            void* dtarget,
-                                                            float alpha,
-                                                            float gamma,
-                                                            miopenLossReductionMode_t reduction);
-#endif
-
 /** @} */
 // CLOSEOUT LossFunction DOXYGEN GROUP
 
@@ -5499,40 +5628,42 @@ typedef enum
     miopenTensorMhaAmaxDK             = 33,
     miopenTensorMhaAmaxDV             = 34,
     miopenTensorMhaAmaxDS             = 35,
+    miopenTensorMhaBias               = 36,
 
 #ifdef MIOPEN_BETA_API
-    miopenTensorActivationX                = 36,
-    miopenTensorActivationY                = 37,
-    miopenTensorActivationDX               = 38,
-    miopenTensorActivationDY               = 39,
-    miopenTensorBiasX                      = 40,
-    miopenTensorBiasY                      = 41,
-    miopenTensorBias                       = 42,
-    miopenTensorSoftmaxX                   = 43,
-    miopenTensorSoftmaxY                   = 44,
-    miopenTensorSoftmaxDX                  = 45,
-    miopenTensorSoftmaxDY                  = 46,
-    miopenTensorBatchnormX                 = 47,
-    miopenTensorBatchnormY                 = 48,
-    miopenTensorBatchnormRunningMean       = 49,
-    miopenTensorBatchnormRunningVariance   = 50,
-    miopenTensorBatchnormSavedMean         = 51,
-    miopenTensorBatchnormSavedVariance     = 52,
-    miopenTensorBatchnormScale             = 53,
-    miopenTensorBatchnormScaleDiff         = 54,
-    miopenTensorBatchnormEstimatedMean     = 55,
-    miopenTensorBatchnormEstimatedVariance = 56,
-    miopenTensorBatchnormBias              = 57,
-    miopenTensorBatchnormBiasDiff          = 58,
-    miopenTensorBatchnormDX                = 59,
-    miopenTensorBatchnormDY                = 60,
+    miopenTensorActivationX                = 37,
+    miopenTensorActivationY                = 38,
+    miopenTensorActivationDX               = 39,
+    miopenTensorActivationDY               = 40,
+    miopenTensorBiasX                      = 41,
+    miopenTensorBiasY                      = 42,
+    miopenTensorBias                       = 43,
+    miopenTensorSoftmaxX                   = 44,
+    miopenTensorSoftmaxY                   = 45,
+    miopenTensorSoftmaxDX                  = 46,
+    miopenTensorSoftmaxDY                  = 47,
+    miopenTensorBatchnormX                 = 48,
+    miopenTensorBatchnormY                 = 49,
+    miopenTensorBatchnormRunningMean       = 50,
+    miopenTensorBatchnormRunningVariance   = 51,
+    miopenTensorBatchnormSavedMean         = 52,
+    miopenTensorBatchnormSavedVariance     = 53,
+    miopenTensorBatchnormScale             = 54,
+    miopenTensorBatchnormScaleDiff         = 55,
+    miopenTensorBatchnormEstimatedMean     = 56,
+    miopenTensorBatchnormEstimatedVariance = 57,
+    miopenTensorBatchnormBias              = 58,
+    miopenTensorBatchnormBiasDiff          = 59,
+    miopenTensorBatchnormDX                = 60,
+    miopenTensorBatchnormDY                = 61,
 #endif
 
     miopenTensorArgumentIsScalar = 1U << 31,
 
+    miopenTensorMhaMask = miopenTensorArgumentIsScalar | 1,
 #ifdef MIOPEN_BETA_API
-    miopenScalarBatchnormExpAvgFactor = miopenTensorArgumentIsScalar | 1,
-    miopenScalarBatchnormEpsilon      = miopenTensorArgumentIsScalar | 2,
+    miopenScalarBatchnormExpAvgFactor = miopenTensorArgumentIsScalar | 2,
+    miopenScalarBatchnormEpsilon      = miopenTensorArgumentIsScalar | 3,
 #endif
 } miopenTensorArgumentId_t;
 
@@ -5564,6 +5695,15 @@ MIOPEN_EXPORT miopenStatus_t miopenCreateConvProblem(miopenProblem_t* problem,
  * @return             miopenStatus_t
  */
 
+/*! @enum miopenMhaMask_t
+ * Different masks for Mha.
+ */
+typedef enum
+{
+    miopenMhaMaskNone   = 0,
+    miopenMhaMaskCausal = 1,
+} miopenMhaMask_t;
+
 MIOPEN_EXPORT miopenStatus_t miopenCreateMhaProblem(miopenProblem_t* problem,
                                                     miopenMhaDescriptor_t operatorDesc,
                                                     miopenProblemDirection_t direction);
@@ -7768,6 +7908,40 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle,
                                                 void* dx);
 /** @} */
 // CLOSEOUT ROPE DOXYGEN GROUP
+// kthvalue APIs
+/** @addtogroup kthvalue
+ *
+ *  @{
+ */
+
+/*! @brief Execute a Kthvalue forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param indices                  Data tensor indices (output)
+ * @param indicesDesc              Tensor descriptor for indices tensor (input)
+ * @param k                        The k-th smallest element(input)
+ * @param dim                      The dimension to find the kth value along (Default = -1)(input)
+ * @param keepDim                  Whether the output tensor has dim retained or not (Default =
+ * false)(input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenKthvalueForward(miopenHandle_t handle,
+                                                   miopenTensorDescriptor_t inputDesc,
+                                                   const void* input,
+                                                   miopenTensorDescriptor_t outputDesc,
+                                                   void* output,
+                                                   miopenTensorDescriptor_t indicesDesc,
+                                                   size_t* indices,
+                                                   size_t k,
+                                                   int32_t dim  = -1,
+                                                   bool keepDim = false);
+
+/** @} */
+// CLOSEOUT kthvalue DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
 #ifdef MIOPEN_BETA_API
@@ -7824,6 +7998,276 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
 // CLOSEOUT RELU DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+
+/*! @ingroup LossFunction
+ * @enum miopenLossReductionMode_t
+ * Reduction mode for loss function
+ */
+typedef enum
+{
+    MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */
+    MIOPEN_LOSS_REDUCTION_SUM  = 1, /*!< output tensor elements are summed up */
+    MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total
+                                       number of elements to get mean value */
+} miopenLossReductionMode_t;
+
+// SoftMarginLoss APIs
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the
+SoftMarginLossForward call
+ *
+ * @param [in]  handle              MIOpen Handle
+ * @param [in]  inputDesc           Tensor descriptor for input tensor
+ * @param [in]  targetDesc          Tensor descriptor for target tensor
+ * @param [in]  outputDesc          Tensor descriptor for output tensor
+*  @param [in]  reduction           Reduction mode (sum, mean). For none reduction we don't need to
+use this function
+ * @param [out] sizeInBytes         Pointer to data to return the minimum workspace size
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetSoftMarginLossForwardWorkspaceSize(miopenHandle_t handle,
+                                            miopenTensorDescriptor_t inputDesc,
+                                            miopenTensorDescriptor_t targetDesc,
+                                            miopenTensorDescriptor_t outputDesc,
+                                            miopenLossReductionMode_t reduction,
+                                            size_t* sizeInBytes);
+
+/*! @brief Execute a SoftMarginLoss forward layer
+ *
+ * @param [in]  handle                  MIOpen handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor
+ * @param [in]  target                  Data tensor target
+ * @param [in]  outputDesc              Tensor descriptor for output tensor
+ * @param [out] output                  Data tensor output
+ * @param [in]  reduction               Reduction mode. If reduction mode is mean or sum, you must
+ * provide param workspace and workspaceSizeInBytes. Call
+ * miopenGetSoftMarginLossForwardWorkspaceSize to get workspaceSizeInBytes
+ * @param [in]  workspace               Address of the allocated workspace data (Default = null)
+ * @param [in]  workspaceSizeInBytes    Size in bytes of the allocated workspace data (Default = 0)
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossForward(miopenHandle_t handle,
+                                                         miopenTensorDescriptor_t inputDesc,
+                                                         const void* input,
+                                                         miopenTensorDescriptor_t targetDesc,
+                                                         const void* target,
+                                                         miopenTensorDescriptor_t outputDesc,
+                                                         void* output,
+                                                         miopenLossReductionMode_t reduction,
+                                                         void* workspace             = nullptr,
+                                                         size_t workspaceSizeInBytes = 0);
+
+/*! @brief Execute a SoftMarginLoss backward layer
+ *
+ * @param [in]  handle                  MIOpen handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor
+ * @param [in]  target                  Data tensor target
+ * @param [in]  doutputDesc             Tensor descriptor for output gradient
+ * @param [in]  doutput                 Output gradient
+ * @param [in]  dinputDesc              Tensor descriptor for input gradient
+ * @param [out] dinput                  Input gradient
+ * @param [in]  reduction               Reduction mode (none, sum, mean)
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle,
+                                                          miopenTensorDescriptor_t inputDesc,
+                                                          const void* input,
+                                                          miopenTensorDescriptor_t targetDesc,
+                                                          const void* target,
+                                                          miopenTensorDescriptor_t doutputDesc,
+                                                          const void* doutput,
+                                                          miopenTensorDescriptor_t dinputDesc,
+                                                          void* dinput,
+                                                          miopenLossReductionMode_t reduction);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
+#ifdef MIOPEN_BETA_API
+// MultiMarginLoss APIs
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the
+MultiMarginLoss Forward call
+ *
+ * @param [in]  handle              MIOpen Handle
+ * @param [in]  inputDesc           Tensor descriptor for input tensor (N, C) where N is the batch
+size and C is the number of classes
+ * @param [in]  targetDesc          Tensor descriptor for target tensor, must have shape (N). Each
+value is between 0 and C - 1
+ * @param [in]  weightDesc          Tensor descriptor for weight tensor. It is a manual rescaling
+weight given to each class. It has to be a Tensor of size C
+ * @param [in]  outputDesc          Tensor descriptor for output tensor. If reduction is 'none,
+then it must have shape (N). Otherwise, it is a scalar
+ * @param [in]  p                   Has a default value of 1. The only supported values are 1 and 2
+ * @param [in]  margin              Has a default value of 1
+ * @param [in]  reduction           Reduction mode (sum, mean)
+ * @param [out] sizeInBytes         Pointer to data to return the minimum workspace size
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetMultiMarginLossForwardWorkspaceSize(miopenHandle_t handle,
+                                             miopenTensorDescriptor_t inputDesc,
+                                             miopenTensorDescriptor_t targetDesc,
+                                             miopenTensorDescriptor_t weightDesc,
+                                             miopenTensorDescriptor_t outputDesc,
+                                             long p,
+                                             float margin,
+                                             miopenLossReductionMode_t reduction,
+                                             size_t* sizeInBytes);
+
+/*! @brief Execute a MultiMarginLoss forward layer
+ *
+ * @param [in]  handle                  MIOpen handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor (N, C) where N is the
+batch size and C is the number of classes.
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor, must have shape (N).
+Each value is between 0 and C - 1
+ * @param [in]  target                  Data tensor target
+ * @param [in]  weightDesc              Tensor descriptor for weight tensor. It is a manual
+rescaling weight given to each class. It has to be a Tensor of size C
+ * @param [in]  weight                  Data tensor weight
+ * @param [in]  outputDesc              Tensor descriptor for output tensor. If reduction is 'none,
+then it must have shape (N). Otherwise, it is a scalar.
+ * @param [out] output                  Data tensor output
+ * @param [in]  p                       Has a default value of 1. The only supported values are 1
+and 2
+ * @param [in]  margin                  Has a default value of 1
+ * @param [in]  reduction               Reduction mode. If reduction mode is mean or sum, you must
+ * provide param workspace and workspaceSizeInBytes. Call
+ * miopenGetMultiMarginLossForwardWorkspaceSize to get workspaceSizeInBytes
+ * @param [in]  workspace               Address of the allocated workspace data. Set = nullptr if
+reduction = 'none'
+ * @param [in]  workspaceSizeInBytes    Size in bytes of the allocated workspace data. Set = 0 if
+reduction = 'none
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
+                                                          miopenTensorDescriptor_t inputDesc,
+                                                          const void* input,
+                                                          miopenTensorDescriptor_t targetDesc,
+                                                          const void* target,
+                                                          miopenTensorDescriptor_t weightDesc,
+                                                          const void* weight,
+                                                          miopenTensorDescriptor_t outputDesc,
+                                                          void* output,
+                                                          long p,
+                                                          float margin,
+                                                          miopenLossReductionMode_t reduction,
+                                                          void* workspace,
+                                                          size_t workspaceSizeInBytes);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
+#ifdef MIOPEN_BETA_API
+// SigmoidFocalLoss APIs
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the SigmoidFocalLoss
+ * Forward call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param reduction                Reduction (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetSigmoidFocalLossForwardWorkspaceSize(miopenHandle_t handle,
+                                              miopenTensorDescriptor_t inputDesc,
+                                              miopenTensorDescriptor_t targetDesc,
+                                              miopenTensorDescriptor_t outputDesc,
+                                              miopenLossReductionMode_t reduction,
+                                              size_t* sizeInBytes);
+
+/*! @brief Execute a SigmoidFocalLoss forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param target                   Data tensor target (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param alpha                    Alpha (input)
+ * @param gamma                    Gamma (input)
+ * @param reduction                Reduction (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossForward(miopenHandle_t handle,
+                                                           void* workspace,
+                                                           size_t workspaceSizeInBytes,
+                                                           miopenTensorDescriptor_t inputDesc,
+                                                           const void* input,
+                                                           miopenTensorDescriptor_t targetDesc,
+                                                           const void* target,
+                                                           miopenTensorDescriptor_t outputDesc,
+                                                           void* output,
+                                                           float alpha,
+                                                           float gamma,
+                                                           miopenLossReductionMode_t reduction);
+
+/*! @brief Execute a SigmoidFocalLoss backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param targetDesc               Tensor descriptor for target tensor (input)
+ * @param target                   Data tensor target (input)
+ * @param doutputDesc              Tensor descriptor for output gradient (input)
+ * @param doutput                  Gradient of output (input)
+ * @param dinputDesc               Tensor descriptor for input gradient (input)
+ * @param dinput                   Gradient of input (output)
+ * @param dtargetDesc              Tensor descriptor for target gradient (input)
+ * @param dtarget                  Gradient of target (output)
+ * @param alpha                    Alpha (input)
+ * @param gamma                    Gamma (input)
+ * @param reduction                Reduction (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSigmoidFocalLossBackward(miopenHandle_t handle,
+                                                            miopenTensorDescriptor_t inputDesc,
+                                                            const void* input,
+                                                            miopenTensorDescriptor_t targetDesc,
+                                                            const void* target,
+                                                            miopenTensorDescriptor_t doutputDesc,
+                                                            const void* doutput,
+                                                            miopenTensorDescriptor_t dinputDesc,
+                                                            void* dinput,
+                                                            miopenTensorDescriptor_t dtargetDesc,
+                                                            void* dtarget,
+                                                            float alpha,
+                                                            float gamma,
+                                                            miopenLossReductionMode_t reduction);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/include/miopen/sigmoidfocalloss/solvers.hpp b/src/include/miopen/sigmoidfocalloss/solvers.hpp
index 9cb3bd15e8..67d566c935 100644
--- a/src/include/miopen/sigmoidfocalloss/solvers.hpp
+++ b/src/include/miopen/sigmoidfocalloss/solvers.hpp
@@ -27,6 +27,7 @@
 
 #include <miopen/sigmoidfocalloss/problem_description.hpp>
 #include <miopen/solver.hpp>
+#include <miopen/buffer_info.hpp>
 
 namespace miopen {
 
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index a5d62b6092..3524c33451 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -32,7 +32,6 @@
 #include <miopen/conv_algo_name.hpp>
 
 #include <cstdint>
-#include <unordered_map>
 
 namespace miopen {
 
@@ -62,6 +61,7 @@ enum class Primitive
     Item,
     RoPE,
     ReLU,
+    Kthvalue,
     Loss
 };
 
diff --git a/src/solver.cpp b/src/solver.cpp
index f0b2854de7..167a085872 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -24,22 +24,25 @@
  *
  *******************************************************************************/
 
-#include <miopen/solver.hpp>
-
 #include <miopen/activ/solvers.hpp>
 #include <miopen/adam/solvers.hpp>
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/cat/solvers.hpp>
+#include <miopen/conv/solvers.hpp>
 #include <miopen/fusion/solvers.hpp>
-#include <miopen/groupnorm/solvers.hpp>
 #include <miopen/getitem/solvers.hpp>
+#include <miopen/glu/solvers.hpp>
+#include <miopen/groupnorm/solvers.hpp>
+#include <miopen/kthvalue/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
+#include <miopen/mha/solvers.hpp>
+#include <miopen/multimarginloss/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/prelu/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/rope/solvers.hpp>
-#include <miopen/mha/solvers.hpp>
 #include <miopen/sigmoidfocalloss/solvers.hpp>
+#include <miopen/softmarginloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
@@ -57,6 +60,14 @@
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS)
 
 namespace miopen {
+
+namespace debug {
+
+// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
+bool enable_deprecated_solvers = false;
+
+} // namespace debug
+
 namespace solver {
 
 std::ostream& operator<<(std::ostream& os, const KernelInfo& k)
@@ -678,9 +689,24 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
+
     Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId());
     Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId());
-    
+
+    Register(registry, ++id, Primitive::Kthvalue, kthvalue::KthvalueFwd{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Activation, glu::GLUForward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Activation, glu::GLUBackward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossForward{}.SolverDbId());
+    Register(
+        registry, ++id, Primitive::Loss, softmarginloss::SoftMarginLossBackward{}.SolverDbId());
+
+    Register(
+        registry, ++id, Primitive::Loss, multimarginloss::MultiMarginLossForward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId());
+
     Register(registry,
              ++id,
              Primitive::Loss,
@@ -692,12 +718,15 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Loss, sigmoidfocalloss::SigmoidFocalLossBwd{}.SolverDbId());
 
-    // IMPORTANT: New solvers should be added to the end of the function!
+    // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white
+    // space between this comment and the newly registered solver(s)!
 }
 
 bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx)
 {
     static const bool device_is_allowed = [&]() {
+        if(miopen::debug::enable_deprecated_solvers)
+            return true;
         if(env::enabled(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS))
             return true;
         const auto device = ctx.GetStream().GetTargetProperties().Name();

From 009c7853fb1ad2156d529e9dc4d6ca9bd4507016 Mon Sep 17 00:00:00 2001
From: long10024070 <long.luong@moreh.com.vn>
Date: Tue, 5 Nov 2024 10:45:26 +0000
Subject: [PATCH 28/28] fix clang tidy

---
 include/miopen/miopen.h | 4 ++--
 src/solver.cpp          | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 27b5ebe327..5524874405 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -8094,8 +8094,8 @@ MIOPEN_EXPORT miopenStatus_t miopenSoftMarginLossBackward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
-#ifdef MIOPEN_BETA_API
 // MultiMarginLoss APIs
+#ifdef MIOPEN_BETA_API
 /** @addtogroup LossFunction
  *
  *  @{
@@ -8176,8 +8176,8 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 // CLOSEOUT LossFunction DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
-#ifdef MIOPEN_BETA_API
 // SigmoidFocalLoss APIs
+#ifdef MIOPEN_BETA_API
 /** @addtogroup LossFunction
  *
  *  @{
diff --git a/src/solver.cpp b/src/solver.cpp
index ddb206e4ab..167a085872 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -44,7 +44,6 @@
 #include <miopen/sigmoidfocalloss/solvers.hpp>
 #include <miopen/softmarginloss/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
-#include <miopen/multimarginloss/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
 #include <miopen/db.hpp>