diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index afd10b6232..1deb5188be 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -47,6 +47,7 @@ add_executable(MIOpenDriver
     dm_groupnorm.cpp
     dm_layernorm.cpp
     dm_lrn.cpp
+    dm_multimarginloss.cpp
     dm_pool.cpp
     dm_reduce.cpp
     dm_reduceextreme.cpp
diff --git a/driver/dm_multimarginloss.cpp b/driver/dm_multimarginloss.cpp
new file mode 100644
index 0000000000..1c924bb7dc
--- /dev/null
+++ b/driver/dm_multimarginloss.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "multimarginloss_driver.hpp"
+#include "registry_driver_maker.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "multimarginloss")
+        return new MultiMarginLossDriver<float, float>();
+    if(base_arg == "multimarginlossfp16")
+        return new MultiMarginLossDriver<float16, float>();
+    if(base_arg == "multimarginlossbfp16")
+        return new MultiMarginLossDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index a36121f676..6bb0f0f0d5 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], multimarginloss[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -205,7 +205,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
-       arg != "getitemfp16" && arg != "getitembfp16" && arg != "--version")
+       arg != "getitemfp16" && arg != "getitembfp16" && arg != "multimarginloss" &&
+       arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/multimarginloss_driver.hpp b/driver/multimarginloss_driver.hpp
new file mode 100644
index 0000000000..623fbca6c9
--- /dev/null
+++ b/driver/multimarginloss_driver.hpp
@@ -0,0 +1,492 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_MULTIMARGINLOSS_DRIVER_HPP
+#define GUARD_MIOPEN_MULTIMARGINLOSS_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "random.hpp"
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloMultiMarginLossUnreducedForwardRunHost(miopenTensorDescriptor_t iDesc,
+                                                  miopenTensorDescriptor_t tDesc,
+                                                  miopenTensorDescriptor_t wDesc,
+                                                  miopenTensorDescriptor_t oDesc,
+                                                  long p,
+                                                  float margin,
+                                                  Tgpu* input,
+                                                  const uint64_t* target,
+                                                  Tgpu* weight,
+                                                  Tcheck* ref_output)
+{
+    auto I_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(iDesc));
+    auto T_tv = miopen::get_inner_expanded_tv<1>(miopen::deref(tDesc));
+    auto W_tv = miopen::get_inner_expanded_tv<1>(miopen::deref(wDesc));
+    auto O_tv = miopen::get_inner_expanded_tv<1>(miopen::deref(oDesc));
+    auto N = I_tv.size[0], C = I_tv.size[1];
+
+    int32_t ret = 0;
+
+    for(size_t n = 0; n < N; n++)
+    {
+        double loss = 0;
+        uint64_t y  = target[T_tv.get_tensor_view_idx({n})];
+        if(y >= C)
+            continue;
+        for(size_t c = 0; c < C; c++)
+        {
+            if(y == c)
+                continue;
+            double t = margin - static_cast<double>(input[I_tv.get_tensor_view_idx({n, y})]) +
+                       static_cast<double>(input[I_tv.get_tensor_view_idx({n, c})]);
+
+            if(t < 0)
+                continue;
+            if(p == 2)
+                t = t * t;
+            t = weight[W_tv.get_tensor_view_idx({y})] * t;
+            loss += t / C;
+        }
+        ref_output[O_tv.get_tensor_view_idx({n})] = static_cast<Tcheck>(loss);
+    }
+    return ret;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloMultiMarginLossReducedForwardRunHost(miopenTensorDescriptor_t iDesc,
+                                                miopenTensorDescriptor_t tDesc,
+                                                miopenTensorDescriptor_t wDesc,
+                                                long p,
+                                                float margin,
+                                                const float divisor,
+                                                Tgpu* input,
+                                                const uint64_t* target,
+                                                Tgpu* weight,
+                                                Tcheck* ref_output)
+{
+    auto I_tv = miopen::get_inner_expanded_tv<2>(miopen::deref(iDesc));
+    auto T_tv = miopen::get_inner_expanded_tv<1>(miopen::deref(tDesc));
+    auto W_tv = miopen::get_inner_expanded_tv<1>(miopen::deref(wDesc));
+    auto N = I_tv.size[0], C = I_tv.size[1];
+
+    int32_t ret = 0;
+
+    double sum_loss = 0;
+    for(size_t n = 0; n < N; n++)
+    {
+        double loss = 0;
+        uint64_t y  = target[T_tv.get_tensor_view_idx({n})];
+        if(y >= C)
+            continue;
+        for(size_t c = 0; c < C; c++)
+        {
+            if(y == c)
+                continue;
+            double t = margin - static_cast<double>(input[I_tv.get_tensor_view_idx({n, y})]) +
+                       static_cast<double>(input[I_tv.get_tensor_view_idx({n, c})]);
+            if(t < 0)
+                continue;
+            if(p == 2)
+                t = t * t;
+            t = weight[W_tv.get_tensor_view_idx({y})] * t;
+            loss += t / C;
+        }
+        sum_loss += loss;
+    }
+    ref_output[0] = static_cast<Tcheck>(sum_loss / divisor);
+
+    return ret;
+};
+
+template <typename Tgpu, typename Tref>
+class MultiMarginLossDriver : public Driver
+{
+public:
+    MultiMarginLossDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&iDesc);
+        miopenCreateTensorDescriptor(&tDesc);
+        miopenCreateTensorDescriptor(&wDesc);
+        miopenCreateTensorDescriptor(&oDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~MultiMarginLossDriver() override
+    {
+        miopenDestroyTensorDescriptor(iDesc);
+        miopenDestroyTensorDescriptor(tDesc);
+        miopenDestroyTensorDescriptor(wDesc);
+        miopenDestroyTensorDescriptor(oDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    // forw = 0 -> run both fw, bw, = 1 -> run only fw, = 2 -> run only bw
+    int forw;
+
+    miopenTensorDescriptor_t iDesc;
+    miopenTensorDescriptor_t tDesc;
+    miopenTensorDescriptor_t wDesc;
+    miopenTensorDescriptor_t oDesc;
+
+    std::unique_ptr<GPUMem> i_dev;
+    std::unique_ptr<GPUMem> t_dev;
+    std::unique_ptr<GPUMem> w_dev;
+    std::unique_ptr<GPUMem> o_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> I;
+    std::vector<uint64_t> T;
+    std::vector<Tgpu> W;
+    std::vector<Tgpu> O;
+    std::vector<Tref> Ohost;
+    std::vector<Tgpu> workspace;
+
+    long p;
+    float margin;
+    miopenLossReductionMode_t reduction_mode;
+    size_t ws_sizeInBytes;
+};
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Take (Default=1)", "int");
+    inflags.AddInputFlag("dim", 'D', "41x4", "Dim of input tensor (Default=41x4)", "tensor");
+    inflags.AddInputFlag("contiguous", 'C', "1", "Tensor is contiguous or not (Default=1)", "int");
+    inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+    inflags.AddInputFlag("reduce",
+                         'R',
+                         "none",
+                         "Specifies the reduction to apply to the output ('none'|'mean'|'sum') "
+                         "(Default=none to indicate no reduction)",
+                         "str");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    auto reduction = inflags.GetValueStr("reduce");
+    if(reduction != "none" && reduction != "mean" && reduction != "sum")
+        return miopenStatusInvalidValue;
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    forw = inflags.GetValueInt("forw");
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::GetandSetData()
+{
+    // Set tensor description
+    std::vector<int> in_len = inflags.GetValueTensor("dim").lengths;
+    size_t N = in_len[0], C = in_len[1];
+    if(inflags.GetValueInt("contiguous") == 1)
+    {
+        SetTensorNd(iDesc, in_len, data_type);
+
+        std::vector<int> t_len = {N};
+        SetTensorNd(tDesc, t_len, miopenInt64);
+
+        std::vector<int> w_len = {C};
+        SetTensorNd(wDesc, w_len, data_type);
+    }
+    else
+    {
+        std::vector<int> in_strides(in_len.size());
+        in_strides.back() = 1;
+        for(int i = in_len.size() - 2; i >= 0; --i)
+            in_strides[i] = in_strides[i + 1] * in_len[i + 1];
+        in_strides[0] *= 2;
+        SetTensorNd(iDesc, in_len, in_strides, data_type);
+
+        std::vector<int> t_len     = {N};
+        std::vector<int> t_strides = {2};
+        SetTensorNd(tDesc, t_len, t_strides, miopenInt64);
+
+        std::vector<int> w_lens    = {C};
+        std::vector<int> w_strides = {2};
+        SetTensorNd(wDesc, w_lens, w_strides, data_type);
+    }
+
+    // Set p and margin
+    // p = 1 or 2
+    p      = prng::gen_A_to_B<long>(static_cast<long>(1), static_cast<long>(3));
+    margin = prng::gen_A_to_B<float>(static_cast<float>(0.5), static_cast<float>(1.5));
+
+    // Set reduction_mode
+    auto reduction = inflags.GetValueStr("reduce");
+    if(reduction == "none")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_NONE;
+    else if(reduction == "mean")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_MEAN;
+    else if(reduction == "sum")
+        reduction_mode = MIOPEN_LOSS_REDUCTION_SUM;
+
+    // Set output tensor description (forw = 1 or = 0)
+    if(forw == 0 || forw == 1)
+    {
+        if(reduction == "none")
+        {
+            std::vector<int> o_lens = {N};
+            SetTensorNd(oDesc, o_lens, data_type);
+        }
+        else
+        {
+            std::vector<int> o_lens = {1};
+            SetTensorNd(oDesc, o_lens, data_type);
+        }
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    uint32_t ctx = 0;
+
+    // for unpacked tensor, we need to use GetTensorSpace instead of GetTensorSize
+    size_t i_sz = GetTensorSpace(iDesc);
+    size_t t_sz = GetTensorSpace(tDesc);
+    size_t w_sz = GetTensorSpace(wDesc);
+    i_dev       = std::make_unique<GPUMem>(ctx, i_sz, sizeof(Tgpu));
+    t_dev       = std::make_unique<GPUMem>(ctx, t_sz, sizeof(uint64_t));
+    w_dev       = std::make_unique<GPUMem>(ctx, w_sz, sizeof(Tgpu));
+    I           = std::vector<Tgpu>(i_sz);
+    T           = std::vector<uint64_t>(t_sz);
+    W           = std::vector<Tgpu>(w_sz);
+    for(int i = 0; i < i_sz; i++)
+    {
+        I[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1), static_cast<Tgpu>(1));
+    }
+    int C = miopen::deref(iDesc).GetLengths()[1];
+    // 0 to C - 1
+    for(int i = 0; i < t_sz; i++)
+    {
+        T[i] = prng::gen_A_to_B<uint64_t>(static_cast<uint64_t>(0), static_cast<uint64_t>(C));
+    }
+    for(int i = 0; i < w_sz; i++)
+    {
+        W[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1), static_cast<Tgpu>(1));
+    }
+
+    if(i_dev->ToGPU(GetStream(), I.data()) != 0)
+        std::cerr << "Error copying (I) to GPU, size: " << i_dev->GetSize() << std::endl;
+
+    if(t_dev->ToGPU(GetStream(), T.data()) != 0)
+        std::cerr << "Error copying (T) to GPU, size: " << t_dev->GetSize() << std::endl;
+
+    if(w_dev->ToGPU(GetStream(), W.data()) != 0)
+        std::cerr << "Error copying (W) to GPU, size: " << w_dev->GetSize() << std::endl;
+
+    if(forw == 0 || forw == 1)
+    {
+        size_t o_sz = GetTensorSpace(oDesc);
+
+        miopenGetMultiMarginLossForwardWorkspaceSize(
+            GetHandle(), iDesc, tDesc, wDesc, oDesc, p, margin, reduction_mode, &ws_sizeInBytes);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+        {
+            return miopenStatusAllocFailed;
+        }
+
+        o_dev = std::make_unique<GPUMem>(ctx, o_sz, sizeof(Tgpu));
+        O     = std::vector<Tgpu>(o_sz, static_cast<Tgpu>(0));
+        Ohost = std::vector<Tref>(o_sz, static_cast<Tref>(0));
+        if(o_dev->ToGPU(GetStream(), O.data()) != 0)
+            std::cerr << "Error copying (out) to GPU, size: " << o_dev->GetSize() << std::endl;
+
+        size_t ws_sz  = ws_sizeInBytes / sizeof(Tgpu);
+        workspace_dev = std::make_unique<GPUMem>(ctx, ws_sz, sizeof(Tgpu));
+        workspace     = std::vector<Tgpu>(ws_sz);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenMultiMarginLossForward(GetHandle(),
+                                     iDesc,
+                                     i_dev->GetMem(),
+                                     tDesc,
+                                     t_dev->GetMem(),
+                                     wDesc,
+                                     w_dev->GetMem(),
+                                     oDesc,
+                                     o_dev->GetMem(),
+                                     p,
+                                     margin,
+                                     reduction_mode,
+                                     workspace_dev->GetMem(),
+                                     ws_sizeInBytes);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward MultiMarginLoss Elapsed: "
+                      << t.gettime_ms() / iter << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward MultiMarginLoss Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
+    }
+
+    if(o_dev->FromGPU(GetStream(), O.data()) != 0)
+        std::cerr << "Error copying (o_dev) from GPU, size: " << o_dev->GetSize() << std::endl;
+    if(workspace_dev->FromGPU(GetStream(), workspace.data()) != 0)
+        std::cerr << "Error copying (workspace_dev) from GPU, size: " << workspace_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        mloMultiMarginLossUnreducedForwardRunHost(
+            iDesc, tDesc, wDesc, oDesc, p, margin, I.data(), T.data(), W.data(), Ohost.data());
+    }
+    else
+    {
+        float divisor = (reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN)
+                            ? miopen::deref(iDesc).GetLengths()[0]
+                            : 1;
+        mloMultiMarginLossReducedForwardRunHost(
+            iDesc, tDesc, wDesc, p, margin, divisor, I.data(), T.data(), W.data(), Ohost.data());
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref MultiMarginLossDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(Ohost, O);
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward MultiMarginLoss FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward MultiMarginLoss Verifies OK on CPU reference (" << error << " < "
+                  << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int MultiMarginLossDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_MULTIMARGINLOSS_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 6b205fc99e..1e16e18ac4 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7607,6 +7607,100 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+/*! @ingroup LossFunction
+ * @enum miopenLossReductionMode_t
+ * Reduction mode for loss function
+ */
+typedef enum
+{
+    MIOPEN_LOSS_REDUCTION_NONE = 0, /*!< output tensor elements are not reduced */
+    MIOPEN_LOSS_REDUCTION_SUM  = 1, /*!< output tensor elements are summed up */
+    MIOPEN_LOSS_REDUCTION_MEAN = 2, /*!< output tensor elements are summed up and divided with total
+                                       number of elements to get mean value */
+} miopenLossReductionMode_t;
+
+// MultiMarginLoss APIs
+/** @addtogroup LossFunction
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the
+MultiMarginLossForward call
+ *
+ * @param [in]  handle              MIOpen Handle
+ * @param [in]  inputDesc           Tensor descriptor for input tensor (N, C) where N is the batch
+size and C is the number of classes
+ * @param [in]  targetDesc          Tensor descriptor for target tensor, must have shape (N). Each
+value is between 0 and C - 1
+ * @param [in]  weightDesc          Tensor descriptor for weight tensor. It is a manual rescaling
+weight given to each class. It has to be a Tensor of size C
+ * @param [in]  outputDesc          Tensor descriptor for output tensor. If reduction is 'none,
+then it must have shape (N). Otherwise, it is a scalar
+ * @param [in]  p                   Has a default value of 1. The only supported values are 1 and 2
+ * @param [in]  margin              Has a default value of 1
+ * @param [in]  reduction           Reduction mode (sum, mean)
+ * @param [out] sizeInBytes         Pointer to data to return the minimum workspace size
+ * @return                          miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetMultiMarginLossForwardWorkspaceSize(miopenHandle_t handle,
+                                             miopenTensorDescriptor_t inputDesc,
+                                             miopenTensorDescriptor_t targetDesc,
+                                             miopenTensorDescriptor_t weightDesc,
+                                             miopenTensorDescriptor_t outputDesc,
+                                             long p,
+                                             float margin,
+                                             miopenLossReductionMode_t reduction,
+                                             size_t* sizeInBytes);
+
+/*! @brief Execute a MultiMarginLoss forward layer
+ *
+ * @param [in]  handle                  MIOpen handle
+ * @param [in]  inputDesc               Tensor descriptor for input tensor (N, C) where N is the
+batch size and C is the number of classes.
+ * @param [in]  input                   Data tensor input
+ * @param [in]  targetDesc              Tensor descriptor for target tensor, must have shape (N).
+Each value is between 0 and C - 1
+ * @param [in]  target                  Data tensor target
+ * @param [in]  weightDesc              Tensor descriptor for weight tensor. It is a manual
+rescaling weight given to each class. It has to be a Tensor of size C
+ * @param [in]  weight                  Data tensor weight
+ * @param [in]  outputDesc              Tensor descriptor for output tensor. If reduction is 'none,
+then it must have shape (N). Otherwise, it is a scalar.
+ * @param [out] output                  Data tensor output
+ * @param [in]  p                       Has a default value of 1. The only supported values are 1
+and 2
+ * @param [in]  margin                  Has a default value of 1
+ * @param [in]  reduction               Reduction mode. If reduction mode is mean or sum, you must
+ * provide param workspace and workspaceSizeInBytes. Call
+ * miopenGetMultiMarginLossForwardWorkspaceSize to get workspaceSizeInBytes
+ * @param [in]  workspace               Address of the allocated workspace data. Set = nullptr if
+reduction = 'none'
+ * @param [in]  workspaceSizeInBytes    Size in bytes of the allocated workspace data. Set = 0 if
+reduction = 'none
+ * @return                              miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
+                                                          miopenTensorDescriptor_t inputDesc,
+                                                          const void* input,
+                                                          miopenTensorDescriptor_t targetDesc,
+                                                          const void* target,
+                                                          miopenTensorDescriptor_t weightDesc,
+                                                          const void* weight,
+                                                          miopenTensorDescriptor_t outputDesc,
+                                                          void* output,
+                                                          long p,
+                                                          float margin,
+                                                          miopenLossReductionMode_t reduction,
+                                                          void* workspace,
+                                                          size_t workspaceSizeInBytes);
+
+/** @} */
+// CLOSEOUT LossFunction DOXYGEN GROUP
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ea6bbc77f0..f5d72c85ad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -157,6 +157,8 @@ set( MIOpen_Source
     lrn_api.cpp
     mha/mha_descriptor.cpp
     mha/problem_description.cpp
+    multimarginloss/problem_description.cpp
+    multimarginloss_api.cpp
     op_args.cpp
     operator.cpp
     performance_config.cpp
@@ -289,6 +291,8 @@ set( MIOpen_Source
     solver/layernorm/forward_t5layernorm.cpp
     solver/mha/mha_solver_backward.cpp
     solver/mha/mha_solver_forward.cpp
+    solver/multimarginloss/forward_reduced_multimarginloss.cpp
+    solver/multimarginloss/forward_unreduced_multimarginloss.cpp
     solver/pooling/forward2d.cpp
     solver/pooling/forwardNaive.cpp
     solver/pooling/forwardNd.cpp
@@ -455,6 +459,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/stride_array.hpp
         kernels/tensor_view.hpp
         kernels/utilities.inc
+        kernels/warp_shuffle.hpp
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
@@ -494,8 +499,10 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenGroupNorm.cpp
         kernels/MIOpenGetitem.cpp
         kernels/MIOpenLayerNorm.cpp
+        kernels/MIOpenLossReduce.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
+        kernels/MIOpenMultiMarginLoss.cpp
         kernels/MIOpenNeuron.cl
         kernels/MIOpenPooling.cl
         kernels/MIOpenPoolingBwd.cl
@@ -621,13 +628,14 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         adam.cpp
         addlayernorm.cpp
         cat.cpp
+        exec_utils.cpp
         groupnorm.cpp
         getitem.cpp
         kernel_cache.cpp
         layernorm.cpp
         lrn.cpp
         mlo_dir_conv.cpp
-        exec_utils.cpp
+        multimarginloss.cpp
         ocl/activ_ocl.cpp
         ocl/batchnormocl.cpp
         ocl/convolutionocl.cpp
diff --git a/src/include/miopen/multimarginloss.hpp b/src/include/miopen/multimarginloss.hpp
new file mode 100644
index 0000000000..4d8d0f6566
--- /dev/null
+++ b/src/include/miopen/multimarginloss.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_MULTIMARGINLOSS_HPP_
+#define MIOPEN_MULTIMARGINLOSS_HPP_
+
+#include "miopen/miopen.h"
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT std::size_t
+GetMultiMarginLossForwardWorkspaceSize(Handle& handle,
+                                       const TensorDescriptor& iDesc,
+                                       const TensorDescriptor& tDesc,
+                                       const TensorDescriptor& wDesc,
+                                       const TensorDescriptor& oDesc,
+                                       long p,
+                                       float margin,
+                                       miopenLossReductionMode_t reduction);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t
+MultiMarginLossUnreducedForward(Handle& handle,
+                                const TensorDescriptor& iDesc,
+                                ConstData_t i,
+                                const TensorDescriptor& tDesc,
+                                ConstData_t t,
+                                const TensorDescriptor& wDesc,
+                                ConstData_t w,
+                                const TensorDescriptor& oDesc,
+                                Data_t o,
+                                long p,
+                                float margin);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t MultiMarginLossForward(Handle& handle,
+                                                              Data_t workspace,
+                                                              size_t workspaceSizeInBytes,
+                                                              const TensorDescriptor& iDesc,
+                                                              ConstData_t i,
+                                                              const TensorDescriptor& tDesc,
+                                                              ConstData_t t,
+                                                              const TensorDescriptor& wDesc,
+                                                              ConstData_t w,
+                                                              const TensorDescriptor& oDesc,
+                                                              Data_t o,
+                                                              long p,
+                                                              float margin,
+                                                              miopenLossReductionMode_t reduction);
+
+} // namespace miopen
+#endif // _MIOPEN_MULTIMARGINLOSS_HPP_
diff --git a/src/include/miopen/multimarginloss/invoke_params.hpp b/src/include/miopen/multimarginloss/invoke_params.hpp
new file mode 100644
index 0000000000..24e323946f
--- /dev/null
+++ b/src/include/miopen/multimarginloss/invoke_params.hpp
@@ -0,0 +1,61 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace multimarginloss {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* iDesc = nullptr;
+    const TensorDescriptor* tDesc = nullptr;
+    const TensorDescriptor* wDesc = nullptr;
+    const TensorDescriptor* oDesc = nullptr;
+
+    ConstData_t i = nullptr;
+    ConstData_t t = nullptr;
+    ConstData_t w = nullptr;
+    Data_t o      = nullptr;
+
+    long p;
+    float margin;
+    float divisor = 0;
+
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace multimarginloss
+
+} // namespace miopen
diff --git a/src/include/miopen/multimarginloss/problem_description.hpp b/src/include/miopen/multimarginloss/problem_description.hpp
new file mode 100644
index 0000000000..c3eeecfa08
--- /dev/null
+++ b/src/include/miopen/multimarginloss/problem_description.hpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/miopen.h"
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace multimarginloss {
+
+struct ForwardProblemDescription : ProblemDescriptionBase
+{
+    ForwardProblemDescription(const TensorDescriptor& iDesc_,
+                              const TensorDescriptor& tDesc_,
+                              const TensorDescriptor& wDesc_,
+                              const TensorDescriptor& oDesc_,
+                              const long p_,
+                              const float margin_,
+                              const float divisor_)
+        : iDesc(iDesc_),
+          tDesc(tDesc_),
+          wDesc(wDesc_),
+          oDesc(oDesc_),
+          p(p_),
+          margin(margin_),
+          divisor(divisor_)
+    {
+        if(iDesc.GetType() != oDesc.GetType() || iDesc.GetType() != wDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "MultiMarginLoss: Input, output, weight tensor types do not match.");
+        }
+        if(tDesc.GetType() != miopenInt64)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "MultiMarginLoss: Target tensor type should be miopenInt64.");
+        }
+        if(iDesc.GetNumDims() != 2)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "MultiMarginLoss: Input tensor need to be 2D tensor");
+        }
+        if(tDesc.GetNumDims() != 1 || tDesc.GetLengths()[0] != iDesc.GetLengths()[0])
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "MultiMarginLoss: Target tensor need to be 1D tensor. If input "
+                         "tensor has shape (N, C) then target tensor must have shape (N)");
+        }
+        if(wDesc.GetNumDims() != 1 || wDesc.GetLengths()[0] != iDesc.GetLengths()[1])
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "MultiMarginLoss: Weight tensor need to be 1D tensor. If input "
+                         "tensor has shape (N, C) then weight tensor must have shape (C)");
+        }
+        // Check output tensor dimension
+        if(divisor == 0)
+        {
+            // non-reduction case
+            if(oDesc.GetNumDims() != 1 || oDesc.GetLengths()[0] != iDesc.GetLengths()[0])
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "MultiMarginLoss: Output tensor need to be "
+                             "1D tensor. If input "
+                             "tensor has shape (N, C) then output tensor must have shape (N)");
+            }
+        }
+        else
+        {
+            // reduction case
+            if(oDesc.GetNumDims() != 1 || oDesc.GetLengths()[0] != 1)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "MultiMarginLoss: Output tensor need to be a scalar.");
+            }
+        }
+        // Check p value
+        if(p != 1 && p != 2)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "MultiMarginLoss: p need to be equal 1 or 2.");
+        }
+    }
+
+    const TensorDescriptor& GetiDesc() const { return iDesc; }
+    const TensorDescriptor& GettDesc() const { return tDesc; }
+    const TensorDescriptor& GetwDesc() const { return wDesc; }
+    const TensorDescriptor& GetoDesc() const { return oDesc; }
+    long Getp() const { return p; }
+    float Getmargin() const { return margin; }
+    float Getdivisor() const { return divisor; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor iDesc;
+    TensorDescriptor tDesc;
+    TensorDescriptor wDesc;
+    TensorDescriptor oDesc;
+    long p;
+    float margin;
+    float divisor;
+};
+
+} // namespace multimarginloss
+
+} // namespace miopen
diff --git a/src/include/miopen/multimarginloss/solvers.hpp b/src/include/miopen/multimarginloss/solvers.hpp
new file mode 100644
index 0000000000..078eaa0ce0
--- /dev/null
+++ b/src/include/miopen/multimarginloss/solvers.hpp
@@ -0,0 +1,82 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/multimarginloss/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace multimarginloss {
+
+using ForwardMultiMarginLossSolver =
+    NonTunableSolverBase<ExecutionContext, miopen::multimarginloss::ForwardProblemDescription>;
+
+struct MultiMarginLossUnreducedForward final : ForwardMultiMarginLossSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<MultiMarginLossUnreducedForward>();
+    }
+    bool
+    IsImprovementOverROCm(const ExecutionContext& context,
+                          const miopen::multimarginloss::ForwardProblemDescription& problem) const;
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::multimarginloss::ForwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::multimarginloss::ForwardProblemDescription& problem) const override;
+};
+
+struct MultiMarginLossForward final : ForwardMultiMarginLossSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<MultiMarginLossForward>();
+    }
+    bool
+    IsImprovementOverROCm(const ExecutionContext& context,
+                          const miopen::multimarginloss::ForwardProblemDescription& problem) const;
+    bool
+    IsApplicable(const ExecutionContext& context,
+                 const miopen::multimarginloss::ForwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::multimarginloss::ForwardProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(
+        const ExecutionContext& context,
+        const miopen::multimarginloss::ForwardProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+} // namespace multimarginloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..3a0327e973 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -32,7 +32,6 @@
 #include <miopen/conv_algo_name.hpp>
 
 #include <cstdint>
-#include <unordered_map>
 
 namespace miopen {
 
@@ -59,7 +58,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    MultiMarginLoss
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..226e33749d 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -27,8 +27,8 @@
 #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
-#include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include <miopen/tensor.hpp>
 
 namespace miopen {
 
@@ -77,4 +77,4 @@ inline void slice_tv(tensor_view_t<N>& tensor_view, int32_t sliceCount, const in
 
 } // namespace miopen
 
-#endif // MIOPEN_TENSOR_REORDER_UTIL_HPP_
+#endif // MIOPEN_TENSOR_VIEW_UTIL_HPP_
diff --git a/src/kernels/MIOpenLossReduce.cpp b/src/kernels/MIOpenLossReduce.cpp
new file mode 100644
index 0000000000..a86839d3c7
--- /dev/null
+++ b/src/kernels/MIOpenLossReduce.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "warp_shuffle.hpp"
+
+template <typename DTYPE>
+__device__ void LossSum(const DTYPE* __restrict__ input, DTYPE* __restrict__ output, uint64_t N)
+{
+    auto gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    FLOAT_ACCUM val = gid < N ? CVT_FLOAT2ACCUM(input[gid]) : CVT_FP32_2ACCUM(0.0f);
+    val             = block_reduce_sum(val);
+
+    if(threadIdx.x == 0)
+        output[blockIdx.x] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void
+ReduceSumLoss(const FLOAT* __restrict__ input, FLOAT* __restrict__ output, uint64_t N)
+{
+    // instantiate the kernel
+    LossSum<FLOAT>(input, output, N);
+}
diff --git a/src/kernels/MIOpenMultiMarginLoss.cpp b/src/kernels/MIOpenMultiMarginLoss.cpp
new file mode 100644
index 0000000000..52a032cdbe
--- /dev/null
+++ b/src/kernels/MIOpenMultiMarginLoss.cpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename DTYPE>
+__device__ void multimarginlossunreducedforward2d(const DTYPE* __restrict__ I,
+                                                  const uint64_t* __restrict__ T,
+                                                  const DTYPE* __restrict__ W,
+                                                  DTYPE* __restrict__ O,
+                                                  const long p,
+                                                  const float margin,
+                                                  tensor_view_t<2> I_tv,
+                                                  tensor_view_t<1> T_tv,
+                                                  tensor_view_t<1> W_tv,
+                                                  tensor_view_t<1> O_tv)
+{
+    const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    size_t N = I_tv.size[0], C = I_tv.size[1];
+    size_t n = gid;
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM loss = 0;
+    uint64_t y       = T[T_tv.get_tensor_view_idx({n})];
+    if(y >= C)
+    {
+        // TODO: need to handle invalid target index value
+        return;
+    }
+
+    for(size_t c = 0; c < C; c++)
+    {
+        if(y == c)
+            continue;
+        FLOAT_ACCUM t = margin - CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx({n, y})]) +
+                        CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx({n, c})]);
+        if(t < 0)
+            continue;
+        if(p == 2)
+            t = t * t;
+        t = CVT_FLOAT2ACCUM(W[W_tv.get_tensor_view_idx({y})]) * t;
+        loss += t / C;
+    }
+    O[O_tv.get_tensor_view_idx({n})] = CVT_ACCUM2FLOAT(loss);
+}
+
+extern "C" __global__ void MultiMarginLossUnreducedForward2d(const FLOAT* __restrict__ I,
+                                                             const uint64_t* __restrict__ T,
+                                                             const FLOAT* __restrict__ W,
+                                                             FLOAT* __restrict__ O,
+                                                             const long p,
+                                                             const float margin,
+                                                             tensor_view_t<2> I_tv,
+                                                             tensor_view_t<1> T_tv,
+                                                             tensor_view_t<1> W_tv,
+                                                             tensor_view_t<1> O_tv)
+{
+    // instantiate the kernel
+    multimarginlossunreducedforward2d<FLOAT>(I, T, W, O, p, margin, I_tv, T_tv, W_tv, O_tv);
+}
+
+template <typename DTYPE>
+__device__ void multimarginlossforward2d(const DTYPE* __restrict__ I,
+                                         const uint64_t* __restrict__ T,
+                                         const DTYPE* __restrict__ W,
+                                         DTYPE* __restrict__ lsum,
+                                         const long p,
+                                         const float margin,
+                                         const float divisor,
+                                         tensor_view_t<2> I_tv,
+                                         tensor_view_t<1> T_tv,
+                                         tensor_view_t<1> W_tv)
+{
+    const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    size_t N = I_tv.size[0], C = I_tv.size[1];
+    size_t n = gid;
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM loss = 0;
+    uint64_t y       = T[T_tv.get_tensor_view_idx({n})];
+    if(y >= C)
+    {
+        // TODO: need to handle invalid target index value
+        return;
+    }
+
+    for(size_t c = 0; c < C; c++)
+    {
+        if(y == c)
+            continue;
+        FLOAT_ACCUM t = margin - CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx({n, y})]) +
+                        CVT_FLOAT2ACCUM(I[I_tv.get_tensor_view_idx({n, c})]);
+        if(t < 0)
+            continue;
+        if(p == 2)
+            t = t * t;
+        t = CVT_FLOAT2ACCUM(W[W_tv.get_tensor_view_idx({y})]) * t;
+        loss += t / C;
+    }
+
+    lsum[n] = CVT_ACCUM2FLOAT(loss / divisor);
+}
+
+extern "C" __global__ void MultiMarginLossForward2d(const FLOAT* __restrict__ I,
+                                                    const uint64_t* __restrict__ T,
+                                                    const FLOAT* __restrict__ W,
+                                                    FLOAT* __restrict__ lsum,
+                                                    const long p,
+                                                    const float margin,
+                                                    const float divisor,
+                                                    tensor_view_t<2> I_tv,
+                                                    tensor_view_t<1> T_tv,
+                                                    tensor_view_t<1> W_tv)
+{
+    // instantiate the kernel
+    multimarginlossforward2d<FLOAT>(I, T, W, lsum, p, margin, divisor, I_tv, T_tv, W_tv);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..d7294d8992 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -33,17 +33,22 @@ struct tensor_layout_t;
 template <int N>
 struct tensor_view_t
 {
-    // Get index in tensor view at tensor layout
-    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout)
+    constexpr uint64_t get_tensor_view_idx(const uint64_t (&layout)[N])
     {
         static_assert(N > 0);
         uint64_t idx = 0;
         for(auto i = 0; i < N; ++i)
         {
-            idx += stride[i] * tensor_layout.layout[i];
+            idx += stride[i] * layout[i];
         }
         return idx;
     }
+
+    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout)
+    {
+        return get_tensor_view_idx(tensor_layout.layout);
+    }
+
     uint64_t stride[N];
     uint64_t size[N];
 };
diff --git a/src/kernels/warp_shuffle.hpp b/src/kernels/warp_shuffle.hpp
new file mode 100644
index 0000000000..db536bd2ae
--- /dev/null
+++ b/src/kernels/warp_shuffle.hpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_WARP_SHUFFLE_HPP
+#define GUARD_WARP_SHUFFLE_HPP
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+
+__device__ FLOAT_ACCUM warp_reduce_sum(FLOAT_ACCUM val)
+{
+    if(warpSize >= 64)
+        val += __shfl_down(val, 32);
+    if(warpSize >= 32)
+        val += __shfl_down(val, 16);
+    if(warpSize >= 16)
+        val += __shfl_down(val, 8);
+    if(warpSize >= 8)
+        val += __shfl_down(val, 4);
+    if(warpSize >= 4)
+        val += __shfl_down(val, 2);
+    if(warpSize >= 2)
+        val += __shfl_down(val, 1);
+    return val;
+}
+
+__device__ FLOAT_ACCUM block_reduce_sum(FLOAT_ACCUM val)
+{
+    static __shared__ FLOAT_ACCUM shared[REDUCE_SIZE / warpSize];
+    auto lane = threadIdx.x % warpSize;
+    auto wid  = threadIdx.x / warpSize;
+
+    val = warp_reduce_sum(val);
+
+    if(lane == 0)
+        shared[wid] = val;
+    __syncthreads();
+
+    val = threadIdx.x < REDUCE_SIZE / warpSize ? shared[lane] : 0;
+    if(wid == 0)
+        val = warp_reduce_sum(val);
+
+    return val;
+}
+
+#endif // GUARD_WARP_SHUFFLE_HPP
diff --git a/src/multimarginloss.cpp b/src/multimarginloss.cpp
new file mode 100644
index 0000000000..fb5c4b4df5
--- /dev/null
+++ b/src/multimarginloss.cpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/multimarginloss/invoke_params.hpp>
+#include <miopen/multimarginloss/solvers.hpp>
+#include <miopen/multimarginloss.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+miopenStatus_t MultiMarginLossUnreducedForward(Handle& handle,
+                                               const TensorDescriptor& iDesc,
+                                               ConstData_t i,
+                                               const TensorDescriptor& tDesc,
+                                               ConstData_t t,
+                                               const TensorDescriptor& wDesc,
+                                               ConstData_t w,
+                                               const TensorDescriptor& oDesc,
+                                               Data_t o,
+                                               const long p,
+                                               const float margin)
+{
+    const auto problem =
+        multimarginloss::ForwardProblemDescription{iDesc, tDesc, wDesc, oDesc, p, margin, 0};
+
+    const auto invoke_params = [&]() {
+        auto tmp   = multimarginloss::InvokeParams{};
+        tmp.type   = InvokeType::Run;
+        tmp.iDesc  = &iDesc;
+        tmp.i      = i;
+        tmp.tDesc  = &tDesc;
+        tmp.t      = t;
+        tmp.wDesc  = &wDesc;
+        tmp.w      = w;
+        tmp.oDesc  = &oDesc;
+        tmp.o      = o;
+        tmp.p      = p;
+        tmp.margin = margin;
+        return tmp;
+    }();
+
+    const auto algo = AlgorithmName{"MultiMarginLossUnreducedForward"};
+    const auto solvers =
+        solver::SolverContainer<solver::multimarginloss::MultiMarginLossUnreducedForward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+std::size_t GetMultiMarginLossForwardWorkspaceSize(Handle& handle,
+                                                   const TensorDescriptor& iDesc,
+                                                   const TensorDescriptor& tDesc,
+                                                   const TensorDescriptor& wDesc,
+                                                   const TensorDescriptor& oDesc,
+                                                   const long p,
+                                                   const float margin,
+                                                   miopenLossReductionMode_t reduction)
+{
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        return static_cast<size_t>(0);
+    }
+    auto ctx            = ExecutionContext{&handle};
+    const float divisor = (reduction == MIOPEN_LOSS_REDUCTION_MEAN) ? iDesc.GetLengths()[0] : 1;
+    const auto problem =
+        multimarginloss::ForwardProblemDescription{iDesc, tDesc, wDesc, oDesc, p, margin, divisor};
+
+    const auto solvers = solver::SolverContainer<solver::multimarginloss::MultiMarginLossForward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t MultiMarginLossForward(Handle& handle,
+                                      Data_t workspace,
+                                      size_t workspaceSizeInBytes,
+                                      const TensorDescriptor& iDesc,
+                                      ConstData_t i,
+                                      const TensorDescriptor& tDesc,
+                                      ConstData_t t,
+                                      const TensorDescriptor& wDesc,
+                                      ConstData_t w,
+                                      const TensorDescriptor& oDesc,
+                                      Data_t o,
+                                      const long p,
+                                      const float margin,
+                                      miopenLossReductionMode_t reduction)
+{
+    const float divisor = (reduction == MIOPEN_LOSS_REDUCTION_MEAN) ? iDesc.GetLengths()[0] : 1;
+    const auto problem =
+        multimarginloss::ForwardProblemDescription{iDesc, tDesc, wDesc, oDesc, p, margin, divisor};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = multimarginloss::InvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.iDesc          = &iDesc;
+        tmp.i              = i;
+        tmp.tDesc          = &tDesc;
+        tmp.t              = t;
+        tmp.wDesc          = &wDesc;
+        tmp.w              = w;
+        tmp.oDesc          = &oDesc;
+        tmp.o              = o;
+        tmp.p              = p;
+        tmp.margin         = margin;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        tmp.divisor        = divisor;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"MultiMarginLossForward"};
+    const auto solvers = solver::SolverContainer<solver::multimarginloss::MultiMarginLossForward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/multimarginloss/problem_description.cpp b/src/multimarginloss/problem_description.cpp
new file mode 100644
index 0000000000..ca094f819d
--- /dev/null
+++ b/src/multimarginloss/problem_description.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/multimarginloss/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace multimarginloss {
+
+NetworkConfig ForwardProblemDescription::MakeNetworkConfig() const
+{
+    std::ostringstream ss;
+    ss << "multilmarginloss_fwd";
+    ss << "itype" << iDesc.GetType();
+    ss << "ilen";
+    auto ilen = iDesc.GetLengths();
+    for(unsigned long i : ilen)
+        ss << i << "_";
+    ss << "cont" << iDesc.IsContiguous();
+    ss << "divisor" << divisor;
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace multimarginloss
+
+} // namespace miopen
diff --git a/src/multimarginloss_api.cpp b/src/multimarginloss_api.cpp
new file mode 100644
index 0000000000..cb0bdeb7e8
--- /dev/null
+++ b/src/multimarginloss_api.cpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#include <miopen/multimarginloss.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+extern "C" miopenStatus_t
+miopenGetMultiMarginLossForwardWorkspaceSize(miopenHandle_t handle,
+                                             miopenTensorDescriptor_t inputDesc,
+                                             miopenTensorDescriptor_t targetDesc,
+                                             miopenTensorDescriptor_t weightDesc,
+                                             miopenTensorDescriptor_t outputDesc,
+                                             const long p,
+                                             const float margin,
+                                             miopenLossReductionMode_t reduction,
+                                             size_t* sizeInBytes)
+{
+    MIOPEN_LOG_FUNCTION(
+        handle, inputDesc, targetDesc, weightDesc, outputDesc, p, margin, reduction);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) =
+            miopen::GetMultiMarginLossForwardWorkspaceSize(miopen::deref(handle),
+                                                           miopen::deref(inputDesc),
+                                                           miopen::deref(targetDesc),
+                                                           miopen::deref(weightDesc),
+                                                           miopen::deref(outputDesc),
+                                                           p,
+                                                           margin,
+                                                           reduction);
+    });
+}
+
+miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
+                                            miopenTensorDescriptor_t inputDesc,
+                                            const void* input,
+                                            miopenTensorDescriptor_t targetDesc,
+                                            const void* target,
+                                            miopenTensorDescriptor_t weightDesc,
+                                            const void* weight,
+                                            miopenTensorDescriptor_t outputDesc,
+                                            void* output,
+                                            const long p,
+                                            const float margin,
+                                            miopenLossReductionMode_t reduction,
+                                            void* workspace,
+                                            size_t workspaceSizeInBytes)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        targetDesc,
+                        target,
+                        weightDesc,
+                        weight,
+                        outputDesc,
+                        output,
+                        p,
+                        margin,
+                        reduction,
+                        workspace,
+                        workspaceSizeInBytes);
+
+    if(reduction == MIOPEN_LOSS_REDUCTION_NONE)
+    {
+        return miopen::try_([&] {
+            miopen::MultiMarginLossUnreducedForward(miopen::deref(handle),
+                                                    miopen::deref(inputDesc),
+                                                    DataCast(input),
+                                                    miopen::deref(targetDesc),
+                                                    DataCast(target),
+                                                    miopen::deref(weightDesc),
+                                                    DataCast(weight),
+                                                    miopen::deref(outputDesc),
+                                                    DataCast(output),
+                                                    p,
+                                                    margin);
+        });
+    }
+    else if(reduction == MIOPEN_LOSS_REDUCTION_SUM || reduction == MIOPEN_LOSS_REDUCTION_MEAN)
+    {
+        return miopen::try_([&] {
+            miopen::MultiMarginLossForward(miopen::deref(handle),
+                                           DataCast(workspace),
+                                           workspaceSizeInBytes,
+                                           miopen::deref(inputDesc),
+                                           DataCast(input),
+                                           miopen::deref(targetDesc),
+                                           DataCast(target),
+                                           miopen::deref(weightDesc),
+                                           DataCast(weight),
+                                           miopen::deref(outputDesc),
+                                           DataCast(output),
+                                           p,
+                                           margin,
+                                           reduction);
+        });
+    }
+    else
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "miopenMultiMarginLossForward: reduction should be "
+                     "MIOPEN_LOSS_REDUCTION_NONE, "
+                     "MIOPEN_LOSS_REDUCTION_SUM or MIOPEN_LOSS_REDUCTION_MEAN.");
+    }
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index 6b451ca498..63b3ecae0d 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -38,6 +38,7 @@
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
 #include <miopen/softmax/solvers.hpp>
+#include <miopen/multimarginloss/solvers.hpp>
 
 #include <miopen/conv_algo_name.hpp>
 #include <miopen/db.hpp>
@@ -673,6 +674,15 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
 
+    Register(registry,
+             ++id,
+             Primitive::MultiMarginLoss,
+             multimarginloss::MultiMarginLossUnreducedForward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::MultiMarginLoss,
+             multimarginloss::MultiMarginLossForward{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/multimarginloss/forward_reduced_multimarginloss.cpp b/src/solver/multimarginloss/forward_reduced_multimarginloss.cpp
new file mode 100644
index 0000000000..d00d41cd40
--- /dev/null
+++ b/src/solver/multimarginloss/forward_reduced_multimarginloss.cpp
@@ -0,0 +1,236 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/multimarginloss/invoke_params.hpp>
+#include <miopen/multimarginloss/solvers.hpp>
+#include <miopen/multimarginloss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE_MULTIMARGINLOSS 256
+#define LOCAL_SIZE_REDUCE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace multimarginloss {
+
+bool MultiMarginLossForward::IsImprovementOverROCm(
+    const ExecutionContext& /*context*/,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    if(problem.GetiDesc().GetLengths()[1] <= 30)
+        return true;
+    if((problem.GetiDesc().GetType() == miopenHalf ||
+        problem.GetiDesc().GetType() == miopenBFloat16) &&
+       problem.GetiDesc().IsContiguous() && problem.GetiDesc().GetLengths()[1] <= 40)
+        return true;
+    return false;
+}
+
+bool MultiMarginLossForward::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    return true;
+}
+
+ConvSolution MultiMarginLossForward::GetSolution(
+    const ExecutionContext& /*context*/,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto xgrid = problem.GetiDesc().GetLengths()[0];
+    auto dtype = problem.GetiDesc().GetType();
+
+    {
+        /* Phase 1: Calc loss for each element. */
+        size_t xlocalsize = LOCAL_SIZE_MULTIMARGINLOSS;
+        size_t xgridsize  = AlignUp(xgrid, xlocalsize);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenMultiMarginLoss.cpp";
+        kernel.kernel_name = "MultiMarginLossForward2d";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
+        };
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    {
+        /* Phase 2: Reduce */
+        auto _size              = xgrid;
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
+            {"REDUCE_SIZE", LOCAL_SIZE_REDUCE},
+        };
+        do
+        {
+            size_t xlocalsize = LOCAL_SIZE_REDUCE;
+            size_t xgridsize  = AlignUp(_size, xlocalsize);
+            size_t ylocalsize = 1;
+            size_t ygridsize  = 1;
+            size_t zlocalsize = 1;
+            size_t zgridsize  = 1;
+
+            auto kernel        = KernelInfo{};
+            kernel.kernel_file = "MIOpenLossReduce.cpp";
+            kernel.kernel_name = "ReduceSumLoss";
+
+            kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+            kernel.l_wk.push_back(xlocalsize);
+            kernel.l_wk.push_back(ylocalsize);
+            kernel.l_wk.push_back(zlocalsize);
+
+            kernel.g_wk.push_back(xgridsize);
+            kernel.g_wk.push_back(ygridsize);
+            kernel.g_wk.push_back(zgridsize);
+
+            result.construction_params.push_back(kernel);
+            _size = AlignUp(_size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
+        } while(_size > 1);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::multimarginloss::InvokeParams>();
+            auto i_tv             = get_inner_expanded_tv<2>(deref(params.iDesc));
+            auto t_tv             = get_inner_expanded_tv<1>(deref(params.tDesc));
+            auto w_tv             = get_inner_expanded_tv<1>(deref(params.wDesc));
+
+            float elapsed = 0.0f;
+            HipEventPtr start;
+            HipEventPtr stop;
+
+            const bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+            /* Phase 1: Calc loss for each element. */
+            {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                kernel(params.i,
+                       params.t,
+                       params.w,
+                       params.workspace,
+                       params.p,
+                       params.margin,
+                       params.divisor,
+                       i_tv,
+                       t_tv,
+                       w_tv);
+            }
+
+            /* Phase 2: Reduce */
+            auto size      = deref(params.iDesc).GetLengths()[0];
+            auto reduce_in = params.workspace;
+            auto reduce_out =
+                static_cast<Data_t>(static_cast<char*>(params.workspace) +
+                                    size * get_data_size(deref(params.oDesc).GetType()));
+
+            for(int i = 1; i < kernels.size(); ++i)
+            {
+                decltype(auto) kernel = handle_.Run(kernels[i]);
+                if(i + 1 != kernels.size())
+                {
+                    kernel(reduce_in, reduce_out, size);
+                    std::swap(reduce_in, reduce_out);
+                }
+                else
+                {
+                    kernel(reduce_in, params.o, size);
+                }
+                size = AlignUp(size, LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE;
+            }
+
+            if(profiling)
+            {
+                hipEventRecord(stop.get(), handle_.GetStream());
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+
+                handle_.EnableProfiling(true);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t MultiMarginLossForward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    auto elem = problem.GetiDesc().GetLengths()[0];
+    return (elem + (elem + LOCAL_SIZE_REDUCE) / LOCAL_SIZE_REDUCE) *
+           get_data_size(problem.GetoDesc().GetType());
+}
+
+} // namespace multimarginloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/multimarginloss/forward_unreduced_multimarginloss.cpp b/src/solver/multimarginloss/forward_unreduced_multimarginloss.cpp
new file mode 100644
index 0000000000..729df7d431
--- /dev/null
+++ b/src/solver/multimarginloss/forward_unreduced_multimarginloss.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/multimarginloss/invoke_params.hpp>
+#include <miopen/multimarginloss/solvers.hpp>
+#include <miopen/multimarginloss.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace multimarginloss {
+
+bool MultiMarginLossUnreducedForward::IsImprovementOverROCm(
+    const ExecutionContext& /*context*/,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    if(problem.GetiDesc().GetLengths()[1] <= 30)
+        return true;
+    if((problem.GetiDesc().GetType() == miopenHalf ||
+        problem.GetiDesc().GetType() == miopenBFloat16) &&
+       problem.GetiDesc().IsContiguous() && problem.GetiDesc().GetLengths()[1] <= 40)
+        return true;
+    return false;
+}
+
+bool MultiMarginLossUnreducedForward::IsApplicable(
+    const ExecutionContext& context,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    if(!IsImprovementOverROCm(context, problem))
+        return false;
+    return true;
+}
+
+ConvSolution MultiMarginLossUnreducedForward::GetSolution(
+    const ExecutionContext& /*context*/,
+    const miopen::multimarginloss::ForwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto xgrid = problem.GetiDesc().GetLengths()[0];
+
+    {
+        auto dtype        = problem.GetiDesc().GetType();
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(xgrid, xlocalsize);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenMultiMarginLoss.cpp";
+        kernel.kernel_name = "MultiMarginLossUnreducedForward2d";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
+        };
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::multimarginloss::InvokeParams>();
+
+            auto i_tv = get_inner_expanded_tv<2>(deref(params.iDesc));
+            auto t_tv = get_inner_expanded_tv<1>(deref(params.tDesc));
+            auto w_tv = get_inner_expanded_tv<1>(deref(params.wDesc));
+            auto o_tv = get_inner_expanded_tv<1>(deref(params.oDesc));
+
+            kernel(params.i,
+                   params.t,
+                   params.w,
+                   params.o,
+                   params.p,
+                   params.margin,
+                   i_tv,
+                   t_tv,
+                   w_tv,
+                   o_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace multimarginloss
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_multimarginloss.hpp b/test/cpu_multimarginloss.hpp
new file mode 100644
index 0000000000..0bde558d62
--- /dev/null
+++ b/test/cpu_multimarginloss.hpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_CPU_MULTIMARGINLOSS_HPP
+#define GUARD_CPU_MULTIMARGINLOSS_HPP
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_multimarginloss_unreduced_forward(tensor<T> input,
+                                           tensor<uint64_t> target,
+                                           tensor<T> weight,
+                                           tensor<T>& ref_output,
+                                           long p,
+                                           float margin)
+{
+    auto I_tv = miopen::get_inner_expanded_tv<2>(input.desc);
+    auto T_tv = miopen::get_inner_expanded_tv<1>(target.desc);
+    auto W_tv = miopen::get_inner_expanded_tv<1>(weight.desc);
+    auto O_tv = miopen::get_inner_expanded_tv<1>(ref_output.desc);
+    auto N = I_tv.size[0], C = I_tv.size[1];
+
+    for(size_t n = 0; n < N; n++)
+    {
+        double loss = 0;
+        uint64_t y  = target[T_tv.get_tensor_view_idx({n})];
+        if(y >= C)
+            continue;
+        for(size_t c = 0; c < C; c++)
+        {
+            if(y == c)
+                continue;
+            double t = margin - static_cast<double>(input[I_tv.get_tensor_view_idx({n, y})]) +
+                       static_cast<double>(input[I_tv.get_tensor_view_idx({n, c})]);
+
+            if(t < 0)
+                continue;
+            if(p == 2)
+                t = t * t;
+            t = static_cast<double>(weight[W_tv.get_tensor_view_idx({y})]) * t;
+            loss += t / C;
+        }
+        ref_output[O_tv.get_tensor_view_idx({n})] = loss;
+    }
+}
+
+template <class T>
+void cpu_multimarginloss_reduced_forward(tensor<T> input,
+                                         tensor<uint64_t> target,
+                                         tensor<T> weight,
+                                         tensor<T>& ref_output,
+                                         long p,
+                                         float margin,
+                                         const float divisor)
+{
+    auto I_tv = miopen::get_inner_expanded_tv<2>(input.desc);
+    auto T_tv = miopen::get_inner_expanded_tv<1>(target.desc);
+    auto W_tv = miopen::get_inner_expanded_tv<1>(weight.desc);
+    auto N = I_tv.size[0], C = I_tv.size[1];
+
+    double sum = 0;
+    for(size_t n = 0; n < N; n++)
+    {
+        double loss = 0;
+        uint64_t y  = target[T_tv.get_tensor_view_idx({n})];
+        if(y >= C)
+            continue;
+        for(size_t c = 0; c < C; c++)
+        {
+            if(y == c)
+                continue;
+            double t = margin - static_cast<double>(input[I_tv.get_tensor_view_idx({n, y})]) +
+                       static_cast<double>(input[I_tv.get_tensor_view_idx({n, c})]);
+
+            if(t < 0)
+                continue;
+            if(p == 2)
+                t = t * t;
+            t = static_cast<double>(weight[W_tv.get_tensor_view_idx({y})]) * t;
+            loss += t / C;
+        }
+        sum += loss;
+    }
+    sum /= divisor;
+    ref_output[0] = static_cast<T>(sum);
+}
+
+#endif
diff --git a/test/gtest/multimarginloss.cpp b/test/gtest/multimarginloss.cpp
new file mode 100644
index 0000000000..df2653af41
--- /dev/null
+++ b/test/gtest/multimarginloss.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "multimarginloss.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace multimarginloss {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_MultiMarginLoss_FP32 : MultiMarginLossForwardTest<float>
+{
+};
+
+struct GPU_MultiMarginLoss_FP16 : MultiMarginLossForwardTest<half_float::half>
+{
+};
+
+struct GPU_MultiMarginLoss_BFP16 : MultiMarginLossForwardTest<bfloat16>
+{
+};
+
+} // namespace multimarginloss
+
+using multimarginloss::GPU_MultiMarginLoss_BFP16;
+using multimarginloss::GPU_MultiMarginLoss_FP16;
+using multimarginloss::GPU_MultiMarginLoss_FP32;
+
+TEST_P(GPU_MultiMarginLoss_FP32, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_MultiMarginLoss_FP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_MultiMarginLoss_BFP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_MultiMarginLoss_FP32,
+                         testing::ValuesIn(MultiMarginLossTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_MultiMarginLoss_FP16,
+                         testing::ValuesIn(MultiMarginLossFp16TestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_MultiMarginLoss_BFP16,
+                         testing::ValuesIn(MultiMarginLossTestConfigs()));
diff --git a/test/gtest/multimarginloss.hpp b/test/gtest/multimarginloss.hpp
new file mode 100644
index 0000000000..d69f98e0f3
--- /dev/null
+++ b/test/gtest/multimarginloss.hpp
@@ -0,0 +1,267 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_multimarginloss.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/multimarginloss.hpp>
+
+struct MultiMarginLossTestCase
+{
+    std::vector<size_t> dims;
+    bool cont;
+    miopenLossReductionMode_t reduction_mode;
+    long p;
+
+    friend std::ostream& operator<<(std::ostream& os, const MultiMarginLossTestCase& tc)
+    {
+        os << "dims:";
+        os << tc.dims[0];
+        for(int i = 1; i < tc.dims.size(); i++)
+            os << "x" << tc.dims[i];
+        os << " cont:" << tc.cont << " reduction_mode:" << tc.reduction_mode << " p:" << tc.p;
+        return os;
+    }
+};
+
+inline std::vector<MultiMarginLossTestCase> MultiMarginLossTestConfigs()
+{
+    // clang-format off
+    return {
+        {{22, 12}, true, MIOPEN_LOSS_REDUCTION_MEAN, 1}, 
+        {{22, 12}, false, MIOPEN_LOSS_REDUCTION_SUM, 1}, 
+        {{22, 12}, true, MIOPEN_LOSS_REDUCTION_NONE, 1}, 
+        {{9456, 13}, false, MIOPEN_LOSS_REDUCTION_MEAN, 2 }, 
+        {{9456, 13}, true, MIOPEN_LOSS_REDUCTION_SUM, 2 }, 
+        {{9456, 13}, false, MIOPEN_LOSS_REDUCTION_NONE, 2 }, 
+        {{543210, 7}, true, MIOPEN_LOSS_REDUCTION_MEAN, 2 }, 
+        {{543210, 7}, false, MIOPEN_LOSS_REDUCTION_SUM, 2 }, 
+        {{543210, 7}, true, MIOPEN_LOSS_REDUCTION_NONE, 2 }, 
+        {{3995776, 6}, true, MIOPEN_LOSS_REDUCTION_MEAN, 1 }, 
+        {{3995776, 6}, true, MIOPEN_LOSS_REDUCTION_SUM, 1 }, 
+        {{3995776, 6}, true, MIOPEN_LOSS_REDUCTION_NONE, 1 }, 
+    };
+    // clang-format on
+}
+
+// Remove big tests with reduction from FP16 test because the result will be overflow/ underflow
+inline std::vector<MultiMarginLossTestCase> MultiMarginLossFp16TestConfigs()
+{
+    // clang-format off
+    return {
+        {{22, 12}, true, MIOPEN_LOSS_REDUCTION_MEAN, 1}, 
+        {{22, 12}, false, MIOPEN_LOSS_REDUCTION_SUM, 1}, 
+        {{22, 12}, true, MIOPEN_LOSS_REDUCTION_NONE, 1}, 
+        {{9456, 13}, false, MIOPEN_LOSS_REDUCTION_MEAN, 2 }, 
+        {{9456, 13}, true, MIOPEN_LOSS_REDUCTION_SUM, 2 }, 
+        {{9456, 13}, false, MIOPEN_LOSS_REDUCTION_NONE, 2 }, 
+        {{543210, 7}, true, MIOPEN_LOSS_REDUCTION_NONE, 2 }, 
+        {{3995776, 6}, true, MIOPEN_LOSS_REDUCTION_NONE, 1 }, 
+    };
+    // clang-format on
+}
+
+template <typename T = float>
+struct MultiMarginLossForwardTest : public ::testing::TestWithParam<MultiMarginLossTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        auto in_dims   = config.dims;
+        reduction_mode = config.reduction_mode;
+        p              = config.p;
+        margin         = prng::gen_A_to_B<float>(0.5, 1.5);
+        size_t N = in_dims[0], C = in_dims[1];
+
+        if(config.cont)
+        {
+            input = tensor<T>{in_dims};
+            // input is (N, C) -> target is (N)
+            target = tensor<uint64_t>{std::vector<size_t>{N}};
+            // input is (N, C) -> weight is (C)
+            weight = tensor<T>{std::vector<size_t>{C}};
+        }
+        else
+        {
+            std::vector<size_t> in_strides(in_dims.size());
+            in_strides.back() = 1;
+            for(int i = in_dims.size() - 2; i >= 0; --i)
+                in_strides[i] = in_strides[i + 1] * in_dims[i + 1];
+            in_strides[0] *= 2;
+            input = tensor<T>{in_dims, in_strides};
+
+            std::vector<size_t> t_len     = {N};
+            std::vector<size_t> t_strides = {2};
+            target                        = tensor<uint64_t>{t_len, t_strides};
+
+            std::vector<size_t> w_lens    = {C};
+            std::vector<size_t> w_strides = {2};
+            weight                        = tensor<T>{w_lens, w_strides};
+        }
+
+        auto gen_in_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-1), static_cast<T>(1));
+        };
+        std::generate(input.begin(), input.end(), gen_in_value);
+        input_dev = handle.Write(input.data);
+
+        for(auto& ptr : target)
+        {
+            ptr = prng::gen_A_to_B<uint64_t>(0, C);
+        }
+        target_dev = handle.Write(target.data);
+
+        auto gen_weight_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-1), static_cast<T>(1));
+        };
+        std::generate(weight.begin(), weight.end(), gen_weight_value);
+        weight_dev = handle.Write(weight.data);
+
+        if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            // input is (N, C) -> output is (N)
+            output     = tensor<T>{std::vector<size_t>{N}};
+            ref_output = tensor<T>{std::vector<size_t>{N}};
+        }
+        else
+        {
+            // Tensor with 1 element to store result after reduce
+            output     = tensor<T>{std::vector<size_t>{1}};
+            ref_output = tensor<T>{std::vector<size_t>{1}};
+        }
+        std::fill(output.begin(), output.end(), 0);
+        std::fill(ref_output.begin(), ref_output.end(), 0);
+        output_dev = handle.Write(output.data);
+
+        if(reduction_mode != MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            ws_sizeInBytes = miopen::GetMultiMarginLossForwardWorkspaceSize(handle,
+                                                                            input.desc,
+                                                                            target.desc,
+                                                                            weight.desc,
+                                                                            output.desc,
+                                                                            p,
+                                                                            margin,
+                                                                            reduction_mode);
+            if(ws_sizeInBytes == static_cast<size_t>(-1))
+                GTEST_FAIL() << "Call GetMultiMarginLossForwardWorkspaceSize failed!";
+            workspace = tensor<T>{std::vector<size_t>{ws_sizeInBytes / sizeof(T)}};
+            std::fill(workspace.begin(), workspace.end(), 0);
+            workspace_dev = handle.Write(workspace.data);
+        }
+    }
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+        if(reduction_mode == MIOPEN_LOSS_REDUCTION_NONE)
+        {
+            cpu_multimarginloss_unreduced_forward<T>(input, target, weight, ref_output, p, margin);
+
+            status = miopen::MultiMarginLossUnreducedForward(handle,
+                                                             input.desc,
+                                                             input_dev.get(),
+                                                             target.desc,
+                                                             target_dev.get(),
+                                                             weight.desc,
+                                                             weight_dev.get(),
+                                                             output.desc,
+                                                             output_dev.get(),
+                                                             p,
+                                                             margin);
+        }
+        else
+        {
+            cpu_multimarginloss_reduced_forward<T>(
+                input,
+                target,
+                weight,
+                ref_output,
+                p,
+                margin,
+                (reduction_mode == MIOPEN_LOSS_REDUCTION_MEAN) ? input.desc.GetLengths()[0] : 1);
+
+            status = miopen::MultiMarginLossForward(handle,
+                                                    workspace_dev.get(),
+                                                    ws_sizeInBytes,
+                                                    input.desc,
+                                                    input_dev.get(),
+                                                    target.desc,
+                                                    target_dev.get(),
+                                                    weight.desc,
+                                                    weight_dev.get(),
+                                                    output.desc,
+                                                    output_dev.get(),
+                                                    p,
+                                                    margin,
+                                                    reduction_mode);
+        }
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        // Write from GPU to CPU
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        auto tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error = miopen::rms_range(ref_output, output);
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, tolerance);
+    }
+    MultiMarginLossTestCase config;
+
+    tensor<T> input;
+    tensor<uint64_t> target;
+    tensor<T> weight;
+    tensor<T> output;
+    tensor<T> workspace;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr target_dev;
+    miopen::Allocator::ManageDataPtr weight_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    long p;
+    float margin;
+    miopenLossReductionMode_t reduction_mode;
+    size_t ws_sizeInBytes;
+};
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 8ff00e6532..5428ce54cd 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -137,6 +137,11 @@ struct miopen_type<uint16_t> : std::integral_constant<miopenDataType_t, miopenHa
 {
 };
 
+template <>
+struct miopen_type<uint64_t> : std::integral_constant<miopenDataType_t, miopenInt64>
+{
+};
+
 template <class T>
 struct tensor
 {