diff --git a/.clang-tidy b/.clang-tidy index 8ed34ebb0f..39e6d96b87 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -71,6 +71,19 @@ # -modernize-type-traits # -performance-avoid-endl # +# TODO Code Quality WORKAROUND ROCm 6.2 +# -readability-container-size-empty +# -bugprone-inc-dec-in-conditions +# -readability-avoid-nested-conditional-operator +# -performance-enum-size +# -readability-redundant-member-init +# -bugprone-multi-level-implicit-pointer-conversion +# -clang-analyzer-optin.core.EnumCastOutOfRange +# -cppcoreguidelines-macro-to-enum +# -cppcoreguidelines-owning-memory +# -readability-avoid-return-with-void-value +# -bugprone-unused-local-non-trivial-variable +# Checks: >- *, -abseil-*, @@ -78,9 +91,12 @@ Checks: >- -android-cloexec-fopen, -bugprone-easily-swappable-parameters, -bugprone-exception-escape, + -bugprone-inc-dec-in-conditions, -bugprone-lambda-function-name, -bugprone-macro-parentheses, + -bugprone-multi-level-implicit-pointer-conversion, -bugprone-narrowing-conversions, + -bugprone-unused-local-non-trivial-variable, -bugprone-use-after-move, -cert-env33-c, -cert-err33-c, @@ -90,6 +106,7 @@ Checks: >- -cert-msc51-cpp, -clang-analyzer-alpha.core.CastToStruct, -clang-analyzer-cplusplus.NewDeleteLeaks, + -clang-analyzer-optin.core.EnumCastOutOfRange, -clang-analyzer-optin.performance.Padding, -clang-diagnostic-extern-c-compat, -clang-diagnostic-unused-command-line-argument, @@ -100,11 +117,13 @@ Checks: >- -cppcoreguidelines-avoid-magic-numbers, -cppcoreguidelines-explicit-virtual-functions, -cppcoreguidelines-init-variables, + -cppcoreguidelines-macro-to-enum, -cppcoreguidelines-macro-usage, -cppcoreguidelines-misleading-capture-default-by-value, -cppcoreguidelines-missing-std-forward, -cppcoreguidelines-narrowing-conversions, -cppcoreguidelines-non-private-member-variables-in-classes, + -cppcoreguidelines-owning-memory, -cppcoreguidelines-prefer-member-initializer, -cppcoreguidelines-pro-bounds-array-to-pointer-decay, -cppcoreguidelines-pro-bounds-constant-array-index, @@ -168,11 +187,15 @@ Checks: >- -modernize-concat-nested-namespaces, -modernize-unary-static-assert, -performance-avoid-endl, + -performance-enum-size, + -readability-avoid-nested-conditional-operator, -performance-no-automatic-move, -performance-unnecessary-copy-initialization, -performance-unnecessary-value-param, + -readability-avoid-return-with-void-value, -readability-avoid-unconditional-preprocessor-if, -readability-container-data-pointer, + -readability-container-size-empty, -readability-convert-member-functions-to-static, -readability-else-after-return, -readability-function-cognitive-complexity, @@ -182,6 +205,7 @@ Checks: >- -readability-named-parameter, -readability-qualified-auto, -readability-redundant-declaration, + -readability-redundant-member-init, -readability-redundant-string-init, -readability-simplify-boolean-expr, -readability-suspicious-call-argument, diff --git a/CMakeLists.txt b/CMakeLists.txt index 98201516ad..21f96a0d77 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() -rocm_setup_version(VERSION 3.2.0) +rocm_setup_version(VERSION 3.3.0) list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) include(TargetFlags) diff --git a/Dockerfile b/Dockerfile index 89c8065f48..fb272596aa 100755 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,6 @@ RUN dpkg --add-architecture i386 # Install preliminary dependencies RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" \ apt-utils \ ca-certificates \ curl \ @@ -15,17 +14,21 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ gnupg2 \ wget +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" + #Add gpg keys ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/jammy/amdgpu-install_6.1.60100-1_all.deb --no-check-certificate +RUN wget https://repo.radeon.com/amdgpu-install/6.2/ubuntu/jammy/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - ./amdgpu-install_6.1.60100-1_all.deb + ./amdgpu-install_6.2.60200-1_all.deb # Add rocm repository -RUN export ROCM_APT_VER=6.1;\ +RUN export ROCM_APT_VER=6.2;\ echo $ROCM_APT_VER &&\ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCM_APT_VER/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list' &&\ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_APT_VER jammy main > /etc/apt/sources.list.d/rocm.list' @@ -85,6 +88,7 @@ ADD dev-requirements.txt /dev-requirements.txt # Install dependencies # TODO: Add --std=c++14 ARG GPU_ARCH=";" +# install to /opt/rocm will cause permission issue ARG PREFIX=/usr/local ARG USE_FIN="OFF" ARG CCACHE_SECONDARY_STORAGE="" @@ -101,9 +105,12 @@ RUN ccache -s # purge existing composable kernel installed with ROCm # hence cannot use autoremove since it will remove more components +# even purge will remove some other components which is not ideal RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ - composablekernel-dev + composablekernel-dev \ + miopen-hip + ARG COMPILER_LAUNCHER="" # rbuild is used to trigger build of requirements.txt, dev-requirements.txt RUN if [ "$USE_FIN" = "ON" ]; then \ @@ -120,4 +127,7 @@ RUN pip3 install -r /doc-requirements.txt # Composable Kernel requires this version cmake RUN pip3 install --upgrade cmake==3.27.5 +# groupadd can add one group a time RUN groupadd -f render +RUN groupadd -f video +RUN usermod -a -G render,video root \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 424c1d8a2f..6d85a6f7d1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -33,14 +33,21 @@ def cmake_build(Map conf=[:]){ def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined -Wno-option-ignored " + conf.get("extradebugflags", "") def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","") def prefixpath = conf.get("prefixpath","/opt/rocm") + def build_type_debug = (conf.get("build_type",'release') == 'debug') + def code_conv_enabled = conf.get("codecov", false) + def mlir_args = " -DMIOPEN_USE_MLIR=" + conf.get("mlir_build", "ON") + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + if (build_type_debug || code_conv_enabled) + { + mlir_args = " -DMIOPEN_USE_MLIR=OFF" + } + def setup_args = mlir_args + " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","") def build_fin = conf.get("build_fin", "OFF") setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " - def build_type_debug = (conf.get("build_type",'release') == 'debug') - //cmake_env can overwrite default CXX variables. def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","") @@ -72,7 +79,7 @@ def cmake_build(Map conf=[:]){ test_flags = " --disable-verification-cache " + test_flags } - if(conf.get("codecov", false)){ //Need + if(code_conv_enabled){ //Need setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags} -fprofile-arcs -ftest-coverage' -DCODECOV_TEST=On " + setup_args }else if(build_type_debug){ setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args @@ -194,9 +201,9 @@ def getDockerImage(Map conf=[:]) { env.DOCKER_BUILDKIT=1 def prefixpath = conf.get("prefixpath", "/opt/rocm") // one image for each prefix 1: /usr/local 2:/opt/rocm - def gpu_arch = "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages + def gpu_arch = "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages def mlir_build = conf.get("mlir_build", "ON") // always ON - def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg USE_MLIR='${mlir_build}' " + def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg GPU_TARGETS='${gpu_arch}' --build-arg USE_MLIR='${mlir_build}' " if(env.CCACHE_HOST) { def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim() @@ -284,7 +291,7 @@ def buildHipClangJob(Map conf=[:]){ } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 210, unit:'MINUTES') + timeout(time: 300, unit:'MINUTES') { if (lfs_pull) { sh "git lfs pull --exclude=" @@ -648,7 +655,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot(make_targets: Smoke_targets) + buildHipClangJobAndReboot(make_targets: Smoke_targets, build_install: "true") } } stage('Fp32 Hip Debug gfx90a') { @@ -661,7 +668,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets) + buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, build_install: "true") } } stage('Fp32 Hip Debug gfx908') { @@ -674,7 +681,7 @@ pipeline { } agent{ label rocmnode("gfx908") } steps{ - buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets) + buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, build_install: "true") } } stage('Fp32 Hip Debug gfx94X') { @@ -687,7 +694,7 @@ pipeline { } agent{ label rocmnode("gfx94X") } steps{ - buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, needs_reboot:false) + buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, needs_reboot:false, build_install: "true") } } } @@ -711,7 +718,7 @@ pipeline { NOCOMGR_build_cmd = "CTEST_PARALLEL_LEVEL=4 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check" } steps{ - buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOCOMGR_flags, build_cmd: NOCOMGR_build_cmd, test_flags: ' --verbose ') + buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOCOMGR_flags, build_cmd: NOCOMGR_build_cmd, test_flags: ' --verbose ', build_install: "true") } } stage('Fp32 Hip Debug NOMLIR gfx90a') { @@ -728,7 +735,7 @@ pipeline { NOMLIR_build_cmd = "CTEST_PARALLEL_LEVEL=4 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check" } steps{ - buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOMLIR_flags, build_cmd: NOMLIR_build_cmd, test_flags: ' --verbose ') + buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOMLIR_flags, build_cmd: NOMLIR_build_cmd, test_flags: ' --verbose ', build_install: "true") } } stage('Fp32 Hip Debug NOCK gfx90a Build-Only') { @@ -741,7 +748,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot( build_type: 'debug', setup_flags: "-DMIOPEN_USE_COMPOSABLEKERNEL=Off", make_targets: "") + buildHipClangJobAndReboot( build_type: 'debug', setup_flags: "-DMIOPEN_USE_COMPOSABLEKERNEL=Off", make_targets: "", build_install: "true") } } stage('Fp32 Hip Debug Embedded Vega20') { @@ -757,7 +764,7 @@ pipeline { Embedded_flags = "-DMIOPEN_EMBED_DB='gfx906_60'" } steps{ - buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Embedded_flags, build_env: extra_log_env, test_flags: ' --verbose ') + buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Embedded_flags, build_env: extra_log_env, test_flags: ' --verbose ', build_install: "true") } } stage('Fp32 Hip Static gfx90a') { @@ -770,7 +777,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot( setup_flags: "-DBUILD_SHARED_LIBS=Off", mlir_build: 'OFF') + buildHipClangJobAndReboot( setup_flags: "-DBUILD_SHARED_LIBS=Off", mlir_build: 'OFF', build_install: "true") } } stage('Fp32 Hip Normal-Find gfx90a') { @@ -787,7 +794,7 @@ pipeline { execute_cmd = "bin/test_conv2d --disable-verification-cache" } steps{ - buildHipClangJobAndReboot(make_targets: make_targets, execute_cmd: execute_cmd, find_mode: "Normal") + buildHipClangJobAndReboot(make_targets: make_targets, execute_cmd: execute_cmd, find_mode: "Normal", build_install: "true") } } stage('Fp32 Hip Fast-Find gfx90a') { @@ -804,7 +811,7 @@ pipeline { execute_cmd = "MIOPEN_FIND_MODE=2 CTEST_PARALLEL_LEVEL=4 bin/test_conv2d --disable-verification-cache" } steps{ - buildHipClangJobAndReboot( make_targets: make_targets, execute_cmd: execute_cmd) + buildHipClangJobAndReboot( make_targets: make_targets, execute_cmd: execute_cmd, build_install: "true") } } stage('Fp32 Hip gfx90a') { @@ -830,7 +837,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot(make_targets: Smoke_targets, setup_flags: "-DMIOPEN_USE_SQLITE_PERF_DB=On") + buildHipClangJobAndReboot(make_targets: Smoke_targets, setup_flags: "-DMIOPEN_USE_SQLITE_PERF_DB=On", build_install: "true") } } } @@ -850,7 +857,7 @@ pipeline { } agent{ label rocmnode("vega20") } steps{ - buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Bf16 Hip Vega20') { @@ -863,7 +870,7 @@ pipeline { } agent{ label rocmnode("vega20") } steps{ - buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Fp16 Hip gfx908') { @@ -876,7 +883,7 @@ pipeline { } agent{ label rocmnode("gfx908") } steps{ - buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Bf16 Hip gfx908') { @@ -889,7 +896,7 @@ pipeline { } agent{ label rocmnode("gfx908") } steps{ - buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Fp16 Hip gfx90a') { @@ -902,7 +909,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Bf16 Hip gfx90a') { @@ -915,7 +922,7 @@ pipeline { } agent{ label rocmnode("gfx90a") } steps{ - buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets) + buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true") } } stage('Fp16 Hip gfx94X') { @@ -928,7 +935,7 @@ pipeline { } agent{ label rocmnode("gfx94X") } steps{ - buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, needs_reboot:false) + buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, needs_reboot:false, build_install: "true") } } stage('Bf16 Hip gfx94X') { @@ -941,7 +948,7 @@ pipeline { } agent{ label rocmnode("gfx94X") } steps{ - buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, needs_reboot:false) + buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, needs_reboot:false, build_install: "true") } } } diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 2574dbbf5e..34a045bca8 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -36,3 +36,4 @@ The MIOpen API library is structured as follows: * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental) + * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index bb4f98b6a5..72eac33929 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==1.6.2 +rocm-docs-core[api_reference]==1.7.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index d3060ed33b..f8ad206428 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -92,7 +92,7 @@ requests==2.32.2 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.6.2 +rocm-docs-core[api-reference]==1.7.1 # via -r requirements.in smmap==5.0.1 # via gitdb diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 1e841ee5e5..2ba219dd5e 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -49,6 +49,7 @@ add_executable(MIOpenDriver dm_lrn.cpp dm_multimarginloss.cpp dm_pool.cpp + dm_prelu.cpp dm_reduce.cpp dm_reduceextreme.cpp dm_reducecalculation.cpp diff --git a/driver/dm_prelu.cpp b/driver/dm_prelu.cpp new file mode 100644 index 0000000000..06ec99e8a5 --- /dev/null +++ b/driver/dm_prelu.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "prelu_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "prelu") + return new PReLUDriver(); + if(base_arg == "prelufp16") + return new PReLUDriver(); + if(base_arg == "prelubfp16") + return new PReLUDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index 8d7cbe758a..b4cca0356a 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -176,6 +176,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " + "prelu[bfp16|fp16], " "multimarginloss[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -208,8 +209,9 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" && - arg != "ropefp16" && arg != "ropebfp16" && arg != "multimarginloss" && - arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && arg != "--version") + arg != "ropefp16" && arg != "ropebfp16" && arg != "prelu" && arg != "prelufp16" && + arg != "prelubfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" && + arg != "multimarginlossbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloPReLUHost.hpp b/driver/mloPReLUHost.hpp new file mode 100644 index 0000000000..c3460b6425 --- /dev/null +++ b/driver/mloPReLUHost.hpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include <../test/ford.hpp> + +#include +#include +#include + +template +int32_t mloPReLUBackwardRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + const miopenTensorDescriptor_t doutputDesc, + const miopenTensorDescriptor_t dinputDesc, + const Tgpu* input, + const Tgpu* weight, + const Tgpu* doutput, + Tcheck* dinput_host, + Tcheck* dweight_host) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); + + auto input_sz = miopen::deref(inputDesc).GetElementSize(); + auto weight_sz = miopen::deref(weightDesc).GetElementSize(); + auto weight_grad_collector = std::vector(input_sz); + + par_ford(input_sz)([&](int gid) { + auto tensor_layout = tensor_layout_t<5>(input_tv, gid); + float input_v = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + float grad_v = static_cast(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]); + + if(dinput_host) + { + float weight_v; + if(weight_sz == 1) + weight_v = static_cast(weight[0]); + else + weight_v = static_cast(weight[tensor_layout.layout[1]]); + float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input_grad_v); + } + if(dweight_host) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } + }); + + if(dweight_host) + { + if(weight_sz == 1) + { + double sum = 0; + for(int i = 0; i < input_sz; ++i) + sum += static_cast(weight_grad_collector[i]); + dweight_host[0] = static_cast(sum); + } + else + { + size_t inner_size = std::accumulate( + &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies()); + size_t outer_size = inner_size * input_tv.size[1]; + par_ford(weight_sz)([&](int i) { + double sum = 0; + ford(input_tv.size[0])([&](int j) { + ford(inner_size)([&](int k) { + sum += static_cast( + weight_grad_collector[j * outer_size + i * inner_size + k]); + }); + }); + dweight_host[i] = static_cast(sum); + }); + } + } + + return miopenStatusSuccess; +} diff --git a/driver/prelu_driver.hpp b/driver/prelu_driver.hpp new file mode 100644 index 0000000000..e304f27465 --- /dev/null +++ b/driver/prelu_driver.hpp @@ -0,0 +1,394 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "driver.hpp" +#include "mloPReLUHost.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/verify.hpp> + +#include + +template +class PReLUDriver : public Driver +{ +public: + PReLUDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&weightDesc); + miopenCreateTensorDescriptor(&doutputDesc); + miopenCreateTensorDescriptor(&dinputDesc); + miopenCreateTensorDescriptor(&dweightDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~PReLUDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(weightDesc); + miopenDestroyTensorDescriptor(doutputDesc); + miopenDestroyTensorDescriptor(dinputDesc); + miopenDestroyTensorDescriptor(dweightDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t weightDesc; + miopenTensorDescriptor_t doutputDesc; + miopenTensorDescriptor_t dinputDesc; + miopenTensorDescriptor_t dweightDesc; + + std::unique_ptr input_dev; + std::unique_ptr weight_dev; + std::unique_ptr doutput_dev; + std::unique_ptr dinput_dev; + std::unique_ptr dweight_dev; + std::unique_ptr workspace_dev; + + std::vector input; + std::vector weight; + std::vector doutput; + std::vector dinput; + std::vector dweight; + + std::vector dinput_host; + std::vector dweight_host; + + size_t ws_sizeInBytes; +}; + +template +int PReLUDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + auto inTensorParam = inflags.GetValueTensor("input"); + auto input_length = inTensorParam.lengths; + if(input_length.empty()) + { + std::cout << "Tensor must not be empty"; + return miopenStatusBadParm; + } + + std::vector weight_length = {inflags.GetValueInt("NumParameters")}; + if(weight_length[0] != 1 && (input_length.size() == 1 || weight_length[0] != input_length[1])) + { + std::cout << "NumParameters must be 1 or the second dim of DimLengths"; + return miopenStatusBadParm; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::GetandSetData() +{ + auto inTensorParam = inflags.GetValueTensor("input"); + auto input_length = inTensorParam.lengths; + std::vector weight_length = {inflags.GetValueInt("NumParameters")}; + + if(SetTensorNd(inputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + if(SetTensorNd(dinputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input gradient tensor"); + + if(SetTensorNd(weightDesc, weight_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing weight tensor"); + if(SetTensorNd(dweightDesc, weight_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing weight gradient tensor"); + + if(SetTensorNd(doutputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output gradient tensor"); + + return miopenStatusSuccess; +} + +template +int PReLUDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward PReLU (Default=1)", "int"); + inflags.AddTensorFlag("input", 'D', "256x4x1x1x8723", "input tensor descriptor"); + inflags.AddInputFlag( + "NumParameters", + 'P', + "1", + "Number of weight to learn. Although it takes an int as input, there is only two values " + "are legitimate: 1, or the number of channels (the second dim) at input (Default=1)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int PReLUDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t weight_sz = GetTensorSize(weightDesc); + + miopenGetPReLUBackwardWorkspaceSize(GetHandle(), inputDesc, weightDesc, &ws_sizeInBytes); + + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + doutput_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + dweight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + input = std::vector(input_sz); + weight = std::vector(weight_sz); + doutput = std::vector(input_sz, static_cast(1.0f)); + dinput = std::vector(input_sz, std::numeric_limits::quiet_NaN()); + dweight = std::vector(weight_sz, std::numeric_limits::quiet_NaN()); + + dinput_host = std::vector(input_sz, std::numeric_limits::quiet_NaN()); + dweight_host = std::vector(weight_sz, std::numeric_limits::quiet_NaN()); + + for(int i = 0; i < input_sz; i++) + input[i] = prng::gen_A_to_B(static_cast(-1e-5), static_cast(1e-6)); + + for(int i = 0; i < weight_sz; i++) + weight[i] = prng::gen_A_to_B(static_cast(-1e-5), static_cast(1e-6)); + + if(input_dev->ToGPU(GetStream(), input.data()) != 0) + { + std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(weight_dev->ToGPU(GetStream(), weight.data()) != 0) + { + std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0) + { + std::cerr << "Error copying (out grad) to GPU, size: " << doutput_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0) + { + std::cerr << "Error copying (input grad) to GPU, size: " << dinput_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + if(dweight_dev->ToGPU(GetStream(), dweight.data()) != 0) + { + std::cerr << "Error copying (weight grad) to GPU, size: " << dweight_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::RunForwardGPU() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::RunForwardCPU() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenPReLUBackward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + inputDesc, + input_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + dinputDesc, + dinput_dev->GetMem(), + dweightDesc, + dweight_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward PReLU Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward PReLU Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0) + { + std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + if(dweight_dev->FromGPU(GetStream(), dweight.data()) != 0) + { + std::cerr << "Error copying (dweight_dev) from GPU, size: " << dweight_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::RunBackwardCPU() +{ + return mloPReLUBackwardRunHost(inputDesc, + weightDesc, + doutputDesc, + dinputDesc, + input.data(), + weight.data(), + doutput.data(), + dinput_host.data(), + dweight_host.data()); +} + +template +Tref PReLUDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int PReLUDriver::VerifyForward() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_dinput = miopen::rms_range(dinput_host, dinput); + auto error_dweight = miopen::rms_range(dweight_host, dweight); + + if(!std::isfinite(error_dinput) || error_dinput > tolerance) + { + std::cout << "Backward PReLU Input Gradient FAILED: " << error_dinput << " > " << tolerance + << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward PReLU Input Gradient Verifies OK on CPU reference (" << error_dinput + << " < " << tolerance << ')' << std::endl; + } + + if(!std::isfinite(error_dweight) || error_dweight > tolerance) + { + std::cout << "Backward PReLU Weight Gradient FAILED: " << error_dweight << " > " + << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward PReLU Weight Gradient Verifies OK on CPU reference (" + << error_dweight << " < " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 75fe5df342..d5c9c7cea8 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -71,6 +71,7 @@ * @defgroup getitem * @defgroup ReduceCalculation * @defgroup RotaryPositionalEmbeddings + * @defgroup ReLU * */ @@ -7678,6 +7679,61 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle, #endif // MIOPEN_BETA_API #ifdef MIOPEN_BETA_API +/** @addtogroup ReLU + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the PReLU backward call + * + * @param handle MIOpen Handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param weightDesc Tensor descriptor for weight tensor (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t weightDesc, + size_t* sizeInBytes); + +/*! @brief Execute a PReLU backward layer + * + * @param handle MIOpen handle (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param weightDesc Tensor descriptor for weight tensor (input) + * @param weight Data tensor weight (input) + * @param doutputDesc Tensor descriptor for output gradient (input) + * @param doutput Gradient of output (input) + * @param dinputDesc Tensor descriptor for input gradient (input) + * @param dinput Gradient of input (output) + * @param dweightDesc Tensor descriptor for weight gradient (input) + * @param dweight Gradient of weight (output) + */ +MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t weightDesc, + const void* weight, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dweightDesc, + void* dweight); + +/** @} */ +// CLOSEOUT RELU DOXYGEN GROUP +#endif // MIOPEN_BETA_API + +#ifdef MIOPEN_BETA_API + /*! @ingroup LossFunction * @enum miopenLossReductionMode_t * Reduction mode for loss function @@ -7769,7 +7825,7 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle, /** @} */ // CLOSEOUT LossFunction DOXYGEN GROUP -#endif +#endif // MIOPEN_BETA_API #ifdef __cplusplus } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9dd357540d..32da2a6229 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -165,6 +165,8 @@ set( MIOpen_Source performance_config.cpp pooling/problem_description.cpp pooling_api.cpp + prelu/problem_description.cpp + prelu_api.cpp problem.cpp process.cpp ramdb.cpp @@ -306,6 +308,9 @@ set( MIOpen_Source solver/pooling/forwardNd.cpp solver/pooling/backward2d.cpp solver/pooling/backwardNd.cpp + solver/prelu/backward_prelu_multi_weights.cpp + solver/prelu/backward_prelu_single_weight.cpp + solver/prelu/utils.cpp solver/reduce/forward_argmax.cpp solver/reduce/forward_argmin.cpp solver/reduce/forward_max.cpp @@ -515,6 +520,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenLRNFwd.cl kernels/MIOpenMultiMarginLoss.cpp kernels/MIOpenNeuron.cl + kernels/MIOpenPReLU.cpp kernels/MIOpenPooling.cl kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl @@ -670,6 +676,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN t5layernorm.cpp ocl/fusionopconvocl.cpp ocl/fusionopbiasbnactivocl.cpp + prelu.cpp reducecalculation.cpp reduceextreme.cpp rope.cpp diff --git a/src/comgr.cpp b/src/comgr.cpp index aa53b71bb5..b508bced2f 100644 --- a/src/comgr.cpp +++ b/src/comgr.cpp @@ -701,6 +701,7 @@ void BuildAsm(const std::string& name, #define WORKAROUND_ISSUE_HIPRTC_HIPRTC_HEADER_H 1 // See SWDEV-307838, issue #1648. #define WORKAROUND_ISSUE_1674 (HIP_PACKAGE_VERSION_FLAT >= 5003022305ULL) +#define WORKAROUND_ISSUE_3188 (HIP_PACKAGE_VERSION_FLAT >= 6002041133ULL) // See WORKAROUND_SWDEV_413293 in ./CmakeLists.txt #define WORKAROUND_SWDEV_413293 MIOPEN_HIP_COMPILER_HAS_OPTION_OFFLOAD_UNIFORM_BLOCK @@ -975,6 +976,9 @@ void BuildHip(const std::string& name, #endif #if WORKAROUND_ISSUE_1674 opts.push_back("-Wno-gnu-line-marker"); +#endif +#if WORKAROUND_ISSUE_3188 + opts.push_back("-Wno-pass-failed"); #endif opts.push_back("-Wno-cuda-compat"); opts.push_back("-fno-gpu-rdc"); diff --git a/src/include/miopen/graphapi/util.hpp b/src/include/miopen/graphapi/util.hpp index f9415a3630..02af1c9131 100644 --- a/src/include/miopen/graphapi/util.hpp +++ b/src/include/miopen/graphapi/util.hpp @@ -36,13 +36,12 @@ namespace miopen { namespace graphapi { -inline std::string tensorIdAsStr(const Tensor* tens_ptr) +inline std::string tensorIdAsStr(int64_t tens_id) { - int64_t id = tens_ptr->getId(); - char* b = reinterpret_cast(&id); + char* b = reinterpret_cast(&tens_id); - return {b, sizeof(id)}; + return {b, sizeof(tens_id)}; } template diff --git a/src/include/miopen/prelu.hpp b/src/include/miopen/prelu.hpp new file mode 100644 index 0000000000..35e5e7f611 --- /dev/null +++ b/src/include/miopen/prelu.hpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT size_t GetPReLUBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& weightDesc); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t PReLUBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dweightDesc, + Data_t dweight); + +} // namespace miopen diff --git a/src/include/miopen/prelu/invoke_params.hpp b/src/include/miopen/prelu/invoke_params.hpp new file mode 100644 index 0000000000..1acc2dc143 --- /dev/null +++ b/src/include/miopen/prelu/invoke_params.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace prelu { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* weightDesc = nullptr; + const TensorDescriptor* doutputDesc = nullptr; + const TensorDescriptor* dinputDesc = nullptr; + const TensorDescriptor* dweightDesc = nullptr; + + ConstData_t input = nullptr; + ConstData_t weight = nullptr; + ConstData_t doutput = nullptr; + Data_t dinput = nullptr; + Data_t dweight = nullptr; + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace prelu + +} // namespace miopen diff --git a/src/include/miopen/prelu/problem_description.hpp b/src/include/miopen/prelu/problem_description.hpp new file mode 100644 index 0000000000..ce10d96046 --- /dev/null +++ b/src/include/miopen/prelu/problem_description.hpp @@ -0,0 +1,115 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace prelu { + +bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y); +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); + +struct BackwardProblemDescription : ProblemDescriptionBase +{ + + BackwardProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& doutputDesc_, + const TensorDescriptor& dinputDesc_, + const TensorDescriptor& dweightDesc_) + : inputDesc(inputDesc_), + weightDesc(weightDesc_), + doutputDesc(doutputDesc_), + dinputDesc(dinputDesc_), + dweightDesc(dweightDesc_) + { + IsSameType(); + IsRightLength(); + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetWeightDesc() const { return weightDesc; } + const TensorDescriptor& GetdOuputDesc() const { return doutputDesc; } + const TensorDescriptor& GetdInputDesc() const { return dinputDesc; } + const TensorDescriptor& GetdWeightDesc() const { return dweightDesc; } + + bool IsSameType() const + { + if(!checkSameType(inputDesc, weightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Input and Weight tensor must have same type."); + if(!checkSameType(inputDesc, dinputDesc) || !checkSameType(weightDesc, dweightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Gradient tensors (excluding Output gradient) must share a same " + "type with Input and Weight tensor."); + return true; + } + + bool IsRightLength() const + { + if(!checkSameLength(inputDesc, doutputDesc) || !checkSameLength(inputDesc, dinputDesc)) + MIOPEN_THROW( + miopenStatusBadParm, + "PReLU: Input and Output Gradient tensors sizes must match with Input tensor."); + if(weightDesc.GetNumDims() != 1) + MIOPEN_THROW(miopenStatusBadParm, "PReLU: Weight tensor must have 1 dimension."); + if(weightDesc.GetElementSize() != 1 && + (inputDesc.GetNumDims() == 1 || + weightDesc.GetElementSize() != inputDesc.GetLengths()[1])) + MIOPEN_THROW( + miopenStatusBadParm, + "PReLU: Weight size must be 1 or equal to the second dim of Input tensor."); + if(!checkSameLength(weightDesc, dweightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Weight Gradient tensors sizes must match with Weight tensor."); + return true; + } + + bool IsSingleWeight() const + { + if(weightDesc.GetElementSize() > 1) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor inputDesc; + TensorDescriptor weightDesc; + TensorDescriptor doutputDesc; + TensorDescriptor dinputDesc; + TensorDescriptor dweightDesc; +}; + +} // namespace prelu + +} // namespace miopen diff --git a/src/include/miopen/prelu/solvers.hpp b/src/include/miopen/prelu/solvers.hpp new file mode 100644 index 0000000000..a2bec477f0 --- /dev/null +++ b/src/include/miopen/prelu/solvers.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace prelu { + +using BackwardSolverBase = + NonTunableSolverBase; + +struct Backward : BackwardSolverBase +{ + bool MayNeedWorkspace() const override { return true; } +}; + +struct SingleWeightBackward final : Backward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; +}; + +struct MultiWeightsBackward final : Backward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; +}; + +} // namespace prelu + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/prelu/utils.hpp b/src/include/miopen/prelu/utils.hpp new file mode 100644 index 0000000000..b7551a4135 --- /dev/null +++ b/src/include/miopen/prelu/utils.hpp @@ -0,0 +1,45 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { +namespace solver { +namespace prelu { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params); + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 485b7a24d8..fdfb857319 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -60,6 +60,7 @@ enum class Primitive Adam, Item, RoPE, + ReLU, MultiMarginLoss }; diff --git a/src/kernels/MIOpenPReLU.cpp b/src/kernels/MIOpenPReLU.cpp new file mode 100644 index 0000000000..750b226e7f --- /dev/null +++ b/src/kernels/MIOpenPReLU.cpp @@ -0,0 +1,115 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +template +__device__ void PReLUBackward(const TI* __restrict__ input, + const TI* __restrict__ weight, + const TO* __restrict__ output_grad, + TI* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= N) + return; + + auto tensor_layout = tensor_layout_t(input_tv, gid); + FLOAT_ACCUM input_v = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM grad_v = + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + + if(input_grad != nullptr) + { + FLOAT_ACCUM weight_v = CVT_FLOAT2ACCUM( + weight[SingleWeight ? 0 : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]); + FLOAT_ACCUM input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + CVT_ACCUM2FLOAT(input_grad_v); + } + if(weight_grad_collector != nullptr) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } +} + +extern "C" __global__ void PReLUSWBackward(const INPUT_TYPE* __restrict__ input, + const INPUT_TYPE* __restrict__ weight, + const OUTPUT_TYPE* __restrict__ output_grad, + INPUT_TYPE* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + // instantiate the kernel + PReLUBackward(input, + weight, + output_grad, + input_grad, + weight_grad_collector, + N, + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); +} + +extern "C" __global__ void PReLUMWBackward(const INPUT_TYPE* __restrict__ input, + const INPUT_TYPE* __restrict__ weight, + const OUTPUT_TYPE* __restrict__ output_grad, + INPUT_TYPE* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + // instantiate the kernel + PReLUBackward(input, + weight, + output_grad, + input_grad, + weight_grad_collector, + N, + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); +} diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d7294d8992..8cdab4b942 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -27,6 +27,8 @@ #ifndef GUARD_TENSOR_VIEW_HPP #define GUARD_TENSOR_VIEW_HPP +#include + template struct tensor_layout_t; @@ -77,6 +79,15 @@ struct tensor_layout_t } } + constexpr tensor_layout_t(std::initializer_list layout_) + { + static_assert(N > 0); + for(auto i = 0; i < N; ++i) + { + layout[i] = layout_.begin()[i]; + } + } + uint64_t layout[N]; }; diff --git a/src/prelu.cpp b/src/prelu.cpp new file mode 100644 index 0000000000..3966c1f45f --- /dev/null +++ b/src/prelu.cpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +namespace miopen { + +size_t GetPReLUBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& weightDesc) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = + prelu::BackwardProblemDescription{inputDesc, weightDesc, inputDesc, inputDesc, weightDesc}; + + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t PReLUBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dweightDesc, + Data_t dweight) +{ + const auto problem = prelu::BackwardProblemDescription{ + inputDesc, weightDesc, doutputDesc, inputDesc, dweightDesc}; + + const auto invoke_params = [&]() { + auto tmp = prelu::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.weightDesc = &weightDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinputDesc = &dinputDesc; + tmp.dweightDesc = &dweightDesc; + tmp.input = input; + tmp.weight = weight; + tmp.doutput = doutput; + tmp.dinput = dinput; + tmp.dweight = dweight; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + return tmp; + }(); + + const auto algo = AlgorithmName{"PReLUBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/prelu/problem_description.cpp b/src/prelu/problem_description.cpp new file mode 100644 index 0000000000..f364c60d19 --- /dev/null +++ b/src/prelu/problem_description.cpp @@ -0,0 +1,76 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace prelu { + +bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetType() != y.GetType()) + return false; + return true; +} + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetNumDims() != y.GetNumDims()) + return false; + for(int32_t i = 0; i < x.GetNumDims(); ++i) + { + if(x.GetLengths()[i] != y.GetLengths()[i]) + return false; + } + return true; +} + +NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto output_dtype = doutputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto num_params = weightDesc.GetElementSize(); + + std::ostringstream ss; + + ss << "prelu_bwd"; + ss << "idtype" << input_dtype; + ss << "odtype" << output_dtype; + ss << "ndim" << inputDesc.GetNumDims(); + ss << "size" << size; + ss << "num_params" << num_params; + + return NetworkConfig{ss.str()}; +} + +} // namespace prelu + +} // namespace miopen diff --git a/src/prelu_api.cpp b/src/prelu_api.cpp new file mode 100644 index 0000000000..5e670a0a2b --- /dev/null +++ b/src/prelu_api.cpp @@ -0,0 +1,133 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdPReLU(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(inputDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "prelufp16"; + } + else if(dtype == miopenFloat) + { + ss << "prelufp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "prelubfp16"; + } + + MIOPEN_LOG_FUNCTION(inputDesc, weightDesc); + ss << " --input " << miopen::deref(inputDesc).GetLengths(); + ss << " --weight " << miopen::deref(weightDesc).GetLengths(); + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t +miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION(handle, inputDesc, weightDesc, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = miopen::GetPReLUBackwardWorkspaceSize( + miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(weightDesc)); + }); +} + +extern "C" miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, + void* workspace, + const size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t dweightDesc, + void* dweight) +{ + MIOPEN_LOG_FUNCTION(handle, + workspace, + workspaceSizeInBytes, + inputDesc, + input, + weightDesc, + weight, + doutputDesc, + doutput, + dinputDesc, + dinput, + dweightDesc, + dweight); + + LogCmdPReLU(inputDesc, weightDesc, false); + return miopen::try_([&] { + miopen::PReLUBackward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(doutputDesc), + DataCast(doutput), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(dweightDesc), + DataCast(dweight)); + }); +} diff --git a/src/solver.cpp b/src/solver.cpp index 16d7ed92e4..11ada34965 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -677,6 +678,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::MultiMarginLoss, diff --git a/src/solver/prelu/backward_prelu_multi_weights.cpp b/src/solver/prelu/backward_prelu_multi_weights.cpp new file mode 100644 index 0000000000..5fed375a2b --- /dev/null +++ b/src/solver/prelu/backward_prelu_multi_weights.cpp @@ -0,0 +1,183 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define VIEW_DIMS 5 + +#define warpSizeCTX (context.GetStream().GetWavefrontWidth()) +#define LOCAL_SIZE_MW_BWD 256 +#define LOCAL_SIZE_MW_REDUCE_BWD warpSizeCTX + +namespace miopen { + +namespace solver { + +namespace prelu { + +bool MultiWeightsBackward::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS) + return false; + if(problem.IsSingleWeight()) + return false; + return true; +} + +ConvSolution +MultiWeightsBackward::GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetdInputDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetdInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType()); + + /* Phase 1: Calc gradient for each elements. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"VIEW_DIMS", VIEW_DIMS}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + }; + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_MW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUMWBackward", build_params)); + } + + /* Phase 2: Reduce gradient. */ + { + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"REDUCE_SIZE", LOCAL_SIZE_MW_REDUCE_BWD}, + }; + result.construction_params.push_back( + make_hip_kernel({LOCAL_SIZE_MW_REDUCE_BWD}, + {problem.GetdWeightDesc().GetElementSize() * LOCAL_SIZE_MW_REDUCE_BWD}, + "MIOpenReduceSum.cpp", + "Reduce1dSum", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + HipEventPtr start, stop; + bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + hipStreamSynchronize(handle_.GetStream()); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + int kernelCnt = 0; + + /* Phase 1: Calc gradient for each elements. */ + { + auto input_tv = get_inner_expanded_tv(deref(params.inputDesc)); + auto weight_tv = get_inner_expanded_tv<1>(deref(params.weightDesc)); + auto output_grad_tv = get_inner_expanded_tv(deref(params.doutputDesc)); + auto input_grad_tv = get_inner_expanded_tv(deref(params.dinputDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.input, + params.weight, + params.doutput, + params.dinput, + params.workspace, + static_cast(deref(params.inputDesc).GetElementSize()), + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); + } + + /* Phase 2: Reduce gradient. */ + { + uint64_t output_numel = deref(params.weightDesc).GetElementSize(); + uint64_t outer_size = deref(params.inputDesc).GetLengths()[0]; + uint64_t inner_size = + deref(params.inputDesc).GetElementSize() / outer_size / output_numel; + auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.workspace, + params.dweight, + output_numel, + inner_size, + outer_size, + weight_grad_tv); + } + + if(profiling) + { + float elapsed = 0.0f; + hipEventRecord(stop.get(), handle_.GetStream()); + handle_.EnableProfiling(true); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + + return result; +} + +std::size_t MultiWeightsBackward::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto size = problem.GetdInputDesc().GetElementSize(); + size *= get_data_size(miopenFloat); + return size; +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/solver/prelu/backward_prelu_single_weight.cpp b/src/solver/prelu/backward_prelu_single_weight.cpp new file mode 100644 index 0000000000..2f8ba825f4 --- /dev/null +++ b/src/solver/prelu/backward_prelu_single_weight.cpp @@ -0,0 +1,202 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define VIEW_DIMS 5 + +#define LOCAL_SIZE_SW_BWD 256 +#define LOCAL_SIZE_SW_REDUCE_BWD 256 + +namespace miopen { + +namespace solver { + +namespace prelu { + +namespace { +MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits(const TensorDescriptor& inputDesc) +{ + auto size = inputDesc.GetElementSize(); + return MultiBufferWorkspaceTraits{size * get_data_size(miopenFloat), + AlignUp(size, LOCAL_SIZE_SW_REDUCE_BWD) / + LOCAL_SIZE_SW_REDUCE_BWD * get_data_size(miopenFloat)}; +} +} // namespace + +bool SingleWeightBackward::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS) + return false; + if(!problem.IsSingleWeight()) + return false; + return true; +} + +ConvSolution +SingleWeightBackward::GetSolution(const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetdInputDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetdInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType()); + + /* Phase 1: Calc gradient for each elements. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"VIEW_DIMS", VIEW_DIMS}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + }; + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_SW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUSWBackward", build_params)); + } + + /* Phase 2: Reduce gradient. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"REDUCE_SIZE", LOCAL_SIZE_SW_REDUCE_BWD}, + }; + while(size > LOCAL_SIZE_SW_REDUCE_BWD) + { + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SW_REDUCE_BWD}, + {size}, + "MIOpenReduceSum.cpp", + "ReduceSumFLOATACCUM", + build_params)); + size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD; + } + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_SW_REDUCE_BWD}, {size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); + } + + auto getBuffPart = [ws = GetMultiBufferWorkspaceTraits(problem.GetdInputDesc())]( + void* buffer, size_t part_idx) { + return static_cast(static_cast(buffer) + ws.GetOffset(part_idx)); + }; + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + HipEventPtr start, stop; + bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + hipStreamSynchronize(handle_.GetStream()); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + auto work_a = getBuffPart(params.GetWorkspace(), 0); + auto work_b = getBuffPart(params.GetWorkspace(), 1); + + int kernelCnt = 0; + + /* Phase 1: Calc gradient for each elements. */ + { + auto input_tv = get_inner_expanded_tv(deref(params.inputDesc)); + auto weight_tv = get_inner_expanded_tv<1>(deref(params.weightDesc)); + auto output_grad_tv = get_inner_expanded_tv(deref(params.doutputDesc)); + auto input_grad_tv = get_inner_expanded_tv(deref(params.dinputDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.input, + params.weight, + params.doutput, + params.dinput, + work_a, + static_cast(deref(params.inputDesc).GetElementSize()), + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); + } + + /* Phase 2: Reduce gradient. */ + { + uint64_t size = deref(params.inputDesc).GetElementSize(); + while(size > LOCAL_SIZE_SW_REDUCE_BWD) + { + auto kernel = handle_.Run(kernels[kernelCnt++]); + kernel(work_a, work_b, size); + size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD; + std::swap(work_a, work_b); + } + auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc)); + handle_.Run(kernels[kernelCnt++])(work_a, params.dweight, size, weight_grad_tv); + } + + if(profiling) + { + float elapsed = 0.0f; + hipEventRecord(stop.get(), handle_.GetStream()); + handle_.EnableProfiling(true); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + + return result; +} + +std::size_t SingleWeightBackward::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + return GetMultiBufferWorkspaceTraits(problem.GetdInputDesc()).GetSize(); +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/solver/prelu/utils.cpp b/src/solver/prelu/utils.cpp new file mode 100644 index 0000000000..d67e0bdac5 --- /dev/null +++ b/src/solver/prelu/utils.cpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +namespace miopen { + +namespace solver { + +namespace prelu { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) +{ + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/test/cpu_prelu.hpp b/test/cpu_prelu.hpp new file mode 100644 index 0000000000..952099c3f6 --- /dev/null +++ b/test/cpu_prelu.hpp @@ -0,0 +1,100 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "tensor_holder.hpp" +#include + +template +void cpu_prelu_backward(const tensor input, + const tensor weight, + const tensor output_grad, + tensor& ref_input_grad, + tensor& ref_weight_grad, + const bool has_dinput = true, + const bool has_dweight = true) +{ + auto N = input.desc.GetElementSize(); + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto weight_tv = miopen::get_inner_expanded_tv<1>(weight.desc); + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(ref_input_grad.desc); + auto weight_grad_tv = miopen::get_inner_expanded_tv<1>(ref_weight_grad.desc); + + auto weight_grad_collector = std::vector(N); + + par_ford(N)([&](int gid) { + auto tensor_layout = tensor_layout_t<5>(input_tv, gid); + float input_v = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + float grad_v = + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + + if(has_dinput) + { + float weight_v; + weight_v = static_cast( + weight[weight.desc.GetElementSize() == 1 + ? 0 + : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]); + + float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + ref_input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input_grad_v); + } + if(has_dweight) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } + }); + + if(has_dweight) + { + if(weight.desc.GetElementSize() == 1) + { + double sum = 0; + for(int i = 0; i < N; ++i) + sum += static_cast(weight_grad_collector[i]); + ref_weight_grad[0] = static_cast(sum); + } + else + { + size_t inner_size = std::accumulate( + &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies()); + size_t outer_size = inner_size * input_tv.size[1]; + par_ford(weight.desc.GetElementSize())([&](int i) { + double sum = 0; + ford(input_tv.size[0])([&](int j) { + ford(inner_size)([&](int k) { + sum += static_cast( + weight_grad_collector[j * outer_size + i * inner_size + k]); + }); + }); + ref_weight_grad[weight_grad_tv.get_tensor_view_idx({i})] = static_cast(sum); + }); + } + } +} diff --git a/test/gtest/graphapi_mha_bwd.cpp b/test/gtest/graphapi_mha_bwd.cpp index 35bf97f3d0..13d78cd424 100644 --- a/test/gtest/graphapi_mha_bwd.cpp +++ b/test/gtest/graphapi_mha_bwd.cpp @@ -39,6 +39,7 @@ class MhaBwdGraphTest : public MhaGraphTestBase mGraphBuilder = std::make_unique(); std::vector nhsd = {n, h, s, d}; + std::vector nhds = {n, h, d, s}; std::vector nhss = {n, h, s, s}; std::vector nhs1 = {n, h, s, 1}; std::vector all1s = {1, 1, 1, 1}; @@ -49,8 +50,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase MAKE_TENSOR_F(dO, nhsd, false); MAKE_TENSOR_F(O, nhsd, false); + MAKE_TENSOR_F(K_T, nhds, true); MAKE_TENSOR_F(T_MM_0, nhss, true); - addNode("OP_MATMUL", {Q, K}, {T_MM_0}); + + addNode("OP_RESHAPE", {K}, {K_T}); + addNode("OP_MATMUL", {Q, K_T}, {T_MM_0}); MAKE_TENSOR_F(T_SCL_0, nhss, true); // MAKE_TENSOR_F(ATN_SCL, all1s, false); @@ -101,9 +105,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase MAKE_TENSOR_F(SCL_S, all1s, false); addNode("OP_POINTWISE:MUL", {T_SCL_3, SCL_S}, {T_SCL_4}); - // NOTE(Amber): omitting the Reshape transpose node here + MAKE_TENSOR_F(SCL_4T, nhss, true); + addNode("OP_RESHAPE", {T_SCL_4}, {SCL_4T}); + MAKE_TENSOR_F(T_MM_1, nhsd, true); - addNode("OP_MATMUL", {T_SCL_4, dO}, {T_MM_1}); + addNode("OP_MATMUL", {SCL_4T, dO}, {T_MM_1}); MAKE_TENSOR_F(T_SCL_5, nhsd, true); MAKE_TENSOR_F(DSCL_S, all1s, false); @@ -122,8 +128,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase ////////////////// Center-top ////////////////////////////////// + MAKE_TENSOR_F(V_T, nhds, true); + addNode("OP_RESHAPE", {V}, {V_T}); + MAKE_TENSOR_F(T_MM_2, nhss, true); - addNode("OP_MATMUL", {dO, V}, {T_MM_2}); + addNode("OP_MATMUL", {dO, V_T}, {T_MM_2}); MAKE_TENSOR_F(T_SCL_7, nhss, true); addNode("OP_POINTWISE:MUL", {T_MM_2, DSCL_dO}, {T_SCL_7}); @@ -192,9 +201,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase addNode("OP_REDUCTION:MAX", {T_SCL_17}, {AMax_dQ}); ////////////////// Right Bottom ////////////////////////////////// - // XXX(Amber): Reshape transpose node goes here + MAKE_TENSOR_F(SCL_15T, nhss, true); + addNode("OP_RESHAPE", {T_SCL_15}, {SCL_15T}); + MAKE_TENSOR_F(T_MM_4, nhsd, true); - addNode("OP_MATMUL", {T_SCL_15, Q}, {T_MM_4}); + addNode("OP_MATMUL", {SCL_15T, Q}, {T_MM_4}); MAKE_TENSOR_F(T_SCL_18, nhsd, true); addNode("OP_POINTWISE:MUL", {T_MM_4, DSCL_dS}, {T_SCL_18}); @@ -295,6 +306,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase // softmax needed for calling cpu backward mha mSoftMax = std::move(out.mSoftMax); + + // Remove "SCL_O" here so that it doesn't pollute the variant pack used for + // execution + size_t cnt = mFilledTensors.erase("SCL_O"); + ASSERT_EQ(cnt, 1); } void runCpuVerify(size_t n, size_t h, size_t s, size_t d) override @@ -350,16 +366,13 @@ class MhaBwdGraphTest : public MhaGraphTestBase checkTensor("dK", dK_ref); checkTensor("dV", dV_ref); } - - /// \todo remove once backward mha is ready to execute - void executeMhaGraph() override {} }; } // end namespace mha_graph_test // using namespace mha_graph_test; -TEST_P(MhaBwdGraphTest, MhaBwdGraph) { Run(); } +TEST_P(MhaBwdGraphTest, MhaBwdGraph) { Run(MhaDir::Bwd); } INSTANTIATE_TEST_SUITE_P(MhaGraphBwdSuite, MhaBwdGraphTest, diff --git a/test/gtest/graphapi_mha_cpp_common.hpp b/test/gtest/graphapi_mha_cpp_common.hpp index 5f35855bd4..ec385ab360 100644 --- a/test/gtest/graphapi_mha_cpp_common.hpp +++ b/test/gtest/graphapi_mha_cpp_common.hpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -271,6 +272,13 @@ class MhaGraphTestBase .setOutput(out_tensors[0]) .build())); } + else if(name == "OP_RESHAPE") + { + assert(in_tensors.size() == 1); + assert(out_tensors.size() == 1); + mGraphBuilder->addNode(mAlloc.allocate( + gr::OperationReshapeBuilder{}.setX(in_tensors[0]).setY(out_tensors[0]).build())); + } else { FAIL() << "Unknown graph node type: " << name; @@ -423,7 +431,13 @@ class MhaGraphTestBase } public: - void Run() + enum class MhaDir + { + Fwd, + Bwd + }; + + void Run(MhaDir direction) { auto [n, h, s, d, p] = GetParam(); std::cout << "n:" << n << ", h:" << h << ", s:" << s << ", d:" << d << ", p:" << p @@ -431,10 +445,17 @@ class MhaGraphTestBase mProbDropout = p; auto& handle = get_handle(); - if((p > 0.0f) && (s % handle.GetWavefrontWidth() != 0)) + if(direction == MhaDir::Fwd && (p > 0.0f) && (s % handle.GetWavefrontWidth() != 0)) { - GTEST_SKIP() << "CPU Dropout currently supprorts only fully occupied warps"; + GTEST_SKIP() + << "CPU Fwd pass with Dropout currently supprorts only fully occupied warps"; } + + if(direction == MhaDir::Bwd && p > 0.0f) + { + GTEST_SKIP() << "CPU backward pass with Dropout is not supported currently"; + } + createMhaGraph(n, h, s, d); initInputs(n, h, s, d); executeMhaGraph(); diff --git a/test/gtest/graphapi_mha_fwd_f8.cpp b/test/gtest/graphapi_mha_fwd.cpp similarity index 99% rename from test/gtest/graphapi_mha_fwd_f8.cpp rename to test/gtest/graphapi_mha_fwd.cpp index a1897feae9..d64477e557 100644 --- a/test/gtest/graphapi_mha_fwd_f8.cpp +++ b/test/gtest/graphapi_mha_fwd.cpp @@ -201,7 +201,7 @@ class GPU_MhaFwdGraph_FP32 : public MhaGraphTestBase using namespace mha_graph_test; -TEST_P(GPU_MhaFwdGraph_FP32, MhaFwdGraph) { Run(); } +TEST_P(GPU_MhaFwdGraph_FP32, MhaFwdGraph) { Run(MhaDir::Fwd); } INSTANTIATE_TEST_SUITE_P(Unit, GPU_MhaFwdGraph_FP32, diff --git a/test/gtest/prelu.cpp b/test/gtest/prelu.cpp new file mode 100644 index 0000000000..1d0cd4d431 --- /dev/null +++ b/test/gtest/prelu.cpp @@ -0,0 +1,118 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "prelu.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace prelu { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +bool CheckFloatArg(std::string arg) +{ + if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == arg)) + { + return true; + } + return false; +} + +struct GPU_PReLU_bwd_FP32 : PReLUTest +{ +}; + +struct GPU_PReLU_bwd_FP16 : PReLUTest +{ +}; + +struct GPU_PReLU_bwd_BFP16 : PReLUTest +{ +}; + +} // namespace prelu +using namespace prelu; + +TEST_P(GPU_PReLU_bwd_FP32, Test) +{ + if(CheckFloatArg("--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_PReLU_bwd_FP16, Test) +{ + if(CheckFloatArg("--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_PReLU_bwd_BFP16, Test) +{ + if(CheckFloatArg("--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUSmokeTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUFullTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUPerfTestConfigs())); diff --git a/test/gtest/prelu.hpp b/test/gtest/prelu.hpp new file mode 100644 index 0000000000..510cfb85f4 --- /dev/null +++ b/test/gtest/prelu.hpp @@ -0,0 +1,247 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_prelu.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" + +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct PReLUTestCase +{ + std::vector lengths; + bool full_params; + bool contiguous; + + friend std::ostream& operator<<(std::ostream& os, const PReLUTestCase& tc) + { + return os << " Lengths:" << tc.lengths + << " Full_num_params:" << (tc.full_params ? "True" : "False") + << " Contiguous:" << (tc.contiguous ? "True" : "False"); + } +}; + +inline std::vector PReLUSmokeTestConfigs() +{ + std::vector tcs; + + tcs.push_back({{64, 4}, true, true}); + tcs.push_back({{64, 4}, true, false}); + tcs.push_back({{64, 4}, false, true}); + tcs.push_back({{64, 4}, false, false}); + + tcs.push_back({{64, 112}, true, true}); + tcs.push_back({{64, 112}, true, false}); + tcs.push_back({{64, 112}, false, true}); + tcs.push_back({{64, 112}, false, false}); + + return tcs; +} + +inline std::vector PReLUPerfTestConfigs() +{ + std::vector tcs; + + tcs.push_back({{64, 112, 50}, true, true}); + tcs.push_back({{64, 112, 50}, true, false}); + tcs.push_back({{64, 112, 50}, false, true}); + tcs.push_back({{64, 112, 50}, false, false}); + + return tcs; +} + +inline std::vector PReLUFullTestConfigs() +{ + std::vector tcs; + + auto smoke_test = PReLUSmokeTestConfigs(); + auto perf_test = PReLUPerfTestConfigs(); + + tcs.reserve(smoke_test.size() + perf_test.size()); + for(const auto& test : smoke_test) + tcs.push_back(test); + for(const auto& test : perf_test) + tcs.push_back(test); + + return tcs; +} + +inline std::vector GetStrides(std::vector lengths, bool contiguous) +{ + if(!contiguous) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +struct PReLUTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + prelu_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-6, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-6, 99); }; + + auto lengths = prelu_config.lengths; + + auto input_strides = GetStrides(lengths, prelu_config.contiguous); + input = tensor{lengths, input_strides}.generate(gen_value1); + + std::vector weight_length = {1}; + if(prelu_config.full_params) + weight_length = {lengths[1]}; + std::vector weight_strides = {prelu_config.contiguous ? 1 : 2}; + weight = tensor{weight_length, weight_strides}.generate(gen_value2); + + ws_sizeInBytes = miopen::GetPReLUBackwardWorkspaceSize(handle, input.desc, weight.desc); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + if(ws_sizeInBytes != 0) + { + std::vector workspace_dims; + workspace_dims.push_back(ws_sizeInBytes / sizeof(float)); + + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), 0.0f); + + workspace_dev = handle.Write(workspace.data); + } + + dinput = tensor{lengths, input_strides}; + ref_dinput = tensor{lengths, input_strides}; + std::fill(dinput.begin(), dinput.end(), static_cast(0.0f)); + std::fill(ref_dinput.begin(), ref_dinput.end(), static_cast(0.0f)); + + dweight = tensor{weight_length, weight_strides}; + ref_dweight = tensor{weight_length, weight_strides}; + std::fill(dweight.begin(), dweight.end(), static_cast(0.0f)); + std::fill(ref_dweight.begin(), ref_dweight.end(), static_cast(0.0f)); + + auto out_strides = GetStrides(lengths, true); + doutput = tensor{lengths, out_strides}; + std::fill(doutput.begin(), doutput.end(), static_cast(1.0f)); + + input_dev = handle.Write(input.data); + weight_dev = handle.Write(weight.data); + doutput_dev = handle.Write(doutput.data); + dinput_dev = handle.Write(dinput.data); + dweight_dev = handle.Write(dweight.data); + } + + void RunTest() + { + cpu_prelu_backward(input, weight, doutput, ref_dinput, ref_dweight); + + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::PReLUBackward(handle, + workspace_dev.get(), + ws_sizeInBytes, + input.desc, + input_dev.get(), + weight.desc, + weight_dev.get(), + doutput.desc, + doutput_dev.get(), + dinput.desc, + dinput_dev.get(), + dweight.desc, + dweight_dev.get()); + EXPECT_EQ(status, miopenStatusSuccess); + dinput.data = handle.Read(dinput_dev, dinput.data.size()); + dweight.data = handle.Read(dweight_dev, dweight.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + + auto error_dinput = miopen::rms_range(ref_dinput, dinput); + auto error_dweight = miopen::rms_range(ref_dweight, dweight); + ASSERT_EQ(miopen::range_distance(ref_dinput), miopen::range_distance(dinput)); + ASSERT_EQ(miopen::range_distance(ref_dweight), miopen::range_distance(dweight)); + EXPECT_LT(error_dinput, tolerance) + << "Error backward Input Gradient beyond tolerance Error: " << error_dinput + << ", Tolerance: " << tolerance; + EXPECT_LT(error_dweight, tolerance) + << "Error backward Weight Gradient beyond tolerance Error: " << error_dweight + << ", Tolerance: " << tolerance; + } + + PReLUTestCase prelu_config; + + tensor input; + tensor weight; + tensor doutput; + tensor dinput; + tensor dweight; + tensor workspace; + + tensor ref_dinput; + tensor ref_dweight; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr doutput_dev; + miopen::Allocator::ManageDataPtr dinput_dev; + miopen::Allocator::ManageDataPtr dweight_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; +}; diff --git a/test/nightlies/JenkinsfileNightlyAux b/test/nightlies/JenkinsfileNightlyAux index d1b7e70951..feb6ab5dc7 100644 --- a/test/nightlies/JenkinsfileNightlyAux +++ b/test/nightlies/JenkinsfileNightlyAux @@ -46,7 +46,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 8, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyConv2D b/test/nightlies/JenkinsfileNightlyConv2D index 5b67507376..809cfff95c 100644 --- a/test/nightlies/JenkinsfileNightlyConv2D +++ b/test/nightlies/JenkinsfileNightlyConv2D @@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 8, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyConv2Daux b/test/nightlies/JenkinsfileNightlyConv2Daux index e6439313f7..f967352b21 100644 --- a/test/nightlies/JenkinsfileNightlyConv2Daux +++ b/test/nightlies/JenkinsfileNightlyConv2Daux @@ -48,7 +48,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 8, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyFastFullConv2D b/test/nightlies/JenkinsfileNightlyFastFullConv2D index eaa703b3b1..a94992002b 100644 --- a/test/nightlies/JenkinsfileNightlyFastFullConv2D +++ b/test/nightlies/JenkinsfileNightlyFastFullConv2D @@ -43,7 +43,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 12, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_ENABLE_LOGGING_CMD=1 MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyFusions b/test/nightlies/JenkinsfileNightlyFusions index 84d8329c6a..269480baf7 100644 --- a/test/nightlies/JenkinsfileNightlyFusions +++ b/test/nightlies/JenkinsfileNightlyFusions @@ -46,7 +46,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 8, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyRNN b/test/nightlies/JenkinsfileNightlyRNN index b299f832cc..33ac860053 100644 --- a/test/nightlies/JenkinsfileNightlyRNN +++ b/test/nightlies/JenkinsfileNightlyRNN @@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 8, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=Off -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyReduce b/test/nightlies/JenkinsfileNightlyReduce index a7d6c6becb..c2b14e8f79 100644 --- a/test/nightlies/JenkinsfileNightlyReduce +++ b/test/nightlies/JenkinsfileNightlyReduce @@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 6, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}" diff --git a/test/nightlies/JenkinsfileNightlyTensorOps b/test/nightlies/JenkinsfileNightlyTensorOps index e19a0a9f18..d326c64f9f 100644 --- a/test/nightlies/JenkinsfileNightlyTensorOps +++ b/test/nightlies/JenkinsfileNightlyTensorOps @@ -48,7 +48,8 @@ def buildJob(compiler, flags, image, test, testargs){ timeout(time: 6, unit: 'HOURS') { sh "echo \$HSA_ENABLE_SDMA" - sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." + // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors. + sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=Off -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .." sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}" sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"