Skip to content

Commit

Permalink
Matrix Mult bench vs Intel MKL-DNN (#70)
Browse files Browse the repository at this point in the history
* Add Intel Deep Learning library MKL-DNN / DNNL as a submodule for benching

* Use an older MKL-DNN (1.0.4) that did not entangle everything with cpu_engine.cpp

* Add Intel MKL-DNN AVX2 and AVX512 benchmarks
  • Loading branch information
mratsim authored Dec 27, 2019
1 parent 3239bc1 commit 77f431e
Show file tree
Hide file tree
Showing 7 changed files with 323 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@
[submodule "benchmarks/vendor/tasking-2.0"]
path = benchmarks/vendor/tasking-2.0
url = https://github.com/aprell/tasking-2.0
[submodule "benchmarks/vendor/mkl-dnn"]
path = benchmarks/vendor/mkl-dnn
url = https://github.com/intel/mkl-dnn
59 changes: 59 additions & 0 deletions benchmarks/matmul_gemm_blas/mkldnn_gemm_jit_avx2.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# GEMM (GEneralized Matrix Multiplication) using MKL-DNN / DNNL
# Intel Deep Neural Network Library.

import
./gemm_bench_common, ./gemm_bench_config,
../vendor/mkldnn


proc benchMKLDNN(a, b: seq[float32], ashape, bshape: MatrixShape, nb_samples: int): seq[float32] =
let req_ops = gemm_required_ops(ashape, bshape)
let out_shape = gemm_out_shape(ashape, bshape)
let out_size = out_shape.M * out_shape.N

result = newSeq[float32](out_size)
var # MKL-DNN wants pointers as inputs
trans = 'N'
m = int32 M
n = int32 N
k = int32 K
alpha = 1'f32
lda = int32 K
ldb = int32 N
beta = 0'f32
ldc = int32 N

bench("Intel MKL-DNN / DNNL JIT AVX benchmark", req_ops):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(result[0].addr, out_size * sizeof(float32)) # We zero memory between computation
do:
# Main work
discard mkldnn_jit_avx_gemm_f32(
trans.addr, trans.addr,
m.addr, n.addr, k.addr,
alpha.addr, a[0].unsafeaddr, lda.addr,
b[0].unsafeAddr, ldb.addr,
beta.addr, result[0].addr, ldc.addr,
bias = nil
)

# Bench
when isMainModule:
import std/[random, sequtils]

randomize(42) # FOr reproducibility
# warmup()
reportConfig("Intel MKL-DNN JIT AVX", float32, (M, K), (K, N))

block:
let a = newSeqWith(M*K, float32 rand(-0.1..0.1))
let b = newSeqWith(K*N, float32 rand(-0.1..0.1))

let mkl = benchMKLDNN(a, b, (M,K), (K,N), NbSamples)
59 changes: 59 additions & 0 deletions benchmarks/matmul_gemm_blas/mkldnn_gemm_jit_avx512.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# GEMM (GEneralized Matrix Multiplication) using MKL-DNN / DNNL
# Intel Deep Neural Network Library.

import
./gemm_bench_common, ./gemm_bench_config,
../vendor/mkldnn


proc benchMKLDNN(a, b: seq[float32], ashape, bshape: MatrixShape, nb_samples: int): seq[float32] =
let req_ops = gemm_required_ops(ashape, bshape)
let out_shape = gemm_out_shape(ashape, bshape)
let out_size = out_shape.M * out_shape.N

result = newSeq[float32](out_size)
var # MKL-DNN wants pointers as inputs
trans = 'N'
m = int32 M
n = int32 N
k = int32 K
alpha = 1'f32
lda = int32 K
ldb = int32 N
beta = 0'f32
ldc = int32 N

bench("Intel MKL-DNN / DNNL JIT AVX512 benchmark", req_ops):
# Initialisation, not measured apart for the "Collected n samples in ... seconds"
zeroMem(result[0].addr, out_size * sizeof(float32)) # We zero memory between computation
do:
# Main work
discard mkldnn_jit_avx512_common_gemm_f32(
trans.addr, trans.addr,
m.addr, n.addr, k.addr,
alpha.addr, a[0].unsafeaddr, lda.addr,
b[0].unsafeAddr, ldb.addr,
beta.addr, result[0].addr, ldc.addr,
bias = nil
)

# Bench
when isMainModule:
import std/[random, sequtils]

randomize(42) # FOr reproducibility
# warmup()
reportConfig("Intel MKL-DNN JIT AVX512", float32, (M, K), (K, N))

block:
let a = newSeqWith(M*K, float32 rand(-0.1..0.1))
let b = newSeqWith(K*N, float32 rand(-0.1..0.1))

let mkl = benchMKLDNN(a, b, (M,K), (K,N), NbSamples)
1 change: 1 addition & 0 deletions benchmarks/vendor/mkl-dnn
Submodule mkl-dnn added at a0a87d
84 changes: 84 additions & 0 deletions benchmarks/vendor/mkldnn.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Weave
# Copyright (c) 2019 Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import os, strutils
const cSourcesPath = currentSourcePath.rsplit(DirSep, 1)[0] & '/'

{.passC:"-fopenmp".}
{.passL:"-fopenmp".}

{.passC:"-I" & cSourcesPath.}
{.passC:"-I" & cSourcesPath & "mkl-dnn/include".}
{.passC:"-I" & cSourcesPath & "mkl-dnn/src/common".}
{.passC:"-I" & cSourcesPath & "mkl-dnn/src/cpu".}
{.passC:"-I" & cSourcesPath & "mkl-dnn/src/cpu/gemm/f32".}
# {.passC:"-std=c++11".}

{.compile: cSourcesPath & "mkl-dnn/src/common/utils.cpp".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/jit_utils/jit_utils.cpp".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/jit_utils/jitprofiling/jitprofiling.c".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/gemm_utils_f32.cpp".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.cpp".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.cpp".}
{.compile: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.cpp".}


type MkldnnStatus {.importc: "mkldnn_status_t".} = enum
# The operation was successful
MkldnnSuccess = 0,
# The operation failed due to an out-of-memory condition
MkldnnOutOfMemory = 1,
# The operation failed and should be retried
MkldnnTryAgain = 2,
# The operation failed because of incorrect function arguments
MkldnnInvalidArguments = 3,
# The operation failed because a primitive was not ready for execution
MkldnnNotReady = 4,
# The operation failed because requested functionality is not implemented
MkldnnUnimplemented = 5,
# Primitive iterator passed over last primitive descriptor
MkldnnIteratorEnds = 6,
# Primitive or engine failed on execution
MkldnnRuntimeError = 7,
# Queried element is not required for given primitive
MkldnnNotRequired = 8

proc mkldnn_ref_gemm*[T](
transa: ptr char, transb: ptr char,
M, N, K: ptr int32,
alpha, A: ptr T, lda: ptr int32,
B: ptr T, ldb: ptr int32,
beta, C: ptr T, ldc: ptr int32,
bias: ptr T
): MkldnnStatus {.
importcpp:"mkldnn::impl::cpu::ref_gemm<'*6>(@)",
header: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/ref_gemm_f32.hpp"
.}

proc mkldnn_jit_avx_gemm_f32*(
transa: ptr char, transb: ptr char,
M, N, K: ptr int32,
alpha, A: ptr float32, lda: ptr int32,
B: ptr float32, ldb: ptr int32,
beta, C: ptr float32, ldc: ptr int32,
bias: ptr float32
): MkldnnStatus {.
importcpp:"mkldnn::impl::cpu::jit_avx_gemm_f32(@)",
header: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/jit_avx_gemm_f32.hpp"
.}

proc mkldnn_jit_avx512_common_gemm_f32*(
transa: ptr char, transb: ptr char,
M, N, K: ptr int32,
alpha, A: ptr float32, lda: ptr int32,
B: ptr float32, ldb: ptr int32,
beta, C: ptr float32, ldc: ptr int32,
bias: ptr float32
): MkldnnStatus {.
importcpp:"mkldnn::impl::cpu::jit_avx512_common_gemm_f32(@)",
header: cSourcesPath & "mkl-dnn/src/cpu/gemm/f32/jit_avx512_common_gemm_f32.hpp"
.}
85 changes: 85 additions & 0 deletions benchmarks/vendor/mkldnn_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef MKLDNN_CONFIG_H
#define MKLDNN_CONFIG_H

#ifndef DOXYGEN_SHOULD_SKIP_THIS

// All symbols shall be internal unless marked as MKLDNN_API
#if defined _WIN32 || defined __CYGWIN__
# define MKLDNN_HELPER_DLL_IMPORT __declspec(dllimport)
# define MKLDNN_HELPER_DLL_EXPORT __declspec(dllexport)
#else
# if __GNUC__ >= 4
# define MKLDNN_HELPER_DLL_IMPORT __attribute__((visibility("default")))
# define MKLDNN_HELPER_DLL_EXPORT __attribute__((visibility("default")))
# else
# define MKLDNN_HELPER_DLL_IMPORT
# define MKLDNN_HELPER_DLL_EXPORT
# endif
#endif

#ifdef MKLDNN_DLL
# ifdef MKLDNN_DLL_EXPORTS
# define MKLDNN_API MKLDNN_HELPER_DLL_EXPORT
# else
# define MKLDNN_API MKLDNN_HELPER_DLL_IMPORT
# endif
#else
# define MKLDNN_API
#endif

#if defined (__GNUC__)
# define MKLDNN_DEPRECATED __attribute__((deprecated))
#elif defined(_MSC_VER)
# define MKLDNN_DEPRECATED __declspec(deprecated)
#else
# define MKLDNN_DEPRECATED
#endif
#endif // DOXYGEN_SHOULD_SKIP_THIS

// No runtime (disabled)
#define MKLDNN_RUNTIME_NONE 0u
// Sequential runtime (CPU only)
#define MKLDNN_RUNTIME_SEQ 1u
// OpenMP runtime (CPU only)
#define MKLDNN_RUNTIME_OMP 2u
// TBB runtime (CPU only)
#define MKLDNN_RUNTIME_TBB 4u
// OpenCL runtime
#define MKLDNN_RUNTIME_OCL 256u

// MKL-DNN CPU engine runtime
#define MKLDNN_CPU_RUNTIME MKLDNN_RUNTIME_OMP

// MKL-DNN GPU engine runtime
#define MKLDNN_GPU_RUNTIME MKLDNN_RUNTIME_NONE

#if defined(MKLDNN_CPU_RUNTIME) && defined(MKLDNN_GPU_RUNTIME)
# if (MKLDNN_CPU_RUNTIME == MKLDNN_RUNTIME_NONE) \
|| (MKLDNN_CPU_RUNTIME == MKLDNN_RUNTIME_OCL)
# error "Unexpected MKLDNN_CPU_RUNTIME"
# endif
# if (MKLDNN_GPU_RUNTIME != MKLDNN_RUNTIME_NONE) \
&& (MKLDNN_GPU_RUNTIME != MKLDNN_RUNTIME_OCL)
# error "Unexpected MKLDNN_GPU_RUNTIME"
# endif
#else
# error "BOTH MKLDNN_CPU_RUNTIME and MKLDNN_GPU_RUNTIME must be defined"
#endif

#endif
32 changes: 32 additions & 0 deletions benchmarks/vendor/mkldnn_version.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef MKLDNN_VERSION_H
#define MKLDNN_VERSION_H

/// Major version
#define MKLDNN_VERSION_MAJOR 1

/// Minor version
#define MKLDNN_VERSION_MINOR 0

/// Patch version
#define MKLDNN_VERSION_PATCH 4

/// Git commit hash
#define MKLDNN_VERSION_HASH "a0a87d662edeef38d01db4ac5dd25f59a1f0881f"

#endif

0 comments on commit 77f431e

Please sign in to comment.