Skip to content

Commit

Permalink
Merge branch 'v1.0.0-rc.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
Alfio Lazzaro committed Dec 11, 2018
2 parents 18cb726 + 188da68 commit 96f626d
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 93 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ if (USE_CUDA)
set(CUDA_ARCH_NUMBER_K80 37)
set(CUDA_ARCH_NUMBER_P100 60)
set(CUDA_ARCH_NUMBER ${CUDA_ARCH_NUMBER_${WITH_GPU}})

# assume that the backend compiler for nvcc understands the -std=c++11
set(CMAKE_CUDA_FLAGS "-std=c++11 -arch=sm_${CUDA_ARCH_NUMBER} --cudart static")
endif ()


Expand All @@ -48,6 +51,7 @@ find_package(PkgConfig)

if (USE_MPI)
get_property(REQUIRED_MPI_COMPONENTS GLOBAL PROPERTY ENABLED_LANGUAGES)
list(REMOVE_ITEM REQUIRED_MPI_COMPONENTS CUDA) # CUDA does not have a MPI component
find_package(MPI COMPONENTS ${REQUIRED_MPI_COMPONENTS} REQUIRED)
endif ()

Expand Down Expand Up @@ -85,6 +89,7 @@ endif ()
get_filename_component(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME)

if (Fortran_COMPILER_NAME MATCHES "gfortran.*")
set(CMAKE_CXX_FLAGS "-std=c++11")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops")
set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -fprofile-arcs -ftest-coverage")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb")
Expand Down
4 changes: 2 additions & 2 deletions VERSION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MAJOR = 1
MINOR = 0
PATCH = 0-rc.2
PATCH = 0-rc.3
# A specific DATE (Month/Year) fixes an official release, otherwise
# it is considered Development version.
DATE = 2018-12-04
DATE = 2018-12-11
2 changes: 0 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,6 @@ function(CUDA_CONVERT_FLAGS EXISTING_TARGET)
endfunction()

if (USE_CUDA)
set(CMAKE_CUDA_FLAGS "-arch=sm_${CUDA_ARCH_NUMBER} --cudart static")

target_link_libraries(dbcsr PRIVATE cuda)

# add libcusmm which has its own CMakeLists.txt due to code generation
Expand Down
31 changes: 11 additions & 20 deletions src/acc/cublaswrap/cublas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ extern "C" int cublas_destroy(cublasHandle_t *handle)
}

/****************************************************************************/
extern "C" int cublas_dgemm_loop(cublasHandle_t *handle, char transa, char transb,
int *stack_params, int ps_width, int stack_size,
double *a_data, double *b_data, double *c_data,
double alpha, double beta, cudaStream_t *stream)
extern "C" int cublas_dgemm(cublasHandle_t *handle, char transa, char transb,
int m, int n, int k,
int a_offset, int b_offset, int c_offset,
double *a_data, double *b_data, double *c_data,
double alpha, double beta, cudaStream_t *stream)
{
cublasStatus_t cStatus = cublasSetStream(*handle, *stream);
if (cStatus != CUBLAS_STATUS_SUCCESS) {
Expand All @@ -52,27 +53,17 @@ extern "C" int cublas_dgemm_loop(cublasHandle_t *handle, char transa, char trans
}
cublasOperation_t cTransa = transa=='N' ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cTransb = transb=='N' ? CUBLAS_OP_N : CUBLAS_OP_T;
int m, n, k;
int &lda = transa=='N' ? m : k;
int &ldb = transb=='N' ? k : n;

for (int ii = 0; ii < stack_size; ii++) {
// get mnk from stack data
m = stack_params[ ps_width * ii ];
n = stack_params[ ps_width * ii + 1];
k = stack_params[ ps_width * ii + 2];

// get first element of data, index - 1 becasue data comes from fortran
double *a_mat = &a_data[ stack_params[ ps_width * ii + 3 ] - 1 ];
double *b_mat = &b_data[ stack_params[ ps_width * ii + 4 ] - 1 ];
double *c_mat = &c_data[ stack_params[ ps_width * ii + 5 ] - 1 ];

cublasStatus_t stat = cublasDgemm(*handle, cTransa, cTransb, m, n, k, &alpha, a_mat, lda, b_mat, ldb, &beta, c_mat, lda);
if (stat != CUBLAS_STATUS_SUCCESS) return(-1);
}
cublasStatus_t stat = cublasDgemm(*handle, cTransa, cTransb,
m, n, k,
&alpha, &a_data[ a_offset ], lda,
&b_data[ b_offset], ldb,
&beta, &c_data[ c_offset], lda);
if (stat != CUBLAS_STATUS_SUCCESS) return(-1);
if (cuda_error_check(cudaGetLastError())) return(-1);
return(0);
}

#endif

15 changes: 7 additions & 8 deletions src/acc/cublaswrap/dbcsr_cublas.F
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ MODULE dbcsr_cublas
PUBLIC :: cublas_handle_type

#if (__DBCSR_ACC == 2)
PUBLIC :: cublas_dgemm_loop_cu
PUBLIC :: cublas_dgemm_cu
#endif

TYPE cublas_handle_type
Expand All @@ -50,21 +50,20 @@ FUNCTION cublas_destroy_cu(handle) &
INTEGER(KIND=C_INT) :: istat
END FUNCTION cublas_destroy_cu

FUNCTION cublas_dgemm_loop_cu(handle, transa, transb, &
stack_params, ps_width, stack_size, &
a_data, b_data, c_data, alpha, beta, stream) &
FUNCTION cublas_dgemm_cu(handle, transa, transb, &
m, n, k, a_offset, b_offset, c_offset, &
a_data, b_data, c_data, alpha, beta, stream) &
RESULT(istat) &
BIND(C, name="cublas_dgemm_loop")
BIND(C, name="cublas_dgemm")
IMPORT
TYPE(C_PTR), INTENT(IN), VALUE :: handle
CHARACTER(KIND=C_CHAR), INTENT(IN), VALUE :: transa, transb
TYPE(C_PTR), INTENT(IN), VALUE :: stack_params
INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ps_width, stack_size
INTEGER(KIND=C_INT), INTENT(IN), VALUE :: m, n, k, a_offset, b_offset, c_offset
TYPE(C_PTR), INTENT(IN), VALUE :: a_data, b_data, c_data
REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: alpha, beta
TYPE(C_PTR), INTENT(IN), VALUE :: stream
INTEGER(KIND=C_INT) :: istat
END FUNCTION cublas_dgemm_loop_cu
END FUNCTION cublas_dgemm_cu

END INTERFACE
#endif
Expand Down
2 changes: 2 additions & 0 deletions src/acc/dbcsr_acc_devmem.F
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,8 @@ SUBROUTINE dev2host_raw(this, hostmem_cptr, n_bytes, stream)
INTEGER :: istat
TYPE(C_PTR) :: stream_cptr

IF (.NOT. acc_devmem_allocated(this)) RETURN

IF (this%size_in_bytes < n_bytes) &
DBCSR_ABORT("acc_devmem_dev2host: this too small")

Expand Down
2 changes: 1 addition & 1 deletion src/acc/libsmm_acc/libcusmm/libcusmm_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ int libcusmm_benchmark_transpose(libcusmm_benchmark_t* handle,
exit(1);
}

int errors;
int errors = 0;
errors += libcusmm_benchmark_transpose_(handle->n_stack_trs_a, handle->stack_trs_a, handle->d_stack_trs_a,
handle->mat_a, handle->mat_trs_a, handle->d_mat_a,
handle->n_a, mat_m, mat_n,
Expand Down
16 changes: 0 additions & 16 deletions src/acc/libsmm_acc/libcusmm/parameters_K20X.json
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,6 @@
{"m": 9, "n": 9, "k": 32, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 159.535},
{"m": 9, "n": 9, "k": 45, "tile_m": 1, "tile_n": 1, "w": 16, "v": 8, "threads": 160, "grouping": 16, "minblocks": 1, "algorithm": "largeDB2", "perf": 160.12},
{"m": 9, "n": 9, "k": 64, "tile_m": 1, "tile_n": 1, "w": 14, "v": 6, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 160.483},
{"m": 9, "n": 9, "k": 81, "tile_m": 1, "tile_n": 1, "w": 14, "v": 6, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 164.81},
{"m": 9, "n": 13, "k": 4, "tile_m": 1, "tile_n": 1, "w": 2, "v": 12, "threads": 128, "grouping": 16, "minblocks": 1, "algorithm": "largeDB2", "perf": 68.546},
{"m": 9, "n": 13, "k": 5, "threads": 128, "grouping": 16, "minblocks": 1, "algorithm": "tiny", "perf": 81.5548},
{"m": 9, "n": 13, "k": 6, "tile_m": 1, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "small", "perf": 86.6047},
Expand Down Expand Up @@ -1158,24 +1157,17 @@
{"m": 9, "n": 64, "k": 16, "tile_m": 3, "tile_n": 2, "w": 6, "v": 48, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 262.958},
{"m": 9, "n": 64, "k": 22, "tile_m": 3, "tile_n": 2, "w": 6, "v": 32, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 285.617},
{"m": 9, "n": 64, "k": 64, "tile_m": 3, "tile_n": 2, "w": 10, "v": 64, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 319.273},
{"m": 9, "n": 81, "k": 9, "tile_m": 5, "tile_n": 2, "w": 2, "v": 52, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 202.245},
{"m": 10, "n": 4, "k": 4, "tile_m": 1, "tile_n": 1, "w": 2, "v": 4, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "largeDB2", "perf": 28.1008},
{"m": 10, "n": 4, "k": 10, "tile_m": 1, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 4, "algorithm": "small", "perf": 54.1832},
{"m": 10, "n": 4, "k": 15, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 70.4501},
{"m": 10, "n": 10, "k": 4, "threads": 128, "grouping": 16, "minblocks": 1, "algorithm": "tiny", "perf": 74.7338},
{"m": 10, "n": 10, "k": 10, "tile_m": 1, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 116.741},
{"m": 10, "n": 10, "k": 15, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 141.061},
{"m": 10, "n": 10, "k": 100, "tile_m": 2, "tile_n": 2, "w": 20, "v": 10, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 193.259},
{"m": 10, "n": 15, "k": 4, "tile_m": 1, "tile_n": 2, "w": 2, "v": 8, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "largeDB2", "perf": 81.7434},
{"m": 10, "n": 15, "k": 10, "tile_m": 2, "tile_n": 1, "w": 4, "v": 8, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "largeDB2", "perf": 137.343},
{"m": 10, "n": 15, "k": 15, "tile_m": 2, "tile_n": 1, "w": 6, "v": 8, "threads": 96, "grouping": 16, "minblocks": 1, "algorithm": "largeDB2", "perf": 168.752},
{"m": 10, "n": 100, "k": 10, "tile_m": 2, "tile_n": 4, "threads": 128, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 250.611},
{"m": 11, "n": 11, "k": 11, "tile_m": 1, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 148.474},
{"m": 11, "n": 11, "k": 121, "tile_m": 3, "tile_n": 2, "w": 16, "v": 6, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 206.461},
{"m": 11, "n": 121, "k": 11, "tile_m": 3, "tile_n": 2, "threads": 256, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 264.615},
{"m": 12, "n": 12, "k": 12, "tile_m": 2, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "small", "perf": 154.228},
{"m": 12, "n": 12, "k": 144, "tile_m": 3, "tile_n": 2, "w": 16, "v": 8, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 249.09},
{"m": 12, "n": 144, "k": 12, "tile_m": 3, "tile_n": 3, "w": 6, "v": 80, "threads": 256, "grouping": 16, "minblocks": 4, "algorithm": "largeDB1", "perf": 273.287},
{"m": 13, "n": 4, "k": 4, "tile_m": 1, "tile_n": 1, "w": 2, "v": 4, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "largeDB2", "perf": 35.8537},
{"m": 13, "n": 4, "k": 5, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 43.1223},
{"m": 13, "n": 4, "k": 6, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 50.5155},
Expand Down Expand Up @@ -1280,7 +1272,6 @@
{"m": 13, "n": 13, "k": 28, "tile_m": 2, "tile_n": 2, "w": 14, "v": 12, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB2", "perf": 218.849},
{"m": 13, "n": 13, "k": 32, "tile_m": 2, "tile_n": 2, "w": 16, "v": 8, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB2", "perf": 223.821},
{"m": 13, "n": 13, "k": 45, "tile_m": 2, "tile_n": 2, "w": 16, "v": 12, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB2", "perf": 232.303},
{"m": 13, "n": 13, "k": 169, "tile_m": 2, "tile_n": 2, "w": 18, "v": 8, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 258.16},
{"m": 13, "n": 16, "k": 4, "tile_m": 2, "tile_n": 2, "threads": 64, "grouping": 16, "minblocks": 4, "algorithm": "small", "perf": 100.974},
{"m": 13, "n": 16, "k": 5, "tile_m": 1, "tile_n": 2, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 114.075},
{"m": 13, "n": 16, "k": 6, "tile_m": 2, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 131.042},
Expand Down Expand Up @@ -1412,12 +1403,10 @@
{"m": 13, "n": 45, "k": 28, "tile_m": 2, "tile_n": 3, "w": 14, "v": 36, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB2", "perf": 338.61},
{"m": 13, "n": 45, "k": 32, "tile_m": 2, "tile_n": 5, "w": 12, "v": 38, "threads": 96, "grouping": 16, "minblocks": 8, "algorithm": "largeDB2", "perf": 342.906},
{"m": 13, "n": 45, "k": 45, "tile_m": 2, "tile_n": 5, "w": 12, "v": 44, "threads": 96, "grouping": 16, "minblocks": 8, "algorithm": "largeDB2", "perf": 366.795},
{"m": 13, "n": 169, "k": 13, "tile_m": 4, "tile_n": 3, "w": 6, "v": 96, "threads": 256, "grouping": 16, "minblocks": 1, "algorithm": "largeDB1", "perf": 241.485},
{"m": 14, "n": 14, "k": 14, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 197.885},
{"m": 14, "n": 14, "k": 16, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 216.145},
{"m": 14, "n": 14, "k": 29, "tile_m": 2, "tile_n": 2, "threads": 128, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 218.838},
{"m": 14, "n": 14, "k": 32, "tile_m": 2, "tile_n": 2, "w": 16, "v": 14, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 224.281},
{"m": 14, "n": 14, "k": 196, "tile_m": 2, "tile_n": 2, "w": 16, "v": 12, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 289.084},
{"m": 14, "n": 16, "k": 14, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 213.982},
{"m": 14, "n": 16, "k": 16, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 230.421},
{"m": 14, "n": 16, "k": 29, "tile_m": 2, "tile_n": 2, "w": 12, "v": 16, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 232.968},
Expand All @@ -1428,7 +1417,6 @@
{"m": 14, "n": 32, "k": 14, "tile_m": 2, "tile_n": 2, "threads": 160, "grouping": 16, "minblocks": 8, "algorithm": "medium", "perf": 258.209},
{"m": 14, "n": 32, "k": 29, "tile_m": 2, "tile_n": 2, "w": 10, "v": 32, "threads": 160, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 311.422},
{"m": 14, "n": 32, "k": 32, "tile_m": 2, "tile_n": 2, "w": 16, "v": 32, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 326.197},
{"m": 14, "n": 196, "k": 14, "tile_m": 2, "tile_n": 6, "threads": 256, "grouping": 16, "minblocks": 1, "algorithm": "medium", "perf": 278.411},
{"m": 15, "n": 4, "k": 4, "tile_m": 1, "tile_n": 1, "w": 2, "v": 4, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "largeDB2", "perf": 41.0244},
{"m": 15, "n": 4, "k": 10, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 76.5643},
{"m": 15, "n": 4, "k": 15, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 101.102},
Expand All @@ -1438,8 +1426,6 @@
{"m": 15, "n": 15, "k": 4, "tile_m": 2, "tile_n": 2, "threads": 64, "grouping": 16, "minblocks": 1, "algorithm": "small", "perf": 119.369},
{"m": 15, "n": 15, "k": 10, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 184.444},
{"m": 15, "n": 15, "k": 15, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 226.439},
{"m": 15, "n": 15, "k": 225, "tile_m": 2, "tile_n": 2, "w": 16, "v": 10, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 311.643},
{"m": 15, "n": 225, "k": 15, "tile_m": 3, "tile_n": 3, "w": 4, "v": 150, "threads": 384, "grouping": 16, "minblocks": 1, "algorithm": "largeDB1", "perf": 258.751},
{"m": 16, "n": 4, "k": 4, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 43.2513},
{"m": 16, "n": 4, "k": 5, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 52.8509},
{"m": 16, "n": 4, "k": 6, "threads": 128, "grouping": 16, "minblocks": 1, "algorithm": "tiny", "perf": 56.0623},
Expand Down Expand Up @@ -1539,7 +1525,6 @@
{"m": 16, "n": 16, "k": 32, "tile_m": 2, "tile_n": 2, "w": 14, "v": 14, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 276.949},
{"m": 16, "n": 16, "k": 55, "tile_m": 2, "tile_n": 2, "w": 14, "v": 16, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 303.184},
{"m": 16, "n": 16, "k": 64, "tile_m": 2, "tile_n": 2, "w": 12, "v": 12, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 312.221},
{"m": 16, "n": 16, "k": 256, "tile_m": 2, "tile_n": 2, "w": 14, "v": 12, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "largeDB1", "perf": 341.89},
{"m": 16, "n": 17, "k": 4, "tile_m": 2, "tile_n": 2, "threads": 96, "grouping": 16, "minblocks": 4, "algorithm": "medium", "perf": 113.047},
{"m": 16, "n": 17, "k": 5, "tile_m": 1, "tile_n": 3, "threads": 96, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 135.825},
{"m": 16, "n": 17, "k": 6, "tile_m": 1, "tile_n": 3, "threads": 128, "grouping": 16, "minblocks": 12, "algorithm": "medium", "perf": 152.914},
Expand Down Expand Up @@ -1630,7 +1615,6 @@
{"m": 16, "n": 64, "k": 16, "tile_m": 2, "tile_n": 4, "w": 8, "v": 40, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 374.147},
{"m": 16, "n": 64, "k": 22, "tile_m": 2, "tile_n": 4, "w": 8, "v": 40, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 409.267},
{"m": 16, "n": 64, "k": 64, "tile_m": 2, "tile_n": 4, "w": 8, "v": 44, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "largeDB1", "perf": 500.346},
{"m": 16, "n": 256, "k": 16, "tile_m": 2, "tile_n": 6, "w": 6, "v": 168, "threads": 384, "grouping": 16, "minblocks": 1, "algorithm": "largeDB1", "perf": 309.179},
{"m": 17, "n": 4, "k": 4, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 1, "algorithm": "medium", "perf": 44.4021},
{"m": 17, "n": 4, "k": 5, "tile_m": 1, "tile_n": 1, "threads": 96, "grouping": 16, "minblocks": 1, "algorithm": "medium", "perf": 54.1312},
{"m": 17, "n": 4, "k": 6, "tile_m": 1, "tile_n": 1, "threads": 128, "grouping": 16, "minblocks": 8, "algorithm": "small", "perf": 55.1151},
Expand Down
Loading

0 comments on commit 96f626d

Please sign in to comment.