From 101a98a2635125622a8cdf6ed3c198371353053a Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Wed, 13 Jul 2022 09:42:05 +0800 Subject: [PATCH 01/15] add memcpy2d example --- memcpy_2d_example_gfx1030/main.cpp | 185 +++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 memcpy_2d_example_gfx1030/main.cpp diff --git a/memcpy_2d_example_gfx1030/main.cpp b/memcpy_2d_example_gfx1030/main.cpp new file mode 100644 index 0000000..bfbd56f --- /dev/null +++ b/memcpy_2d_example_gfx1030/main.cpp @@ -0,0 +1,185 @@ +#include +#include +#include +#include + +#define HIP_CALL(call) do{ \ + hipError_t err = call; \ + if(err != hipSuccess){ \ + printf("[hiperror](%d) fail to call %s",(int)err,#call); \ + exit(0); \ + } \ +} while(0) + +#define HSACO "memcpy_2d_x4_example_gfx1030.hsaco" +#define HSA_KERNEL "memcpy_2d_x4_example_gfx1030" + +#define PER_PIXEL_CHECK +#define ASSERT_ON_FAIL + +#ifndef ABS +#define ABS(x) ((x) > 0 ? (x) : -1 * (x)) +#endif + +template +void rand_vec(T * seq, size_t rows, size_t cols, size_t padding) +{ + static std::random_device rd; // seed + static std::mt19937 mt(rd()); + static std::uniform_real_distribution dist(-10.0, 10.0); + + for (size_t i = 0; i < rows; ++i) + for (size_t j = 0; j < cols + padding; ++j) { + if (j >= cols) + continue; + int offset = i * (cols + padding) + j; + seq[offset] = dist(mt); + } +} + +static inline bool valid_vector(const float* ref, const float* pred, int rows, int cols, int padding, double nrms = 1e-6) +{ + double s0 = 0.0; + double s1 = 0.0; +#ifdef PER_PIXEL_CHECK + int pp_err = 0; +#endif + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols + padding; ++j) { + if (j >= cols) + continue; + double ri = (double)ref[i * (cols + padding) + j]; + double pi = (double)pred[i * (cols + padding) + j]; + double d = ri - pi; + double dd = d * d; + double rr = 2.0 * ri * ri; + s0 += dd; + s1 += rr; +#ifdef PER_PIXEL_CHECK + double delta = ABS(ri - pi) / ri; + if(delta > 3e-5) { +#ifdef ASSERT_ON_FAIL + if(pp_err < 100) + printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)pred)[i], delta); +#endif + pp_err++; + } +#endif + } + } + //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); + return (sqrt(s0/s1)= 1000) + num_cu *= 2; + // std::cout << "num_cu: " << num_cu << std::endl; + } + + int total_loop = 4; + int warm_ups = 2; + int i; + + // initial blockDim, gridDim + int bdx = 256; + int gdx = num_cu; + + // initial matrix parameters + int rows = 512; + int cols = bdx * gdx * 8; + int padding = 256; + + float *host_in, *host_out, *dev_in, *dev_out; + + int total_floats = rows * (cols + padding); + + host_in = new float[total_floats]; + host_out = new float[total_floats]; + HIP_CALL(hipMalloc(&dev_in, sizeof(float) * total_floats)); + HIP_CALL(hipMalloc(&dev_out, sizeof(float) * total_floats)); + + rand_vec(host_in, rows, cols, padding); + + HIP_CALL(hipMemcpy(dev_in, host_in, sizeof(float) * total_floats, hipMemcpyHostToDevice)); + + printf("memcpy, input:%p, output:%p, floats:%d\n",dev_in, dev_out, total_floats); + + struct __attribute__((packed)) { + float *input; + float *output; + int rows; + int gdx; + int bdx; + int padding; + } args; + size_t arg_size = sizeof(args); + args.input = dev_in; + args.output = dev_out; + args.rows = rows; + args.gdx = gdx; + args.bdx = bdx; + args.padding = padding; + + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, + &arg_size, HIP_LAUNCH_PARAM_END}; + + for (i = 0; i < warm_ups; i++) + HIP_CALL(hipModuleLaunchKernel( kernel_func, gdx,1,1, bdx,1,1, 0, 0, NULL, (void**)&config )); + + hipEventCreate(&evt_00); + hipEventCreate(&evt_11); + + hipDeviceSynchronize(); + hipEventRecord(evt_00, NULL); + for(i=0;i Date: Wed, 13 Jul 2022 10:03:44 +0800 Subject: [PATCH 02/15] add memcpy2d example --- memcpy_2d_example_gfx1030/build_clanghipcc.h | 13 ++ .../memcpy_2d_example_gfx1030.s | 129 ++++++++++++++++++ .../memcpy_2d_x4_example_gfx1030.s | 114 ++++++++++++++++ 3 files changed, 256 insertions(+) create mode 100755 memcpy_2d_example_gfx1030/build_clanghipcc.h create mode 100644 memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s create mode 100644 memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s diff --git a/memcpy_2d_example_gfx1030/build_clanghipcc.h b/memcpy_2d_example_gfx1030/build_clanghipcc.h new file mode 100755 index 0000000..56c6274 --- /dev/null +++ b/memcpy_2d_example_gfx1030/build_clanghipcc.h @@ -0,0 +1,13 @@ +#!/bin/sh + +KSRC=memcpy_2d_x4_example_gfx1030.s +KOUT=memcpy_2d_x4_example_gfx1030.hsaco +SRC=main.cpp +TARGET=out.exe + +# pre-delete the previous KOUT +rm -rf $KOUT +/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT + +rm -rf $TARGET +/opt/rocm/hip/bin/hipcc $SRC -mcpu=gfx1030 -o $TARGET diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s new file mode 100644 index 0000000..ace62b6 --- /dev/null +++ b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s @@ -0,0 +1,129 @@ +.text +.global memcpy_2d_example_gfx1030 +.p2align 8 +.type memcpy_2d_example_gfx1030,@function +memcpy_2d_example_gfx1030: +; This is just an example, not the optimal one +.set s_karg, 0 ; kernel argument +.set s_bx, 2 ; blockIdx + +.set s_ptr_in, 4 +.set s_ptr_out, 6 +.set s_rows, 8 +.set s_gdx, 10 +.set s_bdx, 20 +.set s_padding, 14 +.set s_stride_block, 16 +.set s_tmp, 18 + +.set v_buf, 0 +.set v_offset, 16 +.set v_tmp, 32 + + ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm + ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 + ; s_waitcnt lgkmcnt(0) + ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 + ; s_mov_b32 s[s_gdx], 72 ; num_cu + + s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 + s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 + s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 + s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 + s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 + s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 + + s_mul_i32 s[s_bdx+1], 256, 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + ; s_mul_i32 s[s_tmp+1], s[s_bx], 256*4 ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_tmp], s[s_gdx], 256*4 ; gridDim * blockDim * 4 + v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] + v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] + v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] + v_add_nc_u32 v[v_offset+4], s[s_tmp], v[v_offset+3] + v_add_nc_u32 v[v_offset+5], s[s_tmp], v[v_offset+4] + v_add_nc_u32 v[v_offset+6], s[s_tmp], v[v_offset+5] + v_add_nc_u32 v[v_offset+7], s[s_tmp], v[v_offset+6] + s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload + +label_memcopy_start: + global_load_dword v[v_buf+0], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+1], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+2], v[v_offset+2], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+3], v[v_offset+3], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+4], v[v_offset+4], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+5], v[v_offset+5], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+6], v[v_offset+6], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+7], v[v_offset+7], s[s_ptr_in:s_ptr_in+1] + + ; add stride padding + s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 + s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 + s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] + s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 + + s_waitcnt vmcnt(0) + + global_store_dword v[v_offset+0], v[v_buf+0], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+1], v[v_buf+1], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+2], v[v_buf+2], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+3], v[v_buf+3], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+4], v[v_buf+4], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+5], v[v_buf+5], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+6], v[v_buf+6], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+7], v[v_buf+7], s[s_ptr_out:s_ptr_out+1] + + s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] + s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 + + s_sub_u32 s[s_rows], s[s_rows], 1 + s_cmp_eq_u32 s[s_rows], 0 + s_waitcnt vmcnt(0) + s_cbranch_scc0 label_memcopy_start + s_endpgm + +.rodata +.p2align 6 +.amdhsa_kernel memcpy_2d_example_gfx1030 + .amdhsa_group_segment_fixed_size 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 32 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 + .amdhsa_wavefront_size32 1 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: memcpy_2d_example_gfx1030 + .symbol: memcpy_2d_example_gfx1030.kd + .sgpr_count: 32 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 32 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .wavefront_size: 32 ;warpsize + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 ;gridsize + .args: + - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} + - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} + - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s new file mode 100644 index 0000000..b2bba99 --- /dev/null +++ b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s @@ -0,0 +1,114 @@ +.text +.global memcpy_2d_x4_example_gfx1030 +.p2align 8 +.type memcpy_2d_x4_example_gfx1030,@function +memcpy_2d_x4_example_gfx1030: +; This is just an example, not the optimal one +.set s_karg, 0 ; kernel argument +.set s_bx, 2 ; blockIdx + +.set s_ptr_in, 4 +.set s_ptr_out, 6 +.set s_rows, 8 +.set s_gdx, 10 +.set s_bdx, 12 +.set s_padding, 14 +.set s_stride_block, 16 +.set s_tmp, 18 + +.set v_buf, 0 +.set v_offset, 16 +.set v_tmp, 32 + + ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm + ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 + ; s_waitcnt lgkmcnt(0) + ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 + ; s_mov_b32 s[s_gdx], 72 ; num_cu + + s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 + s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 + s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 + s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 + s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 + s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 + + s_mul_i32 s[s_bdx+1], 256, 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + ; s_mul_i32 s[s_tmp+1], s[s_bx], 256*4 ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 + v_lshlrev_b32 v[v_offset+0], 2, v[v_offset+0] + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_tmp], s[s_gdx], 256*4*4 ; gridDim * blockDim * 4 * 4Dwords + v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] + + s_mul_i32 s[s_tmp], s[s_gdx], 256*4 ; gridDim * blockDim * 4 + s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload + +label_memcopy_start: + global_load_dwordx4 v[v_buf+0 : v_buf+3], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] + global_load_dwordx4 v[v_buf+4 : v_buf+7], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] + + ; add stride padding + s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 + s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 + s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] + s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 + + s_waitcnt vmcnt(0) + + global_store_dwordx4 v[v_offset+0], v[v_buf+0 : v_buf+3], s[s_ptr_out:s_ptr_out+1] + global_store_dwordx4 v[v_offset+1], v[v_buf+4 : v_buf+7], s[s_ptr_out:s_ptr_out+1] + + s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] + s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 + + s_sub_u32 s[s_rows], s[s_rows], 1 + s_cmp_eq_u32 s[s_rows], 0 + s_waitcnt vmcnt(0) + s_cbranch_scc0 label_memcopy_start + s_endpgm + +.rodata +.p2align 6 +.amdhsa_kernel memcpy_2d_x4_example_gfx1030 + .amdhsa_group_segment_fixed_size 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 32 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 + .amdhsa_wavefront_size32 1 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: memcpy_2d_x4_example_gfx1030 + .symbol: memcpy_2d_x4_example_gfx1030.kd + .sgpr_count: 32 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 32 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .wavefront_size: 32 ;warpsize + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 ;gridsize + .args: + - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} + - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} + - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata From ce38f8ed4aa02b139914e742c468a4894d6b4ea9 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Wed, 13 Jul 2022 10:41:50 +0800 Subject: [PATCH 03/15] update asm for gfx1030 --- memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s | 5 +++-- memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s index ace62b6..f1327f5 100644 --- a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s +++ b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s @@ -33,9 +33,10 @@ memcpy_2d_example_gfx1030: s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 - s_mul_i32 s[s_bdx+1], 256, 4 ; blockDim * 4 + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 - ; s_mul_i32 s[s_tmp+1], s[s_bx], 256*4 ; blockIdx * blockDim * 4 v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s index b2bba99..d4e8849 100644 --- a/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s +++ b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s @@ -33,9 +33,10 @@ memcpy_2d_x4_example_gfx1030: s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 - s_mul_i32 s[s_bdx+1], 256, 4 ; blockDim * 4 + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 - ; s_mul_i32 s[s_tmp+1], s[s_bx], 256*4 ; blockIdx * blockDim * 4 v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 v_lshlrev_b32 v[v_offset+0], 2, v[v_offset+0] From 4e089e43e3783ac3e3e38d1ec90f59e63362b4d6 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Thu, 14 Jul 2022 15:52:45 +0800 Subject: [PATCH 04/15] add memcpy_2d_1Dword benchmark --- memcpy_2d_example_gfx1030/benchmark/README.md | 28 +++ .../benchmark/benchmark.cpp | 203 ++++++++++++++++++ memcpy_2d_example_gfx1030/benchmark/config.h | 51 +++++ .../benchmark/memcpy_2d_example_gfx1030.s | 131 +++++++++++ .../benchmark/params.config | 5 + .../benchmark/parser.cpp | 46 ++++ memcpy_2d_example_gfx1030/benchmark/run.sh | 17 ++ 7 files changed, 481 insertions(+) create mode 100644 memcpy_2d_example_gfx1030/benchmark/README.md create mode 100644 memcpy_2d_example_gfx1030/benchmark/benchmark.cpp create mode 100644 memcpy_2d_example_gfx1030/benchmark/config.h create mode 100644 memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s create mode 100644 memcpy_2d_example_gfx1030/benchmark/params.config create mode 100644 memcpy_2d_example_gfx1030/benchmark/parser.cpp create mode 100755 memcpy_2d_example_gfx1030/benchmark/run.sh diff --git a/memcpy_2d_example_gfx1030/benchmark/README.md b/memcpy_2d_example_gfx1030/benchmark/README.md new file mode 100644 index 0000000..f7318c8 --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/README.md @@ -0,0 +1,28 @@ +# memcpy 2d benchmark + +This is a benchmark for memcpy-2d-1Dword example on gfx1030. You can specify matrix parameters (rows, cols, padding) by modifying params.config. +Here rows and padding can pass in any value, while cols currently only support multiples of 2048 (2K). + +## build and run +Go to the benchmark root and build by +''' +$ ./run.sh +''' +Then you can run by +''' +$ ./out.exe params.config +''' + +## conclusion +I have tested in different paramter combination, which shows: +| Rows | Cols | Padding | GBPS | +| :--: | :--: | :-----: | :--: | +| 128 | 147456 | 1024 | 254.202 | +| 256 | 147456 | 1024 | 309.121 | +| 512 | 147456 | 1024 | 368.616 | +| 512 | 73728 | 1024 | 316.32 | +| 512 | 184320 | 1024 | 349.346 | +| 512 | 147456 | 0 | 365.193 | +| 512 | 147456 | 1 | 226.867 | +| 512 | 147456 | 8192 | 367.966 | +| 512 | 147456 | 32768 | 363.132 | \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp b/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp new file mode 100644 index 0000000..da9b8ea --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp @@ -0,0 +1,203 @@ +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#define HIP_CALL(call) do{ \ + hipError_t err = call; \ + if(err != hipSuccess){ \ + printf("[hiperror](%d) fail to call %s",(int)err,#call); \ + exit(0); \ + } \ +} while(0) + +#define HSACO "memcpy_2d_example_gfx1030.hsaco" +#define HSA_KERNEL "memcpy_2d_example_gfx1030" + +#define ABS(x) ((x) > 0 ? (x) : -1 * (x)) + +template +void rand_vec(Matrix_2d &matrix) +{ + static std::random_device rd; // seed + static std::mt19937 mt(rd()); + static std::uniform_real_distribution dist(-10.0, 10.0); + + for (size_t i = 0; i < matrix.rows; ++i) + for (size_t j = 0; j < matrix.cols + matrix.padding; ++j) { + if (j >= matrix.cols) + continue; + int offset = i * (matrix.cols + matrix.padding) + j; + matrix.data[offset] = dist(mt); + } +} + +template +static inline bool valid_vector(const Matrix_2d &host_in, const Matrix_2d &host_out, double nrms = 1e-6) +{ + double s0 = 0.0; + double s1 = 0.0; + int pp_err = 0; + + int rows = host_in.rows; + int cols = host_in.cols; + int padding = host_in.padding; + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols + padding; ++j) { + if (j >= cols) + continue; + double ri = (double)host_in.data[i * (cols + padding) + j]; + double pi = (double)host_out.data[i * (cols + padding) + j]; + double d = ri - pi; + double dd = d * d; + double rr = 2.0 * ri * ri; + s0 += dd; + s1 += rr; + double delta = ABS(ri - pi) / ri; + + if(delta > 3e-5) { + if(pp_err < 100) + printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)host_out.data)[i], delta); + pp_err++; + } + } + } + //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); + return (sqrt (s0 / s1) < nrms) && (pp_err==0); +} + +int main(int argc, char **argv) +{ + if (argc <= 1) { + std::cout << "no input file! please enter params.config" << std::endl; + return -1; + } + // parse params.config + Config config; + config.parseConfigFile(argv[1]); + + // add key-value to myStruct + assert(config.m_contents.find("rows") != config.m_contents.end() && "[error!] failed to parse rows!"); + assert(config.m_contents.find("cols") != config.m_contents.end() && "[error!] failed to parse cols!"); + assert(config.m_contents.find("padding") != config.m_contents.end() && "[error!] failed to parse padding!"); + + int rows = std::stoi(config.m_contents["rows"]); + int cols = std::stoi(config.m_contents["cols"]); + int padding = std::stoi(config.m_contents["padding"]); + + // judge cols legality + assert(cols % (256*8) == 0 && "[!]Only supports cols which is multiples of 2048(2K)"); + + Matrix_2d matrix_host_in(rows, cols, padding, 1); + Matrix_2d matrix_host_out(rows, cols, padding, 1); + Matrix_2d matrix_dev_in(rows, cols, padding, 0); + Matrix_2d matrix_dev_out(rows, cols, padding, 0); + + // kernel preparation + hipModule_t module; + hipFunction_t kernel_func; + hipEvent_t evt_00, evt_11; + HIP_CALL(hipSetDevice(0)); + + HIP_CALL(hipModuleLoad(&module, HSACO)); + HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); + + int num_cu; + int gcn_arch; + { + hipDeviceProp_t dev_prop; + hipDevice_t dev; + HIP_CALL(hipGetDevice(&dev)); + HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); + num_cu = dev_prop.multiProcessorCount; + gcn_arch = dev_prop.gcnArch; + if (gcn_arch >= 1000) + num_cu *= 2; + } + + int total_loop = 4; + int warm_ups = 2; + + // initial blockDim, gridDim + int bdx = 256; + int gdx = matrix_host_in.cols / (8 * bdx); + HIP_CALL(hipMalloc(&matrix_dev_in.data, sizeof(float) * matrix_dev_in.length)); + HIP_CALL(hipMalloc(&matrix_dev_out.data, sizeof(float) * matrix_dev_out.length)); + + // initial host in data + rand_vec(matrix_host_in); + + // memcpy data from host to device + HIP_CALL(hipMemcpy(matrix_dev_in.data, matrix_host_in.data, sizeof(float) * matrix_dev_in.length, hipMemcpyHostToDevice)); + printf("memcpy, input: %p, output: %p, floats: %d\n",matrix_dev_in.data, matrix_dev_out.data, matrix_dev_in.length); + + struct __attribute__((packed)) + { + float *input; + float *output; + int rows; + int gdx; + int bdx; + int padding; + } args; + + size_t arg_size = sizeof(args); + args.input = matrix_dev_in.data; + args.output = matrix_dev_out.data; + args.rows = rows; + args.gdx = gdx; + args.bdx = bdx; + args.padding = padding; + + void* config_kernel[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size, HIP_LAUNCH_PARAM_END}; + + // warm up kernel + for (int i = 0; i < warm_ups; i++) + HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); + + hipEventCreate(&evt_00); + hipEventCreate(&evt_11); + hipDeviceSynchronize(); + hipEventRecord(evt_00, NULL); + + // launch kernel + for(int i = 0; i < total_loop; i++) + HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); + + float elapsed_ms; + hipEventRecord(evt_11, NULL); + hipEventSynchronize(evt_11); + hipDeviceSynchronize(); + hipEventElapsedTime(&elapsed_ms, evt_00, evt_11); + hipEventDestroy(evt_00); + hipEventDestroy(evt_11); + + HIP_CALL(hipMemcpy(matrix_host_out.data, matrix_dev_out.data, sizeof(float) * matrix_host_out.length, hipMemcpyDeviceToHost)); + + // verification + bool is_valid = valid_vector(matrix_host_in, matrix_host_out); + if(!is_valid) + printf("Data not valid, please check\n"); + else + printf("Data is valid :)\n"); + + // evaluation + float time_per_loop_ms = elapsed_ms / total_loop; + float gbps = (matrix_host_out.rows * matrix_host_out.cols) * 2 * sizeof(float) / time_per_loop_ms / 1000 / 1000; + + std::cout << "---- MEMCPY 2D EXAMPLE EVALUATION ----" << std::endl; + std::cout << " rows: " << matrix_host_in.rows << '\t' << "cols: " << matrix_host_in.cols << '\t' << "padding: " << matrix_host_in.padding << std::endl; + std::cout << " gdx: " << gdx << '\t' << "bdx " << bdx << std::endl; + std::cout << " gbps: " << gbps << std::endl; + std::cout << "---- FINISH EVALUATION ----" << std::endl; + + hipFree(matrix_dev_in.data); + hipFree(matrix_dev_out.data); + + return 0; +} \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/config.h b/memcpy_2d_example_gfx1030/benchmark/config.h new file mode 100644 index 0000000..a679238 --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/config.h @@ -0,0 +1,51 @@ +#ifndef CONFIG_ +#define CONFIG_ + +#include +#include + +template +class Matrix_2d +{ +public: + int rows; + int cols; + int padding; + int length; + bool type; //Host: 1; Device: 0 + T *data; +public: + Matrix_2d(): rows(0), cols(0), padding(0), length(0), type(0), data(nullptr) {} + Matrix_2d(int r, int c, int p, bool t): rows(r), cols(c), padding(p), type(t), length(r * (c + p)), data(new T[r * (c + p)]) {} + ~Matrix_2d() { + if (data != nullptr && type) { + delete [] data; + } + } + void initMem() { + if (!data) + data = new T[rows * (cols + padding)]; + } +}; + +class Config +{ +public: + char m_delimiter; + std::unordered_map m_contents; +public: + struct FILE_NOT_FOUND + { + std::string filename; + FILE_NOT_FOUND(const std::string& filename_ = std::string()): filename(filename_) {} + }; +public: + Config(): m_delimiter('=') {} + void parseConfigFile(const char *fileName); + void split(const std::string &line); +private: + void deleteSpace(std::string &str); +}; + + +#endif \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s new file mode 100644 index 0000000..6b3007e --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s @@ -0,0 +1,131 @@ +.text +.global memcpy_2d_example_gfx1030 +.p2align 8 +.type memcpy_2d_example_gfx1030,@function +memcpy_2d_example_gfx1030: +; This is just an example, not the optimal one +.set s_karg, 0 ; kernel argument +.set s_bx, 2 ; blockIdx + +.set s_ptr_in, 4 +.set s_ptr_out, 6 +.set s_rows, 8 +.set s_gdx, 10 +.set s_bdx, 12 +.set s_padding, 14 +.set s_stride_block, 16 +.set s_tmp, 18 + +.set v_buf, 0 +.set v_offset, 16 +.set v_tmp, 32 + + ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm + ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 + ; s_waitcnt lgkmcnt(0) + ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 + ; s_mov_b32 s[s_gdx], 72 ; num_cu + + s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 + s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 + s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 + s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 + s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 + s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 + + s_waitcnt lgkmcnt(0) + + + s_mul_i32 s[s_tmp], s[s_gdx], s[s_bdx+1] ; gridDim * blockDim * 4 + v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] + v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] + v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] + v_add_nc_u32 v[v_offset+4], s[s_tmp], v[v_offset+3] + v_add_nc_u32 v[v_offset+5], s[s_tmp], v[v_offset+4] + v_add_nc_u32 v[v_offset+6], s[s_tmp], v[v_offset+5] + v_add_nc_u32 v[v_offset+7], s[s_tmp], v[v_offset+6] + s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload + +label_memcopy_start: + global_load_dword v[v_buf+0], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+1], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+2], v[v_offset+2], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+3], v[v_offset+3], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+4], v[v_offset+4], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+5], v[v_offset+5], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+6], v[v_offset+6], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+7], v[v_offset+7], s[s_ptr_in:s_ptr_in+1] + + ; add stride padding + s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 + s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 + s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] + s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 + + s_waitcnt vmcnt(0) + + global_store_dword v[v_offset+0], v[v_buf+0], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+1], v[v_buf+1], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+2], v[v_buf+2], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+3], v[v_buf+3], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+4], v[v_buf+4], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+5], v[v_buf+5], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+6], v[v_buf+6], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+7], v[v_buf+7], s[s_ptr_out:s_ptr_out+1] + + s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] + s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 + + s_sub_u32 s[s_rows], s[s_rows], 1 + s_cmp_eq_u32 s[s_rows], 0 + s_waitcnt vmcnt(0) + s_cbranch_scc0 label_memcopy_start + s_endpgm + +.rodata +.p2align 6 +.amdhsa_kernel memcpy_2d_example_gfx1030 + .amdhsa_group_segment_fixed_size 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 32 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 + .amdhsa_wavefront_size32 1 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: memcpy_2d_example_gfx1030 + .symbol: memcpy_2d_example_gfx1030.kd + .sgpr_count: 32 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 32 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .wavefront_size: 32 ;warpsize + ; .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 ;gridsize + .args: + - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} + - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} + - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/memcpy_2d_example_gfx1030/benchmark/params.config b/memcpy_2d_example_gfx1030/benchmark/params.config new file mode 100644 index 0000000..90a5acb --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/params.config @@ -0,0 +1,5 @@ +[2d_matrix_paramters] + +rows = 512 +cols = 147456 +padding = 32768 \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/parser.cpp b/memcpy_2d_example_gfx1030/benchmark/parser.cpp new file mode 100644 index 0000000..d2dd054 --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/parser.cpp @@ -0,0 +1,46 @@ +#include +#include +#include + +#include "config.h" + +void Config::parseConfigFile(const char *fileName) +{ + std::ifstream in(fileName); + if (!in) + throw FILE_NOT_FOUND(); + + std::string line; + while (!in.eof()) { + getline(in, line); + if (line.find('[') == 0 || line.find(' ') == 0) + continue; + split(line); + } +} + +void Config::split(const std::string &line) +{ + std::size_t previous = 0; + std::size_t current = line.find(this->m_delimiter); + std::size_t length = line.length(); + if (current != std::string::npos) { + std::string key = line.substr(previous, current - previous); + std::string value = line.substr(current + 1, length); + deleteSpace(key); + deleteSpace(value); + this->m_contents[key] = value; + } +} + +void Config::deleteSpace(std::string &str) +{ + int j = 0; + for (int i = 0; i < str.length(); i++) { + if (str[i] == ' ') + continue; + str[j++] = str[i]; + } + str.resize(j); +} + diff --git a/memcpy_2d_example_gfx1030/benchmark/run.sh b/memcpy_2d_example_gfx1030/benchmark/run.sh new file mode 100755 index 0000000..cd16d98 --- /dev/null +++ b/memcpy_2d_example_gfx1030/benchmark/run.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +# g++ -std=c++11 benchmark.cpp parser.cpp -o out + +KSRC=memcpy_2d_example_gfx1030.s +KOUT=memcpy_2d_example_gfx1030.hsaco +SRC1=benchmark.cpp +SRC2=parser.cpp + +TARGET=out.exe + +# pre-delete the previous KOUT +rm -rf $KOUT +/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT + +rm -rf $TARGET +/opt/rocm/hip/bin/hipcc $SRC1 $SRC2 -mcpu=gfx1030 -o $TARGET \ No newline at end of file From 3e4b8a43c48e546d04e96820bd9ab1241ee4b27b Mon Sep 17 00:00:00 2001 From: zyGao1126 <57161470+zyGao1126@users.noreply.github.com> Date: Thu, 14 Jul 2022 15:55:02 +0800 Subject: [PATCH 05/15] Update README.md --- memcpy_2d_example_gfx1030/benchmark/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/memcpy_2d_example_gfx1030/benchmark/README.md b/memcpy_2d_example_gfx1030/benchmark/README.md index f7318c8..29ee08a 100644 --- a/memcpy_2d_example_gfx1030/benchmark/README.md +++ b/memcpy_2d_example_gfx1030/benchmark/README.md @@ -5,13 +5,13 @@ Here rows and padding can pass in any value, while cols currently only support m ## build and run Go to the benchmark root and build by -''' +``` $ ./run.sh -''' +``` Then you can run by -''' +``` $ ./out.exe params.config -''' +``` ## conclusion I have tested in different paramter combination, which shows: @@ -25,4 +25,4 @@ I have tested in different paramter combination, which shows: | 512 | 147456 | 0 | 365.193 | | 512 | 147456 | 1 | 226.867 | | 512 | 147456 | 8192 | 367.966 | -| 512 | 147456 | 32768 | 363.132 | \ No newline at end of file +| 512 | 147456 | 32768 | 363.132 | From 580174006c3a3a21c88be1f42cf4189061fe5917 Mon Sep 17 00:00:00 2001 From: zyGao1126 <57161470+zyGao1126@users.noreply.github.com> Date: Thu, 14 Jul 2022 15:55:45 +0800 Subject: [PATCH 06/15] Update README.md --- memcpy_2d_example_gfx1030/benchmark/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/memcpy_2d_example_gfx1030/benchmark/README.md b/memcpy_2d_example_gfx1030/benchmark/README.md index 29ee08a..5e0dbf8 100644 --- a/memcpy_2d_example_gfx1030/benchmark/README.md +++ b/memcpy_2d_example_gfx1030/benchmark/README.md @@ -1,9 +1,9 @@ -# memcpy 2d benchmark +# Memcpy 2D Benchmark This is a benchmark for memcpy-2d-1Dword example on gfx1030. You can specify matrix parameters (rows, cols, padding) by modifying params.config. Here rows and padding can pass in any value, while cols currently only support multiples of 2048 (2K). -## build and run +## Build and Run Go to the benchmark root and build by ``` $ ./run.sh @@ -13,7 +13,7 @@ Then you can run by $ ./out.exe params.config ``` -## conclusion +## Conclusion I have tested in different paramter combination, which shows: | Rows | Cols | Padding | GBPS | | :--: | :--: | :-----: | :--: | From e4c885a0c292721a8cee6586c4d04188ece160af Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Thu, 14 Jul 2022 15:59:58 +0800 Subject: [PATCH 07/15] Update README.md --- memcpy_2d_example_gfx1030/README.md | 1 + memcpy_2d_example_gfx1030/benchmark/README.md | 28 --- .../benchmark/benchmark.cpp | 203 ------------------ memcpy_2d_example_gfx1030/benchmark/config.h | 51 ----- .../benchmark/memcpy_2d_example_gfx1030.s | 131 ----------- .../benchmark/params.config | 5 - .../benchmark/parser.cpp | 46 ---- memcpy_2d_example_gfx1030/benchmark/run.sh | 17 -- 8 files changed, 1 insertion(+), 481 deletions(-) create mode 100644 memcpy_2d_example_gfx1030/README.md delete mode 100644 memcpy_2d_example_gfx1030/benchmark/README.md delete mode 100644 memcpy_2d_example_gfx1030/benchmark/benchmark.cpp delete mode 100644 memcpy_2d_example_gfx1030/benchmark/config.h delete mode 100644 memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s delete mode 100644 memcpy_2d_example_gfx1030/benchmark/params.config delete mode 100644 memcpy_2d_example_gfx1030/benchmark/parser.cpp delete mode 100755 memcpy_2d_example_gfx1030/benchmark/run.sh diff --git a/memcpy_2d_example_gfx1030/README.md b/memcpy_2d_example_gfx1030/README.md new file mode 100644 index 0000000..c267824 --- /dev/null +++ b/memcpy_2d_example_gfx1030/README.md @@ -0,0 +1 @@ +# See README in benchmark \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/README.md b/memcpy_2d_example_gfx1030/benchmark/README.md deleted file mode 100644 index f7318c8..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# memcpy 2d benchmark - -This is a benchmark for memcpy-2d-1Dword example on gfx1030. You can specify matrix parameters (rows, cols, padding) by modifying params.config. -Here rows and padding can pass in any value, while cols currently only support multiples of 2048 (2K). - -## build and run -Go to the benchmark root and build by -''' -$ ./run.sh -''' -Then you can run by -''' -$ ./out.exe params.config -''' - -## conclusion -I have tested in different paramter combination, which shows: -| Rows | Cols | Padding | GBPS | -| :--: | :--: | :-----: | :--: | -| 128 | 147456 | 1024 | 254.202 | -| 256 | 147456 | 1024 | 309.121 | -| 512 | 147456 | 1024 | 368.616 | -| 512 | 73728 | 1024 | 316.32 | -| 512 | 184320 | 1024 | 349.346 | -| 512 | 147456 | 0 | 365.193 | -| 512 | 147456 | 1 | 226.867 | -| 512 | 147456 | 8192 | 367.966 | -| 512 | 147456 | 32768 | 363.132 | \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp b/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp deleted file mode 100644 index da9b8ea..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#define HIP_CALL(call) do{ \ - hipError_t err = call; \ - if(err != hipSuccess){ \ - printf("[hiperror](%d) fail to call %s",(int)err,#call); \ - exit(0); \ - } \ -} while(0) - -#define HSACO "memcpy_2d_example_gfx1030.hsaco" -#define HSA_KERNEL "memcpy_2d_example_gfx1030" - -#define ABS(x) ((x) > 0 ? (x) : -1 * (x)) - -template -void rand_vec(Matrix_2d &matrix) -{ - static std::random_device rd; // seed - static std::mt19937 mt(rd()); - static std::uniform_real_distribution dist(-10.0, 10.0); - - for (size_t i = 0; i < matrix.rows; ++i) - for (size_t j = 0; j < matrix.cols + matrix.padding; ++j) { - if (j >= matrix.cols) - continue; - int offset = i * (matrix.cols + matrix.padding) + j; - matrix.data[offset] = dist(mt); - } -} - -template -static inline bool valid_vector(const Matrix_2d &host_in, const Matrix_2d &host_out, double nrms = 1e-6) -{ - double s0 = 0.0; - double s1 = 0.0; - int pp_err = 0; - - int rows = host_in.rows; - int cols = host_in.cols; - int padding = host_in.padding; - - for (int i = 0; i < rows; ++i) { - for (int j = 0; j < cols + padding; ++j) { - if (j >= cols) - continue; - double ri = (double)host_in.data[i * (cols + padding) + j]; - double pi = (double)host_out.data[i * (cols + padding) + j]; - double d = ri - pi; - double dd = d * d; - double rr = 2.0 * ri * ri; - s0 += dd; - s1 += rr; - double delta = ABS(ri - pi) / ri; - - if(delta > 3e-5) { - if(pp_err < 100) - printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)host_out.data)[i], delta); - pp_err++; - } - } - } - //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); - return (sqrt (s0 / s1) < nrms) && (pp_err==0); -} - -int main(int argc, char **argv) -{ - if (argc <= 1) { - std::cout << "no input file! please enter params.config" << std::endl; - return -1; - } - // parse params.config - Config config; - config.parseConfigFile(argv[1]); - - // add key-value to myStruct - assert(config.m_contents.find("rows") != config.m_contents.end() && "[error!] failed to parse rows!"); - assert(config.m_contents.find("cols") != config.m_contents.end() && "[error!] failed to parse cols!"); - assert(config.m_contents.find("padding") != config.m_contents.end() && "[error!] failed to parse padding!"); - - int rows = std::stoi(config.m_contents["rows"]); - int cols = std::stoi(config.m_contents["cols"]); - int padding = std::stoi(config.m_contents["padding"]); - - // judge cols legality - assert(cols % (256*8) == 0 && "[!]Only supports cols which is multiples of 2048(2K)"); - - Matrix_2d matrix_host_in(rows, cols, padding, 1); - Matrix_2d matrix_host_out(rows, cols, padding, 1); - Matrix_2d matrix_dev_in(rows, cols, padding, 0); - Matrix_2d matrix_dev_out(rows, cols, padding, 0); - - // kernel preparation - hipModule_t module; - hipFunction_t kernel_func; - hipEvent_t evt_00, evt_11; - HIP_CALL(hipSetDevice(0)); - - HIP_CALL(hipModuleLoad(&module, HSACO)); - HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); - - int num_cu; - int gcn_arch; - { - hipDeviceProp_t dev_prop; - hipDevice_t dev; - HIP_CALL(hipGetDevice(&dev)); - HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); - num_cu = dev_prop.multiProcessorCount; - gcn_arch = dev_prop.gcnArch; - if (gcn_arch >= 1000) - num_cu *= 2; - } - - int total_loop = 4; - int warm_ups = 2; - - // initial blockDim, gridDim - int bdx = 256; - int gdx = matrix_host_in.cols / (8 * bdx); - HIP_CALL(hipMalloc(&matrix_dev_in.data, sizeof(float) * matrix_dev_in.length)); - HIP_CALL(hipMalloc(&matrix_dev_out.data, sizeof(float) * matrix_dev_out.length)); - - // initial host in data - rand_vec(matrix_host_in); - - // memcpy data from host to device - HIP_CALL(hipMemcpy(matrix_dev_in.data, matrix_host_in.data, sizeof(float) * matrix_dev_in.length, hipMemcpyHostToDevice)); - printf("memcpy, input: %p, output: %p, floats: %d\n",matrix_dev_in.data, matrix_dev_out.data, matrix_dev_in.length); - - struct __attribute__((packed)) - { - float *input; - float *output; - int rows; - int gdx; - int bdx; - int padding; - } args; - - size_t arg_size = sizeof(args); - args.input = matrix_dev_in.data; - args.output = matrix_dev_out.data; - args.rows = rows; - args.gdx = gdx; - args.bdx = bdx; - args.padding = padding; - - void* config_kernel[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size, HIP_LAUNCH_PARAM_END}; - - // warm up kernel - for (int i = 0; i < warm_ups; i++) - HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); - - hipEventCreate(&evt_00); - hipEventCreate(&evt_11); - hipDeviceSynchronize(); - hipEventRecord(evt_00, NULL); - - // launch kernel - for(int i = 0; i < total_loop; i++) - HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); - - float elapsed_ms; - hipEventRecord(evt_11, NULL); - hipEventSynchronize(evt_11); - hipDeviceSynchronize(); - hipEventElapsedTime(&elapsed_ms, evt_00, evt_11); - hipEventDestroy(evt_00); - hipEventDestroy(evt_11); - - HIP_CALL(hipMemcpy(matrix_host_out.data, matrix_dev_out.data, sizeof(float) * matrix_host_out.length, hipMemcpyDeviceToHost)); - - // verification - bool is_valid = valid_vector(matrix_host_in, matrix_host_out); - if(!is_valid) - printf("Data not valid, please check\n"); - else - printf("Data is valid :)\n"); - - // evaluation - float time_per_loop_ms = elapsed_ms / total_loop; - float gbps = (matrix_host_out.rows * matrix_host_out.cols) * 2 * sizeof(float) / time_per_loop_ms / 1000 / 1000; - - std::cout << "---- MEMCPY 2D EXAMPLE EVALUATION ----" << std::endl; - std::cout << " rows: " << matrix_host_in.rows << '\t' << "cols: " << matrix_host_in.cols << '\t' << "padding: " << matrix_host_in.padding << std::endl; - std::cout << " gdx: " << gdx << '\t' << "bdx " << bdx << std::endl; - std::cout << " gbps: " << gbps << std::endl; - std::cout << "---- FINISH EVALUATION ----" << std::endl; - - hipFree(matrix_dev_in.data); - hipFree(matrix_dev_out.data); - - return 0; -} \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/config.h b/memcpy_2d_example_gfx1030/benchmark/config.h deleted file mode 100644 index a679238..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/config.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef CONFIG_ -#define CONFIG_ - -#include -#include - -template -class Matrix_2d -{ -public: - int rows; - int cols; - int padding; - int length; - bool type; //Host: 1; Device: 0 - T *data; -public: - Matrix_2d(): rows(0), cols(0), padding(0), length(0), type(0), data(nullptr) {} - Matrix_2d(int r, int c, int p, bool t): rows(r), cols(c), padding(p), type(t), length(r * (c + p)), data(new T[r * (c + p)]) {} - ~Matrix_2d() { - if (data != nullptr && type) { - delete [] data; - } - } - void initMem() { - if (!data) - data = new T[rows * (cols + padding)]; - } -}; - -class Config -{ -public: - char m_delimiter; - std::unordered_map m_contents; -public: - struct FILE_NOT_FOUND - { - std::string filename; - FILE_NOT_FOUND(const std::string& filename_ = std::string()): filename(filename_) {} - }; -public: - Config(): m_delimiter('=') {} - void parseConfigFile(const char *fileName); - void split(const std::string &line); -private: - void deleteSpace(std::string &str); -}; - - -#endif \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s deleted file mode 100644 index 6b3007e..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s +++ /dev/null @@ -1,131 +0,0 @@ -.text -.global memcpy_2d_example_gfx1030 -.p2align 8 -.type memcpy_2d_example_gfx1030,@function -memcpy_2d_example_gfx1030: -; This is just an example, not the optimal one -.set s_karg, 0 ; kernel argument -.set s_bx, 2 ; blockIdx - -.set s_ptr_in, 4 -.set s_ptr_out, 6 -.set s_rows, 8 -.set s_gdx, 10 -.set s_bdx, 12 -.set s_padding, 14 -.set s_stride_block, 16 -.set s_tmp, 18 - -.set v_buf, 0 -.set v_offset, 16 -.set v_tmp, 32 - - ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm - ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 - ; s_waitcnt lgkmcnt(0) - ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 - ; s_mov_b32 s[s_gdx], 72 ; num_cu - - s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 - s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 - s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 - s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 - s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 - s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 - - s_waitcnt lgkmcnt(0) - - s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 - s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 - v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 - v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 - - s_waitcnt lgkmcnt(0) - - - s_mul_i32 s[s_tmp], s[s_gdx], s[s_bdx+1] ; gridDim * blockDim * 4 - v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] - v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] - v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] - v_add_nc_u32 v[v_offset+4], s[s_tmp], v[v_offset+3] - v_add_nc_u32 v[v_offset+5], s[s_tmp], v[v_offset+4] - v_add_nc_u32 v[v_offset+6], s[s_tmp], v[v_offset+5] - v_add_nc_u32 v[v_offset+7], s[s_tmp], v[v_offset+6] - s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload - -label_memcopy_start: - global_load_dword v[v_buf+0], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+1], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+2], v[v_offset+2], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+3], v[v_offset+3], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+4], v[v_offset+4], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+5], v[v_offset+5], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+6], v[v_offset+6], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+7], v[v_offset+7], s[s_ptr_in:s_ptr_in+1] - - ; add stride padding - s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 - s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 - s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] - s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 - - s_waitcnt vmcnt(0) - - global_store_dword v[v_offset+0], v[v_buf+0], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+1], v[v_buf+1], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+2], v[v_buf+2], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+3], v[v_buf+3], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+4], v[v_buf+4], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+5], v[v_buf+5], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+6], v[v_buf+6], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+7], v[v_buf+7], s[s_ptr_out:s_ptr_out+1] - - s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] - s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 - - s_sub_u32 s[s_rows], s[s_rows], 1 - s_cmp_eq_u32 s[s_rows], 0 - s_waitcnt vmcnt(0) - s_cbranch_scc0 label_memcopy_start - s_endpgm - -.rodata -.p2align 6 -.amdhsa_kernel memcpy_2d_example_gfx1030 - .amdhsa_group_segment_fixed_size 0 - .amdhsa_user_sgpr_dispatch_ptr 0 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 32 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 - .amdhsa_wavefront_size32 1 - .amdhsa_workgroup_processor_mode 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: memcpy_2d_example_gfx1030 - .symbol: memcpy_2d_example_gfx1030.kd - .sgpr_count: 32 - .vgpr_count: 64 - .kernarg_segment_align: 8 - .kernarg_segment_size: 32 - .group_segment_fixed_size: 0 - .private_segment_fixed_size: 0 - .wavefront_size: 32 ;warpsize - ; .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 ;gridsize - .args: - - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} - - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} - - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/memcpy_2d_example_gfx1030/benchmark/params.config b/memcpy_2d_example_gfx1030/benchmark/params.config deleted file mode 100644 index 90a5acb..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/params.config +++ /dev/null @@ -1,5 +0,0 @@ -[2d_matrix_paramters] - -rows = 512 -cols = 147456 -padding = 32768 \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/parser.cpp b/memcpy_2d_example_gfx1030/benchmark/parser.cpp deleted file mode 100644 index d2dd054..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/parser.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include - -#include "config.h" - -void Config::parseConfigFile(const char *fileName) -{ - std::ifstream in(fileName); - if (!in) - throw FILE_NOT_FOUND(); - - std::string line; - while (!in.eof()) { - getline(in, line); - if (line.find('[') == 0 || line.find(' ') == 0) - continue; - split(line); - } -} - -void Config::split(const std::string &line) -{ - std::size_t previous = 0; - std::size_t current = line.find(this->m_delimiter); - std::size_t length = line.length(); - if (current != std::string::npos) { - std::string key = line.substr(previous, current - previous); - std::string value = line.substr(current + 1, length); - deleteSpace(key); - deleteSpace(value); - this->m_contents[key] = value; - } -} - -void Config::deleteSpace(std::string &str) -{ - int j = 0; - for (int i = 0; i < str.length(); i++) { - if (str[i] == ' ') - continue; - str[j++] = str[i]; - } - str.resize(j); -} - diff --git a/memcpy_2d_example_gfx1030/benchmark/run.sh b/memcpy_2d_example_gfx1030/benchmark/run.sh deleted file mode 100755 index cd16d98..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh - -# g++ -std=c++11 benchmark.cpp parser.cpp -o out - -KSRC=memcpy_2d_example_gfx1030.s -KOUT=memcpy_2d_example_gfx1030.hsaco -SRC1=benchmark.cpp -SRC2=parser.cpp - -TARGET=out.exe - -# pre-delete the previous KOUT -rm -rf $KOUT -/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT - -rm -rf $TARGET -/opt/rocm/hip/bin/hipcc $SRC1 $SRC2 -mcpu=gfx1030 -o $TARGET \ No newline at end of file From ac6590e71ffd485d97f5c2f2a6e3f4d95c7807de Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Thu, 14 Jul 2022 15:59:58 +0800 Subject: [PATCH 08/15] Update README.md --- memcpy_2d_example_gfx1030/README.md | 1 + memcpy_2d_example_gfx1030/benchmark/README.md | 28 --- .../benchmark/benchmark.cpp | 203 ------------------ memcpy_2d_example_gfx1030/benchmark/config.h | 51 ----- .../benchmark/memcpy_2d_example_gfx1030.s | 131 ----------- .../benchmark/params.config | 5 - .../benchmark/parser.cpp | 46 ---- memcpy_2d_example_gfx1030/benchmark/run.sh | 17 -- memcpy_2d_example_gfx1030/main.cpp | 5 +- 9 files changed, 4 insertions(+), 483 deletions(-) create mode 100644 memcpy_2d_example_gfx1030/README.md delete mode 100644 memcpy_2d_example_gfx1030/benchmark/README.md delete mode 100644 memcpy_2d_example_gfx1030/benchmark/benchmark.cpp delete mode 100644 memcpy_2d_example_gfx1030/benchmark/config.h delete mode 100644 memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s delete mode 100644 memcpy_2d_example_gfx1030/benchmark/params.config delete mode 100644 memcpy_2d_example_gfx1030/benchmark/parser.cpp delete mode 100755 memcpy_2d_example_gfx1030/benchmark/run.sh diff --git a/memcpy_2d_example_gfx1030/README.md b/memcpy_2d_example_gfx1030/README.md new file mode 100644 index 0000000..c267824 --- /dev/null +++ b/memcpy_2d_example_gfx1030/README.md @@ -0,0 +1 @@ +# See README in benchmark \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/README.md b/memcpy_2d_example_gfx1030/benchmark/README.md deleted file mode 100644 index f7318c8..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# memcpy 2d benchmark - -This is a benchmark for memcpy-2d-1Dword example on gfx1030. You can specify matrix parameters (rows, cols, padding) by modifying params.config. -Here rows and padding can pass in any value, while cols currently only support multiples of 2048 (2K). - -## build and run -Go to the benchmark root and build by -''' -$ ./run.sh -''' -Then you can run by -''' -$ ./out.exe params.config -''' - -## conclusion -I have tested in different paramter combination, which shows: -| Rows | Cols | Padding | GBPS | -| :--: | :--: | :-----: | :--: | -| 128 | 147456 | 1024 | 254.202 | -| 256 | 147456 | 1024 | 309.121 | -| 512 | 147456 | 1024 | 368.616 | -| 512 | 73728 | 1024 | 316.32 | -| 512 | 184320 | 1024 | 349.346 | -| 512 | 147456 | 0 | 365.193 | -| 512 | 147456 | 1 | 226.867 | -| 512 | 147456 | 8192 | 367.966 | -| 512 | 147456 | 32768 | 363.132 | \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp b/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp deleted file mode 100644 index da9b8ea..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/benchmark.cpp +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#define HIP_CALL(call) do{ \ - hipError_t err = call; \ - if(err != hipSuccess){ \ - printf("[hiperror](%d) fail to call %s",(int)err,#call); \ - exit(0); \ - } \ -} while(0) - -#define HSACO "memcpy_2d_example_gfx1030.hsaco" -#define HSA_KERNEL "memcpy_2d_example_gfx1030" - -#define ABS(x) ((x) > 0 ? (x) : -1 * (x)) - -template -void rand_vec(Matrix_2d &matrix) -{ - static std::random_device rd; // seed - static std::mt19937 mt(rd()); - static std::uniform_real_distribution dist(-10.0, 10.0); - - for (size_t i = 0; i < matrix.rows; ++i) - for (size_t j = 0; j < matrix.cols + matrix.padding; ++j) { - if (j >= matrix.cols) - continue; - int offset = i * (matrix.cols + matrix.padding) + j; - matrix.data[offset] = dist(mt); - } -} - -template -static inline bool valid_vector(const Matrix_2d &host_in, const Matrix_2d &host_out, double nrms = 1e-6) -{ - double s0 = 0.0; - double s1 = 0.0; - int pp_err = 0; - - int rows = host_in.rows; - int cols = host_in.cols; - int padding = host_in.padding; - - for (int i = 0; i < rows; ++i) { - for (int j = 0; j < cols + padding; ++j) { - if (j >= cols) - continue; - double ri = (double)host_in.data[i * (cols + padding) + j]; - double pi = (double)host_out.data[i * (cols + padding) + j]; - double d = ri - pi; - double dd = d * d; - double rr = 2.0 * ri * ri; - s0 += dd; - s1 += rr; - double delta = ABS(ri - pi) / ri; - - if(delta > 3e-5) { - if(pp_err < 100) - printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)host_out.data)[i], delta); - pp_err++; - } - } - } - //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); - return (sqrt (s0 / s1) < nrms) && (pp_err==0); -} - -int main(int argc, char **argv) -{ - if (argc <= 1) { - std::cout << "no input file! please enter params.config" << std::endl; - return -1; - } - // parse params.config - Config config; - config.parseConfigFile(argv[1]); - - // add key-value to myStruct - assert(config.m_contents.find("rows") != config.m_contents.end() && "[error!] failed to parse rows!"); - assert(config.m_contents.find("cols") != config.m_contents.end() && "[error!] failed to parse cols!"); - assert(config.m_contents.find("padding") != config.m_contents.end() && "[error!] failed to parse padding!"); - - int rows = std::stoi(config.m_contents["rows"]); - int cols = std::stoi(config.m_contents["cols"]); - int padding = std::stoi(config.m_contents["padding"]); - - // judge cols legality - assert(cols % (256*8) == 0 && "[!]Only supports cols which is multiples of 2048(2K)"); - - Matrix_2d matrix_host_in(rows, cols, padding, 1); - Matrix_2d matrix_host_out(rows, cols, padding, 1); - Matrix_2d matrix_dev_in(rows, cols, padding, 0); - Matrix_2d matrix_dev_out(rows, cols, padding, 0); - - // kernel preparation - hipModule_t module; - hipFunction_t kernel_func; - hipEvent_t evt_00, evt_11; - HIP_CALL(hipSetDevice(0)); - - HIP_CALL(hipModuleLoad(&module, HSACO)); - HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); - - int num_cu; - int gcn_arch; - { - hipDeviceProp_t dev_prop; - hipDevice_t dev; - HIP_CALL(hipGetDevice(&dev)); - HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); - num_cu = dev_prop.multiProcessorCount; - gcn_arch = dev_prop.gcnArch; - if (gcn_arch >= 1000) - num_cu *= 2; - } - - int total_loop = 4; - int warm_ups = 2; - - // initial blockDim, gridDim - int bdx = 256; - int gdx = matrix_host_in.cols / (8 * bdx); - HIP_CALL(hipMalloc(&matrix_dev_in.data, sizeof(float) * matrix_dev_in.length)); - HIP_CALL(hipMalloc(&matrix_dev_out.data, sizeof(float) * matrix_dev_out.length)); - - // initial host in data - rand_vec(matrix_host_in); - - // memcpy data from host to device - HIP_CALL(hipMemcpy(matrix_dev_in.data, matrix_host_in.data, sizeof(float) * matrix_dev_in.length, hipMemcpyHostToDevice)); - printf("memcpy, input: %p, output: %p, floats: %d\n",matrix_dev_in.data, matrix_dev_out.data, matrix_dev_in.length); - - struct __attribute__((packed)) - { - float *input; - float *output; - int rows; - int gdx; - int bdx; - int padding; - } args; - - size_t arg_size = sizeof(args); - args.input = matrix_dev_in.data; - args.output = matrix_dev_out.data; - args.rows = rows; - args.gdx = gdx; - args.bdx = bdx; - args.padding = padding; - - void* config_kernel[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size, HIP_LAUNCH_PARAM_END}; - - // warm up kernel - for (int i = 0; i < warm_ups; i++) - HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); - - hipEventCreate(&evt_00); - hipEventCreate(&evt_11); - hipDeviceSynchronize(); - hipEventRecord(evt_00, NULL); - - // launch kernel - for(int i = 0; i < total_loop; i++) - HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); - - float elapsed_ms; - hipEventRecord(evt_11, NULL); - hipEventSynchronize(evt_11); - hipDeviceSynchronize(); - hipEventElapsedTime(&elapsed_ms, evt_00, evt_11); - hipEventDestroy(evt_00); - hipEventDestroy(evt_11); - - HIP_CALL(hipMemcpy(matrix_host_out.data, matrix_dev_out.data, sizeof(float) * matrix_host_out.length, hipMemcpyDeviceToHost)); - - // verification - bool is_valid = valid_vector(matrix_host_in, matrix_host_out); - if(!is_valid) - printf("Data not valid, please check\n"); - else - printf("Data is valid :)\n"); - - // evaluation - float time_per_loop_ms = elapsed_ms / total_loop; - float gbps = (matrix_host_out.rows * matrix_host_out.cols) * 2 * sizeof(float) / time_per_loop_ms / 1000 / 1000; - - std::cout << "---- MEMCPY 2D EXAMPLE EVALUATION ----" << std::endl; - std::cout << " rows: " << matrix_host_in.rows << '\t' << "cols: " << matrix_host_in.cols << '\t' << "padding: " << matrix_host_in.padding << std::endl; - std::cout << " gdx: " << gdx << '\t' << "bdx " << bdx << std::endl; - std::cout << " gbps: " << gbps << std::endl; - std::cout << "---- FINISH EVALUATION ----" << std::endl; - - hipFree(matrix_dev_in.data); - hipFree(matrix_dev_out.data); - - return 0; -} \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/config.h b/memcpy_2d_example_gfx1030/benchmark/config.h deleted file mode 100644 index a679238..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/config.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef CONFIG_ -#define CONFIG_ - -#include -#include - -template -class Matrix_2d -{ -public: - int rows; - int cols; - int padding; - int length; - bool type; //Host: 1; Device: 0 - T *data; -public: - Matrix_2d(): rows(0), cols(0), padding(0), length(0), type(0), data(nullptr) {} - Matrix_2d(int r, int c, int p, bool t): rows(r), cols(c), padding(p), type(t), length(r * (c + p)), data(new T[r * (c + p)]) {} - ~Matrix_2d() { - if (data != nullptr && type) { - delete [] data; - } - } - void initMem() { - if (!data) - data = new T[rows * (cols + padding)]; - } -}; - -class Config -{ -public: - char m_delimiter; - std::unordered_map m_contents; -public: - struct FILE_NOT_FOUND - { - std::string filename; - FILE_NOT_FOUND(const std::string& filename_ = std::string()): filename(filename_) {} - }; -public: - Config(): m_delimiter('=') {} - void parseConfigFile(const char *fileName); - void split(const std::string &line); -private: - void deleteSpace(std::string &str); -}; - - -#endif \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s deleted file mode 100644 index 6b3007e..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/memcpy_2d_example_gfx1030.s +++ /dev/null @@ -1,131 +0,0 @@ -.text -.global memcpy_2d_example_gfx1030 -.p2align 8 -.type memcpy_2d_example_gfx1030,@function -memcpy_2d_example_gfx1030: -; This is just an example, not the optimal one -.set s_karg, 0 ; kernel argument -.set s_bx, 2 ; blockIdx - -.set s_ptr_in, 4 -.set s_ptr_out, 6 -.set s_rows, 8 -.set s_gdx, 10 -.set s_bdx, 12 -.set s_padding, 14 -.set s_stride_block, 16 -.set s_tmp, 18 - -.set v_buf, 0 -.set v_offset, 16 -.set v_tmp, 32 - - ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm - ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 - ; s_waitcnt lgkmcnt(0) - ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 - ; s_mov_b32 s[s_gdx], 72 ; num_cu - - s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 - s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 - s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 - s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 - s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 - s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 - - s_waitcnt lgkmcnt(0) - - s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 - s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 - v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 - v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 - - s_waitcnt lgkmcnt(0) - - - s_mul_i32 s[s_tmp], s[s_gdx], s[s_bdx+1] ; gridDim * blockDim * 4 - v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] - v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] - v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] - v_add_nc_u32 v[v_offset+4], s[s_tmp], v[v_offset+3] - v_add_nc_u32 v[v_offset+5], s[s_tmp], v[v_offset+4] - v_add_nc_u32 v[v_offset+6], s[s_tmp], v[v_offset+5] - v_add_nc_u32 v[v_offset+7], s[s_tmp], v[v_offset+6] - s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload - -label_memcopy_start: - global_load_dword v[v_buf+0], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+1], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+2], v[v_offset+2], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+3], v[v_offset+3], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+4], v[v_offset+4], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+5], v[v_offset+5], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+6], v[v_offset+6], s[s_ptr_in:s_ptr_in+1] - global_load_dword v[v_buf+7], v[v_offset+7], s[s_ptr_in:s_ptr_in+1] - - ; add stride padding - s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 - s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 - s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] - s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 - - s_waitcnt vmcnt(0) - - global_store_dword v[v_offset+0], v[v_buf+0], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+1], v[v_buf+1], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+2], v[v_buf+2], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+3], v[v_buf+3], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+4], v[v_buf+4], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+5], v[v_buf+5], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+6], v[v_buf+6], s[s_ptr_out:s_ptr_out+1] - global_store_dword v[v_offset+7], v[v_buf+7], s[s_ptr_out:s_ptr_out+1] - - s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] - s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 - - s_sub_u32 s[s_rows], s[s_rows], 1 - s_cmp_eq_u32 s[s_rows], 0 - s_waitcnt vmcnt(0) - s_cbranch_scc0 label_memcopy_start - s_endpgm - -.rodata -.p2align 6 -.amdhsa_kernel memcpy_2d_example_gfx1030 - .amdhsa_group_segment_fixed_size 0 - .amdhsa_user_sgpr_dispatch_ptr 0 - .amdhsa_user_sgpr_kernarg_segment_ptr 1 - .amdhsa_system_sgpr_workgroup_id_x 1 - .amdhsa_system_vgpr_workitem_id 0 - .amdhsa_next_free_vgpr 64 - .amdhsa_next_free_sgpr 32 - .amdhsa_ieee_mode 0 - .amdhsa_dx10_clamp 0 - .amdhsa_wavefront_size32 1 - .amdhsa_workgroup_processor_mode 0 -.end_amdhsa_kernel - -.amdgpu_metadata ---- -amdhsa.version: [ 1, 0 ] -amdhsa.kernels: - - .name: memcpy_2d_example_gfx1030 - .symbol: memcpy_2d_example_gfx1030.kd - .sgpr_count: 32 - .vgpr_count: 64 - .kernarg_segment_align: 8 - .kernarg_segment_size: 32 - .group_segment_fixed_size: 0 - .private_segment_fixed_size: 0 - .wavefront_size: 32 ;warpsize - ; .reqd_workgroup_size : [256, 1, 1] - .max_flat_workgroup_size: 256 ;gridsize - .args: - - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} - - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} - - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} - - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} - - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} -... -.end_amdgpu_metadata diff --git a/memcpy_2d_example_gfx1030/benchmark/params.config b/memcpy_2d_example_gfx1030/benchmark/params.config deleted file mode 100644 index 90a5acb..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/params.config +++ /dev/null @@ -1,5 +0,0 @@ -[2d_matrix_paramters] - -rows = 512 -cols = 147456 -padding = 32768 \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/benchmark/parser.cpp b/memcpy_2d_example_gfx1030/benchmark/parser.cpp deleted file mode 100644 index d2dd054..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/parser.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include - -#include "config.h" - -void Config::parseConfigFile(const char *fileName) -{ - std::ifstream in(fileName); - if (!in) - throw FILE_NOT_FOUND(); - - std::string line; - while (!in.eof()) { - getline(in, line); - if (line.find('[') == 0 || line.find(' ') == 0) - continue; - split(line); - } -} - -void Config::split(const std::string &line) -{ - std::size_t previous = 0; - std::size_t current = line.find(this->m_delimiter); - std::size_t length = line.length(); - if (current != std::string::npos) { - std::string key = line.substr(previous, current - previous); - std::string value = line.substr(current + 1, length); - deleteSpace(key); - deleteSpace(value); - this->m_contents[key] = value; - } -} - -void Config::deleteSpace(std::string &str) -{ - int j = 0; - for (int i = 0; i < str.length(); i++) { - if (str[i] == ' ') - continue; - str[j++] = str[i]; - } - str.resize(j); -} - diff --git a/memcpy_2d_example_gfx1030/benchmark/run.sh b/memcpy_2d_example_gfx1030/benchmark/run.sh deleted file mode 100755 index cd16d98..0000000 --- a/memcpy_2d_example_gfx1030/benchmark/run.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh - -# g++ -std=c++11 benchmark.cpp parser.cpp -o out - -KSRC=memcpy_2d_example_gfx1030.s -KOUT=memcpy_2d_example_gfx1030.hsaco -SRC1=benchmark.cpp -SRC2=parser.cpp - -TARGET=out.exe - -# pre-delete the previous KOUT -rm -rf $KOUT -/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT - -rm -rf $TARGET -/opt/rocm/hip/bin/hipcc $SRC1 $SRC2 -mcpu=gfx1030 -o $TARGET \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/main.cpp b/memcpy_2d_example_gfx1030/main.cpp index bfbd56f..e008e6f 100644 --- a/memcpy_2d_example_gfx1030/main.cpp +++ b/memcpy_2d_example_gfx1030/main.cpp @@ -11,8 +11,8 @@ } \ } while(0) -#define HSACO "memcpy_2d_x4_example_gfx1030.hsaco" -#define HSA_KERNEL "memcpy_2d_x4_example_gfx1030" +#define HSACO "memcpy_2d_example_gfx1030.hsaco" +#define HSA_KERNEL "memcpy_2d_example_gfx1030" #define PER_PIXEL_CHECK #define ASSERT_ON_FAIL @@ -175,6 +175,7 @@ int main(int argc, char **argv) } float time_per_loop_ms = elapsed_ms / total_loop; + float total_time = elapsed_ms; float gbps = total_floats * 2 * sizeof(float) / time_per_loop_ms / 1000 / 1000; printf("gbps:%f\n",gbps); From 3f14b0107acdabae18ed73258f79bc4e4c33acaf Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Fri, 15 Jul 2022 18:33:04 +0800 Subject: [PATCH 09/15] delete readme --- memcpy_2d_example_gfx1030/README.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 memcpy_2d_example_gfx1030/README.md diff --git a/memcpy_2d_example_gfx1030/README.md b/memcpy_2d_example_gfx1030/README.md deleted file mode 100644 index c267824..0000000 --- a/memcpy_2d_example_gfx1030/README.md +++ /dev/null @@ -1 +0,0 @@ -# See README in benchmark \ No newline at end of file From 4f55dc37380f273b79df6bd35f1612a70634b09a Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:00:48 +0800 Subject: [PATCH 10/15] add command line parsing --- memcpy_2d_example_gfx1030/main.cpp | 294 +++++++++++++++++++---------- 1 file changed, 198 insertions(+), 96 deletions(-) diff --git a/memcpy_2d_example_gfx1030/main.cpp b/memcpy_2d_example_gfx1030/main.cpp index e008e6f..2f1b004 100644 --- a/memcpy_2d_example_gfx1030/main.cpp +++ b/memcpy_2d_example_gfx1030/main.cpp @@ -1,135 +1,136 @@ -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#define HIP_CALL(call) do{ \ - hipError_t err = call; \ - if(err != hipSuccess){ \ - printf("[hiperror](%d) fail to call %s",(int)err,#call); \ - exit(0); \ - } \ -} while(0) +#include "config.h" #define HSACO "memcpy_2d_example_gfx1030.hsaco" #define HSA_KERNEL "memcpy_2d_example_gfx1030" -#define PER_PIXEL_CHECK -#define ASSERT_ON_FAIL - -#ifndef ABS #define ABS(x) ((x) > 0 ? (x) : -1 * (x)) -#endif template -void rand_vec(T * seq, size_t rows, size_t cols, size_t padding) +void rand_vec(Matrix_2d &matrix) { static std::random_device rd; // seed static std::mt19937 mt(rd()); static std::uniform_real_distribution dist(-10.0, 10.0); - for (size_t i = 0; i < rows; ++i) - for (size_t j = 0; j < cols + padding; ++j) { - if (j >= cols) + for (size_t i = 0; i < matrix.rows; ++i) + for (size_t j = 0; j < matrix.cols + matrix.padding; ++j) { + if (j >= matrix.cols) continue; - int offset = i * (cols + padding) + j; - seq[offset] = dist(mt); + int offset = i * (matrix.cols + matrix.padding) + j; + matrix.data[offset] = dist(mt); } } -static inline bool valid_vector(const float* ref, const float* pred, int rows, int cols, int padding, double nrms = 1e-6) +template +static inline bool valid_vector(const Matrix_2d &host_in, const Matrix_2d &host_out, double nrms = 1e-6) { double s0 = 0.0; double s1 = 0.0; -#ifdef PER_PIXEL_CHECK int pp_err = 0; -#endif + + int rows = host_in.rows; + int cols = host_in.cols; + int padding = host_in.padding; + for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols + padding; ++j) { if (j >= cols) continue; - double ri = (double)ref[i * (cols + padding) + j]; - double pi = (double)pred[i * (cols + padding) + j]; + double ri = (double)host_in.data[i * (cols + padding) + j]; + double pi = (double)host_out.data[i * (cols + padding) + j]; double d = ri - pi; double dd = d * d; double rr = 2.0 * ri * ri; s0 += dd; s1 += rr; -#ifdef PER_PIXEL_CHECK double delta = ABS(ri - pi) / ri; + if(delta > 3e-5) { -#ifdef ASSERT_ON_FAIL if(pp_err < 100) - printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)pred)[i], delta); -#endif - pp_err++; - } -#endif + printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)host_out.data)[i], delta); + pp_err++; + } } } //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); - return (sqrt(s0/s1) matrix_host_in(rows, cols, padding, 1); + Matrix_2d matrix_host_out(rows, cols, padding, 1); + Matrix_2d matrix_dev_in(rows, cols, padding, 0); + Matrix_2d matrix_dev_out(rows, cols, padding, 0); + + // kernel preparation hipModule_t module; hipFunction_t kernel_func; - hipEvent_t evt_00, evt_11; + hipEvent_t evt_00, evt_11; HIP_CALL(hipSetDevice(0)); - // loads code object from file into a hipModule_t HIP_CALL(hipModuleLoad(&module, HSACO)); - // extract HSA_KERNEL in module - HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); - + HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); + int num_cu; int gcn_arch; { hipDeviceProp_t dev_prop; hipDevice_t dev; - HIP_CALL(hipGetDevice( &dev )); - HIP_CALL(hipGetDeviceProperties( &dev_prop, dev )); + HIP_CALL(hipGetDevice(&dev)); + HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); num_cu = dev_prop.multiProcessorCount; gcn_arch = dev_prop.gcnArch; if (gcn_arch >= 1000) num_cu *= 2; - // std::cout << "num_cu: " << num_cu << std::endl; } int total_loop = 4; int warm_ups = 2; - int i; - + // initial blockDim, gridDim int bdx = 256; - int gdx = num_cu; + int gdx = matrix_host_in.cols / (8 * bdx); + HIP_CALL(hipMalloc(&matrix_dev_in.data, sizeof(float) * matrix_dev_in.length)); + HIP_CALL(hipMalloc(&matrix_dev_out.data, sizeof(float) * matrix_dev_out.length)); - // initial matrix parameters - int rows = 512; - int cols = bdx * gdx * 8; - int padding = 256; + // initial host in data + rand_vec(matrix_host_in); - float *host_in, *host_out, *dev_in, *dev_out; + // memcpy data from host to device + HIP_CALL(hipMemcpy(matrix_dev_in.data, matrix_host_in.data, sizeof(float) * matrix_dev_in.length, hipMemcpyHostToDevice)); + // printf("Memcpy: input: %p, output: %p, floats: %d\n",matrix_dev_in.data, matrix_dev_out.data, matrix_dev_in.length); - int total_floats = rows * (cols + padding); - - host_in = new float[total_floats]; - host_out = new float[total_floats]; - HIP_CALL(hipMalloc(&dev_in, sizeof(float) * total_floats)); - HIP_CALL(hipMalloc(&dev_out, sizeof(float) * total_floats)); - - rand_vec(host_in, rows, cols, padding); - - HIP_CALL(hipMemcpy(dev_in, host_in, sizeof(float) * total_floats, hipMemcpyHostToDevice)); - - printf("memcpy, input:%p, output:%p, floats:%d\n",dev_in, dev_out, total_floats); - - struct __attribute__((packed)) { + struct __attribute__((packed)) + { float *input; float *output; int rows; @@ -137,50 +138,151 @@ int main(int argc, char **argv) int bdx; int padding; } args; + size_t arg_size = sizeof(args); - args.input = dev_in; - args.output = dev_out; + args.input = matrix_dev_in.data; + args.output = matrix_dev_out.data; args.rows = rows; args.gdx = gdx; args.bdx = bdx; - args.padding = padding; + args.padding = padding; - void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, - &arg_size, HIP_LAUNCH_PARAM_END}; + void* config_kernel[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size, HIP_LAUNCH_PARAM_END}; - for (i = 0; i < warm_ups; i++) - HIP_CALL(hipModuleLaunchKernel( kernel_func, gdx,1,1, bdx,1,1, 0, 0, NULL, (void**)&config )); + // warm up kernel + for (int i = 0; i < warm_ups; i++) + HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); hipEventCreate(&evt_00); - hipEventCreate(&evt_11); - + hipEventCreate(&evt_11); hipDeviceSynchronize(); hipEventRecord(evt_00, NULL); - for(i=0;i> default_setting = { + {1, 2048, 0}, // 8Kb, no padding + {2, 131072, 0}, // 1Mb, no padding + {256, 131072, 0}, // 128Mb, no padding + {512, 262144, 0}, // 512Mb, no padding + {1024, 262144, 0}, // 1G, no padding + + {1, 2048, 5120}, // 8kb, add padding + {2, 131072, 5120}, // 1Mb, no padding + {256, 131072, 5120}, // 128Mb, add padding + {512, 262144, 5120}, // 512Mb, add padding + {1024, 262144, 5120}, // 1G, add padding +}; + + +int main(int argc, char **argv) +{ + int rows; + int cols; + int padding; + + if (argc <= 1) { + std::cout << " No Input Parameters! Running Benchmark With Default Settings " << std::endl; + std::cout << " ---- MEMCPY 2D EXAMPLE BENCHMARK ---- " << std::endl; + + Result res; + + for (int i = 0; i < default_setting.size(); ++i) { + rows = default_setting[i][0]; + cols = default_setting[i][1]; + padding = default_setting[i][2]; + + run_kernel(rows, cols, padding, res); + + int length = rows * cols * sizeof(float); + + if (length >= 1024 && length < 1024 * 1024) + std::cout << " Input Size: " << length / 1024 << " Kb" << std::endl; + else if (length >= 1024 * 1024 && length < 1024 * 1024 * 1024) + std::cout << " Input Size: " << length / 1024 / 1024 << " Mb" << std::endl; + else if (length >= 1024 * 1024 * 1024) + std::cout << " Input Size: " << length / 1024 / 1024 / 1024 <<" Gb" << std::endl; + + std::cout << " Settings: rows: " << rows << '\t' << "cols: " << cols << '\t' << "padding: " << padding << std::endl; + std::cout << " gdx: " << res.gdx << '\t' << "bdx: " << res.bdx << std::endl; + if (res.isValid) + std::cout << " Result: " << "Is Valid"; + else + std::cout << " Result: " << "Not Valid"; + std::cout << " gbps: " << res.gbps << std::endl; + std::cout << std::endl; + } + + std::cout << " ---- END OF BENCHMARK ---- " << std::endl; + } + + else { + const char *optString = "r::c::p::"; + const struct option long_options[] = + { {"help", 0, NULL, 'h'}, + {"rows", optional_argument, NULL, 'r'}, + {"cols", optional_argument, NULL, 'c'}, + {"padding", optional_argument, NULL,'p'}, + {NULL, 0, NULL, 0} + }; + + int ch; + + while((ch = getopt_long(argc, argv, optString, long_options, NULL)) != -1){ + switch(ch) { + case 'h': print_usage(stdout, 0); break; + case 'r': rows = atoi(optarg); break; + case 'c': cols = atoi(optarg); break; + case 'p': padding = atoi(optarg); break; + case -1: break; + default: abort(); + } + } + + Result res; + run_kernel(rows, cols, padding, res); + + std::cout << std::endl; + std::cout << " ---- MEMCPY 2D EXAMPLE BENCHMARK ---- " << std::endl; + std::cout << " Input Size: " << rows * cols * sizeof(float) <<" bytes" << std::endl; + std::cout << " Settings: rows: " << rows << '\t' << "cols: " << cols << '\t' << "padding: " << padding << std::endl; + std::cout << " gdx: " << res.gdx << '\t' << "bdx: " << res.bdx << std::endl; + if (res.isValid) + std::cout << " Result: " << "Is Valid"; + else + std::cout << " Result: " << "Not Valid"; + std::cout << " gbps: " << res.gbps << std::endl; + std::cout << " ---- END OF BENCHMARK ---- " << std::endl; + } + + return 0; +} \ No newline at end of file From 1aaa2dca9b8416e9d3de4ba26d2eb84f6c4ab25d Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:03:31 +0800 Subject: [PATCH 11/15] --amend --- memcpy_2d_example_gfx1030/build_clanghipcc.h | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100755 memcpy_2d_example_gfx1030/build_clanghipcc.h diff --git a/memcpy_2d_example_gfx1030/build_clanghipcc.h b/memcpy_2d_example_gfx1030/build_clanghipcc.h deleted file mode 100755 index 56c6274..0000000 --- a/memcpy_2d_example_gfx1030/build_clanghipcc.h +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - -KSRC=memcpy_2d_x4_example_gfx1030.s -KOUT=memcpy_2d_x4_example_gfx1030.hsaco -SRC=main.cpp -TARGET=out.exe - -# pre-delete the previous KOUT -rm -rf $KOUT -/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT - -rm -rf $TARGET -/opt/rocm/hip/bin/hipcc $SRC -mcpu=gfx1030 -o $TARGET From 29af1277c9dccc64dc54615664c21b3d2e24be07 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:04:45 +0800 Subject: [PATCH 12/15] update asm for gfx1030 --- memcpy_2d_example_gfx1030/build_clanghipcc.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100755 memcpy_2d_example_gfx1030/build_clanghipcc.sh diff --git a/memcpy_2d_example_gfx1030/build_clanghipcc.sh b/memcpy_2d_example_gfx1030/build_clanghipcc.sh new file mode 100755 index 0000000..ca64652 --- /dev/null +++ b/memcpy_2d_example_gfx1030/build_clanghipcc.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +KSRC=memcpy_2d_example_gfx1030.s +KOUT=memcpy_2d_example_gfx1030.hsaco +SRC=main.cpp +TARGET=out.exe + +# pre-delete the previous KOUT +rm -rf $KOUT +/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT + +rm -rf $TARGET +/opt/rocm/hip/bin/hipcc $SRC -mcpu=gfx1030 -o $TARGET From ae68de2c7b3f3036c5406e9d050b00db820d3865 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:07:39 +0800 Subject: [PATCH 13/15] add config class --- memcpy_2d_example_gfx1030/config.h | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 memcpy_2d_example_gfx1030/config.h diff --git a/memcpy_2d_example_gfx1030/config.h b/memcpy_2d_example_gfx1030/config.h new file mode 100644 index 0000000..46a922e --- /dev/null +++ b/memcpy_2d_example_gfx1030/config.h @@ -0,0 +1,46 @@ +#ifndef CONFIG +#define CONFIG + +#include + +#define HIP_CALL(call) do{ \ + hipError_t err = call; \ + if(err != hipSuccess){ \ + printf("[hiperror](%d) fail to call %s",(int)err,#call); \ + exit(0); \ + } \ +} while(0) + +template +class Matrix_2d +{ +public: + int rows; + int cols; + int padding; + int length; + bool type; //Host: 1; Device: 0 + T *data; +public: + Matrix_2d(): rows(0), cols(0), padding(0), length(0), type(0), data(nullptr) {} + Matrix_2d(int r, int c, int p, bool t): rows(r), cols(c), padding(p), type(t), length(r * (c + p)), data(new T[r * (c + p)]) {} + ~Matrix_2d() { + if (data != nullptr && type) { + delete [] data; + } + } + void initMem() { + if (!data) + data = new T[rows * (cols + padding)]; + } +}; + +typedef struct Result +{ + bool isValid; + int bdx; + int gdx; + float gbps; +} Result; + +#endif From b2363730861b6d3519d16af71ab98b6e63e48112 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:09:29 +0800 Subject: [PATCH 14/15] --amend --- memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s index f1327f5..6b3007e 100644 --- a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s +++ b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s @@ -11,7 +11,7 @@ memcpy_2d_example_gfx1030: .set s_ptr_out, 6 .set s_rows, 8 .set s_gdx, 10 -.set s_bdx, 20 +.set s_bdx, 12 .set s_padding, 14 .set s_stride_block, 16 .set s_tmp, 18 @@ -42,7 +42,8 @@ memcpy_2d_example_gfx1030: s_waitcnt lgkmcnt(0) - s_mul_i32 s[s_tmp], s[s_gdx], 256*4 ; gridDim * blockDim * 4 + + s_mul_i32 s[s_tmp], s[s_gdx], s[s_bdx+1] ; gridDim * blockDim * 4 v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] @@ -117,7 +118,7 @@ amdhsa.kernels: .group_segment_fixed_size: 0 .private_segment_fixed_size: 0 .wavefront_size: 32 ;warpsize - .reqd_workgroup_size : [256, 1, 1] + ; .reqd_workgroup_size : [256, 1, 1] .max_flat_workgroup_size: 256 ;gridsize .args: - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} From 10b57ee5185766b82354d8afd4abb07bc8376bb4 Mon Sep 17 00:00:00 2001 From: zyGao1126 <2388921475@qq.com> Date: Mon, 18 Jul 2022 12:11:40 +0800 Subject: [PATCH 15/15] update asm for gdx1030 --- memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s index 6b3007e..9554f5b 100644 --- a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s +++ b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s @@ -35,9 +35,9 @@ memcpy_2d_example_gfx1030: s_waitcnt lgkmcnt(0) - s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 - s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 - v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 s_waitcnt lgkmcnt(0)