diff --git a/memcpy_2d_example_gfx1030/build_clanghipcc.sh b/memcpy_2d_example_gfx1030/build_clanghipcc.sh new file mode 100755 index 0000000..ca64652 --- /dev/null +++ b/memcpy_2d_example_gfx1030/build_clanghipcc.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +KSRC=memcpy_2d_example_gfx1030.s +KOUT=memcpy_2d_example_gfx1030.hsaco +SRC=main.cpp +TARGET=out.exe + +# pre-delete the previous KOUT +rm -rf $KOUT +/opt/rocm/llvm/bin/clang++ -x assembler -target amdgcn--amdhsa -mcpu=gfx1030 $KSRC -o $KOUT + +rm -rf $TARGET +/opt/rocm/hip/bin/hipcc $SRC -mcpu=gfx1030 -o $TARGET diff --git a/memcpy_2d_example_gfx1030/config.h b/memcpy_2d_example_gfx1030/config.h new file mode 100644 index 0000000..46a922e --- /dev/null +++ b/memcpy_2d_example_gfx1030/config.h @@ -0,0 +1,46 @@ +#ifndef CONFIG +#define CONFIG + +#include + +#define HIP_CALL(call) do{ \ + hipError_t err = call; \ + if(err != hipSuccess){ \ + printf("[hiperror](%d) fail to call %s",(int)err,#call); \ + exit(0); \ + } \ +} while(0) + +template +class Matrix_2d +{ +public: + int rows; + int cols; + int padding; + int length; + bool type; //Host: 1; Device: 0 + T *data; +public: + Matrix_2d(): rows(0), cols(0), padding(0), length(0), type(0), data(nullptr) {} + Matrix_2d(int r, int c, int p, bool t): rows(r), cols(c), padding(p), type(t), length(r * (c + p)), data(new T[r * (c + p)]) {} + ~Matrix_2d() { + if (data != nullptr && type) { + delete [] data; + } + } + void initMem() { + if (!data) + data = new T[rows * (cols + padding)]; + } +}; + +typedef struct Result +{ + bool isValid; + int bdx; + int gdx; + float gbps; +} Result; + +#endif diff --git a/memcpy_2d_example_gfx1030/main.cpp b/memcpy_2d_example_gfx1030/main.cpp new file mode 100644 index 0000000..2f1b004 --- /dev/null +++ b/memcpy_2d_example_gfx1030/main.cpp @@ -0,0 +1,288 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "config.h" + +#define HSACO "memcpy_2d_example_gfx1030.hsaco" +#define HSA_KERNEL "memcpy_2d_example_gfx1030" + +#define ABS(x) ((x) > 0 ? (x) : -1 * (x)) + +template +void rand_vec(Matrix_2d &matrix) +{ + static std::random_device rd; // seed + static std::mt19937 mt(rd()); + static std::uniform_real_distribution dist(-10.0, 10.0); + + for (size_t i = 0; i < matrix.rows; ++i) + for (size_t j = 0; j < matrix.cols + matrix.padding; ++j) { + if (j >= matrix.cols) + continue; + int offset = i * (matrix.cols + matrix.padding) + j; + matrix.data[offset] = dist(mt); + } +} + +template +static inline bool valid_vector(const Matrix_2d &host_in, const Matrix_2d &host_out, double nrms = 1e-6) +{ + double s0 = 0.0; + double s1 = 0.0; + int pp_err = 0; + + int rows = host_in.rows; + int cols = host_in.cols; + int padding = host_in.padding; + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols + padding; ++j) { + if (j >= cols) + continue; + double ri = (double)host_in.data[i * (cols + padding) + j]; + double pi = (double)host_out.data[i * (cols + padding) + j]; + double d = ri - pi; + double dd = d * d; + double rr = 2.0 * ri * ri; + s0 += dd; + s1 += rr; + double delta = ABS(ri - pi) / ri; + + if(delta > 3e-5) { + if(pp_err < 100) + printf("diff at %4d, ref:%lf, pred:%lf(0x%08x), d:%lf\n", i, ri, pi,((uint32_t *)host_out.data)[i], delta); + pp_err++; + } + } + } + //printf("nrms:%lf, s0:%lf, s1:%lf\n",sqrt(s0/s1),s0,s1); + return (sqrt (s0 / s1) < nrms) && (pp_err==0); +} + + +void print_usage(FILE* stream, int exit_code) +{ + fprintf(stream, + "Usage: \n"); + fprintf(stream, + " -h --help Display this usage information.\n" + " -r --rows Rows.\n" + " -c --cols Cols.\n" + " -p --padding Padding.\n\n"); + fprintf(stream, + "Note: If no params are provided, The benchmark will run several typical cases.\n"); + exit (exit_code); +} + +void run_kernel(const int &rows, const int &cols, const int &padding, Result &res){ + // judge cols legality + assert(cols % (256 * 8) == 0 && "[!]Only supports cols which is multiples of 2048(2K)"); + + Matrix_2d matrix_host_in(rows, cols, padding, 1); + Matrix_2d matrix_host_out(rows, cols, padding, 1); + Matrix_2d matrix_dev_in(rows, cols, padding, 0); + Matrix_2d matrix_dev_out(rows, cols, padding, 0); + + // kernel preparation + hipModule_t module; + hipFunction_t kernel_func; + hipEvent_t evt_00, evt_11; + HIP_CALL(hipSetDevice(0)); + + HIP_CALL(hipModuleLoad(&module, HSACO)); + HIP_CALL(hipModuleGetFunction(&kernel_func, module, HSA_KERNEL)); + + int num_cu; + int gcn_arch; + { + hipDeviceProp_t dev_prop; + hipDevice_t dev; + HIP_CALL(hipGetDevice(&dev)); + HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); + num_cu = dev_prop.multiProcessorCount; + gcn_arch = dev_prop.gcnArch; + if (gcn_arch >= 1000) + num_cu *= 2; + } + + int total_loop = 4; + int warm_ups = 2; + + // initial blockDim, gridDim + int bdx = 256; + int gdx = matrix_host_in.cols / (8 * bdx); + HIP_CALL(hipMalloc(&matrix_dev_in.data, sizeof(float) * matrix_dev_in.length)); + HIP_CALL(hipMalloc(&matrix_dev_out.data, sizeof(float) * matrix_dev_out.length)); + + // initial host in data + rand_vec(matrix_host_in); + + // memcpy data from host to device + HIP_CALL(hipMemcpy(matrix_dev_in.data, matrix_host_in.data, sizeof(float) * matrix_dev_in.length, hipMemcpyHostToDevice)); + // printf("Memcpy: input: %p, output: %p, floats: %d\n",matrix_dev_in.data, matrix_dev_out.data, matrix_dev_in.length); + + struct __attribute__((packed)) + { + float *input; + float *output; + int rows; + int gdx; + int bdx; + int padding; + } args; + + size_t arg_size = sizeof(args); + args.input = matrix_dev_in.data; + args.output = matrix_dev_out.data; + args.rows = rows; + args.gdx = gdx; + args.bdx = bdx; + args.padding = padding; + + void* config_kernel[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &arg_size, HIP_LAUNCH_PARAM_END}; + + // warm up kernel + for (int i = 0; i < warm_ups; i++) + HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); + + hipEventCreate(&evt_00); + hipEventCreate(&evt_11); + hipDeviceSynchronize(); + hipEventRecord(evt_00, NULL); + + // launch kernel + for(int i = 0; i < total_loop; i++) + HIP_CALL(hipModuleLaunchKernel(kernel_func, gdx, 1, 1, bdx, 1, 1, 0, 0, NULL, (void**)&config_kernel)); + + float elapsed_ms; + hipEventRecord(evt_11, NULL); + hipEventSynchronize(evt_11); + hipDeviceSynchronize(); + hipEventElapsedTime(&elapsed_ms, evt_00, evt_11); + hipEventDestroy(evt_00); + hipEventDestroy(evt_11); + + HIP_CALL(hipMemcpy(matrix_host_out.data, matrix_dev_out.data, sizeof(float) * matrix_host_out.length, hipMemcpyDeviceToHost)); + + // verification + bool isValid = valid_vector(matrix_host_in, matrix_host_out); + res.isValid = isValid; + + // evaluation + float time_per_loop_ms = elapsed_ms / total_loop; + float gbps = (matrix_host_out.rows * matrix_host_out.cols) * 2 * sizeof(float) / time_per_loop_ms / 1000 / 1000; + + res.gbps = gbps; + res.gdx = gdx; + res.bdx = bdx; + + hipFree(matrix_dev_in.data); + hipFree(matrix_dev_out.data); +} + +/* set rows, cols, padding */ +std::vector> default_setting = { + {1, 2048, 0}, // 8Kb, no padding + {2, 131072, 0}, // 1Mb, no padding + {256, 131072, 0}, // 128Mb, no padding + {512, 262144, 0}, // 512Mb, no padding + {1024, 262144, 0}, // 1G, no padding + + {1, 2048, 5120}, // 8kb, add padding + {2, 131072, 5120}, // 1Mb, no padding + {256, 131072, 5120}, // 128Mb, add padding + {512, 262144, 5120}, // 512Mb, add padding + {1024, 262144, 5120}, // 1G, add padding +}; + + +int main(int argc, char **argv) +{ + int rows; + int cols; + int padding; + + if (argc <= 1) { + std::cout << " No Input Parameters! Running Benchmark With Default Settings " << std::endl; + std::cout << " ---- MEMCPY 2D EXAMPLE BENCHMARK ---- " << std::endl; + + Result res; + + for (int i = 0; i < default_setting.size(); ++i) { + rows = default_setting[i][0]; + cols = default_setting[i][1]; + padding = default_setting[i][2]; + + run_kernel(rows, cols, padding, res); + + int length = rows * cols * sizeof(float); + + if (length >= 1024 && length < 1024 * 1024) + std::cout << " Input Size: " << length / 1024 << " Kb" << std::endl; + else if (length >= 1024 * 1024 && length < 1024 * 1024 * 1024) + std::cout << " Input Size: " << length / 1024 / 1024 << " Mb" << std::endl; + else if (length >= 1024 * 1024 * 1024) + std::cout << " Input Size: " << length / 1024 / 1024 / 1024 <<" Gb" << std::endl; + + std::cout << " Settings: rows: " << rows << '\t' << "cols: " << cols << '\t' << "padding: " << padding << std::endl; + std::cout << " gdx: " << res.gdx << '\t' << "bdx: " << res.bdx << std::endl; + if (res.isValid) + std::cout << " Result: " << "Is Valid"; + else + std::cout << " Result: " << "Not Valid"; + std::cout << " gbps: " << res.gbps << std::endl; + std::cout << std::endl; + } + + std::cout << " ---- END OF BENCHMARK ---- " << std::endl; + } + + else { + const char *optString = "r::c::p::"; + const struct option long_options[] = + { {"help", 0, NULL, 'h'}, + {"rows", optional_argument, NULL, 'r'}, + {"cols", optional_argument, NULL, 'c'}, + {"padding", optional_argument, NULL,'p'}, + {NULL, 0, NULL, 0} + }; + + int ch; + + while((ch = getopt_long(argc, argv, optString, long_options, NULL)) != -1){ + switch(ch) { + case 'h': print_usage(stdout, 0); break; + case 'r': rows = atoi(optarg); break; + case 'c': cols = atoi(optarg); break; + case 'p': padding = atoi(optarg); break; + case -1: break; + default: abort(); + } + } + + Result res; + run_kernel(rows, cols, padding, res); + + std::cout << std::endl; + std::cout << " ---- MEMCPY 2D EXAMPLE BENCHMARK ---- " << std::endl; + std::cout << " Input Size: " << rows * cols * sizeof(float) <<" bytes" << std::endl; + std::cout << " Settings: rows: " << rows << '\t' << "cols: " << cols << '\t' << "padding: " << padding << std::endl; + std::cout << " gdx: " << res.gdx << '\t' << "bdx: " << res.bdx << std::endl; + if (res.isValid) + std::cout << " Result: " << "Is Valid"; + else + std::cout << " Result: " << "Not Valid"; + std::cout << " gbps: " << res.gbps << std::endl; + std::cout << " ---- END OF BENCHMARK ---- " << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s new file mode 100644 index 0000000..9554f5b --- /dev/null +++ b/memcpy_2d_example_gfx1030/memcpy_2d_example_gfx1030.s @@ -0,0 +1,131 @@ +.text +.global memcpy_2d_example_gfx1030 +.p2align 8 +.type memcpy_2d_example_gfx1030,@function +memcpy_2d_example_gfx1030: +; This is just an example, not the optimal one +.set s_karg, 0 ; kernel argument +.set s_bx, 2 ; blockIdx + +.set s_ptr_in, 4 +.set s_ptr_out, 6 +.set s_rows, 8 +.set s_gdx, 10 +.set s_bdx, 12 +.set s_padding, 14 +.set s_stride_block, 16 +.set s_tmp, 18 + +.set v_buf, 0 +.set v_offset, 16 +.set v_tmp, 32 + + ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm + ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 + ; s_waitcnt lgkmcnt(0) + ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 + ; s_mov_b32 s[s_gdx], 72 ; num_cu + + s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 + s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 + s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 + s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 + s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 + s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 + + s_waitcnt lgkmcnt(0) + + + s_mul_i32 s[s_tmp], s[s_gdx], s[s_bdx+1] ; gridDim * blockDim * 4 + v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] + v_add_nc_u32 v[v_offset+2], s[s_tmp], v[v_offset+1] + v_add_nc_u32 v[v_offset+3], s[s_tmp], v[v_offset+2] + v_add_nc_u32 v[v_offset+4], s[s_tmp], v[v_offset+3] + v_add_nc_u32 v[v_offset+5], s[s_tmp], v[v_offset+4] + v_add_nc_u32 v[v_offset+6], s[s_tmp], v[v_offset+5] + v_add_nc_u32 v[v_offset+7], s[s_tmp], v[v_offset+6] + s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload + +label_memcopy_start: + global_load_dword v[v_buf+0], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+1], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+2], v[v_offset+2], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+3], v[v_offset+3], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+4], v[v_offset+4], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+5], v[v_offset+5], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+6], v[v_offset+6], s[s_ptr_in:s_ptr_in+1] + global_load_dword v[v_buf+7], v[v_offset+7], s[s_ptr_in:s_ptr_in+1] + + ; add stride padding + s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 + s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 + s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] + s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 + + s_waitcnt vmcnt(0) + + global_store_dword v[v_offset+0], v[v_buf+0], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+1], v[v_buf+1], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+2], v[v_buf+2], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+3], v[v_buf+3], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+4], v[v_buf+4], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+5], v[v_buf+5], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+6], v[v_buf+6], s[s_ptr_out:s_ptr_out+1] + global_store_dword v[v_offset+7], v[v_buf+7], s[s_ptr_out:s_ptr_out+1] + + s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] + s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 + + s_sub_u32 s[s_rows], s[s_rows], 1 + s_cmp_eq_u32 s[s_rows], 0 + s_waitcnt vmcnt(0) + s_cbranch_scc0 label_memcopy_start + s_endpgm + +.rodata +.p2align 6 +.amdhsa_kernel memcpy_2d_example_gfx1030 + .amdhsa_group_segment_fixed_size 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 32 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 + .amdhsa_wavefront_size32 1 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: memcpy_2d_example_gfx1030 + .symbol: memcpy_2d_example_gfx1030.kd + .sgpr_count: 32 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 32 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .wavefront_size: 32 ;warpsize + ; .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 ;gridsize + .args: + - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} + - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} + - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata diff --git a/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s new file mode 100644 index 0000000..d4e8849 --- /dev/null +++ b/memcpy_2d_example_gfx1030/memcpy_2d_x4_example_gfx1030.s @@ -0,0 +1,115 @@ +.text +.global memcpy_2d_x4_example_gfx1030 +.p2align 8 +.type memcpy_2d_x4_example_gfx1030,@function +memcpy_2d_x4_example_gfx1030: +; This is just an example, not the optimal one +.set s_karg, 0 ; kernel argument +.set s_bx, 2 ; blockIdx + +.set s_ptr_in, 4 +.set s_ptr_out, 6 +.set s_rows, 8 +.set s_gdx, 10 +.set s_bdx, 12 +.set s_padding, 14 +.set s_stride_block, 16 +.set s_tmp, 18 + +.set v_buf, 0 +.set v_offset, 16 +.set v_tmp, 32 + + ; http://www.hsafoundation.com/html/Content/Runtime/Topics/02_Core/hsa_kernel_dispatch_packet_t.htm + ; s_load_dword s[s_gdx], s[s_dptr:s_dptr+1], 12 + ; s_waitcnt lgkmcnt(0) + ; s_lshr_b32 s[s_gdx], s[s_gdx], 8 + ; s_mov_b32 s[s_gdx], 72 ; num_cu + + s_load_dwordx2 s[s_ptr_in:s_ptr_in+1], s[s_karg:s_karg+1], 0 + s_load_dwordx2 s[s_ptr_out:s_ptr_out+1], s[s_karg:s_karg+1], 8 + s_load_dword s[s_rows], s[s_karg:s_karg+1], 16 + s_load_dword s[s_gdx], s[s_karg:s_karg+1], 20 + s_load_dword s[s_bdx], s[s_karg:s_karg+1], 24 + s_load_dword s[s_padding], s[s_karg:s_karg+1], 28 + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_bdx+1], s[s_bdx], 4 ; blockDim * 4 + s_mul_i32 s[s_tmp+1], s[s_bx], s[s_bdx+1] ; blockIdx * blockDim * 4 + v_lshlrev_b32 v[v_tmp], 2, v0 ; threadIdx * 4 + v_add_nc_u32 v[v_offset+0], s[s_tmp+1], v[v_tmp] ; (blockIdx*blockDim + threadIdx) * 4 + v_lshlrev_b32 v[v_offset+0], 2, v[v_offset+0] + + s_waitcnt lgkmcnt(0) + + s_mul_i32 s[s_tmp], s[s_gdx], 256*4*4 ; gridDim * blockDim * 4 * 4Dwords + v_add_nc_u32 v[v_offset+1], s[s_tmp], v[v_offset+0] + + s_mul_i32 s[s_tmp], s[s_gdx], 256*4 ; gridDim * blockDim * 4 + s_lshl_b32 s[s_stride_block], s[s_tmp], 3 ; unroll 8, gridDim*blockDim*4*workload + +label_memcopy_start: + global_load_dwordx4 v[v_buf+0 : v_buf+3], v[v_offset+0], s[s_ptr_in:s_ptr_in+1] + global_load_dwordx4 v[v_buf+4 : v_buf+7], v[v_offset+1], s[s_ptr_in:s_ptr_in+1] + + ; add stride padding + s_mul_i32 s[s_padding+1], s[s_padding], 4 ; padding * 4 + s_add_u32 s[s_tmp+2], s[s_padding+1], s[s_stride_block] ; gridDim*blockDim*4*workload + padding*4 + s_add_u32 s[s_ptr_in], s[s_tmp+2], s[s_ptr_in] + s_addc_u32 s[s_ptr_in+1], s[s_ptr_in+1], 0 + + s_waitcnt vmcnt(0) + + global_store_dwordx4 v[v_offset+0], v[v_buf+0 : v_buf+3], s[s_ptr_out:s_ptr_out+1] + global_store_dwordx4 v[v_offset+1], v[v_buf+4 : v_buf+7], s[s_ptr_out:s_ptr_out+1] + + s_add_u32 s[s_ptr_out], s[s_tmp+2], s[s_ptr_out] + s_addc_u32 s[s_ptr_out+1], s[s_ptr_out+1], 0 + + s_sub_u32 s[s_rows], s[s_rows], 1 + s_cmp_eq_u32 s[s_rows], 0 + s_waitcnt vmcnt(0) + s_cbranch_scc0 label_memcopy_start + s_endpgm + +.rodata +.p2align 6 +.amdhsa_kernel memcpy_2d_x4_example_gfx1030 + .amdhsa_group_segment_fixed_size 0 + .amdhsa_user_sgpr_dispatch_ptr 0 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_system_sgpr_workgroup_id_x 1 + .amdhsa_system_vgpr_workitem_id 0 + .amdhsa_next_free_vgpr 64 + .amdhsa_next_free_sgpr 32 + .amdhsa_ieee_mode 0 + .amdhsa_dx10_clamp 0 + .amdhsa_wavefront_size32 1 + .amdhsa_workgroup_processor_mode 0 +.end_amdhsa_kernel + +.amdgpu_metadata +--- +amdhsa.version: [ 1, 0 ] +amdhsa.kernels: + - .name: memcpy_2d_x4_example_gfx1030 + .symbol: memcpy_2d_x4_example_gfx1030.kd + .sgpr_count: 32 + .vgpr_count: 64 + .kernarg_segment_align: 8 + .kernarg_segment_size: 32 + .group_segment_fixed_size: 0 + .private_segment_fixed_size: 0 + .wavefront_size: 32 ;warpsize + .reqd_workgroup_size : [256, 1, 1] + .max_flat_workgroup_size: 256 ;gridsize + .args: + - { .name: input, .size: 8, .offset: 0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: output, .size: 8, .offset: 8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false} + - { .name: rows, .size: 4, .offset: 16, .value_kind: by_value, .value_type: i32} + - { .name: gdx, .size: 4, .offset: 20, .value_kind: by_value, .value_type: i32} + - { .name: bdx, .size: 4, .offset: 24, .value_kind: by_value, .value_type: i32} + - { .name: padding, .size: 4, .offset: 28, .value_kind: by_value, .value_type: i32} +... +.end_amdgpu_metadata