diff --git a/Makefile.am b/Makefile.am index d48ff4195..39da01b13 100644 --- a/Makefile.am +++ b/Makefile.am @@ -12,7 +12,7 @@ SUBDIRS = lib submodules ccan sph bin_PROGRAMS = sgminer -sgminer_CPPFLAGS = $(PTHREAD_FLAGS) -std=gnu99 $(JANSSON_CPPFLAGS) +sgminer_CPPFLAGS = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_CPPFLAGS) sgminer_LDFLAGS = $(PTHREAD_FLAGS) sgminer_LDADD = $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \ @OPENCL_LIBS@ @NCURSES_LIBS@ @PDCURSES_LIBS@ @WS2_LIBS@ \ @@ -44,6 +44,7 @@ sgminer_SOURCES += pool.c pool.h sgminer_SOURCES += algorithm.c algorithm.h sgminer_SOURCES += config_parser.c config_parser.h sgminer_SOURCES += events.c events.h +sgminer_SOURCES += ocl/patch_kernel.c ocl/patch_kernel.h sgminer_SOURCES += ocl/build_kernel.c ocl/build_kernel.h sgminer_SOURCES += ocl/binary_kernel.c ocl/binary_kernel.h diff --git a/ocl.c b/ocl.c index 65e34e750..c7244938d 100644 --- a/ocl.c +++ b/ocl.c @@ -168,6 +168,21 @@ static float get_opencl_version(cl_device_id device) return version; } +static bool get_opencl_bit_align_support(cl_device_id *device) +{ + char extensions[1024]; + const char * camo = "cl_amd_media_ops"; + char *find; + cl_int status; + + status = clGetDeviceInfo(*device, CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); + if (status != CL_SUCCESS) { + return false; + } + find = strstr(extensions, camo); + return !!find; +} + static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, cl_command_queue_properties cq_properties) { cl_int status; @@ -247,6 +262,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg return NULL; } + clState->hasBitAlign = get_opencl_bit_align_support(&devices[gpu]); + status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status); @@ -527,7 +544,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg build_data->kernel_path = (*opt_kernel_path) ? opt_kernel_path : NULL; build_data->work_size = clState->wsize; + build_data->has_bit_align = clState->hasBitAlign; build_data->opencl_version = get_opencl_version(devices[gpu]); + build_data->patch_bfi = needs_bfi_patch(build_data); strcpy(build_data->binary_filename, filename); build_data->binary_filename[strlen(filename) - 3] = 0x00; // And one NULL terminator, cutting off the .cl suffix. @@ -553,13 +572,23 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg return NULL; } - // If it doesn't work, oh well, build it again next run - save_opencl_kernel(build_data, clState->program); + if (save_opencl_kernel(build_data, clState->program)) { + /* Program needs to be rebuilt, because the binary was patched */ + if (build_data->patch_bfi) { + clReleaseProgram(clState->program); + clState->program = load_opencl_binary_kernel(build_data); + } + } + else { + if (build_data->patch_bfi) + quit(1, "Could not save kernel to file, but it is necessary to apply BFI patch"); + } } // Load kernels - applog(LOG_NOTICE, "Initialising kernel %s with nfactor %d, n %d", - filename, algorithm->nfactor, algorithm->n); + applog(LOG_NOTICE, "Initialising kernel %s with%s bitalign, %spatched BFI, nfactor %d, n %d", + filename, clState->hasBitAlign ? "" : "out", build_data->patch_bfi ? "" : "un", + algorithm->nfactor, algorithm->n); /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); @@ -568,6 +597,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg return NULL; } + clState->n_extra_kernels = algorithm->n_extra_kernels; if (clState->n_extra_kernels > 0) { unsigned int i; diff --git a/ocl.h b/ocl.h index 0950d2c03..502119e09 100644 --- a/ocl.h +++ b/ocl.h @@ -24,6 +24,7 @@ typedef struct __clState { cl_mem MidstateBuf; cl_mem padbuffer8; unsigned char cldata[80]; + bool hasBitAlign; bool goffset; cl_uint vwidth; size_t max_work_size; diff --git a/ocl/build_kernel.c b/ocl/build_kernel.c index 2e1b73838..29a99e186 100644 --- a/ocl/build_kernel.c +++ b/ocl/build_kernel.c @@ -1,5 +1,6 @@ #include #include "build_kernel.h" +#include "patch_kernel.h" #include "miner.h" static char *file_contents(const char *filename, int *length) @@ -51,7 +52,6 @@ static char *file_contents(const char *filename, int *length) return (char*)buffer; } -// This should NOT be in here! -- Wolf9466 void set_base_compiler_options(build_kernel_data *data) { char buf[255]; @@ -61,6 +61,12 @@ void set_base_compiler_options(build_kernel_data *data) sprintf(buf, "w%dl%d", (int)data->work_size, (int)sizeof(long)); strcat(data->binary_filename, buf); + + if (data->has_bit_align) { + strcat(data->compiler_options, " -D BITALIGN"); + applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); + } else + applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (data->kernel_path) { strcat(data->compiler_options, " -I \""); @@ -68,10 +74,38 @@ void set_base_compiler_options(build_kernel_data *data) strcat(data->compiler_options, "\""); } + if (data->patch_bfi) { + strcat(data->compiler_options, " -D BFI_INT"); + applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); + } else + applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); + if (data->opencl_version < 1.1) strcat(data->compiler_options, " -D OCL1"); } +bool needs_bfi_patch(build_kernel_data *data) +{ + if (data->has_bit_align && + (data->opencl_version < 1.2) && + (strstr(data->platform, "Cedar") || + strstr(data->platform, "Redwood") || + strstr(data->platform, "Juniper") || + strstr(data->platform, "Cypress" ) || + strstr(data->platform, "Hemlock" ) || + strstr(data->platform, "Caicos" ) || + strstr(data->platform, "Turks" ) || + strstr(data->platform, "Barts" ) || + strstr(data->platform, "Cayman" ) || + strstr(data->platform, "Antilles" ) || + strstr(data->platform, "Wrestler" ) || + strstr(data->platform, "Zacate" ) || + strstr(data->platform, "WinterPark" ))) + return true; + else + return false; +} + cl_program build_opencl_kernel(build_kernel_data *data, const char *filename) { int pl; @@ -164,10 +198,18 @@ bool save_opencl_kernel(build_kernel_data *data, cl_program program) goto out; } + /* Patch the kernel if the hardware supports BFI_INT but it needs to + * be hacked in */ + if (data->patch_bfi) { + if (kernel_bfi_patch(binaries[slot], binary_sizes[slot]) != 0) { + quit(1, "Could not patch BFI_INT, please report this issue."); + } + } + /* Save the binary to be loaded next time */ binaryfile = fopen(data->binary_filename, "wb"); if (!binaryfile) { - /* Not fatal, just means we build it again next time */ + /* Not fatal, just means we build it again next time, unless BFI patch is needed */ applog(LOG_DEBUG, "Unable to create file %s", data->binary_filename); goto out; } else { diff --git a/ocl/build_kernel.h b/ocl/build_kernel.h index 89fb8db8a..92de074ae 100644 --- a/ocl/build_kernel.h +++ b/ocl/build_kernel.h @@ -23,9 +23,12 @@ typedef struct _build_kernel_data { char sgminer_path[255]; const char *kernel_path; size_t work_size; + bool has_bit_align; + bool patch_bfi; float opencl_version; } build_kernel_data; +bool needs_bfi_patch(build_kernel_data *data); cl_program build_opencl_kernel(build_kernel_data *data, const char *filename); bool save_opencl_kernel(build_kernel_data *data, cl_program program); void set_base_compiler_options(build_kernel_data *data); diff --git a/ocl/patch_kernel.c b/ocl/patch_kernel.c new file mode 100644 index 000000000..7c72cebc9 --- /dev/null +++ b/ocl/patch_kernel.c @@ -0,0 +1,97 @@ +#include "patch_kernel.h" +#include "logging.h" +#include +#include + +static int advance(char **area, unsigned *remaining, const char *marker) +{ + char *find = (char *)memmem(*area, *remaining, (void *)marker, strlen(marker)); + + if (!find) { + applog(LOG_DEBUG, "Marker \"%s\" not found", marker); + return 0; + } + *remaining -= find - *area; + *area = find; + return 1; +} + +#define OP3_INST_BFE_UINT 4ULL +#define OP3_INST_BFE_INT 5ULL +#define OP3_INST_BFI_INT 6ULL +#define OP3_INST_BIT_ALIGN_INT 12ULL +#define OP3_INST_BYTE_ALIGN_INT 13ULL + +static void patch_opcodes(char *w, unsigned remaining) +{ + uint64_t *opcode = (uint64_t *)w; + int patched = 0; + int count_bfe_int = 0; + int count_bfe_uint = 0; + int count_byte_align = 0; + while (42) { + int clamp = (*opcode >> (32 + 31)) & 0x1; + int dest_rel = (*opcode >> (32 + 28)) & 0x1; + int alu_inst = (*opcode >> (32 + 13)) & 0x1f; + int s2_neg = (*opcode >> (32 + 12)) & 0x1; + int s2_rel = (*opcode >> (32 + 9)) & 0x1; + int pred_sel = (*opcode >> 29) & 0x3; + if (!clamp && !dest_rel && !s2_neg && !s2_rel && !pred_sel) { + if (alu_inst == OP3_INST_BFE_INT) { + count_bfe_int++; + } else if (alu_inst == OP3_INST_BFE_UINT) { + count_bfe_uint++; + } else if (alu_inst == OP3_INST_BYTE_ALIGN_INT) { + count_byte_align++; + // patch this instruction to BFI_INT + *opcode &= 0xfffc1fffffffffffULL; + *opcode |= OP3_INST_BFI_INT << (32 + 13); + patched++; + } + } + if (remaining <= 8) + break; + opcode++; + remaining -= 8; + } + applog(LOG_DEBUG, "Potential OP3 instructions identified: " + "%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN", + count_bfe_int, count_bfe_uint, count_byte_align); + applog(LOG_DEBUG, "Patched a total of %i BFI_INT instructions", patched); +} + +bool kernel_bfi_patch(char *binary, unsigned binary_size) +{ + unsigned remaining = binary_size; + char *w = binary; + unsigned int start, length; + + /* Find 2nd incidence of .text, and copy the program's + * position and length at a fixed offset from that. Then go + * back and find the 2nd incidence of \x7ELF (rewind by one + * from ELF) and then patch the opcocdes */ + if (!advance(&w, &remaining, ".text")) + return false; + w++; remaining--; + if (!advance(&w, &remaining, ".text")) { + /* 32 bit builds only one ELF */ + w--; remaining++; + } + memcpy(&start, w + 285, 4); + memcpy(&length, w + 289, 4); + w = binary; remaining = binary_size; + if (!advance(&w, &remaining, "ELF")) + return false; + w++; remaining--; + if (!advance(&w, &remaining, "ELF")) { + /* 32 bit builds only one ELF */ + w--; remaining++; + } + w--; remaining++; + w += start; remaining -= start; + applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", + w, remaining); + patch_opcodes(w, length); + + return true; +} diff --git a/ocl/patch_kernel.h b/ocl/patch_kernel.h new file mode 100644 index 000000000..d13b1869b --- /dev/null +++ b/ocl/patch_kernel.h @@ -0,0 +1,10 @@ +#ifndef PATCH_KERNEL_H +#define PATCH_KERNEL_H + +#include + +bool kernel_bfi_patch(char *binary, unsigned binary_size); + +#endif /* PATCH_KERNEL_H */ + +