Replaced division instructions with IMUL_RCP

tevador · Feb 22, 2019 · f3b114a · f3b114a
1 parent 9d5f621
commit f3b114a
Show file tree

Hide file tree

Showing 14 changed files with 766 additions and 890 deletions.
diff --git a/doc/isa-ops.md b/doc/isa-ops.md
@@ -19,8 +19,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
 |1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`|
 |4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)|
 |1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)|
-|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`|
-|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)|
+|8/256|IMUL_RCP|R|-|-|<code>dst = 2<sup>x</sup> / imm32 * dst</code>|
 |2/256|INEG_R|R|-|-|`dst = -dst`|
 |16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`|
 |4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`|
@@ -30,8 +29,8 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
 #### IMULH and ISMULH
 These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`).
 
-#### IDIV_C and ISDIV_C
-The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero.
+#### IMUL_RCP
+This instruction multiplies the destination register by a reciprocal of `imm32`. The reciprocal is calculated as <code>rcp = 2<sup>x</sup> / imm32</code> by choosing the largest integer `x` such that <code>rcp < 2<sup>64</sup></code>. If `imm32` equals 0, this instruction is a no-op.
 
 #### ISWAP_R
 This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op.
@@ -54,7 +53,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
 |6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`|
 
 #### FSCAL_R
-This instruction negates the number and multiplies it by <code>2<sup>x</sup></code>. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{-1, +1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
+This instruction negates the number and multiplies it by <code>2<sup>x</sup></code>. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{+1, -1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
 
 The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`.
 

diff --git a/makefile b/makefile
@@ -9,7 +9,7 @@ OBJDIR=obj
 LDFLAGS=-lpthread
 CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp  src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c
 TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
-ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o)
+ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o)
 ifeq ($(PLATFORM),amd64)
     ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
     CXXFLAGS += -maes
@@ -53,7 +53,7 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak
 $(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR)
 	$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@
 
-$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h divideByConstantCodegen.h Program.hpp) | $(OBJDIR)
+$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@
 
 $(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR)
@@ -65,13 +65,13 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin
 $(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
 
-$(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR)
-	$(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@
+$(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR)
+	$(CC) $(CCFLAGS) -c $(SRCDIR)/reciprocal.c -o $@
 
 $(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@
 
-$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp divideByConstantCodegen.h virtualMemory.hpp) | $(OBJDIR)
+$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR)
 	$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
 
 $(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)

diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
@@ -17,12 +17,10 @@ You should have received a copy of the GNU General Public License
 along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 */
 //#define TRACE
-#define MAGIC_DIVISION
+
 #include "AssemblyGeneratorX86.hpp"
 #include "common.hpp"
-#ifdef MAGIC_DIVISION
-#include "divideByConstantCodegen.h"
-#endif
+#include "reciprocal.h"
 #include "Program.hpp"
 
 namespace RandomX {
@@ -276,38 +274,12 @@ namespace RandomX {
 		traceint(instr);
 	}
 
-	//~6 uOPs
-	void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) {
+	//2 uOPs
+	void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) {
 		if (instr.imm32 != 0) {
 			uint32_t divisor = instr.imm32;
-			if (divisor & (divisor - 1)) {
-				magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
-				if (mi.pre_shift == 0 && !mi.increment) {
-					asmCode << "\tmov rax, " << mi.multiplier << std::endl;
-					asmCode << "\tmul " << regR[instr.dst] << std::endl;
-				}
-				else {
-					asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
-					if (mi.pre_shift > 0)
-						asmCode << "\tshr rax, " << mi.pre_shift << std::endl;
-					if (mi.increment) {
-						asmCode << "\tadd rax, 1" << std::endl;
-						asmCode << "\tsbb rax, 0" << std::endl;
-					}
-					asmCode << "\tmov rcx, " << mi.multiplier << std::endl;
-					asmCode << "\tmul rcx" << std::endl;
-				}
-				if (mi.post_shift > 0)
-					asmCode << "\tshr rdx, " << mi.post_shift << std::endl;
-				asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
-			}
-			else { //divisor is a power of two
-				int shift = 0;
-				while (divisor >>= 1)
-					++shift;
-				if(shift > 0)
-					asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl;
-			}
+			asmCode << "\tmov rax, " << reciprocal(instr.imm32) << std::endl;
+			asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl;
 			traceint(instr);
 		}
 		else {
@@ -317,59 +289,7 @@ namespace RandomX {
 
 	//~8.5 uOPs
 	void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) {
-		int64_t divisor = (int32_t)instr.imm32;
-		if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
-			asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
-			// +/- power of two
-			bool negative = divisor < 0;
-			if (negative)
-				divisor = -divisor;
-			int shift = 0;
-			uint64_t unsignedDivisor = divisor;
-			while (unsignedDivisor >>= 1)
-				++shift;
-			if (shift > 0) {
-				asmCode << "\tmov rcx, rax" << std::endl;
-				asmCode << "\tsar rcx, 63" << std::endl;
-				uint32_t mask = (1ULL << shift) + 0xFFFFFFFF;
-				asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl;
-				asmCode << "\tadd rax, rcx" << std::endl;
-				asmCode << "\tsar rax, " << shift << std::endl;
-			}
-			if (negative)
-				asmCode << "\tneg rax" << std::endl;
-			asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl;
-			traceint(instr);
-		}
-		else if (divisor != 0) {
-			magics_info mi = compute_signed_magic_info(divisor);
-			asmCode << "\tmov rax, " << mi.multiplier << std::endl;
-			asmCode << "\timul " << regR[instr.dst] << std::endl;
-			//asmCode << "\tmov rax, rdx" << std::endl;
-			asmCode << "\txor eax, eax" << std::endl;
-			bool haveSF = false;
-			if (divisor > 0 && mi.multiplier < 0) {
-				asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl;
-				haveSF = true;
-			}
-			if (divisor < 0 && mi.multiplier > 0) {
-				asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl;
-				haveSF = true;
-			}
-			if (mi.shift > 0) {
-				asmCode << "\tsar rdx, " << mi.shift << std::endl;
-				haveSF = true;
-			}
-			if (!haveSF)
-				asmCode << "\ttest rdx, rdx" << std::endl;
-			asmCode << "\tsets al" << std::endl;
-			asmCode << "\tadd rdx, rax" << std::endl;
-			asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
-			traceint(instr);
-		}
-		else {
-			tracenop(instr);
-		}
+		tracenop(instr);
 	}
 
 	//2 uOPs
@@ -570,7 +490,7 @@ namespace RandomX {
 		INST_HANDLE(IMULH_M)
 		INST_HANDLE(ISMULH_R)
 		INST_HANDLE(ISMULH_M)
-		INST_HANDLE(IDIV_C)
+		INST_HANDLE(IMUL_RCP)
 		INST_HANDLE(ISDIV_C)
 		INST_HANDLE(INEG_R)
 		INST_HANDLE(IXOR_R)

diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
@@ -61,7 +61,7 @@ namespace RandomX {
 		void  h_IMULH_M(Instruction&, int);
 		void  h_ISMULH_R(Instruction&, int);
 		void  h_ISMULH_M(Instruction&, int);
-		void  h_IDIV_C(Instruction&, int);
+		void  h_IMUL_RCP(Instruction&, int);
 		void  h_ISDIV_C(Instruction&, int);
 		void  h_INEG_R(Instruction&, int);
 		void  h_IXOR_R(Instruction&, int);

diff --git a/src/Instruction.cpp b/src/Instruction.cpp
@@ -193,7 +193,7 @@ namespace RandomX {
 		}
 	}
 
-	void Instruction::h_IDIV_C(std::ostream& os) const {
+	void Instruction::h_IMUL_RCP(std::ostream& os) const {
 		os << "r" << (int)dst << ", " << imm32 << std::endl;
 	}
 
@@ -345,7 +345,7 @@ namespace RandomX {
 		INST_NAME(IMULH_M)
 		INST_NAME(ISMULH_R)
 		INST_NAME(ISMULH_M)
-		INST_NAME(IDIV_C)
+		INST_NAME(IMUL_RCP)
 		INST_NAME(ISDIV_C)
 		INST_NAME(INEG_R)
 		INST_NAME(IXOR_R)
@@ -396,7 +396,7 @@ namespace RandomX {
 		INST_HANDLE(IMULH_M)
 		INST_HANDLE(ISMULH_R)
 		INST_HANDLE(ISMULH_M)
-		INST_HANDLE(IDIV_C)
+		INST_HANDLE(IMUL_RCP)
 		INST_HANDLE(ISDIV_C)
 		INST_HANDLE(INEG_R)
 		INST_HANDLE(IXOR_R)

diff --git a/src/Instruction.hpp b/src/Instruction.hpp
@@ -41,8 +41,8 @@ namespace RandomX {
 		constexpr int IMULH_M = 9;
 		constexpr int ISMULH_R = 10;
 		constexpr int ISMULH_M = 11;
-		constexpr int IDIV_C = 12;
-		constexpr int ISDIV_C = 13;
+		constexpr int IMUL_RCP = 12;
+		//constexpr int ISDIV_C = 13;
 		constexpr int INEG_R = 14;
 		constexpr int IXOR_R = 15;
 		constexpr int IXOR_M = 16;
@@ -103,7 +103,7 @@ namespace RandomX {
 		void  h_IMULH_M(std::ostream&) const;
 		void  h_ISMULH_R(std::ostream&) const;
 		void  h_ISMULH_M(std::ostream&) const;
-		void  h_IDIV_C(std::ostream&) const;
+		void  h_IMUL_RCP(std::ostream&) const;
 		void  h_ISDIV_C(std::ostream&) const;
 		void  h_INEG_R(std::ostream&) const;
 		void  h_IXOR_R(std::ostream&) const;

diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
@@ -30,6 +30,7 @@ along with RandomX.  If not, see<http://www.gnu.org/licenses/>.
 #include <cfloat>
 #include <thread>
 #include "intrinPortable.h"
+#include "reciprocal.h"
 #ifdef STATS
 #include <algorithm>
 #endif
@@ -136,7 +137,7 @@ namespace RandomX {
 				*ibc.idst += 8 * *ibc.idst + ibc.imm;
 			} break;
 
-			case InstructionType::IMUL_R: {
+			case InstructionType::IMUL_R: { //also handles IMUL_RCP
 				*ibc.idst *= *ibc.isrc;
 			} break;
 
@@ -160,24 +161,6 @@ namespace RandomX {
 				*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask))));
 			} break;
 
-			case InstructionType::IDIV_C: {
-				uint64_t dividend = *ibc.idst;
-				uint64_t quotient = dividend / ibc.imm;
-				*ibc.idst += quotient;
-			} break;
-
-			case InstructionType::ISDIV_C: {
-				if (ibc.simm != -1) {
-					int64_t dividend = unsigned64ToSigned2sCompl(*ibc.idst);
-					int64_t quotient = dividend / ibc.simm;
-					*ibc.idst += quotient;
-				}
-				else {
-					uint64_t quotient = ~(*ibc.idst) + 1;
-					*ibc.idst += quotient;
-				}
-			} break;
-
 			case InstructionType::INEG_R: {
 				*ibc.idst = ~(*ibc.idst) + 1; //two's complement negative
 			} break;
@@ -568,30 +551,22 @@ namespace RandomX {
 					}
 				} break;
 
-				CASE_REP(IDIV_C) {
+				CASE_REP(IMUL_RCP) {
 					uint32_t divisor = instr.imm32;
 					if (divisor != 0) {
 						auto dst = instr.dst % RegistersCount;
-						ibc.type = InstructionType::IDIV_C;
+						ibc.type = InstructionType::IMUL_R;
 						ibc.idst = &r[dst];
-						ibc.imm = divisor;
+						ibc.imm = reciprocal(divisor);
+						ibc.isrc = &ibc.imm;
 					}
 					else {
 						ibc.type = InstructionType::NOP;
 					}
 				} break;
 
 				CASE_REP(ISDIV_C) {
-					int32_t divisor = unsigned32ToSigned2sCompl(instr.imm32);
-					if (divisor != 0) {
-						auto dst = instr.dst % RegistersCount;
-						ibc.type = InstructionType::ISDIV_C;
-						ibc.idst = &r[dst];
-						ibc.simm = divisor;
-					}
-					else {
-						ibc.type = InstructionType::NOP;
-					}
+					ibc.type = InstructionType::NOP;
 				} break;
 
 				CASE_REP(INEG_R) {