diff --git a/doc/isa-ops.md b/doc/isa-ops.md
index 79ac3072..d403bda9 100644
--- a/doc/isa-ops.md
+++ b/doc/isa-ops.md
@@ -19,8 +19,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
|1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`|
|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)|
|1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)|
-|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`|
-|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)|
+|8/256|IMUL_RCP|R|-|-|dst = 2x / imm32 * dst
|
|2/256|INEG_R|R|-|-|`dst = -dst`|
|16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`|
|4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`|
@@ -30,8 +29,8 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
#### IMULH and ISMULH
These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`).
-#### IDIV_C and ISDIV_C
-The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero.
+#### IMUL_RCP
+This instruction multiplies the destination register by a reciprocal of `imm32`. The reciprocal is calculated as rcp = 2x / imm32
by choosing the largest integer `x` such that rcp < 264
. If `imm32` equals 0, this instruction is a no-op.
#### ISWAP_R
This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op.
@@ -54,7 +53,7 @@ Memory operands are loaded as 8-byte values from the address indicated by `src`.
|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`|
#### FSCAL_R
-This instruction negates the number and multiplies it by 2x
. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{-1, +1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
+This instruction negates the number and multiplies it by 2x
. `x` is calculated by taking the 5 least significant digits of the biased exponent and interpreting them as a binary number using the digit set `{+1, -1}` as opposed to the traditional `{0, 1}`. The possible values of `x` are all odd numbers from -31 to +31.
The mathematical operation described above is equivalent to a bitwise XOR of the binary representation with the value of `0x81F0000000000000`.
diff --git a/makefile b/makefile
index 7ad52314..159eb2af 100644
--- a/makefile
+++ b/makefile
@@ -9,7 +9,7 @@ OBJDIR=obj
LDFLAGS=-lpthread
CPPSRC=src/argon2_core.c src/Cache.cpp src/divideByConstantCodegen.c src/Instruction.cpp src/JitCompilerX86.cpp src/Program.cpp src/VirtualMachine.cpp src/argon2_ref.c src/CompiledVirtualMachine.cpp src/executeProgram-linux.cpp src/instructionsPortable.cpp src/LightClientAsyncWorker.cpp src/softAes.cpp src/virtualMemory.cpp src/AssemblyGeneratorX86.cpp src/dataset.cpp src/hashAes1Rx4.cpp src/InterpretedVirtualMachine.cpp src/main.cpp src/TestAluFpu.cpp src/blake2/blake2b.c
TOBJS=$(addprefix $(OBJDIR)/,instructionsPortable.o TestAluFpu.o)
-ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o divideByConstantCodegen.o LightClientAsyncWorker.o hashAes1Rx4.o)
+ROBJS=$(addprefix $(OBJDIR)/,argon2_core.o argon2_ref.o AssemblyGeneratorX86.o blake2b.o CompiledVirtualMachine.o dataset.o JitCompilerX86.o instructionsPortable.o Instruction.o InterpretedVirtualMachine.o main.o Program.o softAes.o VirtualMachine.o Cache.o virtualMemory.o reciprocal.o LightClientAsyncWorker.o hashAes1Rx4.o)
ifeq ($(PLATFORM),amd64)
ROBJS += $(OBJDIR)/JitCompilerX86-static.o $(OBJDIR)/squareHash.o
CXXFLAGS += -maes
@@ -53,7 +53,7 @@ $(OBJDIR)/argon2_core.o: $(addprefix $(SRCDIR)/,argon2_core.c argon2_core.h blak
$(OBJDIR)/argon2_ref.o: $(addprefix $(SRCDIR)/,argon2_ref.c argon2.h argon2_core.h blake2/blake2.h blake2/blake2-impl.h blake2/blamka-round-ref.h blake2/endian.h) | $(OBJDIR)
$(CC) $(CCFLAGS) -c $(SRCDIR)/argon2_ref.c -o $@
-$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h divideByConstantCodegen.h Program.hpp) | $(OBJDIR)
+$(OBJDIR)/AssemblyGeneratorX86.o: $(addprefix $(SRCDIR)/,AssemblyGeneratorX86.cpp AssemblyGeneratorX86.hpp Instruction.hpp common.hpp instructionWeights.hpp blake2/endian.h reciprocal.h Program.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/AssemblyGeneratorX86.cpp -o $@
$(OBJDIR)/blake2b.o: $(addprefix $(SRCDIR)/blake2/,blake2b.c blake2.h blake2-impl.h endian.h) | $(OBJDIR)
@@ -65,13 +65,13 @@ $(OBJDIR)/CompiledVirtualMachine.o: $(addprefix $(SRCDIR)/,CompiledVirtualMachin
$(OBJDIR)/dataset.o: $(addprefix $(SRCDIR)/,dataset.cpp common.hpp blake2/endian.h dataset.hpp intrinPortable.h Cache.hpp virtualMemory.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/dataset.cpp -o $@
-$(OBJDIR)/divideByConstantCodegen.o: $(addprefix $(SRCDIR)/,divideByConstantCodegen.c divideByConstantCodegen.h) | $(OBJDIR)
- $(CC) $(CCFLAGS) -c $(SRCDIR)/divideByConstantCodegen.c -o $@
+$(OBJDIR)/reciprocal.o: $(addprefix $(SRCDIR)/,reciprocal.c reciprocal.h) | $(OBJDIR)
+ $(CC) $(CCFLAGS) -c $(SRCDIR)/reciprocal.c -o $@
$(OBJDIR)/hashAes1Rx4.o: $(addprefix $(SRCDIR)/,hashAes1Rx4.cpp softAes.h intrinPortable.h blake2/endian.h) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/hashAes1Rx4.cpp -o $@
-$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp divideByConstantCodegen.h virtualMemory.hpp) | $(OBJDIR)
+$(OBJDIR)/JitCompilerX86.o: $(addprefix $(SRCDIR)/,JitCompilerX86.cpp JitCompilerX86.hpp Instruction.hpp instructionWeights.hpp common.hpp blake2/endian.h Program.hpp reciprocal.h virtualMemory.hpp) | $(OBJDIR)
$(CXX) $(CXXFLAGS) -c $(SRCDIR)/JitCompilerX86.cpp -o $@
$(OBJDIR)/JitCompilerX86-static.o: $(addprefix $(SRCDIR)/,JitCompilerX86-static.S $(addprefix asm/program_, prologue_linux.inc prologue_load.inc epilogue_linux.inc epilogue_store.inc read_dataset.inc loop_load.inc loop_store.inc xmm_constants.inc)) | $(OBJDIR)
diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp
index c20138e2..15a196b7 100644
--- a/src/AssemblyGeneratorX86.cpp
+++ b/src/AssemblyGeneratorX86.cpp
@@ -17,12 +17,10 @@ You should have received a copy of the GNU General Public License
along with RandomX. If not, see.
*/
//#define TRACE
-#define MAGIC_DIVISION
+
#include "AssemblyGeneratorX86.hpp"
#include "common.hpp"
-#ifdef MAGIC_DIVISION
-#include "divideByConstantCodegen.h"
-#endif
+#include "reciprocal.h"
#include "Program.hpp"
namespace RandomX {
@@ -276,38 +274,12 @@ namespace RandomX {
traceint(instr);
}
- //~6 uOPs
- void AssemblyGeneratorX86::h_IDIV_C(Instruction& instr, int i) {
+ //2 uOPs
+ void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) {
if (instr.imm32 != 0) {
uint32_t divisor = instr.imm32;
- if (divisor & (divisor - 1)) {
- magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
- if (mi.pre_shift == 0 && !mi.increment) {
- asmCode << "\tmov rax, " << mi.multiplier << std::endl;
- asmCode << "\tmul " << regR[instr.dst] << std::endl;
- }
- else {
- asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
- if (mi.pre_shift > 0)
- asmCode << "\tshr rax, " << mi.pre_shift << std::endl;
- if (mi.increment) {
- asmCode << "\tadd rax, 1" << std::endl;
- asmCode << "\tsbb rax, 0" << std::endl;
- }
- asmCode << "\tmov rcx, " << mi.multiplier << std::endl;
- asmCode << "\tmul rcx" << std::endl;
- }
- if (mi.post_shift > 0)
- asmCode << "\tshr rdx, " << mi.post_shift << std::endl;
- asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
- }
- else { //divisor is a power of two
- int shift = 0;
- while (divisor >>= 1)
- ++shift;
- if(shift > 0)
- asmCode << "\tshr " << regR[instr.dst] << ", " << shift << std::endl;
- }
+ asmCode << "\tmov rax, " << reciprocal(instr.imm32) << std::endl;
+ asmCode << "\timul " << regR[instr.dst] << ", rax" << std::endl;
traceint(instr);
}
else {
@@ -317,59 +289,7 @@ namespace RandomX {
//~8.5 uOPs
void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) {
- int64_t divisor = (int32_t)instr.imm32;
- if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
- asmCode << "\tmov rax, " << regR[instr.dst] << std::endl;
- // +/- power of two
- bool negative = divisor < 0;
- if (negative)
- divisor = -divisor;
- int shift = 0;
- uint64_t unsignedDivisor = divisor;
- while (unsignedDivisor >>= 1)
- ++shift;
- if (shift > 0) {
- asmCode << "\tmov rcx, rax" << std::endl;
- asmCode << "\tsar rcx, 63" << std::endl;
- uint32_t mask = (1ULL << shift) + 0xFFFFFFFF;
- asmCode << "\tand ecx, 0" << std::hex << mask << std::dec << "h" << std::endl;
- asmCode << "\tadd rax, rcx" << std::endl;
- asmCode << "\tsar rax, " << shift << std::endl;
- }
- if (negative)
- asmCode << "\tneg rax" << std::endl;
- asmCode << "\tadd " << regR[instr.dst] << ", rax" << std::endl;
- traceint(instr);
- }
- else if (divisor != 0) {
- magics_info mi = compute_signed_magic_info(divisor);
- asmCode << "\tmov rax, " << mi.multiplier << std::endl;
- asmCode << "\timul " << regR[instr.dst] << std::endl;
- //asmCode << "\tmov rax, rdx" << std::endl;
- asmCode << "\txor eax, eax" << std::endl;
- bool haveSF = false;
- if (divisor > 0 && mi.multiplier < 0) {
- asmCode << "\tadd rdx, " << regR[instr.dst] << std::endl;
- haveSF = true;
- }
- if (divisor < 0 && mi.multiplier > 0) {
- asmCode << "\tsub rdx, " << regR[instr.dst] << std::endl;
- haveSF = true;
- }
- if (mi.shift > 0) {
- asmCode << "\tsar rdx, " << mi.shift << std::endl;
- haveSF = true;
- }
- if (!haveSF)
- asmCode << "\ttest rdx, rdx" << std::endl;
- asmCode << "\tsets al" << std::endl;
- asmCode << "\tadd rdx, rax" << std::endl;
- asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl;
- traceint(instr);
- }
- else {
- tracenop(instr);
- }
+ tracenop(instr);
}
//2 uOPs
@@ -570,7 +490,7 @@ namespace RandomX {
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
- INST_HANDLE(IDIV_C)
+ INST_HANDLE(IMUL_RCP)
INST_HANDLE(ISDIV_C)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp
index 9968ebe2..216e4929 100644
--- a/src/AssemblyGeneratorX86.hpp
+++ b/src/AssemblyGeneratorX86.hpp
@@ -61,7 +61,7 @@ namespace RandomX {
void h_IMULH_M(Instruction&, int);
void h_ISMULH_R(Instruction&, int);
void h_ISMULH_M(Instruction&, int);
- void h_IDIV_C(Instruction&, int);
+ void h_IMUL_RCP(Instruction&, int);
void h_ISDIV_C(Instruction&, int);
void h_INEG_R(Instruction&, int);
void h_IXOR_R(Instruction&, int);
diff --git a/src/Instruction.cpp b/src/Instruction.cpp
index 4296c884..205aaaac 100644
--- a/src/Instruction.cpp
+++ b/src/Instruction.cpp
@@ -193,7 +193,7 @@ namespace RandomX {
}
}
- void Instruction::h_IDIV_C(std::ostream& os) const {
+ void Instruction::h_IMUL_RCP(std::ostream& os) const {
os << "r" << (int)dst << ", " << imm32 << std::endl;
}
@@ -345,7 +345,7 @@ namespace RandomX {
INST_NAME(IMULH_M)
INST_NAME(ISMULH_R)
INST_NAME(ISMULH_M)
- INST_NAME(IDIV_C)
+ INST_NAME(IMUL_RCP)
INST_NAME(ISDIV_C)
INST_NAME(INEG_R)
INST_NAME(IXOR_R)
@@ -396,7 +396,7 @@ namespace RandomX {
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
- INST_HANDLE(IDIV_C)
+ INST_HANDLE(IMUL_RCP)
INST_HANDLE(ISDIV_C)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
diff --git a/src/Instruction.hpp b/src/Instruction.hpp
index a38e3e66..543dfbf5 100644
--- a/src/Instruction.hpp
+++ b/src/Instruction.hpp
@@ -41,8 +41,8 @@ namespace RandomX {
constexpr int IMULH_M = 9;
constexpr int ISMULH_R = 10;
constexpr int ISMULH_M = 11;
- constexpr int IDIV_C = 12;
- constexpr int ISDIV_C = 13;
+ constexpr int IMUL_RCP = 12;
+ //constexpr int ISDIV_C = 13;
constexpr int INEG_R = 14;
constexpr int IXOR_R = 15;
constexpr int IXOR_M = 16;
@@ -103,7 +103,7 @@ namespace RandomX {
void h_IMULH_M(std::ostream&) const;
void h_ISMULH_R(std::ostream&) const;
void h_ISMULH_M(std::ostream&) const;
- void h_IDIV_C(std::ostream&) const;
+ void h_IMUL_RCP(std::ostream&) const;
void h_ISDIV_C(std::ostream&) const;
void h_INEG_R(std::ostream&) const;
void h_IXOR_R(std::ostream&) const;
diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp
index 48722139..6a97d7d9 100644
--- a/src/InterpretedVirtualMachine.cpp
+++ b/src/InterpretedVirtualMachine.cpp
@@ -30,6 +30,7 @@ along with RandomX. If not, see.
#include
#include
#include "intrinPortable.h"
+#include "reciprocal.h"
#ifdef STATS
#include
#endif
@@ -136,7 +137,7 @@ namespace RandomX {
*ibc.idst += 8 * *ibc.idst + ibc.imm;
} break;
- case InstructionType::IMUL_R: {
+ case InstructionType::IMUL_R: { //also handles IMUL_RCP
*ibc.idst *= *ibc.isrc;
} break;
@@ -160,24 +161,6 @@ namespace RandomX {
*ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask))));
} break;
- case InstructionType::IDIV_C: {
- uint64_t dividend = *ibc.idst;
- uint64_t quotient = dividend / ibc.imm;
- *ibc.idst += quotient;
- } break;
-
- case InstructionType::ISDIV_C: {
- if (ibc.simm != -1) {
- int64_t dividend = unsigned64ToSigned2sCompl(*ibc.idst);
- int64_t quotient = dividend / ibc.simm;
- *ibc.idst += quotient;
- }
- else {
- uint64_t quotient = ~(*ibc.idst) + 1;
- *ibc.idst += quotient;
- }
- } break;
-
case InstructionType::INEG_R: {
*ibc.idst = ~(*ibc.idst) + 1; //two's complement negative
} break;
@@ -568,13 +551,14 @@ namespace RandomX {
}
} break;
- CASE_REP(IDIV_C) {
+ CASE_REP(IMUL_RCP) {
uint32_t divisor = instr.imm32;
if (divisor != 0) {
auto dst = instr.dst % RegistersCount;
- ibc.type = InstructionType::IDIV_C;
+ ibc.type = InstructionType::IMUL_R;
ibc.idst = &r[dst];
- ibc.imm = divisor;
+ ibc.imm = reciprocal(divisor);
+ ibc.isrc = &ibc.imm;
}
else {
ibc.type = InstructionType::NOP;
@@ -582,16 +566,7 @@ namespace RandomX {
} break;
CASE_REP(ISDIV_C) {
- int32_t divisor = unsigned32ToSigned2sCompl(instr.imm32);
- if (divisor != 0) {
- auto dst = instr.dst % RegistersCount;
- ibc.type = InstructionType::ISDIV_C;
- ibc.idst = &r[dst];
- ibc.simm = divisor;
- }
- else {
- ibc.type = InstructionType::NOP;
- }
+ ibc.type = InstructionType::NOP;
} break;
CASE_REP(INEG_R) {
diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp
index 543632e3..5293b050 100644
--- a/src/JitCompilerX86.cpp
+++ b/src/JitCompilerX86.cpp
@@ -21,7 +21,7 @@ along with RandomX. If not, see.
#include
#include "JitCompilerX86.hpp"
#include "Program.hpp"
-#include "divideByConstantCodegen.h"
+#include "reciprocal.h"
#include "virtualMemory.hpp"
namespace RandomX {
@@ -395,106 +395,17 @@ namespace RandomX {
emitByte(0xc2 + 8 * instr.dst);
}
- void JitCompilerX86::h_IDIV_C(Instruction& instr) {
+ void JitCompilerX86::h_IMUL_RCP(Instruction& instr) {
if (instr.imm32 != 0) {
- uint32_t divisor = instr.imm32;
- if (divisor & (divisor - 1)) {
- magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8);
- if (mi.pre_shift == 0 && !mi.increment) {
- emit(MOV_RAX_I);
- emit64(mi.multiplier);
- emit(REX_MUL_R);
- emitByte(0xe0 + instr.dst);
- }
- else {
- emit(REX_MOV_RR64);
- emitByte(0xc0 + instr.dst);
- if (mi.pre_shift > 0) {
- emit(REX_SHR_RAX);
- emitByte(mi.pre_shift);
- }
- if (mi.increment) {
- emit(RAX_ADD_SBB_1);
- }
- emit(MOV_RCX_I);
- emit64(mi.multiplier);
- emit(MUL_RCX);
- }
- if (mi.post_shift > 0) {
- emit(REX_SHR_RDX);
- emitByte(mi.post_shift);
- }
- emit(REX_ADD_RM);
- emitByte(0xc2 + 8 * instr.dst);
- }
- else { //divisor is a power of two
- int shift = 0;
- while (divisor >>= 1)
- ++shift;
- if (shift > 0) {
- emit(REX_SH);
- emitByte(0xe8 + instr.dst);
- }
- }
+ emit(MOV_RAX_I);
+ emit64(reciprocal(instr.imm32));
+ emit(REX_IMUL_RM);
+ emitByte(0xc0 + 8 * instr.dst);
}
}
void JitCompilerX86::h_ISDIV_C(Instruction& instr) {
- int64_t divisor = (int32_t)instr.imm32;
- if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) {
- emit(REX_MOV_RR64);
- emitByte(0xc0 + instr.dst);
- // +/- power of two
- bool negative = divisor < 0;
- if (negative)
- divisor = -divisor;
- int shift = 0;
- uint64_t unsignedDivisor = divisor;
- while (unsignedDivisor >>= 1)
- ++shift;
- if (shift > 0) {
- emit(MOV_RCX_RAX_SAR_RCX_63);
- uint32_t mask = (1ULL << shift) - 1;
- emit(AND_ECX_I);
- emit32(mask);
- emit(ADD_RAX_RCX);
- emit(SAR_RAX_I8);
- emitByte(shift);
- }
- if (negative)
- emit(NEG_RAX);
- emit(ADD_R_RAX);
- emitByte(0xc0 + instr.dst);
- }
- else if (divisor != 0) {
- magics_info mi = compute_signed_magic_info(divisor);
- emit(MOV_RAX_I);
- emit64(mi.multiplier);
- emit(REX_MUL_R);
- emitByte(0xe8 + instr.dst);
- emit(XOR_EAX_EAX);
- bool haveSF = false;
- if (divisor > 0 && mi.multiplier < 0) {
- emit(ADD_RDX_R);
- emitByte(0xc2 + 8 * instr.dst);
- haveSF = true;
- }
- if (divisor < 0 && mi.multiplier > 0) {
- emit(SUB_RDX_R);
- emitByte(0xc2 + 8 * instr.dst);
- haveSF = true;
- }
- if (mi.shift > 0) {
- emit(SAR_RDX_I8);
- emitByte(mi.shift);
- haveSF = true;
- }
- if (!haveSF)
- emit(TEST_RDX_RDX);
- emit(SETS_AL_ADD_RDX_RAX);
- emit(ADD_R_RAX);
- emitByte(0xc2 + 8 * instr.dst);
- }
+
}
void JitCompilerX86::h_INEG_R(Instruction& instr) {
@@ -748,7 +659,7 @@ namespace RandomX {
INST_HANDLE(IMULH_M)
INST_HANDLE(ISMULH_R)
INST_HANDLE(ISMULH_M)
- INST_HANDLE(IDIV_C)
+ INST_HANDLE(IMUL_RCP)
INST_HANDLE(ISDIV_C)
INST_HANDLE(INEG_R)
INST_HANDLE(IXOR_R)
diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp
index 5936dcfb..fed3a8a3 100644
--- a/src/JitCompilerX86.hpp
+++ b/src/JitCompilerX86.hpp
@@ -101,7 +101,7 @@ namespace RandomX {
void h_IMULH_M(Instruction&);
void h_ISMULH_R(Instruction&);
void h_ISMULH_M(Instruction&);
- void h_IDIV_C(Instruction&);
+ void h_IMUL_RCP(Instruction&);
void h_ISDIV_C(Instruction&);
void h_INEG_R(Instruction&);
void h_IXOR_R(Instruction&);
diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp
index 74b6211b..31f0c546 100644
--- a/src/instructionWeights.hpp
+++ b/src/instructionWeights.hpp
@@ -32,8 +32,8 @@ along with RandomX. If not, see.
#define WT_IMULH_M 1
#define WT_ISMULH_R 4
#define WT_ISMULH_M 1
-#define WT_IDIV_C 4
-#define WT_ISDIV_C 4
+#define WT_IMUL_RCP 8
+#define WT_ISDIV_C 0
#define WT_INEG_R 2
#define WT_IXOR_R 16
#define WT_IXOR_M 4
@@ -71,7 +71,7 @@ along with RandomX. If not, see.
constexpr int wtSum = WT_IADD_R + WT_IADD_M + WT_IADD_RC + WT_ISUB_R + \
WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \
-WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \
+WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IMUL_RCP + WT_ISDIV_C + \
WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \
WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \
WT_FSCAL_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \
diff --git a/src/main.cpp b/src/main.cpp
index bb2a52cc..0b6a0fa9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -341,7 +341,7 @@ int main(int argc, char** argv) {
std::cout << "Calculated result: ";
result.print(std::cout);
if(programCount == 1000)
- std::cout << "Reference result: fe31e8fd7ed1cec773e87c0684b66b38e58b23ab255e8f9c6b62745e43a26851" << std::endl;
+ std::cout << "Reference result: d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl;
if (!miningMode) {
std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
}
diff --git a/src/program.inc b/src/program.inc
index 3c73b240..8a18fe46 100644
--- a/src/program.inc
+++ b/src/program.inc
@@ -1,740 +1,720 @@
- ; COND_M r1, sg(L1[r3], -2004237569)
- xor ecx, ecx
+ ; IMULH_R r1, r0
+ mov rax, r9
+ mul r8
+ mov r9, rdx
+ ; IMULH_R r4, r5
+ mov rax, r12
+ mul r13
+ mov r12, rdx
+ ; FMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; IMUL_9C r6, 933674225
+ lea r14, [r14+r14*8+933674225]
+ ; IROR_R r7, r6
+ mov ecx, r14d
+ ror r15, cl
+ ; FSQRT_R e1
+ sqrtpd xmm5, xmm5
+ ; IADD_R r1, r0
+ add r9, r8
+ ; FSCAL_R f1
+ xorps xmm1, xmm15
+ ; IMUL_R r6, r5
+ imul r14, r13
+ ; FSCAL_R f3
+ xorps xmm3, xmm15
+ ; IADD_M r5, L1[r0]
+ mov eax, r8d
+ and eax, 16376
+ add r13, qword ptr [rsi+rax]
+ ; IMUL_RCP r0, 3332750793
+ mov rax, 11886301652177618669
+ imul r8, rax
+ ; ISTORE L1[r3], r0
mov eax, r11d
and eax, 16376
- cmp dword ptr [rsi+rax], -2004237569
- sets cl
- add r9, rcx
- ; IXOR_R r7, -1379425991
- xor r15, -1379425991
- ; IXOR_R r2, r6
- xor r10, r14
- ; FSWAP_R f3
- shufpd xmm3, xmm3, 1
- ; FADD_R f1, a1
- addpd xmm1, xmm9
- ; IMUL_R r0, r5
- imul r8, r13
+ mov qword ptr [rsi+rax], r8
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; ISUB_R r1, r3
+ sub r9, r11
+ ; ISMULH_R r4, r1
+ mov rax, r12
+ imul r9
+ mov r12, rdx
+ ; IADD_RC r3, r0, 1262539428
+ lea r11, [r11+r8+1262539428]
+ ; FSWAP_R e1
+ shufpd xmm5, xmm5, 1
; FMUL_R e1, a3
mulpd xmm5, xmm11
- ; IADD_R r3, r2
- add r11, r10
- ; COND_M r1, ab(L2[r6], -724006934)
+ ; FMUL_R e3, a3
+ mulpd xmm7, xmm11
+ ; ISWAP_R r0, r2
+ xchg r8, r10
+ ; COND_R r5, of(r4, 137305269)
xor ecx, ecx
- mov eax, r14d
+ cmp r12d, 137305269
+ seto cl
+ add r13, rcx
+ ; IMUL_R r6, r4
+ imul r14, r12
+ ; FMUL_R e3, a0
+ mulpd xmm7, xmm8
+ ; FSCAL_R f0
+ xorps xmm0, xmm15
+ ; FADD_R f1, a0
+ addpd xmm1, xmm8
+ ; IADD_R r6, r3
+ add r14, r11
+ ; ISMULH_M r1, L3[777112]
+ mov rax, r9
+ imul qword ptr [rsi+777112]
+ mov r9, rdx
+ ; FADD_R f1, a1
+ addpd xmm1, xmm9
+ ; FSUB_M f2, L2[r3]
+ mov eax, r11d
and eax, 262136
- cmp dword ptr [rsi+rax], -724006934
- seta cl
- add r9, rcx
- ; IADD_RC r2, r7, -854121467
- lea r10, [r10+r15-854121467]
- ; IADD_RC r5, r6, 1291744030
- lea r13, [r13+r14+1291744030]
- ; ISTORE L2[r6], r4
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm2, xmm12
+ ; IMUL_R r5, r7
+ imul r13, r15
+ ; ISUB_M r1, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ sub r9, qword ptr [rsi+rax]
+ ; IXOR_M r1, L1[r6]
mov eax, r14d
+ and eax, 16376
+ xor r9, qword ptr [rsi+rax]
+ ; COND_R r2, ns(r3, 1727033430)
+ xor ecx, ecx
+ cmp r11d, 1727033430
+ setns cl
+ add r10, rcx
+ ; FADD_R f3, a1
+ addpd xmm3, xmm9
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; IADD_R r5, -1048707993
+ add r13, -1048707993
+ ; COND_R r2, ge(r5, -1016934677)
+ xor ecx, ecx
+ cmp r13d, -1016934677
+ setge cl
+ add r10, rcx
+ ; FSUB_R f2, a3
+ subpd xmm2, xmm11
+ ; ISUB_M r1, L2[r4]
+ mov eax, r12d
+ and eax, 262136
+ sub r9, qword ptr [rsi+rax]
+ ; IMUL_R r5, r3
+ imul r13, r11
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; IROR_R r1, r3
+ mov ecx, r11d
+ ror r9, cl
+ ; FADD_R f3, a2
+ addpd xmm3, xmm10
+ ; ISUB_R r0, -28376526
+ sub r8, -28376526
+ ; IROR_R r6, r0
+ mov ecx, r8d
+ ror r14, cl
+ ; FADD_R f1, a0
+ addpd xmm1, xmm8
+ ; FMUL_R e1, a0
+ mulpd xmm5, xmm8
+ ; IXOR_R r2, r4
+ xor r10, r12
+ ; FSUB_M f1, L1[r2]
+ mov eax, r10d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm1, xmm12
+ ; FSWAP_R f3
+ shufpd xmm3, xmm3, 1
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; ISUB_R r7, r6
+ sub r15, r14
+ ; FADD_R f3, a1
+ addpd xmm3, xmm9
+ ; ISUB_R r1, r7
+ sub r9, r15
+ ; IADD_M r5, L2[r7]
+ mov eax, r15d
and eax, 262136
- mov qword ptr [rsi+rax], r12
- ; IMUL_R r6, r7
- imul r14, r15
+ add r13, qword ptr [rsi+rax]
+ ; IADD_RC r1, r3, 145589392
+ lea r9, [r9+r11+145589392]
+ ; FADD_R f2, a1
+ addpd xmm2, xmm9
+ ; FSUB_R f1, a1
+ subpd xmm1, xmm9
+ ; FADD_M f0, L1[r3]
+ mov eax, r11d
+ and eax, 16376
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm0, xmm12
+ ; FADD_R f3, a1
+ addpd xmm3, xmm9
; FSUB_R f0, a3
subpd xmm0, xmm11
- ; IADD_M r3, L1[r0]
- mov eax, r8d
- and eax, 16376
- add r11, qword ptr [rsi+rax]
- ; ISDIV_C r4, -692911499
- mov rax, -893288710803585809
- imul r12
- xor eax, eax
- sar rdx, 25
- sets al
- add rdx, rax
- add r12, rdx
- ; FMUL_R e0, a0
- mulpd xmm4, xmm8
- ; FDIV_M e1, L1[r0]
- mov eax, r8d
+ ; FMUL_R e2, a2
+ mulpd xmm6, xmm10
+ ; FADD_R f2, a1
+ addpd xmm2, xmm9
+ ; IXOR_R r7, r4
+ xor r15, r12
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; IMUL_RCP r0, 3339947118
+ mov rax, 11860691159940745144
+ imul r8, rax
+ ; FSCAL_R f2
+ xorps xmm2, xmm15
+ ; IMUL_9C r0, 850304074
+ lea r8, [r8+r8*8+850304074]
+ ; IADD_R r2, r4
+ add r10, r12
+ ; IADD_R r0, -1929760745
+ add r8, -1929760745
+ ; ISTORE L2[r4], r7
+ mov eax, r12d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; IROR_R r2, r7
+ mov ecx, r15d
+ ror r10, cl
+ ; FMUL_R e1, a1
+ mulpd xmm5, xmm9
+ ; FSQRT_R e3
+ sqrtpd xmm7, xmm7
+ ; IXOR_R r0, -1150923249
+ xor r8, -1150923249
+ ; IMUL_9C r7, 586146619
+ lea r15, [r15+r15*8+586146619]
+ ; FSWAP_R f2
+ shufpd xmm2, xmm2, 1
+ ; FSUB_M f3, L1[r6]
+ mov eax, r14d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
- andps xmm12, xmm14
- divpd xmm5, xmm12
- maxpd xmm5, xmm13
- ; FMUL_R e0, a1
- mulpd xmm4, xmm9
- ; COND_M r0, no(L1[r1], -540292380)
+ subpd xmm3, xmm12
+ ; IXOR_R r0, 292938237
+ xor r8, 292938237
+ ; COND_R r6, no(r6, -2142285576)
xor ecx, ecx
+ cmp r14d, -2142285576
+ setno cl
+ add r14, rcx
+ ; IMUL_RCP r3, 670137279
+ mov rax, 14778345608621248183
+ imul r11, rax
+ ; ISTORE L1[r1], r5
mov eax, r9d
and eax, 16376
- cmp dword ptr [rsi+rax], -540292380
- setno cl
- add r8, rcx
- ; FSUB_R f1, a1
- subpd xmm1, xmm9
- ; IADD_RC r0, r2, 310371682
- lea r8, [r8+r10+310371682]
- ; COND_R r3, lt(r0, -1067603143)
+ mov qword ptr [rsi+rax], r13
+ ; COND_R r3, sg(r1, 1638220289)
xor ecx, ecx
- cmp r8d, -1067603143
- setl cl
+ cmp r9d, 1638220289
+ sets cl
add r11, rcx
- ; FMUL_R e0, a0
- mulpd xmm4, xmm8
- ; FADD_R f0, a3
- addpd xmm0, xmm11
- ; COND_R r4, sg(r3, -389806289)
+ ; IXOR_R r4, r2
+ xor r12, r10
+ ; COND_R r2, be(r2, 1131588253)
+ xor ecx, ecx
+ cmp r10d, 1131588253
+ setbe cl
+ add r10, rcx
+ ; IMULH_R r3, r1
+ mov rax, r11
+ mul r9
+ mov r11, rdx
+ ; COND_R r3, sg(r6, 1528901692)
xor ecx, ecx
- cmp r11d, -389806289
+ cmp r14d, 1528901692
sets cl
- add r12, rcx
+ add r11, rcx
+ ; IMUL_M r6, L2[r4]
+ mov eax, r12d
+ and eax, 262136
+ imul r14, qword ptr [rsi+rax]
+ ; ISMULH_M r1, L1[r2]
+ mov ecx, r10d
+ and ecx, 16376
+ mov rax, r9
+ imul qword ptr [rsi+rcx]
+ mov r9, rdx
+ ; ISUB_M r5, L1[r4]
+ mov eax, r12d
+ and eax, 16376
+ sub r13, qword ptr [rsi+rax]
+ ; IMUL_RCP r1, 1612208358
+ mov rax, 12285658072842024305
+ imul r9, rax
+ ; COND_R r2, lt(r6, -1712049035)
+ xor ecx, ecx
+ cmp r14d, -1712049035
+ setl cl
+ add r10, rcx
+ ; IMUL_RCP r2, 2888266520
+ mov rax, 13715521397634789187
+ imul r10, rax
+ ; IADD_M r1, L2[r6]
+ mov eax, r14d
+ and eax, 262136
+ add r9, qword ptr [rsi+rax]
; FMUL_R e0, a3
mulpd xmm4, xmm11
- ; ISTORE L2[r7], r4
+ ; ISTORE L1[r7], r1
mov eax, r15d
- and eax, 262136
- mov qword ptr [rsi+rax], r12
- ; IADD_RC r4, r2, 1888908452
- lea r12, [r12+r10+1888908452]
- ; IADD_R r1, r2
- add r9, r10
- ; IXOR_R r6, r5
- xor r14, r13
- ; IADD_M r7, L1[r0]
+ and eax, 16376
+ mov qword ptr [rsi+rax], r9
+ ; ISTORE L1[r0], r3
mov eax, r8d
and eax, 16376
- add r15, qword ptr [rsi+rax]
- ; IADD_R r5, r6
- add r13, r14
+ mov qword ptr [rsi+rax], r11
; FSUB_R f0, a1
subpd xmm0, xmm9
- ; IMULH_R r5, r4
- mov rax, r13
- mul r12
- mov r13, rdx
- ; IMUL_9C r7, 753606235
- lea r15, [r15+r15*8+753606235]
- ; FSWAP_R e2
- shufpd xmm6, xmm6, 1
- ; IMUL_M r7, L1[r1]
- mov eax, r9d
- and eax, 16376
- imul r15, qword ptr [rsi+rax]
- ; IMUL_R r5, 1431156245
- imul r13, 1431156245
- ; IADD_RC r4, r2, 1268508410
- lea r12, [r12+r10+1268508410]
- ; FSWAP_R f2
- shufpd xmm2, xmm2, 1
- ; ISDIV_C r0, -845194077
- mov rax, -5858725577819591251
- imul r8
- xor eax, eax
- sar rdx, 28
- sets al
- add rdx, rax
- add r8, rdx
- ; COND_R r0, ab(r5, 1644043355)
- xor ecx, ecx
- cmp r13d, 1644043355
- seta cl
- add r8, rcx
- ; COND_R r5, lt(r0, 1216385844)
- xor ecx, ecx
- cmp r8d, 1216385844
- setl cl
- add r13, rcx
- ; IMUL_R r5, r2
- imul r13, r10
- ; ISTORE L1[r4], r6
- mov eax, r12d
- and eax, 16376
- mov qword ptr [rsi+rax], r14
- ; IXOR_R r4, r3
- xor r12, r11
- ; IXOR_R r6, r2
- xor r14, r10
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; FMUL_R e0, a1
+ mulpd xmm4, xmm9
+ ; FMUL_R e2, a0
+ mulpd xmm6, xmm8
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IROR_R r5, 21
+ ror r13, 21
; FSQRT_R e1
sqrtpd xmm5, xmm5
- ; COND_R r5, be(r1, 1781435695)
- xor ecx, ecx
- cmp r9d, 1781435695
- setbe cl
- add r13, rcx
- ; ISDIV_C r0, 1367038890
- mov rax, 1811126293978922977
- imul r8
- xor eax, eax
- sar rdx, 27
- sets al
- add rdx, rax
- add r8, rdx
- ; FDIV_M e1, L1[r3]
+ ; ISTORE L1[r3], r1
mov eax, r11d
and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- andps xmm12, xmm14
- divpd xmm5, xmm12
- maxpd xmm5, xmm13
- ; FMUL_R e2, a0
- mulpd xmm6, xmm8
- ; ISTORE L1[r5], r4
- mov eax, r13d
- and eax, 16376
- mov qword ptr [rsi+rax], r12
- ; IXOR_R r0, r4
- xor r8, r12
- ; IMUL_R r5, r1
- imul r13, r9
- ; FDIV_M e0, L1[r2]
- mov eax, r10d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- andps xmm12, xmm14
- divpd xmm4, xmm12
- maxpd xmm4, xmm13
- ; IMUL_R r6, r1
- imul r14, r9
- ; FSUB_M f1, L1[r0]
- mov eax, r8d
+ mov qword ptr [rsi+rax], r9
+ ; IMUL_9C r2, -290275273
+ lea r10, [r10+r10*8-290275273]
+ ; ISUB_M r7, L1[r3]
+ mov eax, r11d
and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; COND_R r2, ns(r1, 392878356)
- xor ecx, ecx
- cmp r9d, 392878356
- setns cl
- add r10, rcx
- ; IADD_R r6, r5
- add r14, r13
- ; FMUL_R e2, a0
- mulpd xmm6, xmm8
- ; ISTORE L1[r0], r3
+ sub r15, qword ptr [rsi+rax]
+ ; IMUL_R r6, 1301522739
+ imul r14, 1301522739
+ ; ISWAP_R r2, r4
+ xchg r10, r12
+ ; FMUL_R e3, a2
+ mulpd xmm7, xmm10
+ ; IMUL_9C r2, 877307769
+ lea r10, [r10+r10*8+877307769]
+ ; IMUL_R r0, r3
+ imul r8, r11
+ ; IMUL_9C r0, 1293318220
+ lea r8, [r8+r8*8+1293318220]
+ ; FSQRT_R e0
+ sqrtpd xmm4, xmm4
+ ; ISTORE L1[r0], r2
mov eax, r8d
and eax, 16376
- mov qword ptr [rsi+rax], r11
- ; IMUL_R r1, r3
- imul r9, r11
- ; IMUL_R r5, r2
- imul r13, r10
- ; FADD_R f0, a0
- addpd xmm0, xmm8
- ; FADD_R f0, a1
- addpd xmm0, xmm9
- ; FSUB_R f0, a0
- subpd xmm0, xmm8
- ; IMUL_R r3, r5
- imul r11, r13
- ; IADD_R r1, r5
- add r9, r13
- ; IXOR_M r0, L1[r5]
- mov eax, r13d
+ mov qword ptr [rsi+rax], r10
+ ; IMUL_RCP r5, 2071364883
+ mov rax, 9562313618003962461
+ imul r13, rax
+ ; FMUL_R e1, a2
+ mulpd xmm5, xmm10
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; FSUB_R f0, a1
+ subpd xmm0, xmm9
+ ; IMULH_R r6, r1
+ mov rax, r14
+ mul r9
+ mov r14, rdx
+ ; ISTORE L1[r6], r5
+ mov eax, r14d
and eax, 16376
- xor r8, qword ptr [rsi+rax]
- ; FSCAL_R f2
- xorps xmm2, xmm15
- ; IDIV_C r5, 2577129788
- mov rax, 15371395512010654233
- mul r13
- shr rdx, 31
- add r13, rdx
- ; COND_R r5, be(r5, -999219370)
- xor ecx, ecx
- cmp r13d, -999219370
- setbe cl
- add r13, rcx
- ; ISTORE L2[r0], r2
- mov eax, r8d
+ mov qword ptr [rsi+rax], r13
+ ; ISTORE L2[r1], r2
+ mov eax, r9d
and eax, 262136
mov qword ptr [rsi+rax], r10
- ; FSUB_R f3, a3
- subpd xmm3, xmm11
- ; IROR_R r7, r6
- mov ecx, r14d
- ror r15, cl
- ; COND_R r6, ab(r4, 1309137534)
- xor ecx, ecx
- cmp r12d, 1309137534
- seta cl
- add r14, rcx
- ; FMUL_R e3, a0
- mulpd xmm7, xmm8
- ; COND_M r3, no(L2[r5], 483660199)
- xor ecx, ecx
- mov eax, r13d
+ ; ISUB_M r1, L2[r4]
+ mov eax, r12d
and eax, 262136
- cmp dword ptr [rsi+rax], 483660199
- setno cl
- add r11, rcx
- ; IMUL_R r1, r6
- imul r9, r14
- ; IADD_RC r7, r2, -1340630490
- lea r15, [r15+r10-1340630490]
- ; IADD_M r0, L3[1554088]
- add r8, qword ptr [rsi+1554088]
- ; FMUL_R e2, a3
- mulpd xmm6, xmm11
- ; IDIV_C r0, 1566192452
- mov rax, 12646619898641986559
- mul r8
- shr rdx, 30
- add r8, rdx
- ; FADD_R f0, a1
- addpd xmm0, xmm9
- ; ISWAP_R r6, r0
- xchg r14, r8
- ; IMUL_9C r4, 1340891034
- lea r12, [r12+r12*8+1340891034]
- ; IROR_R r7, r2
- mov ecx, r10d
- ror r15, cl
- ; FSQRT_R e2
- sqrtpd xmm6, xmm6
- ; FADD_R f2, a1
- addpd xmm2, xmm9
- ; IMUL_R r4, r3
- imul r12, r11
- ; IADD_RC r6, r3, -1584624397
- lea r14, [r14+r11-1584624397]
- ; IROR_R r1, r7
- mov ecx, r15d
- ror r9, cl
- ; IXOR_R r4, r7
- xor r12, r15
- ; FSWAP_R f0
- shufpd xmm0, xmm0, 1
- ; FSWAP_R f3
- shufpd xmm3, xmm3, 1
- ; IROR_R r5, 3
- ror r13, 3
- ; FADD_R f3, a0
- addpd xmm3, xmm8
- ; FMUL_R e0, a0
- mulpd xmm4, xmm8
- ; IADD_R r4, r1
- add r12, r9
- ; COND_M r4, ge(L1[r6], -1612023931)
- xor ecx, ecx
+ sub r9, qword ptr [rsi+rax]
+ ; IADD_M r7, L1[r6]
mov eax, r14d
and eax, 16376
- cmp dword ptr [rsi+rax], -1612023931
- setge cl
- add r12, rcx
- ; FSWAP_R e2
- shufpd xmm6, xmm6, 1
- ; IADD_R r3, r7
- add r11, r15
- ; COND_R r5, be(r2, -1083018923)
- xor ecx, ecx
- cmp r10d, -1083018923
- setbe cl
- add r13, rcx
- ; IADD_R r3, r7
- add r11, r15
- ; ISTORE L2[r6], r0
+ add r15, qword ptr [rsi+rax]
+ ; IADD_RC r2, r0, -1705364403
+ lea r10, [r10+r8-1705364403]
+ ; ISTORE L1[r6], r5
mov eax, r14d
- and eax, 262136
- mov qword ptr [rsi+rax], r8
- ; IXOR_R r2, r3
- xor r10, r11
- ; FMUL_R e2, a3
- mulpd xmm6, xmm11
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; FADD_R f0, a2
- addpd xmm0, xmm10
- ; ISTORE L1[r5], r1
+ and eax, 16376
+ mov qword ptr [rsi+rax], r13
+ ; FSUB_M f0, L1[r5]
mov eax, r13d
and eax, 16376
- mov qword ptr [rsi+rax], r9
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; ISWAP_R r1, r2
- xchg r9, r10
- ; FSWAP_R e0
- shufpd xmm4, xmm4, 1
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ subpd xmm0, xmm12
+ ; IXOR_R r1, r3
+ xor r9, r11
+ ; FADD_R f2, a0
+ addpd xmm2, xmm8
+ ; FSCAL_R f2
+ xorps xmm2, xmm15
+ ; ISUB_R r6, -789651909
+ sub r14, -789651909
+ ; COND_R r4, sg(r1, -1404926795)
+ xor ecx, ecx
+ cmp r9d, -1404926795
+ sets cl
+ add r12, rcx
+ ; FSCAL_R f2
+ xorps xmm2, xmm15
+ ; ISUB_R r6, r7
+ sub r14, r15
+ ; IXOR_R r5, r2
+ xor r13, r10
+ ; IROR_R r6, r5
+ mov ecx, r13d
+ ror r14, cl
; FSUB_R f1, a2
subpd xmm1, xmm10
- ; FSUB_R f0, a0
- subpd xmm0, xmm8
- ; IROR_R r7, r0
- mov ecx, r8d
- ror r15, cl
- ; IADD_RC r5, r4, 283260945
- lea r13, [r13+r12+283260945]
- ; ISDIV_C r6, -340125851
- mov rax, -3639652898025032137
- imul r14
- xor eax, eax
- sar rdx, 26
- sets al
- add rdx, rax
- add r14, rdx
- ; ISTORE L2[r2], r3
- mov eax, r10d
- and eax, 262136
- mov qword ptr [rsi+rax], r11
- ; IADD_RC r6, r6, -935765909
- lea r14, [r14+r14-935765909]
- ; ISDIV_C r3, -701703430
- mov rax, -7056770631919985199
- imul r11
- xor eax, eax
- sar rdx, 28
- sets al
- add rdx, rax
- add r11, rdx
- ; IXOR_M r3, L2[r1]
- mov eax, r9d
- and eax, 262136
- xor r11, qword ptr [rsi+rax]
- ; FADD_R f2, a1
- addpd xmm2, xmm9
- ; ISTORE L1[r5], r7
+ ; IMUL_M r4, L1[r5]
mov eax, r13d
and eax, 16376
- mov qword ptr [rsi+rax], r15
- ; FSUB_R f2, a0
- subpd xmm2, xmm8
- ; FMUL_R e3, a2
- mulpd xmm7, xmm10
- ; IADD_R r2, r5
- add r10, r13
- ; IADD_RC r2, r5, -1056770544
- lea r10, [r10+r13-1056770544]
- ; ISTORE L2[r2], r3
+ imul r12, qword ptr [rsi+rax]
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; FSWAP_R e1
+ shufpd xmm5, xmm5, 1
+ ; IADD_RC r6, r5, 1744830258
+ lea r14, [r14+r13+1744830258]
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; ISUB_R r7, r0
+ sub r15, r8
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; IMUL_9C r4, 241775739
+ lea r12, [r12+r12*8+241775739]
+ ; FADD_R f0, a0
+ addpd xmm0, xmm8
+ ; IMUL_R r4, r3
+ imul r12, r11
+ ; IMUL_RCP r4, 2389176791
+ mov rax, 16580640414036304271
+ imul r12, rax
+ ; FSCAL_R f1
+ xorps xmm1, xmm15
+ ; FSUB_R f2, a1
+ subpd xmm2, xmm9
+ ; ISTORE L2[r2], r0
mov eax, r10d
and eax, 262136
- mov qword ptr [rsi+rax], r11
- ; ISMULH_R r7, r1
- mov rax, r15
- imul r9
- mov r15, rdx
+ mov qword ptr [rsi+rax], r8
+ ; IXOR_M r5, L1[r7]
+ mov eax, r15d
+ and eax, 16376
+ xor r13, qword ptr [rsi+rax]
+ ; IMULH_M r4, L1[r1]
+ mov ecx, r9d
+ and ecx, 16376
+ mov rax, r12
+ mul qword ptr [rsi+rcx]
+ mov r12, rdx
+ ; FMUL_R e2, a1
+ mulpd xmm6, xmm9
; IXOR_R r0, r5
xor r8, r13
- ; ISTORE L1[r4], r0
- mov eax, r12d
- and eax, 16376
- mov qword ptr [rsi+rax], r8
- ; INEG_R r5
- neg r13
- ; FSUB_R f0, a1
- subpd xmm0, xmm9
- ; IMUL_R r6, -244261682
- imul r14, -244261682
- ; IMUL_R r1, r0
- imul r9, r8
- ; IMUL_9C r3, -985744277
- lea r11, [r11+r11*8-985744277]
+ ; IROR_R r0, r7
+ mov ecx, r15d
+ ror r8, cl
+ ; IADD_RC r6, r5, 472588845
+ lea r14, [r14+r13+472588845]
+ ; FADD_R f0, a0
+ addpd xmm0, xmm8
+ ; FSCAL_R f0
+ xorps xmm0, xmm15
; IROR_R r2, r1
mov ecx, r9d
ror r10, cl
- ; ISUB_R r4, -1079131550
- sub r12, -1079131550
- ; FSCAL_R f3
- xorps xmm3, xmm15
- ; COND_R r4, ns(r5, -362284631)
- xor ecx, ecx
- cmp r13d, -362284631
- setns cl
- add r12, rcx
- ; FSUB_R f2, a0
- subpd xmm2, xmm8
- ; IXOR_R r4, r5
- xor r12, r13
- ; FSCAL_R f1
- xorps xmm1, xmm15
- ; FADD_R f0, a0
- addpd xmm0, xmm8
- ; IADD_RC r3, r3, -173615832
- lea r11, [r11+r11-173615832]
- ; IMUL_R r0, 928402279
- imul r8, 928402279
- ; ISUB_R r2, r0
- sub r10, r8
- ; IXOR_R r6, r3
- xor r14, r11
- ; ISUB_R r2, 2106401471
- sub r10, 2106401471
- ; FADD_R f0, a2
- addpd xmm0, xmm10
- ; IMUL_R r4, r6
- imul r12, r14
- ; IADD_RC r4, r0, -373491513
- lea r12, [r12+r8-373491513]
- ; ISDIV_C r0, -1739042721
- mov rax, 7057121271817449967
- imul r8
- xor eax, eax
- sub rdx, r8
- sar rdx, 30
- sets al
- add rdx, rax
- add r8, rdx
- ; IADD_R r3, r1
- add r11, r9
- ; ISUB_M r7, L1[r5]
- mov eax, r13d
- and eax, 16376
- sub r15, qword ptr [rsi+rax]
- ; IMUL_R r1, r2
- imul r9, r10
- ; ISUB_R r0, 722465116
- sub r8, 722465116
- ; IADD_RC r0, r0, -1919541169
- lea r8, [r8+r8-1919541169]
- ; ISUB_M r2, L1[r3]
- mov eax, r11d
- and eax, 16376
- sub r10, qword ptr [rsi+rax]
- ; IADD_R r7, -1183581468
- add r15, -1183581468
- ; FMUL_R e1, a3
- mulpd xmm5, xmm11
+ ; IADD_RC r2, r1, 1968510355
+ lea r10, [r10+r9+1968510355]
+ ; FMUL_R e0, a0
+ mulpd xmm4, xmm8
+ ; ISUB_R r7, r1
+ sub r15, r9
+ ; IADD_RC r4, r7, 1111936914
+ lea r12, [r12+r15+1111936914]
+ ; IADD_RC r7, r3, 373642756
+ lea r15, [r15+r11+373642756]
; FSUB_R f0, a0
subpd xmm0, xmm8
+ ; IMUL_RCP r6, 3388328460
+ mov rax, 11691334451422153092
+ imul r14, rax
+ ; FSWAP_R e1
+ shufpd xmm5, xmm5, 1
+ ; IADD_RC r7, r5, -644292398
+ lea r15, [r15+r13-644292398]
+ ; IMUL_9C r7, -1398596563
+ lea r15, [r15+r15*8-1398596563]
; FADD_R f0, a3
addpd xmm0, xmm11
- ; IMUL_9C r6, 1241113238
- lea r14, [r14+r14*8+1241113238]
- ; FSUB_R f3, a3
- subpd xmm3, xmm11
- ; IADD_M r0, L1[r3]
- mov eax, r11d
- and eax, 16376
- add r8, qword ptr [rsi+rax]
- ; IROR_R r3, r7
- mov ecx, r15d
- ror r11, cl
- ; FADD_R f2, a1
- addpd xmm2, xmm9
- ; IMUL_M r3, L1[r2]
- mov eax, r10d
- and eax, 16376
- imul r11, qword ptr [rsi+rax]
- ; IMUL_9C r7, -2080412544
- lea r15, [r15+r15*8-2080412544]
- ; IMUL_R r0, r3
- imul r8, r11
- ; FADD_R f1, a1
- addpd xmm1, xmm9
- ; IROR_R r6, 21
- ror r14, 21
- ; FDIV_M e3, L1[r1]
- mov eax, r9d
+ ; FDIV_M e1, L1[r5]
+ mov eax, r13d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
andps xmm12, xmm14
- divpd xmm7, xmm12
- maxpd xmm7, xmm13
- ; FSUB_R f0, a1
- subpd xmm0, xmm9
- ; FSWAP_R e1
- shufpd xmm5, xmm5, 1
- ; COND_M r0, no(L1[r5], -1627153829)
- xor ecx, ecx
+ divpd xmm5, xmm12
+ maxpd xmm5, xmm13
+ ; IXOR_M r2, L1[r5]
mov eax, r13d
and eax, 16376
- cmp dword ptr [rsi+rax], -1627153829
- setno cl
- add r8, rcx
- ; FADD_R f2, a3
- addpd xmm2, xmm11
- ; FSUB_R f1, a2
- subpd xmm1, xmm10
- ; FSUB_M f1, L1[r4]
- mov eax, r12d
- and eax, 16376
- cvtdq2pd xmm12, qword ptr [rsi+rax]
- subpd xmm1, xmm12
- ; ISTORE L1[r5], r1
- mov eax, r13d
+ xor r10, qword ptr [rsi+rax]
+ ; IADD_R r5, r6
+ add r13, r14
+ ; IROR_R r4, r0
+ mov ecx, r8d
+ ror r12, cl
+ ; IXOR_R r0, r6
+ xor r8, r14
+ ; IMUL_RCP r1, 1035942442
+ mov rax, 9559913671615977868
+ imul r9, rax
+ ; IMUL_9C r1, 105267179
+ lea r9, [r9+r9*8+105267179]
+ ; IMUL_M r1, L1[r2]
+ mov eax, r10d
and eax, 16376
- mov qword ptr [rsi+rax], r9
- ; ISUB_M r2, L2[r7]
+ imul r9, qword ptr [rsi+rax]
+ ; COND_R r6, be(r7, 1344676209)
+ xor ecx, ecx
+ cmp r15d, 1344676209
+ setbe cl
+ add r14, rcx
+ ; IADD_R r6, r1
+ add r14, r9
+ ; IROR_R r5, r1
+ mov ecx, r9d
+ ror r13, cl
+ ; ISMULH_R r0, r6
+ mov rax, r8
+ imul r14
+ mov r8, rdx
+ ; IXOR_R r6, r7
+ xor r14, r15
+ ; FSUB_R f1, a3
+ subpd xmm1, xmm11
+ ; IMUL_9C r1, 1991866007
+ lea r9, [r9+r9*8+1991866007]
+ ; IMUL_RCP r2, 4139294400
+ mov rax, 9570249764581173254
+ imul r10, rax
+ ; FSWAP_R f0
+ shufpd xmm0, xmm0, 1
+ ; ISUB_R r5, r2
+ sub r13, r10
+ ; COND_R r6, lt(r1, -834783176)
+ xor ecx, ecx
+ cmp r9d, -834783176
+ setl cl
+ add r14, rcx
+ ; ISTORE L2[r7], r3
mov eax, r15d
and eax, 262136
- sub r10, qword ptr [rsi+rax]
- ; ISTORE L1[r2], r3
- mov eax, r10d
- and eax, 16376
mov qword ptr [rsi+rax], r11
- ; FADD_R f0, a3
- addpd xmm0, xmm11
- ; ISUB_M r1, L1[r7]
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; FSCAL_R f1
+ xorps xmm1, xmm15
+ ; IMUL_R r7, r4
+ imul r15, r12
+ ; IMUL_RCP r4, 3027698566
+ mov rax, 13083892069700893994
+ imul r12, rax
+ ; IMULH_M r2, L1[r3]
+ mov ecx, r11d
+ and ecx, 16376
+ mov rax, r10
+ mul qword ptr [rsi+rcx]
+ mov r10, rdx
+ ; IADD_M r6, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ add r14, qword ptr [rsi+rax]
+ ; IMUL_M r3, L1[r1]
+ mov eax, r9d
+ and eax, 16376
+ imul r11, qword ptr [rsi+rax]
+ ; ISTORE L1[r7], r5
mov eax, r15d
and eax, 16376
- sub r9, qword ptr [rsi+rax]
- ; IDIV_C r5, 624165039
- mov rax, 15866829597104432181
- mul r13
- shr rdx, 29
- add r13, rdx
- ; FMUL_R e3, a0
- mulpd xmm7, xmm8
- ; IMUL_R r5, r4
- imul r13, r12
+ mov qword ptr [rsi+rax], r13
+ ; IADD_RC r3, r1, -183791073
+ lea r11, [r11+r9-183791073]
+ ; IMUL_9C r6, 1353963989
+ lea r14, [r14+r14*8+1353963989]
+ ; ISUB_R r2, r3
+ sub r10, r11
+ ; IMUL_R r2, r1
+ imul r10, r9
+ ; IMULH_R r6, r4
+ mov rax, r14
+ mul r12
+ mov r14, rdx
+ ; ISMULH_R r6, r4
+ mov rax, r14
+ imul r12
+ mov r14, rdx
+ ; IADD_R r7, r4
+ add r15, r12
; FMUL_R e3, a1
mulpd xmm7, xmm9
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; IXOR_R r0, -2064879200
- xor r8, -2064879200
- ; FADD_R f1, a3
- addpd xmm1, xmm11
- ; IADD_M r0, L1[r3]
- mov eax, r11d
+ ; FADD_R f1, a2
+ addpd xmm1, xmm10
+ ; IADD_R r5, r6
+ add r13, r14
+ ; IADD_RC r4, r0, -1810659257
+ lea r12, [r12+r8-1810659257]
+ ; IROR_R r2, r5
+ mov ecx, r13d
+ ror r10, cl
+ ; FADD_R f2, a2
+ addpd xmm2, xmm10
+ ; FSWAP_R e2
+ shufpd xmm6, xmm6, 1
+ ; FADD_M f0, L1[r2]
+ mov eax, r10d
and eax, 16376
- add r8, qword ptr [rsi+rax]
- ; ISMULH_R r7, r3
- mov rax, r15
- imul r11
- mov r15, rdx
- ; IMUL_R r5, -1645503310
- imul r13, -1645503310
- ; IMUL_R r7, r3
- imul r15, r11
- ; FMUL_R e2, a2
- mulpd xmm6, xmm10
- ; IADD_R r6, 1769041191
- add r14, 1769041191
- ; FSUB_M f1, L1[r4]
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm0, xmm12
+ ; IADD_R r0, 52817665
+ add r8, 52817665
+ ; IMUL_RCP r6, 3388141601
+ mov rax, 11691979238837063231
+ imul r14, rax
+ ; IMUL_RCP r3, 1356467790
+ mov rax, 14601924774465956466
+ imul r11, rax
+ ; IADD_RC r7, r4, -2056421852
+ lea r15, [r15+r12-2056421852]
+ ; FSUB_M f1, L2[r4]
mov eax, r12d
- and eax, 16376
+ and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm1, xmm12
- ; ISTORE L2[r1], r0
- mov eax, r9d
+ ; ISWAP_R r1, r5
+ xchg r9, r13
+ ; ISTORE L2[r3], r5
+ mov eax, r11d
and eax, 262136
- mov qword ptr [rsi+rax], r8
- ; FSCAL_R f0
- xorps xmm0, xmm15
+ mov qword ptr [rsi+rax], r13
; FMUL_R e0, a3
mulpd xmm4, xmm11
- ; IMUL_R r2, r7
- imul r10, r15
- ; IADD_R r5, r1
- add r13, r9
- ; IROR_R r3, r6
- mov ecx, r14d
- ror r11, cl
- ; FADD_R f0, a0
- addpd xmm0, xmm8
- ; FMUL_R e1, a2
- mulpd xmm5, xmm10
+ ; IADD_RC r1, r4, -129008866
+ lea r9, [r9+r12-129008866]
+ ; COND_R r6, no(r4, 311828213)
+ xor ecx, ecx
+ cmp r12d, 311828213
+ setno cl
+ add r14, rcx
+ ; FSWAP_R e2
+ shufpd xmm6, xmm6, 1
+ ; IADD_RC r2, r2, 498744396
+ lea r10, [r10+r10+498744396]
+ ; IADD_RC r2, r3, 1515945097
+ lea r10, [r10+r11+1515945097]
+ ; FMUL_R e0, a2
+ mulpd xmm4, xmm10
+ ; ISTORE L2[r5], r7
+ mov eax, r13d
+ and eax, 262136
+ mov qword ptr [rsi+rax], r15
+ ; IMUL_M r7, L2[r0]
+ mov eax, r8d
+ and eax, 262136
+ imul r15, qword ptr [rsi+rax]
+ ; IADD_R r0, r2
+ add r8, r10
+ ; IADD_RC r7, r3, 1081450346
+ lea r15, [r15+r11+1081450346]
+ ; FADD_R f1, a3
+ addpd xmm1, xmm11
; FSCAL_R f3
xorps xmm3, xmm15
- ; FADD_R f1, a1
- addpd xmm1, xmm9
- ; IMULH_R r2, r5
- mov rax, r10
- mul r13
- mov r10, rdx
- ; ISTORE L1[r4], r0
- mov eax, r12d
- and eax, 16376
- mov qword ptr [rsi+rax], r8
- ; ISWAP_R r7, r0
- xchg r15, r8
- ; FSWAP_R f0
- shufpd xmm0, xmm0, 1
- ; ISUB_R r2, r0
- sub r10, r8
- ; FSUB_R f1, a3
- subpd xmm1, xmm11
- ; ISUB_M r5, L1[r3]
- mov eax, r11d
- and eax, 16376
- sub r13, qword ptr [rsi+rax]
- ; IXOR_R r7, r0
- xor r15, r8
- ; IMUL_R r4, r1
- imul r12, r9
- ; IADD_RC r0, r2, -1102648763
- lea r8, [r8+r10-1102648763]
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; IXOR_R r4, r1
- xor r12, r9
- ; IXOR_R r6, r0
- xor r14, r8
- ; FSQRT_R e1
- sqrtpd xmm5, xmm5
- ; IMUL_M r6, L2[r1]
- mov eax, r9d
+ ; FADD_M f3, L2[r7]
+ mov eax, r15d
and eax, 262136
- imul r14, qword ptr [rsi+rax]
- ; ISMULH_M r5, L3[353552]
- mov rax, r13
- imul qword ptr [rsi+353552]
- mov r13, rdx
- ; ISUB_M r1, L1[r6]
- mov eax, r14d
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ addpd xmm3, xmm12
+ ; FSUB_R f3, a0
+ subpd xmm3, xmm8
+ ; COND_M r2, of(L1[r5], -255033167)
+ xor ecx, ecx
+ mov eax, r13d
and eax, 16376
- sub r9, qword ptr [rsi+rax]
+ cmp dword ptr [rsi+rax], -255033167
+ seto cl
+ add r10, rcx
+ ; FSUB_R f1, a1
+ subpd xmm1, xmm9
+ ; IADD_R r2, r5
+ add r10, r13
+ ; FSQRT_R e2
+ sqrtpd xmm6, xmm6
+ ; IMUL_9C r2, 1521722302
+ lea r10, [r10+r10*8+1521722302]
; FADD_R f0, a3
addpd xmm0, xmm11
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; FSUB_M f3, L2[r7]
- mov eax, r15d
+ ; ISUB_R r0, r5
+ sub r8, r13
+ ; FADD_R f2, a0
+ addpd xmm2, xmm8
+ ; ISWAP_R r6, r0
+ xchg r14, r8
+ ; IADD_RC r1, r4, -693164762
+ lea r9, [r9+r12-693164762]
+ ; FDIV_M e0, L2[r2]
+ mov eax, r10d
+ and eax, 262136
+ cvtdq2pd xmm12, qword ptr [rsi+rax]
+ andps xmm12, xmm14
+ divpd xmm4, xmm12
+ maxpd xmm4, xmm13
+ ; IMUL_9C r4, -1849458799
+ lea r12, [r12+r12*8-1849458799]
+ ; IADD_RC r1, r4, -651820510
+ lea r9, [r9+r12-651820510]
+ ; IMULH_R r6, r6
+ mov rax, r14
+ mul r14
+ mov r14, rdx
+ ; FSUB_M f3, L2[r0]
+ mov eax, r8d
and eax, 262136
cvtdq2pd xmm12, qword ptr [rsi+rax]
subpd xmm3, xmm12
- ; IMUL_R r0, r2
- imul r8, r10
- ; FMUL_R e1, a0
- mulpd xmm5, xmm8
- ; COND_R r5, sg(r3, -1392293091)
- xor ecx, ecx
- cmp r11d, -1392293091
- sets cl
- add r13, rcx
- ; FSWAP_R e3
- shufpd xmm7, xmm7, 1
- ; IMUL_R r7, r4
- imul r15, r12
- ; IXOR_R r7, r5
- xor r15, r13
- ; FMUL_R e3, a3
- mulpd xmm7, xmm11
- ; IMUL_R r4, r3
- imul r12, r11
- ; FADD_M f1, L1[r1]
- mov eax, r9d
+ ; FSUB_R f0, a2
+ subpd xmm0, xmm10
+ ; FDIV_M e3, L1[r0]
+ mov eax, r8d
and eax, 16376
cvtdq2pd xmm12, qword ptr [rsi+rax]
- addpd xmm1, xmm12
- ; IMUL_R r5, r0
- imul r13, r8
- ; ISUB_R r7, r0
- sub r15, r8
- ; IADD_M r5, L1[r4]
- mov eax, r12d
- and eax, 16376
- add r13, qword ptr [rsi+rax]
- ; IADD_R r6, r2
- add r14, r10
- ; FMUL_R e1, a1
- mulpd xmm5, xmm9
- ; IADD_M r2, L3[1073640]
- add r10, qword ptr [rsi+1073640]
- ; IMUL_R r3, r2
- imul r11, r10
- ; IXOR_R r1, r0
- xor r9, r8
- ; IROR_R r7, r4
- mov ecx, r12d
- ror r15, cl
- ; FSUB_R f1, a1
- subpd xmm1, xmm9
- ; IMUL_R r7, r5
- imul r15, r13
- ; ISUB_R r1, 866191482
- sub r9, 866191482
- ; IMUL_M r7, L1[r4]
- mov eax, r12d
+ andps xmm12, xmm14
+ divpd xmm7, xmm12
+ maxpd xmm7, xmm13
+ ; IADD_M r3, L1[r7]
+ mov eax, r15d
and eax, 16376
- imul r15, qword ptr [rsi+rax]
- ; FADD_R f2, a0
- addpd xmm2, xmm8
- ; IADD_R r2, r1
- add r10, r9
+ add r11, qword ptr [rsi+rax]
+ ; IXOR_M r2, L2[r6]
+ mov eax, r14d
+ and eax, 262136
+ xor r10, qword ptr [rsi+rax]
diff --git a/src/reciprocal.c b/src/reciprocal.c
new file mode 100644
index 00000000..c414702e
--- /dev/null
+++ b/src/reciprocal.c
@@ -0,0 +1,60 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX. If not, see.
+*/
+
+#include "reciprocal.h"
+
+
+/*
+ Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64.
+
+ Equivalent x86 assembly (divisor in rcx):
+
+ mov edx, 1
+ mov r8, rcx
+ xor eax, eax
+ bsr rcx, rcx
+ shl rdx, cl
+ div r8
+ ret
+
+*/
+uint64_t reciprocal(uint64_t divisor) {
+
+ const uint64_t p2exp63 = 1ULL << 63;
+
+ uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor;
+
+ unsigned bsr = 0; //highest set bit in divisor
+
+ for (uint64_t bit = divisor; bit > 0; bit >>= 1)
+ bsr++;
+
+ for (unsigned shift = 0; shift < bsr; shift++) {
+ if (remainder >= divisor - remainder) {
+ quotient = quotient * 2 + 1;
+ remainder = remainder * 2 - divisor;
+ }
+ else {
+ quotient = quotient * 2;
+ remainder = remainder * 2;
+ }
+ }
+
+ return quotient;
+}
\ No newline at end of file
diff --git a/src/reciprocal.h b/src/reciprocal.h
new file mode 100644
index 00000000..0d133940
--- /dev/null
+++ b/src/reciprocal.h
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2019 tevador
+
+This file is part of RandomX.
+
+RandomX is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+RandomX is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with RandomX. If not, see.
+*/
+
+#pragma once
+#include
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+ uint64_t reciprocal(uint64_t);
+
+#if defined(__cplusplus)
+}
+#endif