diff --git a/doc/isa-ops.md b/doc/isa-ops.md
index d403bda9..d98d5f01 100644
--- a/doc/isa-ops.md
+++ b/doc/isa-ops.md
@@ -40,6 +40,8 @@ For floating point instructions, the destination can be a group F or group E reg
Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`.
+Memory operands for group E registers are loaded as described above, then their sign bit is cleared and their exponent value is set to `0x30F` (corresponds to 2-240).
+
|frequency|instruction|dst|src|operation|
|-|-|-|-|-|
|8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`|
@@ -58,8 +60,7 @@ This instruction negates the number and multiplies it by 2x "a2"
; xmm11 -> "a3"
; xmm12 -> temporary
- ; xmm13 -> DBL_MIN
- ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff
- ; xmm15 -> sign mask 0x80000000000000008000000000000000
+ ; xmm13 -> mantissa mask = 0x000fffffffffffff000fffffffffffff
+ ; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000
+ ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000
*/
@@ -165,7 +165,7 @@ namespace RandomX {
static const uint8_t JMP = 0xe9;
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
- static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0f, 0x54, 0xe6 };
+ static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
@@ -556,8 +556,6 @@ namespace RandomX {
emit(REX_ANDPS_XMM12);
emit(REX_DIVPD);
emitByte(0xe4 + 8 * instr.dst);
- emit(REX_MAXPD);
- emitByte(0xe5 + 8 * instr.dst);
}
void JitCompilerX86::h_FSQRT_R(Instruction& instr) {
diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc
index 76b8f3de..6ef67ece 100644
--- a/src/asm/program_loop_load.inc
+++ b/src/asm/program_loop_load.inc
@@ -22,7 +22,11 @@
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
- andps xmm4, xmm14
- andps xmm5, xmm14
- andps xmm6, xmm14
- andps xmm7, xmm14
+ andps xmm4, xmm13
+ andps xmm5, xmm13
+ andps xmm6, xmm13
+ andps xmm7, xmm13
+ orps xmm4, xmm14
+ orps xmm5, xmm14
+ orps xmm6, xmm14
+ orps xmm7, xmm14
diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc
index a0acebca..1ba1635c 100644
--- a/src/asm/program_loop_store.inc
+++ b/src/asm/program_loop_store.inc
@@ -8,10 +8,10 @@
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
pop rcx
- mulpd xmm0, xmm4
- mulpd xmm1, xmm5
- mulpd xmm2, xmm6
- mulpd xmm3, xmm7
+ xorpd xmm0, xmm4
+ xorpd xmm1, xmm5
+ xorpd xmm2, xmm6
+ xorpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc
index e487c583..c798ce70 100644
--- a/src/asm/program_prologue_linux.inc
+++ b/src/asm/program_prologue_linux.inc
@@ -32,8 +32,8 @@
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]
- movapd xmm13, xmmword ptr minDbl[rip]
- movapd xmm14, xmmword ptr absMask[rip]
- movapd xmm15, xmmword ptr signMask[rip]
+ movapd xmm13, xmmword ptr mantissaMask[rip]
+ movapd xmm14, xmmword ptr exp240[rip]
+ movapd xmm15, xmmword ptr scaleMask[rip]
jmp DECL(randomx_program_loop_begin)
\ No newline at end of file
diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc
index f91cca2c..5a666a37 100644
--- a/src/asm/program_prologue_win64.inc
+++ b/src/asm/program_prologue_win64.inc
@@ -45,8 +45,8 @@
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]
- movapd xmm13, xmmword ptr [minDbl]
- movapd xmm14, xmmword ptr [absMask]
- movapd xmm15, xmmword ptr [signMask]
+ movapd xmm13, xmmword ptr [mantissaMask]
+ movapd xmm14, xmmword ptr [exp240]
+ movapd xmm15, xmmword ptr [scaleMask]
jmp randomx_program_loop_begin
\ No newline at end of file
diff --git a/src/asm/program_xmm_constants.inc b/src/asm/program_xmm_constants.inc
index 79d05a40..5c2600b6 100644
--- a/src/asm/program_xmm_constants.inc
+++ b/src/asm/program_xmm_constants.inc
@@ -1,6 +1,6 @@
-minDbl:
- db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
- db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
+mantissaMask:
+ db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0
+exp240:
+ db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48
+scaleMask:
db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129
\ No newline at end of file
diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm
index d7d6f871..37392cdb 100644
--- a/src/executeProgram-win64.asm
+++ b/src/executeProgram-win64.asm
@@ -52,9 +52,9 @@ executeProgram PROC
; xmm10 -> "a2"
; xmm11 -> "a3"
; xmm12 -> temporary
- ; xmm13 -> DBL_MIN
- ; xmm14 -> absolute value mask
- ; xmm15 -> sign mask
+ ; xmm13 -> mantissa mask = 0x000fffffffffffff000fffffffffffff
+ ; xmm14 -> exponent 2**-240 = 0x30f000000000000030f0000000000000
+ ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000
; store callee-saved registers
push rbx
@@ -103,18 +103,18 @@ executeProgram PROC
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]
- movapd xmm13, xmmword ptr [minDbl]
- movapd xmm14, xmmword ptr [absMask]
- movapd xmm15, xmmword ptr [signMask]
+ movapd xmm13, xmmword ptr [mantissaMask]
+ movapd xmm14, xmmword ptr [exp240]
+ movapd xmm15, xmmword ptr [scaleMask]
jmp program_begin
ALIGN 64
-minDbl:
- db 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 16, 0
-absMask:
- db 255, 255, 255, 255, 255, 255, 255, 127, 255, 255, 255, 255, 255, 255, 255, 127
-signMask:
+mantissaMask:
+ db 255, 255, 255, 255, 255, 255, 15, 0, 255, 255, 255, 255, 255, 255, 15, 0
+exp240:
+ db 0, 0, 0, 0, 0, 0, 240, 48, 0, 0, 0, 0, 0, 0, 240, 48
+scaleMask:
db 0, 0, 0, 0, 0, 0, 240, 129, 0, 0, 0, 0, 0, 0, 240, 129
ALIGN 64
@@ -145,10 +145,14 @@ program_begin:
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
- andps xmm4, xmm14
- andps xmm5, xmm14
- andps xmm6, xmm14
- andps xmm7, xmm14
+ andps xmm4, xmm13
+ andps xmm5, xmm13
+ andps xmm6, xmm13
+ andps xmm7, xmm13
+ orps xmm4, xmm14
+ orps xmm5, xmm14
+ orps xmm6, xmm14
+ orps xmm7, xmm14
;# 256 instructions
include program.inc
@@ -181,10 +185,10 @@ IF 1
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
pop rcx
- mulpd xmm0, xmm4
- mulpd xmm1, xmm5
- mulpd xmm2, xmm6
- mulpd xmm3, xmm7
+ xorpd xmm0, xmm4
+ xorpd xmm1, xmm5
+ xorpd xmm2, xmm6
+ xorpd xmm3, xmm7
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
diff --git a/src/main.cpp b/src/main.cpp
index 0b6a0fa9..ac63dce8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -341,7 +341,7 @@ int main(int argc, char** argv) {
std::cout << "Calculated result: ";
result.print(std::cout);
if(programCount == 1000)
- std::cout << "Reference result: d3ae5a9365196ed48bb98ebfc3316498e29443ea7f056ecbd272f749c6af7730" << std::endl;
+ std::cout << "Reference result: e1b4144293ff9ab5aa4c98f2389bb18950d8c3fd874891ac64628e028a286006" << std::endl;
if (!miningMode) {
std::cout << "Performance: " << 1000 * elapsed / programCount << " ms per hash" << std::endl;
}