diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..31540db --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*~ +*.pyc +rcracki.precalc.* +*.rt diff --git a/CL/crackalack.cl b/CL/crackalack.cl new file mode 100644 index 0000000..e66d5ad --- /dev/null +++ b/CL/crackalack.cl @@ -0,0 +1,51 @@ +#include "rt.cl" +#include "string.cl" + + +__kernel void crackalack( + __global unsigned int *g_hash_type, + __global char *g_charset, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned int *g_reduction_offset, + __global unsigned int *g_chain_len, + __global unsigned long *g_indices, + __global unsigned int *g_pos_start) { + + unsigned int hash_type = *g_hash_type; + char charset[MAX_CHARSET_LEN]; + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned int reduction_offset = *g_reduction_offset; + unsigned int chain_len = *g_chain_len; + unsigned long start_index = g_indices[get_global_id(0)]; + unsigned int pos = *g_pos_start; + + unsigned int charset_len = g_strncpy(charset, g_charset, sizeof(charset)); + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + unsigned char plaintext[MAX_PLAINTEXT_LEN]; + unsigned int plaintext_len = 0; + unsigned char hash[MAX_HASH_OUTPUT_LEN]; + unsigned int hash_len; + + unsigned long plaintext_space_total = fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + // Generate a chain, and store it in the local buffer. + g_indices[get_global_id(0)] = generate_rainbow_chain( + hash_type, + charset, + charset_len, + plaintext_len_min, + plaintext_len_max, + reduction_offset, + chain_len, + start_index++, + pos, + plaintext_space_up_to_index, + plaintext_space_total, + plaintext, + &plaintext_len, + hash, + &hash_len); + return; +} diff --git a/CL/crackalack_ntlm8.cl b/CL/crackalack_ntlm8.cl new file mode 100644 index 0000000..11ae2fb --- /dev/null +++ b/CL/crackalack_ntlm8.cl @@ -0,0 +1,25 @@ +#include "ntlm8_functions.cl" + + +/* TODO: specify array length in definition...somehow? */ +__kernel void crackalack_ntlm8( + __global unsigned int *unused1, + __global char *unused2, + __global unsigned int *unused3, + __global unsigned int *unused4, + __global unsigned int *unused5, + __global unsigned int *unused6, + __global unsigned long *g_indices, + __global unsigned int *unused7) { + unsigned long index = g_indices[get_global_id(0)]; + unsigned char plaintext[8]; + + + for (unsigned int pos = 0; pos < 421999; pos++) { + index_to_plaintext_ntlm8(index, charset, plaintext); + index = hash_to_index_ntlm8(hash_ntlm8(plaintext), pos); + } + + g_indices[get_global_id(0)] = index; + return; +} diff --git a/CL/crackalack_ntlm9.cl b/CL/crackalack_ntlm9.cl new file mode 100644 index 0000000..a3f6a69 --- /dev/null +++ b/CL/crackalack_ntlm9.cl @@ -0,0 +1,25 @@ +#include "ntlm9_functions.cl" + + +/* TODO: specify array length in definition...somehow? */ +__kernel void crackalack_ntlm9( + __global unsigned int *unused1, + __global char *unused2, + __global unsigned int *unused3, + __global unsigned int *unused4, + __global unsigned int *unused5, + __global unsigned int *unused6, + __global unsigned long *g_indices, + __global unsigned int *unused7) { + unsigned long index = g_indices[get_global_id(0)]; + unsigned char plaintext[9]; + + + for (unsigned int pos = 0; pos < 1349999; pos++) { + index_to_plaintext_ntlm9(index, charset, plaintext); + index = hash_to_index_ntlm9(hash_ntlm9(plaintext), pos); + } + + g_indices[get_global_id(0)] = index; + return; +} diff --git a/CL/des.cl b/CL/des.cl new file mode 100644 index 0000000..3db5a77 --- /dev/null +++ b/CL/des.cl @@ -0,0 +1,399 @@ +/* + * This software is Copyright (c) 2018 magnum, + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * This file is BASED on code from mbed TLS (https://tls.mbed.org): + * + * FIPS-46-3 compliant Triple-DES implementation + * + * Copyright (C) 2006-2015, ARM Limited, All Rights Reserved + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OPENCL_DES_H +#define OPENCL_DES_H + + +#define __const_a8 __constant +#define DES_KEY_SIZE 8 + +typedef uint uint32_t; + +/* +typedef struct { + uint32_t sk[32]; +} des_context; +*/ + +/* + * 32-bit integer manipulation macros (big endian) + */ +#ifndef GET_UINT32_BE +#define GET_UINT32_BE(n,b,i) \ + do { \ + (n) = ((uint32_t) (b)[(i) ] << 24) \ + | ((uint32_t) (b)[(i) + 1] << 16) \ + | ((uint32_t) (b)[(i) + 2] << 8) \ + | ((uint32_t) (b)[(i) + 3] ); \ + } while (0) +#endif + +#ifndef PUT_UINT32_BE +#define PUT_UINT32_BE(n,b,i) \ + do { \ + (b)[(i) ] = (uchar) ((n) >> 24); \ + (b)[(i) + 1] = (uchar) ((n) >> 16); \ + (b)[(i) + 2] = (uchar) ((n) >> 8); \ + (b)[(i) + 3] = (uchar) ((n) ); \ + } while (0) +#endif + +/* + * Expanded DES S-boxes + */ +__const_a8 uint32_t SB1[64] = { + 0x01010400, 0x00000000, 0x00010000, 0x01010404, + 0x01010004, 0x00010404, 0x00000004, 0x00010000, + 0x00000400, 0x01010400, 0x01010404, 0x00000400, + 0x01000404, 0x01010004, 0x01000000, 0x00000004, + 0x00000404, 0x01000400, 0x01000400, 0x00010400, + 0x00010400, 0x01010000, 0x01010000, 0x01000404, + 0x00010004, 0x01000004, 0x01000004, 0x00010004, + 0x00000000, 0x00000404, 0x00010404, 0x01000000, + 0x00010000, 0x01010404, 0x00000004, 0x01010000, + 0x01010400, 0x01000000, 0x01000000, 0x00000400, + 0x01010004, 0x00010000, 0x00010400, 0x01000004, + 0x00000400, 0x00000004, 0x01000404, 0x00010404, + 0x01010404, 0x00010004, 0x01010000, 0x01000404, + 0x01000004, 0x00000404, 0x00010404, 0x01010400, + 0x00000404, 0x01000400, 0x01000400, 0x00000000, + 0x00010004, 0x00010400, 0x00000000, 0x01010004 +}; + +__const_a8 uint32_t SB2[64] = { + 0x80108020, 0x80008000, 0x00008000, 0x00108020, + 0x00100000, 0x00000020, 0x80100020, 0x80008020, + 0x80000020, 0x80108020, 0x80108000, 0x80000000, + 0x80008000, 0x00100000, 0x00000020, 0x80100020, + 0x00108000, 0x00100020, 0x80008020, 0x00000000, + 0x80000000, 0x00008000, 0x00108020, 0x80100000, + 0x00100020, 0x80000020, 0x00000000, 0x00108000, + 0x00008020, 0x80108000, 0x80100000, 0x00008020, + 0x00000000, 0x00108020, 0x80100020, 0x00100000, + 0x80008020, 0x80100000, 0x80108000, 0x00008000, + 0x80100000, 0x80008000, 0x00000020, 0x80108020, + 0x00108020, 0x00000020, 0x00008000, 0x80000000, + 0x00008020, 0x80108000, 0x00100000, 0x80000020, + 0x00100020, 0x80008020, 0x80000020, 0x00100020, + 0x00108000, 0x00000000, 0x80008000, 0x00008020, + 0x80000000, 0x80100020, 0x80108020, 0x00108000 +}; + +__const_a8 uint32_t SB3[64] = { + 0x00000208, 0x08020200, 0x00000000, 0x08020008, + 0x08000200, 0x00000000, 0x00020208, 0x08000200, + 0x00020008, 0x08000008, 0x08000008, 0x00020000, + 0x08020208, 0x00020008, 0x08020000, 0x00000208, + 0x08000000, 0x00000008, 0x08020200, 0x00000200, + 0x00020200, 0x08020000, 0x08020008, 0x00020208, + 0x08000208, 0x00020200, 0x00020000, 0x08000208, + 0x00000008, 0x08020208, 0x00000200, 0x08000000, + 0x08020200, 0x08000000, 0x00020008, 0x00000208, + 0x00020000, 0x08020200, 0x08000200, 0x00000000, + 0x00000200, 0x00020008, 0x08020208, 0x08000200, + 0x08000008, 0x00000200, 0x00000000, 0x08020008, + 0x08000208, 0x00020000, 0x08000000, 0x08020208, + 0x00000008, 0x00020208, 0x00020200, 0x08000008, + 0x08020000, 0x08000208, 0x00000208, 0x08020000, + 0x00020208, 0x00000008, 0x08020008, 0x00020200 +}; + +__const_a8 uint32_t SB4[64] = { + 0x00802001, 0x00002081, 0x00002081, 0x00000080, + 0x00802080, 0x00800081, 0x00800001, 0x00002001, + 0x00000000, 0x00802000, 0x00802000, 0x00802081, + 0x00000081, 0x00000000, 0x00800080, 0x00800001, + 0x00000001, 0x00002000, 0x00800000, 0x00802001, + 0x00000080, 0x00800000, 0x00002001, 0x00002080, + 0x00800081, 0x00000001, 0x00002080, 0x00800080, + 0x00002000, 0x00802080, 0x00802081, 0x00000081, + 0x00800080, 0x00800001, 0x00802000, 0x00802081, + 0x00000081, 0x00000000, 0x00000000, 0x00802000, + 0x00002080, 0x00800080, 0x00800081, 0x00000001, + 0x00802001, 0x00002081, 0x00002081, 0x00000080, + 0x00802081, 0x00000081, 0x00000001, 0x00002000, + 0x00800001, 0x00002001, 0x00802080, 0x00800081, + 0x00002001, 0x00002080, 0x00800000, 0x00802001, + 0x00000080, 0x00800000, 0x00002000, 0x00802080 +}; + +__const_a8 uint32_t SB5[64] = { + 0x00000100, 0x02080100, 0x02080000, 0x42000100, + 0x00080000, 0x00000100, 0x40000000, 0x02080000, + 0x40080100, 0x00080000, 0x02000100, 0x40080100, + 0x42000100, 0x42080000, 0x00080100, 0x40000000, + 0x02000000, 0x40080000, 0x40080000, 0x00000000, + 0x40000100, 0x42080100, 0x42080100, 0x02000100, + 0x42080000, 0x40000100, 0x00000000, 0x42000000, + 0x02080100, 0x02000000, 0x42000000, 0x00080100, + 0x00080000, 0x42000100, 0x00000100, 0x02000000, + 0x40000000, 0x02080000, 0x42000100, 0x40080100, + 0x02000100, 0x40000000, 0x42080000, 0x02080100, + 0x40080100, 0x00000100, 0x02000000, 0x42080000, + 0x42080100, 0x00080100, 0x42000000, 0x42080100, + 0x02080000, 0x00000000, 0x40080000, 0x42000000, + 0x00080100, 0x02000100, 0x40000100, 0x00080000, + 0x00000000, 0x40080000, 0x02080100, 0x40000100 +}; + +__const_a8 uint32_t SB6[64] = { + 0x20000010, 0x20400000, 0x00004000, 0x20404010, + 0x20400000, 0x00000010, 0x20404010, 0x00400000, + 0x20004000, 0x00404010, 0x00400000, 0x20000010, + 0x00400010, 0x20004000, 0x20000000, 0x00004010, + 0x00000000, 0x00400010, 0x20004010, 0x00004000, + 0x00404000, 0x20004010, 0x00000010, 0x20400010, + 0x20400010, 0x00000000, 0x00404010, 0x20404000, + 0x00004010, 0x00404000, 0x20404000, 0x20000000, + 0x20004000, 0x00000010, 0x20400010, 0x00404000, + 0x20404010, 0x00400000, 0x00004010, 0x20000010, + 0x00400000, 0x20004000, 0x20000000, 0x00004010, + 0x20000010, 0x20404010, 0x00404000, 0x20400000, + 0x00404010, 0x20404000, 0x00000000, 0x20400010, + 0x00000010, 0x00004000, 0x20400000, 0x00404010, + 0x00004000, 0x00400010, 0x20004010, 0x00000000, + 0x20404000, 0x20000000, 0x00400010, 0x20004010 +}; + +__const_a8 uint32_t SB7[64] = { + 0x00200000, 0x04200002, 0x04000802, 0x00000000, + 0x00000800, 0x04000802, 0x00200802, 0x04200800, + 0x04200802, 0x00200000, 0x00000000, 0x04000002, + 0x00000002, 0x04000000, 0x04200002, 0x00000802, + 0x04000800, 0x00200802, 0x00200002, 0x04000800, + 0x04000002, 0x04200000, 0x04200800, 0x00200002, + 0x04200000, 0x00000800, 0x00000802, 0x04200802, + 0x00200800, 0x00000002, 0x04000000, 0x00200800, + 0x04000000, 0x00200800, 0x00200000, 0x04000802, + 0x04000802, 0x04200002, 0x04200002, 0x00000002, + 0x00200002, 0x04000000, 0x04000800, 0x00200000, + 0x04200800, 0x00000802, 0x00200802, 0x04200800, + 0x00000802, 0x04000002, 0x04200802, 0x04200000, + 0x00200800, 0x00000000, 0x00000002, 0x04200802, + 0x00000000, 0x00200802, 0x04200000, 0x00000800, + 0x04000002, 0x04000800, 0x00000800, 0x00200002 +}; + +__const_a8 uint32_t SB8[64] = { + 0x10001040, 0x00001000, 0x00040000, 0x10041040, + 0x10000000, 0x10001040, 0x00000040, 0x10000000, + 0x00040040, 0x10040000, 0x10041040, 0x00041000, + 0x10041000, 0x00041040, 0x00001000, 0x00000040, + 0x10040000, 0x10000040, 0x10001000, 0x00001040, + 0x00041000, 0x00040040, 0x10040040, 0x10041000, + 0x00001040, 0x00000000, 0x00000000, 0x10040040, + 0x10000040, 0x10001000, 0x00041040, 0x00040000, + 0x00041040, 0x00040000, 0x10041000, 0x00001000, + 0x00000040, 0x10040040, 0x00001000, 0x00041040, + 0x10001000, 0x00000040, 0x10000040, 0x10040000, + 0x10040040, 0x10000000, 0x00040000, 0x10001040, + 0x00000000, 0x10041040, 0x00040040, 0x10000040, + 0x10040000, 0x10001000, 0x10001040, 0x00000000, + 0x10041040, 0x00041000, 0x00041000, 0x00001040, + 0x00001040, 0x00040040, 0x10000000, 0x10041000 +}; + +/* + * PC1: left and right halves bit-swap + */ +__const_a8 uint32_t LHs[16] = { + 0x00000000, 0x00000001, 0x00000100, 0x00000101, + 0x00010000, 0x00010001, 0x00010100, 0x00010101, + 0x01000000, 0x01000001, 0x01000100, 0x01000101, + 0x01010000, 0x01010001, 0x01010100, 0x01010101 +}; + +__const_a8 uint32_t RHs[16] = { + 0x00000000, 0x01000000, 0x00010000, 0x01010000, + 0x00000100, 0x01000100, 0x00010100, 0x01010100, + 0x00000001, 0x01000001, 0x00010001, 0x01010001, + 0x00000101, 0x01000101, 0x00010101, 0x01010101, +}; + +/* + * Initial Permutation macro + */ +#define DES_IP(X,Y) \ + do { \ + T = ((X >> 4) ^ Y) & 0x0F0F0F0F; Y ^= T; X ^= (T << 4); \ + T = ((X >> 16) ^ Y) & 0x0000FFFF; Y ^= T; X ^= (T << 16); \ + T = ((Y >> 2) ^ X) & 0x33333333; X ^= T; Y ^= (T << 2); \ + T = ((Y >> 8) ^ X) & 0x00FF00FF; X ^= T; Y ^= (T << 8); \ + Y = ((Y << 1) | (Y >> 31)) & 0xFFFFFFFF; \ + T = (X ^ Y) & 0xAAAAAAAA; Y ^= T; X ^= T; \ + X = ((X << 1) | (X >> 31)) & 0xFFFFFFFF; \ + } while (0) + +/* + * Final Permutation macro + */ +#define DES_FP(X,Y) \ + do { \ + X = ((X << 31) | (X >> 1)) & 0xFFFFFFFF; \ + T = (X ^ Y) & 0xAAAAAAAA; X ^= T; Y ^= T; \ + Y = ((Y << 31) | (Y >> 1)) & 0xFFFFFFFF; \ + T = ((Y >> 8) ^ X) & 0x00FF00FF; X ^= T; Y ^= (T << 8); \ + T = ((Y >> 2) ^ X) & 0x33333333; X ^= T; Y ^= (T << 2); \ + T = ((X >> 16) ^ Y) & 0x0000FFFF; Y ^= T; X ^= (T << 16); \ + T = ((X >> 4) ^ Y) & 0x0F0F0F0F; Y ^= T; X ^= (T << 4); \ + } while (0) + +/* + * DES round macro + */ +#define DES_ROUND(X,Y) \ + do { \ + T = *SK++ ^ X; \ + Y ^= SB8[ (T ) & 0x3F ] ^ \ + SB6[ (T >> 8) & 0x3F ] ^ \ + SB4[ (T >> 16) & 0x3F ] ^ \ + SB2[ (T >> 24) & 0x3F ]; \ + \ + T = *SK++ ^ ((X << 28) | (X >> 4)); \ + Y ^= SB7[ (T ) & 0x3F ] ^ \ + SB5[ (T >> 8) & 0x3F ] ^ \ + SB3[ (T >> 16) & 0x3F ] ^ \ + SB1[ (T >> 24) & 0x3F ]; \ + } while (0) + +#define SWAP(a,b) do { uint32_t t = a; a = b; b = t; } while (0) + +inline void des_setkey(uint32_t SK[32], const uchar key[DES_KEY_SIZE]) +{ + int i; + uint32_t X, Y, T; + + GET_UINT32_BE(X, key, 0); + GET_UINT32_BE(Y, key, 4); + + /* + * Permuted Choice 1 + */ + T = ((Y >> 4) ^ X) & 0x0F0F0F0F; + X ^= T; + Y ^= (T << 4); + T = ((Y) ^ X) & 0x10101010; + X ^= T; + Y ^= (T); + + X = (LHs[(X) & 0xF] << 3) | (LHs[(X >> 8) & 0xF] << 2) + | (LHs[(X >> 16) & 0xF] << 1) | (LHs[(X >> 24) & 0xF]) + | (LHs[(X >> 5) & 0xF] << 7) | (LHs[(X >> 13) & 0xF] << 6) + | (LHs[(X >> 21) & 0xF] << 5) | (LHs[(X >> 29) & 0xF] << 4); + + Y = (RHs[(Y >> 1) & 0xF] << 3) | (RHs[(Y >> 9) & 0xF] << 2) + | (RHs[(Y >> 17) & 0xF] << 1) | (RHs[(Y >> 25) & 0xF]) + | (RHs[(Y >> 4) & 0xF] << 7) | (RHs[(Y >> 12) & 0xF] << 6) + | (RHs[(Y >> 20) & 0xF] << 5) | (RHs[(Y >> 28) & 0xF] << 4); + + X &= 0x0FFFFFFF; + Y &= 0x0FFFFFFF; + + /* + * calculate subkeys + */ + for (i = 0; i < 16; i++) { + if (i < 2 || i == 8 || i == 15) { + X = ((X << 1) | (X >> 27)) & 0x0FFFFFFF; + Y = ((Y << 1) | (Y >> 27)) & 0x0FFFFFFF; + } else { + X = ((X << 2) | (X >> 26)) & 0x0FFFFFFF; + Y = ((Y << 2) | (Y >> 26)) & 0x0FFFFFFF; + } + + *SK++ = ((X << 4) & 0x24000000) | ((X << 28) & 0x10000000) + | ((X << 14) & 0x08000000) | ((X << 18) & 0x02080000) + | ((X << 6) & 0x01000000) | ((X << 9) & 0x00200000) + | ((X >> 1) & 0x00100000) | ((X << 10) & 0x00040000) + | ((X << 2) & 0x00020000) | ((X >> 10) & 0x00010000) + | ((Y >> 13) & 0x00002000) | ((Y >> 4) & 0x00001000) + | ((Y << 6) & 0x00000800) | ((Y >> 1) & 0x00000400) + | ((Y >> 14) & 0x00000200) | ((Y) & 0x00000100) + | ((Y >> 5) & 0x00000020) | ((Y >> 10) & 0x00000010) + | ((Y >> 3) & 0x00000008) | ((Y >> 18) & 0x00000004) + | ((Y >> 26) & 0x00000002) | ((Y >> 24) & 0x00000001); + + *SK++ = ((X << 15) & 0x20000000) | ((X << 17) & 0x10000000) + | ((X << 10) & 0x08000000) | ((X << 22) & 0x04000000) + | ((X >> 2) & 0x02000000) | ((X << 1) & 0x01000000) + | ((X << 16) & 0x00200000) | ((X << 11) & 0x00100000) + | ((X << 3) & 0x00080000) | ((X >> 6) & 0x00040000) + | ((X << 15) & 0x00020000) | ((X >> 4) & 0x00010000) + | ((Y >> 2) & 0x00002000) | ((Y << 8) & 0x00001000) + | ((Y >> 14) & 0x00000808) | ((Y >> 9) & 0x00000400) + | ((Y) & 0x00000200) | ((Y << 7) & 0x00000100) + | ((Y >> 7) & 0x00000020) | ((Y >> 3) & 0x00000011) + | ((Y << 2) & 0x00000004) | ((Y >> 21) & 0x00000002); + } +} + +inline void des_setkey_56(uint32_t SK[32], unsigned char _key[DES_KEY_SIZE - 1]) { + uchar key[DES_KEY_SIZE]; + + key[0] = _key[0]; + key[1] = (_key[0] << 7) | (_key[1] >> 1); + key[2] = (_key[1] << 6) | (_key[2] >> 2); + key[3] = (_key[2] << 5) | (_key[3] >> 3); + key[4] = (_key[3] << 4) | (_key[4] >> 4); + key[5] = (_key[4] << 3) | (_key[5] >> 5); + key[6] = (_key[5] << 2) | (_key[6] >> 6); + key[7] = (_key[6] << 1); + + des_setkey(SK, key); +} + +inline void des_encrypt(uint32_t SK[32], unsigned char *plaintext, unsigned char *output /*, __global unsigned int *g_debug*/) { + int i; + uint32_t X, Y, T; + + des_setkey_56(SK, plaintext); + + /* This sets the state after the initial permutation is applied to the + * plaintext "KGS!@#$%". */ + X = 0x2e09855e; + Y = 0x01d0024e; + +/* + const unsigned char input[] = "\x4b\x47\x53\x21\x40\x23\x24\x25"; + GET_UINT32_BE(X, input, 0); + GET_UINT32_BE(Y, input, 4); + + DES_IP(X, Y); +*/ + + for (i = 0; i < 8; i++) { + DES_ROUND(Y, X); + DES_ROUND(X, Y); + } + + DES_FP(Y, X); + + PUT_UINT32_BE(Y, output, 0); + PUT_UINT32_BE(X, output, 4); +} + +#endif /* OPENCL_DES_H */ diff --git a/CL/des_bs.cl b/CL/des_bs.cl new file mode 100644 index 0000000..140927d --- /dev/null +++ b/CL/des_bs.cl @@ -0,0 +1,705 @@ +#ifndef _DES_CL +#define _DES_CL + +#define __GPU__ + +/* From JohnTheRipper, lm_kernel_b.cl */ +/* DEVICE_INFO obtained from opencl_common.c:1892 */ +#define SM_MAJOR 5 /* Need this from host. */ +#define SM_MINOR 0 /* Need this from host. */ +#define DEVICE_INFO 262162 /* Need this from host. */ +#define SIZEOF_SIZE_T 8 /* Need this from host. */ +#define DEV_VER_MAJOR 390 /* Need this from host. */ +#define DEV_VER_MINOR 77 /* Need this from host. */ +#define OFFSET_TABLE_SIZE 10 +#define HASH_TABLE_SIZE 3 +#define SELECT_CMP_STEPS 2 +#define BITMAP_SIZE_BITS_LESS_ONE 262143 +#define REQ_BITMAP_BITS 18 + +/* + * This software is Copyright (c) 2015 Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * Based on Solar Designer implementation of DES_bs_b.c in jtr-v1.7.9 + */ + +//#include "opencl_lm_finalize_keys.h" +//#include "opencl_mask.h" +#include "opencl_lm_kernel_params.h" + + +#define y(p, q) vxorf(B[p], lm_keys[q]) + + +#define H1()\ + s1(y(31, 0), y(0, 1), y(1, 2),\ + y(2, 3), y(3, 4), y(4, 5),\ + B, 40, 48, 54, 62);\ + s2(y(3, 6), y(4, 7), y(5, 8),\ + y(6, 9), y(7, 10), y(8, 11),\ + B, 44, 59, 33, 49);\ + s3(y(7, 12), y(8, 13), y(9, 14),\ + y(10, 15), y(11, 16), y(12, 17),\ + B, 55, 47, 61, 37);\ + s4(y(11, 18), y(12, 19), y(13, 20),\ + y(14, 21), y(15, 22), y(16, 23),\ + B, 57, 51, 41, 32);\ + s5(y(15, 24), y(16, 25), y(17, 26),\ + y(18, 27), y(19, 28), y(20, 29),\ + B, 39, 45, 56, 34);\ + s6(y(19, 30), y(20, 31), y(21, 32),\ + y(22, 33), y(23, 34), y(24, 35),\ + B, 35, 60, 42, 50);\ + s7(y(23, 36), y(24, 37), y(25, 38),\ + y(26, 39), y(27, 40), y(28, 41),\ + B, 63, 43, 53, 38);\ + s8(y(27, 42), y(28, 43), y(29, 44),\ + y(30, 45), y(31, 46), y(0, 47),\ + B, 36, 58, 46, 52); + +#define H2()\ + s1(y(63, 48), y(32, 49), y(33, 50),\ + y(34, 51), y(35, 52), y(36, 53),\ + B, 8, 16, 22, 30);\ + s2(y(35, 54), y(36, 55), y(37, 56),\ + y(38, 57), y(39, 58), y(40, 59),\ + B, 12, 27, 1, 17);\ + s3(y(39, 60), y(40, 61), y(41, 62),\ + y(42, 63), y(43, 64), y(44, 65),\ + B, 23, 15, 29, 5);\ + s4(y(43, 66), y(44, 67), y(45, 68),\ + y(46, 69), y(47, 70), y(48, 71),\ + B, 25, 19, 9, 0);\ + s5(y(47, 72), y(48, 73), y(49, 74),\ + y(50, 75), y(51, 76), y(52, 77),\ + B, 7, 13, 24, 2);\ + s6(y(51, 78), y(52, 79), y(53, 80),\ + y(54, 81), y(55, 82), y(56, 83),\ + B, 3, 28, 10, 18);\ + s7(y(55, 84), y(56, 85), y(57, 86),\ + y(58, 87), y(59, 88), y(60, 89),\ + B, 31, 11, 21, 6);\ + s8(y(59, 90), y(60, 91), y(61, 92),\ + y(62, 93), y(63, 94), y(32, 95),\ + B, 4, 26, 14, 20); + + +/* I'm using the code below, not above. */ + +#define H1_k0()\ + s1(y(31, 15), y(0, 43), y(1, 26),\ + y(2, 51), y(3, 45), y(4, 9),\ + B, 40, 48, 54, 62);\ + s2(y(3, 27), y(4, 54), y(5, 6),\ + y(6, 0), y(7, 23), y(8, 35),\ + B, 44, 59, 33, 49);\ + s3(y(7, 5), y(8, 25), y(9, 17),\ + y(10, 18), y(11, 33), y(12, 53),\ + B, 55, 47, 61, 37);\ + s4(y(11, 52), y(12, 7), y(13, 24),\ + y(14, 16), y(15, 8), y(16, 36),\ + B, 57, 51, 41, 32);\ + s5(y(15, 20), y(16, 31), y(17, 37),\ + y(18, 40), y(19, 39), y(20, 4),\ + B, 39, 45, 56, 34);\ + s6(y(19, 46), y(20, 29), y(21, 3),\ + y(22, 41), y(23, 19), y(24, 30),\ + B, 35, 60, 42, 50);\ + s7(y(23, 50), y(24, 21), y(25, 38),\ + y(26, 48), y(27, 10), y(28, 22),\ + B, 63, 43, 53, 38);\ + s8(y(27, 32), y(28, 11), y(29, 12),\ + y(30, 49), y(31, 55), y(0, 28),\ + B, 36, 58, 46, 52); + +#define H2_k0()\ + s1(y(63, 6), y(32, 34), y(33, 17),\ + y(34, 42), y(35, 36), y(36, 0),\ + B, 8, 16, 22, 30);\ + s2(y(35, 18), y(36, 45), y(37, 52),\ + y(38, 7), y(39, 14), y(40, 26),\ + B, 12, 27, 1, 17);\ + s3(y(39, 51), y(40, 16), y(41, 8),\ + y(42, 9), y(43, 24), y(44, 44),\ + B, 23, 15, 29, 5);\ + s4(y(43, 43), y(44, 53), y(45, 54),\ + y(46, 23), y(47, 15), y(48, 27),\ + B, 25, 19, 9, 0);\ + s5(y(47, 11), y(48, 22), y(49, 28),\ + y(50, 47), y(51, 30), y(52, 48),\ + B, 7, 13, 24, 2);\ + s6(y(51, 37), y(52, 20), y(53, 31),\ + y(54, 32), y(55, 10), y(56, 21),\ + B, 3, 28, 10, 18);\ + s7(y(55, 41), y(56, 12), y(57, 29),\ + y(58, 55), y(59, 1), y(60, 13),\ + B, 31, 11, 21, 6);\ + s8(y(59, 39), y(60, 2), y(61, 3),\ + y(62, 40), y(63, 46), y(32, 19),\ + B, 4, 26, 14, 20); + +#define H1_k1()\ + s1(y(31, 43), y(0, 16), y(1, 15),\ + y(2, 24), y(3, 18), y(4, 53),\ + B, 40, 48, 54, 62);\ + s2(y(3, 0), y(4, 27), y(5, 34),\ + y(6, 44), y(7, 51), y(8, 8),\ + B, 44, 59, 33, 49);\ + s3(y(7, 33), y(8, 14), y(9, 6),\ + y(10, 7), y(11, 45), y(12, 26),\ + B, 55, 47, 61, 37);\ + s4(y(11, 25), y(12, 35), y(13, 36),\ + y(14, 5), y(15, 52), y(16, 9),\ + B, 57, 51, 41, 32);\ + s5(y(15, 50), y(16, 4), y(17, 10),\ + y(18, 29), y(19, 12), y(20, 46),\ + B, 39, 45, 56, 34);\ + s6(y(19, 19), y(20, 2), y(21, 13),\ + y(22, 30), y(23, 49), y(24, 3),\ + B, 35, 60, 42, 50);\ + s7(y(23, 39), y(24, 31), y(25, 11),\ + y(26, 37), y(27, 40), y(28, 48),\ + B, 63, 43, 53, 38);\ + s8(y(27, 21), y(28, 41), y(29, 22),\ + y(30, 38), y(31, 28), y(0, 1),\ + B, 36, 58, 46, 52); + +#define H2_k1()\ + s1(y(63, 25), y(32, 14), y(33, 52),\ + y(34, 45), y(35, 0), y(36, 35),\ + B, 8, 16, 22, 30);\ + s2(y(35, 53), y(36, 9), y(37, 16),\ + y(38, 26), y(39, 33), y(40, 6),\ + B, 12, 27, 1, 17);\ + s3(y(39, 54), y(40, 51), y(41, 43),\ + y(42, 44), y(43, 27), y(44, 8),\ + B, 23, 15, 29, 5);\ + s4(y(43, 23), y(44, 17), y(45, 18),\ + y(46, 42), y(47, 34), y(48, 7),\ + B, 25, 19, 9, 0);\ + s5(y(47, 32), y(48, 55), y(49, 49),\ + y(50, 11), y(51, 31), y(52, 28),\ + B, 7, 13, 24, 2);\ + s6(y(51, 1), y(52, 41), y(53, 48),\ + y(54, 12), y(55, 47), y(56, 22),\ + B, 3, 28, 10, 18);\ + s7(y(55, 21), y(56, 13), y(57, 50),\ + y(58, 19), y(59, 38), y(60, 46),\ + B, 31, 11, 21, 6);\ + s8(y(59, 3), y(60, 39), y(61, 4),\ + y(62, 20), y(63, 10), y(32, 40),\ + B, 4, 26, 14, 20); + +#define H1_k2()\ + s1(y(31, 23), y(0, 51), y(1, 34),\ + y(2, 27), y(3, 53), y(4, 17),\ + B, 40, 48, 54, 62);\ + s2(y(3, 35), y(4, 7), y(5, 14),\ + y(6, 8), y(7, 54), y(8, 43),\ + B, 44, 59, 33, 49);\ + s3(y(7, 36), y(8, 33), y(9, 25),\ + y(10, 26), y(11, 9), y(12, 6),\ + B, 55, 47, 61, 37);\ + s4(y(11, 5), y(12, 15), y(13, 0),\ + y(14, 24), y(15, 16), y(16, 44),\ + B, 57, 51, 41, 32);\ + s5(y(15, 30), y(16, 37), y(17, 47),\ + y(18, 50), y(19, 13), y(20, 10),\ + B, 39, 45, 56, 34);\ + s6(y(19, 40), y(20, 39), y(21, 46),\ + y(22, 31), y(23, 29), y(24, 4),\ + B, 35, 60, 42, 50);\ + s7(y(23, 3), y(24, 48), y(25, 32),\ + y(26, 1), y(27, 20), y(28, 28),\ + B, 63, 43, 53, 38);\ + s8(y(27, 22), y(28, 21), y(29, 55),\ + y(30, 2), y(31, 49), y(0, 38),\ + B, 36, 58, 46, 52); + +#define H2_k2()\ + s1(y(63, 5), y(32, 33), y(33, 16),\ + y(34, 9), y(35, 35), y(36, 15),\ + B, 8, 16, 22, 30);\ + s2(y(35, 17), y(36, 44), y(37, 51),\ + y(38, 6), y(39, 36), y(40, 25),\ + B, 12, 27, 1, 17);\ + s3(y(39, 18), y(40, 54), y(41, 23),\ + y(42, 8), y(43, 7), y(44, 43),\ + B, 23, 15, 29, 5);\ + s4(y(43, 42), y(44, 52), y(45, 53),\ + y(46, 45), y(47, 14), y(48, 26),\ + B, 25, 19, 9, 0);\ + s5(y(47, 12), y(48, 19), y(49, 29),\ + y(50, 32), y(51, 48), y(52, 49),\ + B, 7, 13, 24, 2);\ + s6(y(51, 38), y(52, 21), y(53, 28),\ + y(54, 13), y(55, 11), y(56, 55),\ + B, 3, 28, 10, 18);\ + s7(y(55, 22), y(56, 46), y(57, 30),\ + y(58, 40), y(59, 2), y(60, 10),\ + B, 31, 11, 21, 6);\ + s8(y(59, 4), y(60, 3), y(61, 37),\ + y(62, 41), y(63, 47), y(32, 20),\ + B, 4, 26, 14, 20); + +#define H1_k3()\ + s1(y(31, 42), y(0, 54), y(1, 14),\ + y(2, 7), y(3, 17), y(4, 52),\ + B, 40, 48, 54, 62);\ + s2(y(3, 15), y(4, 26), y(5, 33),\ + y(6, 43), y(7, 18), y(8, 23),\ + B, 44, 59, 33, 49);\ + s3(y(7, 0), y(8, 36), y(9, 5),\ + y(10, 6), y(11, 44), y(12, 25),\ + B, 55, 47, 61, 37);\ + s4(y(11, 24), y(12, 34), y(13, 35),\ + y(14, 27), y(15, 51), y(16, 8),\ + B, 57, 51, 41, 32);\ + s5(y(15, 31), y(16, 1), y(17, 11),\ + y(18, 30), y(19, 46), y(20, 47),\ + B, 39, 45, 56, 34);\ + s6(y(19, 20), y(20, 3), y(21, 10),\ + y(22, 48), y(23, 50), y(24, 37),\ + B, 35, 60, 42, 50);\ + s7(y(23, 4), y(24, 28), y(25, 12),\ + y(26, 38), y(27, 41), y(28, 49),\ + B, 63, 43, 53, 38);\ + s8(y(27, 55), y(28, 22), y(29, 19),\ + y(30, 39), y(31, 29), y(0, 2),\ + B, 36, 58, 46, 52); + +#define H2_k3()\ + s1(y(63, 24), y(32, 36), y(33, 51),\ + y(34, 44), y(35, 15), y(36, 34),\ + B, 8, 16, 22, 30);\ + s2(y(35, 52), y(36, 8), y(37, 54),\ + y(38, 25), y(39, 0), y(40, 5),\ + B, 12, 27, 1, 17);\ + s3(y(39, 53), y(40, 18), y(41, 42),\ + y(42, 43), y(43, 26), y(44, 23),\ + B, 23, 15, 29, 5);\ + s4(y(43, 45), y(44, 16), y(45, 17),\ + y(46, 9), y(47, 33), y(48, 6),\ + B, 25, 19, 9, 0);\ + s5(y(47, 13), y(48, 40), y(49, 50),\ + y(50, 12), y(51, 28), y(52, 29),\ + B, 7, 13, 24, 2);\ + s6(y(51, 2), y(52, 22), y(53, 49),\ + y(54, 46), y(55, 32), y(56, 19),\ + B, 3, 28, 10, 18);\ + s7(y(55, 55), y(56, 10), y(57, 31),\ + y(58, 20), y(59, 39), y(60, 47),\ + B, 31, 11, 21, 6);\ + s8(y(59, 37), y(60, 4), y(61, 1),\ + y(62, 21), y(63, 11), y(32, 41),\ + B, 4, 26, 14, 20); + +#define H1_k4()\ + s1(y(31, 54), y(0, 27), y(1, 42),\ + y(2, 35), y(3, 6), y(4, 25),\ + B, 40, 48, 54, 62);\ + s2(y(3, 43), y(4, 15), y(5, 45),\ + y(6, 16), y(7, 7), y(8, 51),\ + B, 44, 59, 33, 49);\ + s3(y(7, 44), y(8, 9), y(9, 33),\ + y(10, 34), y(11, 17), y(12, 14),\ + B, 55, 47, 61, 37);\ + s4(y(11, 36), y(12, 23), y(13, 8),\ + y(14, 0), y(15, 24), y(16, 52),\ + B, 57, 51, 41, 32);\ + s5(y(15, 4), y(16, 47), y(17, 41),\ + y(18, 3), y(19, 19), y(20, 20),\ + B, 39, 45, 56, 34);\ + s6(y(19, 50), y(20, 13), y(21, 40),\ + y(22, 37), y(23, 39), y(24, 10),\ + B, 35, 60, 42, 50);\ + s7(y(23, 46), y(24, 1), y(25, 22),\ + y(26, 11), y(27, 30), y(28, 38),\ + B, 63, 43, 53, 38);\ + s8(y(27, 28), y(28, 48), y(29, 49),\ + y(30, 12), y(31, 2), y(0, 32),\ + B, 36, 58, 46, 52); + +#define H2_k4()\ + s1(y(63, 36), y(32, 9), y(33, 24),\ + y(34, 17), y(35, 43), y(36, 23),\ + B, 8, 16, 22, 30);\ + s2(y(35, 25), y(36, 52), y(37, 27),\ + y(38, 14), y(39, 44), y(40, 33),\ + B, 12, 27, 1, 17);\ + s3(y(39, 26), y(40, 7), y(41, 54),\ + y(42, 16), y(43, 15), y(44, 51),\ + B, 23, 15, 29, 5);\ + s4(y(43, 18), y(44, 5), y(45, 6),\ + y(46, 53), y(47, 45), y(48, 34),\ + B, 25, 19, 9, 0);\ + s5(y(47, 55), y(48, 29), y(49, 39),\ + y(50, 22), y(51, 1), y(52, 2),\ + B, 7, 13, 24, 2);\ + s6(y(51, 32), y(52, 48), y(53, 38),\ + y(54, 19), y(55, 21), y(56, 49),\ + B, 3, 28, 10, 18);\ + s7(y(55, 28), y(56, 40), y(57, 4),\ + y(58, 50), y(59, 12), y(60, 20),\ + B, 31, 11, 21, 6);\ + s8(y(59, 10), y(60, 46), y(61, 47),\ + y(62, 31), y(63, 41), y(32, 30),\ + B, 4, 26, 14, 20); + +#define H1_k5()\ + s1(y(31, 18), y(0, 7), y(1, 45),\ + y(2, 15), y(3, 25), y(4, 5),\ + B, 40, 48, 54, 62);\ + s2(y(3, 23), y(4, 34), y(5, 9),\ + y(6, 51), y(7, 26), y(8, 54),\ + B, 44, 59, 33, 49);\ + s3(y(7, 8), y(8, 44), y(9, 36),\ + y(10, 14), y(11, 52), y(12, 33),\ + B, 55, 47, 61, 37);\ + s4(y(11, 0), y(12, 42), y(13, 43),\ + y(14, 35), y(15, 27), y(16, 16),\ + B, 57, 51, 41, 32);\ + s5(y(15, 37), y(16, 11), y(17, 21),\ + y(18, 4), y(19, 40), y(20, 41),\ + B, 39, 45, 56, 34);\ + s6(y(19, 30), y(20, 46), y(21, 20),\ + y(22, 1), y(23, 3), y(24, 47),\ + B, 35, 60, 42, 50);\ + s7(y(23, 10), y(24, 38), y(25, 55),\ + y(26, 32), y(27, 31), y(28, 2),\ + B, 63, 43, 53, 38);\ + s8(y(27, 49), y(28, 28), y(29, 29),\ + y(30, 13), y(31, 39), y(0, 12),\ + B, 36, 58, 46, 52); + +#define H2_k5()\ + s1(y(63, 0), y(32, 44), y(33, 27),\ + y(34, 52), y(35, 23), y(36, 42),\ + B, 8, 16, 22, 30);\ + s2(y(35, 5), y(36, 16), y(37, 7),\ + y(38, 33), y(39, 8), y(40, 36),\ + B, 12, 27, 1, 17);\ + s3(y(39, 6), y(40, 26), y(41, 18),\ + y(42, 51), y(43, 34), y(44, 54),\ + B, 23, 15, 29, 5);\ + s4(y(43, 53), y(44, 24), y(45, 25),\ + y(46, 17), y(47, 9), y(48, 14),\ + B, 25, 19, 9, 0);\ + s5(y(47, 19), y(48, 50), y(49, 3),\ + y(50, 55), y(51, 38), y(52, 39),\ + B, 7, 13, 24, 2);\ + s6(y(51, 12), y(52, 28), y(53, 2),\ + y(54, 40), y(55, 22), y(56, 29),\ + B, 3, 28, 10, 18);\ + s7(y(55, 49), y(56, 20), y(57, 37),\ + y(58, 30), y(59, 13), y(60, 41),\ + B, 31, 11, 21, 6);\ + s8(y(59, 47), y(60, 10), y(61, 11),\ + y(62, 48), y(63, 21), y(32, 31),\ + B, 4, 26, 14, 20); + +#define H1_k6()\ + s1(y(31, 53), y(0, 26), y(1, 9),\ + y(2, 34), y(3, 5), y(4, 24),\ + B, 40, 48, 54, 62);\ + s2(y(3, 42), y(4, 14), y(5, 44),\ + y(6, 54), y(7, 6), y(8, 18),\ + B, 44, 59, 33, 49);\ + s3(y(7, 43), y(8, 8), y(9, 0),\ + y(10, 33), y(11, 16), y(12, 36),\ + B, 55, 47, 61, 37);\ + s4(y(11, 35), y(12, 45), y(13, 23),\ + y(14, 15), y(15, 7), y(16, 51),\ + B, 57, 51, 41, 32);\ + s5(y(15, 1), y(16, 32), y(17, 22),\ + y(18, 37), y(19, 20), y(20, 21),\ + B, 39, 45, 56, 34);\ + s6(y(19, 31), y(20, 10), y(21, 41),\ + y(22, 38), y(23, 4), y(24, 11),\ + B, 35, 60, 42, 50);\ + s7(y(23, 47), y(24, 2), y(25, 19),\ + y(26, 12), y(27, 48), y(28, 39),\ + B, 63, 43, 53, 38);\ + s8(y(27, 29), y(28, 49), y(29, 50),\ + y(30, 46), y(31, 3), y(0, 13),\ + B, 36, 58, 46, 52); + +#define H2_k6()\ + s1(y(63, 35), y(32, 8), y(33, 7),\ + y(34, 16), y(35, 42), y(36, 45),\ + B, 8, 16, 22, 30);\ + s2(y(35, 24), y(36, 51), y(37, 26),\ + y(38, 36), y(39, 43), y(40, 0),\ + B, 12, 27, 1, 17);\ + s3(y(39, 25), y(40, 6), y(41, 53),\ + y(42, 54), y(43, 14), y(44, 18),\ + B, 23, 15, 29, 5);\ + s4(y(43, 17), y(44, 27), y(45, 5),\ + y(46, 52), y(47, 44), y(48, 33),\ + B, 25, 19, 9, 0);\ + s5(y(47, 40), y(48, 30), y(49, 4),\ + y(50, 19), y(51, 2), y(52, 3),\ + B, 7, 13, 24, 2);\ + s6(y(51, 13), y(52, 49), y(53, 39),\ + y(54, 20), y(55, 55), y(56, 50),\ + B, 3, 28, 10, 18);\ + s7(y(55, 29), y(56, 41), y(57, 1),\ + y(58, 31), y(59, 46), y(60, 21),\ + B, 31, 11, 21, 6);\ + s8(y(59, 11), y(60, 47), y(61, 32),\ + y(62, 28), y(63, 22), y(32, 48),\ + B, 4, 26, 14, 20); + +#define H1_k7()\ + s1(y(31, 17), y(0, 6), y(1, 44),\ + y(2, 14), y(3, 24), y(4, 27),\ + B, 40, 48, 54, 62);\ + s2(y(3, 45), y(4, 33), y(5, 8),\ + y(6, 18), y(7, 25), y(8, 53),\ + B, 44, 59, 33, 49);\ + s3(y(7, 23), y(8, 43), y(9, 35),\ + y(10, 36), y(11, 51), y(12, 0),\ + B, 55, 47, 61, 37);\ + s4(y(11, 15), y(12, 9), y(13, 42),\ + y(14, 34), y(15, 26), y(16, 54),\ + B, 57, 51, 41, 32);\ + s5(y(15, 38), y(16, 12), y(17, 55),\ + y(18, 1), y(19, 41), y(20, 22),\ + B, 39, 45, 56, 34);\ + s6(y(19, 48), y(20, 47), y(21, 21),\ + y(22, 2), y(23, 37), y(24, 32),\ + B, 35, 60, 42, 50);\ + s7(y(23, 11), y(24, 39), y(25, 40),\ + y(26, 13), y(27, 28), y(28, 3),\ + B, 63, 43, 53, 38);\ + s8(y(27, 50), y(28, 29), y(29, 30),\ + y(30, 10), y(31, 4), y(0, 46),\ + B, 36, 58, 46, 52); + +#define H2_k7()\ + s1(y(63, 8), y(32, 52), y(33, 35),\ + y(34, 5), y(35, 54), y(36, 18),\ + B, 8, 16, 22, 30);\ + s2(y(35, 36), y(36, 24), y(37, 15),\ + y(38, 9), y(39, 16), y(40, 44),\ + B, 12, 27, 1, 17);\ + s3(y(39, 14), y(40, 34), y(41, 26),\ + y(42, 27), y(43, 42), y(44, 7),\ + B, 23, 15, 29, 5);\ + s4(y(43, 6), y(44, 0), y(45,33),\ + y(46, 25), y(47, 17), y(48, 45),\ + B, 25, 19, 9, 0);\ + s5(y(47, 29), y(48, 3), y(49, 46),\ + y(50, 49), y(51, 32), y(52, 13),\ + B, 7, 13, 24, 2);\ + s6(y(51, 55), y(52, 38), y(53, 12),\ + y(54, 50), y(55, 28), y(56, 39),\ + B, 3, 28, 10, 18);\ + s7(y(55, 2), y(56, 30), y(57, 47),\ + y(58, 4), y(59, 19), y(60, 31),\ + B, 31, 11, 21, 6);\ + s8(y(59, 41), y(60, 20), y(61, 21),\ + y(62, 1), y(63, 48), y(32, 37),\ + B, 4, 26, 14, 20); + + +#define lm_set_block_8(b, i, v0, v1, v2, v3, v4, v5, v6, v7) \ + { \ + b[i] = v0; \ + b[i + 1] = v1; \ + b[i + 2] = v2; \ + b[i + 3] = v3; \ + b[i + 4] = v4; \ + b[i + 5] = v5; \ + b[i + 6] = v6; \ + b[i + 7] = v7; \ + } + +#define vzero 0 + +#define vones (~(vtype)0) + + +constant unsigned char DES_LM_reverse[16] = { + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 +}; + +constant char itoa16[16] = "0123456789abcdef"; + +constant unsigned char DES_IP[64] = { + 57, 49, 41, 33, 25, 17, 9, 1, + 59, 51, 43, 35, 27, 19, 11, 3, + 61, 53, 45, 37, 29, 21, 13, 5, + 63, 55, 47, 39, 31, 23, 15, 7, + 56, 48, 40, 32, 24, 16, 8, 0, + 58, 50, 42, 34, 26, 18, 10, 2, + 60, 52, 44, 36, 28, 20, 12, 4, + 62, 54, 46, 38, 30, 22, 14, 6 +}; + +#define ARCH_WORD int + +/* From JohnTheRipper, DES_std.c */ +inline void DES_do_FP(ARCH_WORD out[2], ARCH_WORD in[2]) +{ + int src, dst; + + out[0] = out[1] = 0; + for (src = 0; src < 64; src++) { + dst = DES_IP[src ^ 0x20]; + + if (in[src >> 5] & ((unsigned ARCH_WORD)1 << (src & 0x1F))) + out[dst >> 5] |= (unsigned ARCH_WORD)1 << (dst & 0x1F); + } +} + +/* From JohnTheRipper, DES_bs.c. */ +inline void DES_bs_get_source_LM(unsigned char *out, uint32_t *raw) +{ +// char *p; + ARCH_WORD swapped[2], block[2], value; + int l, h; + int index; +/* + swapped[0] = raw[1]; + swapped[1] = raw[0]; +*/ + DES_do_FP(block, raw); //swapped); + + + /*p = out;*/ + for (index = 0; index < 16; index += 2) { + value = (block[index >> 3] >> ((index << 2) & 0x18)) & 0xff; + l = DES_LM_reverse[value & 0xf]; + h = DES_LM_reverse[value >> 4]; + out[index >> 1] = (l << 4) | h; + /**p++ = itoa16[l]; + *p++ = itoa16[h];*/ + } + /**p = 0;*/ + + return; +} + + + +inline void des_encrypt(char *plaintext, unsigned char *out) { + vtype B[64]; + lm_vector lm_keys[56]; + + // For each plaintext character, put each bit into its own location in the lm_keys + // array. + for (int i = 0; i < 7; i++) { /* Plaintext characters */ + for (int j = 0; j < 8; j++) { /* Character bits */ + lm_keys[(i * 8) + j] = ((plaintext[i] >> j) & 1);// & 0xfffffffe; + } + } + + // This is the "KGS!@#$%" string, after the initial permutation is applied. + vtype z = vzero, o = vones; + lm_set_block_8(B, 0, z, z, z, z, z, z, z, z); + lm_set_block_8(B, 8, o, o, o, z, o, z, z, z); + lm_set_block_8(B, 16, z, z, z, z, z, z, z, o); + lm_set_block_8(B, 24, z, z, o, z, z, o, o, o); + lm_set_block_8(B, 32, z, z, z, o, z, o, o, o); + lm_set_block_8(B, 40, z, z, z, z, z, o, z, z); + lm_set_block_8(B, 48, o, o, z, z, z, z, o, z); + lm_set_block_8(B, 56, o, z, o, z, o, o, o, o); + + // Perform DES using the plaintext as the key on the fixed bytes (above). + H1_k0(); + H2_k0(); + H1_k1(); + H2_k1(); + H1_k2(); + H2_k2(); + H1_k3(); + H2_k3(); + H1_k4(); + H2_k4(); + H1_k5(); + H2_k5(); + H1_k6(); + H2_k6(); + H1_k7(); + H2_k7(); + + // Extract the result from the bitslice array. + unsigned int block[2]; + unsigned int ret; + + ret = (B[63] & 1); ret <<= 1; + ret |= (B[62] & 1); ret <<= 1; + ret |= (B[61] & 1); ret <<= 1; + ret |= (B[60] & 1); ret <<= 1; + ret |= (B[59] & 1); ret <<= 1; + ret |= (B[58] & 1); ret <<= 1; + ret |= (B[57] & 1); ret <<= 1; + ret |= (B[56] & 1); ret <<= 1; + ret |= (B[55] & 1); ret <<= 1; + ret |= (B[54] & 1); ret <<= 1; + ret |= (B[53] & 1); ret <<= 1; + ret |= (B[52] & 1); ret <<= 1; + ret |= (B[51] & 1); ret <<= 1; + ret |= (B[50] & 1); ret <<= 1; + ret |= (B[49] & 1); ret <<= 1; + ret |= (B[48] & 1); ret <<= 1; + ret |= (B[47] & 1); ret <<= 1; + ret |= (B[46] & 1); ret <<= 1; + ret |= (B[45] & 1); ret <<= 1; + ret |= (B[44] & 1); ret <<= 1; + ret |= (B[43] & 1); ret <<= 1; + ret |= (B[42] & 1); ret <<= 1; + ret |= (B[41] & 1); ret <<= 1; + ret |= (B[40] & 1); ret <<= 1; + ret |= (B[39] & 1); ret <<= 1; + ret |= (B[38] & 1); ret <<= 1; + ret |= (B[37] & 1); ret <<= 1; + ret |= (B[36] & 1); ret <<= 1; + ret |= (B[35] & 1); ret <<= 1; + ret |= (B[34] & 1); ret <<= 1; + ret |= (B[33] & 1); ret <<= 1; + ret |= (B[32] & 1); + block[0] = ret; + + ret = (B[31] & 1); ret <<= 1; + ret |= (B[30] & 1); ret <<= 1; + ret |= (B[29] & 1); ret <<= 1; + ret |= (B[28] & 1); ret <<= 1; + ret |= (B[27] & 1); ret <<= 1; + ret |= (B[26] & 1); ret <<= 1; + ret |= (B[25] & 1); ret <<= 1; + ret |= (B[24] & 1); ret <<= 1; + ret |= (B[23] & 1); ret <<= 1; + ret |= (B[22] & 1); ret <<= 1; + ret |= (B[21] & 1); ret <<= 1; + ret |= (B[20] & 1); ret <<= 1; + ret |= (B[19] & 1); ret <<= 1; + ret |= (B[18] & 1); ret <<= 1; + ret |= (B[17] & 1); ret <<= 1; + ret |= (B[16] & 1); ret <<= 1; + ret |= (B[15] & 1); ret <<= 1; + ret |= (B[14] & 1); ret <<= 1; + ret |= (B[13] & 1); ret <<= 1; + ret |= (B[12] & 1); ret <<= 1; + ret |= (B[11] & 1); ret <<= 1; + ret |= (B[10] & 1); ret <<= 1; + ret |= (B[9] & 1); ret <<= 1; + ret |= (B[8] & 1); ret <<= 1; + ret |= (B[7] & 1); ret <<= 1; + ret |= (B[6] & 1); ret <<= 1; + ret |= (B[5] & 1); ret <<= 1; + ret |= (B[4] & 1); ret <<= 1; + ret |= (B[3] & 1); ret <<= 1; + ret |= (B[2] & 1); ret <<= 1; + ret |= (B[1] & 1); ret <<= 1; + ret |= (B[0] & 1); + block[1] = ret; + + // Perform the final permutation. + DES_bs_get_source_LM(out, block); +} + +#endif /* _DES_CL */ diff --git a/CL/false_alarm_check.cl b/CL/false_alarm_check.cl new file mode 100644 index 0000000..cc8fd30 --- /dev/null +++ b/CL/false_alarm_check.cl @@ -0,0 +1,58 @@ +#include "shared.h" +#include "rt.cl" + +__kernel void false_alarm_check( + __global unsigned int *g_hash_type, + __global char *g_charset, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned int *g_reduction_offset, + __global unsigned long *g_plaintext_space_total, + __global unsigned long *g_plaintext_space_up_to_index, + __global unsigned int *g_device_num, + __global unsigned int *g_total_devices, + __global unsigned int *g_num_start_indices, + __global unsigned long *g_start_indices, + __global unsigned int *g_start_index_positions, + __global unsigned long *g_hash_base_indices, + __global unsigned int *g_exec_block_scaler, + __global unsigned long *g_plaintext_indices) { + + int index_pos = (*g_num_start_indices - *g_device_num) - ((get_global_id(0) + *g_exec_block_scaler) * *g_total_devices) - 1; + if (index_pos < 0) + return; + + char charset[MAX_CHARSET_LEN]; + unsigned char plaintext[MAX_PLAINTEXT_LEN]; + unsigned char hash[MAX_HASH_OUTPUT_LEN]; + unsigned int plaintext_len; + unsigned int hash_len; + + unsigned int charset_len = g_strncpy(charset, g_charset, sizeof(charset)); + unsigned int hash_type = *g_hash_type; + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned int reduction_offset = *g_reduction_offset; + unsigned long plaintext_space_total = *g_plaintext_space_total; + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + + copy_plaintext_space_up_to_index(plaintext_space_up_to_index, g_plaintext_space_up_to_index); + + unsigned long index = g_start_indices[index_pos], previous_index = 0; + unsigned long hash_base_index = g_hash_base_indices[index_pos]; + unsigned int endpoint = g_start_index_positions[index_pos]; + + + for (unsigned int pos = 0; pos < endpoint + 1; pos++) { + index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, plaintext, &plaintext_len); + do_hash(hash_type, plaintext, plaintext_len, hash, &hash_len); + + previous_index = index; + index = hash_to_index(hash, hash_len, reduction_offset, plaintext_space_total, pos); + + if (index == ((hash_base_index + pos) % plaintext_space_total)) { + g_plaintext_indices[index_pos] = previous_index; + return; + } + } +} diff --git a/CL/false_alarm_check_ntlm8.cl b/CL/false_alarm_check_ntlm8.cl new file mode 100644 index 0000000..e244b50 --- /dev/null +++ b/CL/false_alarm_check_ntlm8.cl @@ -0,0 +1,41 @@ +#include "ntlm8_functions.cl" + + +__kernel void false_alarm_check_ntlm8( + __global unsigned int *unused1, + __global char *unused2, + __global unsigned int *unused3, + __global unsigned int *unused4, + __global unsigned int *unused5, + __global unsigned long *unused6, + __global unsigned long *unused7, + __global unsigned int *g_device_num, + __global unsigned int *g_total_devices, + __global unsigned int *g_num_start_indices, + __global unsigned long *g_start_indices, + __global unsigned int *g_start_index_positions, + __global unsigned long *g_hash_base_indices, + __global unsigned int *g_exec_block_scaler, + __global unsigned long *g_plaintext_indices) { + + int index_pos = (*g_num_start_indices - *g_device_num) - ((get_global_id(0) + *g_exec_block_scaler) * *g_total_devices) - 1; + if (index_pos < 0) + return; + + unsigned char plaintext[8]; + unsigned long index = g_start_indices[index_pos], previous_index = 0; + unsigned long hash_base_index = g_hash_base_indices[index_pos]; + unsigned int endpoint = g_start_index_positions[index_pos]; + + for (unsigned int pos = 0; pos < endpoint + 1; pos++) { + index_to_plaintext_ntlm8(index, charset, plaintext); + + previous_index = index; + index = hash_to_index_ntlm8(hash_ntlm8(plaintext), pos); + + if (index == ((hash_base_index + pos) % 6634204312890625UL)) { + g_plaintext_indices[index_pos] = previous_index; + return; + } + } +} diff --git a/CL/ntlm.cl b/CL/ntlm.cl new file mode 100644 index 0000000..46b23ed --- /dev/null +++ b/CL/ntlm.cl @@ -0,0 +1,177 @@ +/* + * MD4 OpenCL kernel based on Solar Designer's MD4 algorithm implementation at: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md4 + * This code is in public domain. + * + * This software is Copyright (c) 2010, Dhiru Kholia + * and Copyright (c) 2012, magnum + * and Copyright (c) 2015, Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, + * are permitted. + * + * Useful References: + * 1 nt_opencl_kernel.c (written by Alain Espinosa ) + * 2. http://tools.ietf.org/html/rfc1320 + * 3. http://en.wikipedia.org/wiki/MD4 + */ + +#include "string.cl" + +#undef MD4_LUT3 /* No good for this format, just here for reference */ + +/* The basic MD4 functions */ +#if MD4_LUT3 +#define F(x, y, z) lut3(x, y, z, 0xca) +#elif USE_BITSELECT +#define F(x, y, z) bitselect((z), (y), (x)) +#elif HAVE_ANDNOT +#define F(x, y, z) ((x & y) ^ ((~x) & z)) +#else +#define F(x, y, z) (z ^ (x & (y ^ z))) +#endif + +#if MD4_LUT3 +#define G(x, y, z) lut3(x, y, z, 0xe8) +#else +#define G(x, y, z) (((x) & ((y) | (z))) | ((y) & (z))) +#endif + +#if MD4_LUT3 +#define H(x, y, z) lut3(x, y, z, 0x96) +#define H2 H +#else +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) +#endif + +/* The MD4 transformation for all three rounds. */ +#define STEP(f, a, b, c, d, x, s) \ + (a) += f((b), (c), (d)) + (x); \ + (a) = rotate((a), (uint)(s)) //(a) = ((a << s) | (a >> (32 - s))) + +inline void md4_encrypt(__private uint *hash, __private uint *W) +{ + hash[0] = 0x67452301; + hash[1] = 0xefcdab89; + hash[2] = 0x98badcfe; + hash[3] = 0x10325476; + + /* Round 1 */ + STEP(F, hash[0], hash[1], hash[2], hash[3], W[0], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[1], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[2], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[3], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[4], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[5], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[6], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[7], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[8], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[9], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[10], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[11], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[12], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[13], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[14], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[15], 19); + + /* Round 2 */ + STEP(G, hash[0], hash[1], hash[2], hash[3], W[0] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[4] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[8] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[12] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[1] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[5] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[9] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[13] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[2] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[6] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[10] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[14] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[3] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[7] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[11] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[15] + 0x5a827999, 13); + + /* Round 3 */ + STEP(H, hash[0], hash[1], hash[2], hash[3], W[0] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[8] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[4] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[12] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[2] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[10] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[6] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[14] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[1] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[9] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[5] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[13] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[3] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[11] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[7] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[15] + 0x6ed9eba1, 15); + + hash[0] = hash[0] + 0x67452301; + hash[1] = hash[1] + 0xefcdab89; + hash[2] = hash[2] + 0x98badcfe; + hash[3] = hash[3] + 0x10325476; +} + +inline void ntlm_hash(unsigned char *plaintext, unsigned int plaintext_len, unsigned char *hash /*, __global unsigned char *g_debug*/) { + unsigned int key[16] = {0}; + unsigned int output[4]; + + +#ifdef AMD_ROCM + /* Below is a workaround for an EXTREMELY confusing bug only observed while using the + * AMD ROCm driver (I'm going to go ahead and blame the driver...). plaintext_len + * is set correctly, yet at the same time, it isn't. The code below basically + * sets plaintext_len to the same value that it was; without this, the wrong + * hash is somehow calculated. */ + plaintext[plaintext_len] = 0; + + int my_len = 0; + for (; my_len < MAX_PLAINTEXT_LEN; my_len++) + if (plaintext[my_len] == 0) + break; + + plaintext_len = my_len; + /***********************************************************************************/ +#endif + + if (plaintext_len > 27) { + plaintext[27] = 0; + plaintext_len = 27; + } + + int i = 0; + for (; i < (plaintext_len / 2); i++) + key[i] = plaintext[i * 2] | (plaintext[(i * 2) + 1] << 16); + + if ((plaintext_len % 2) == 1) + key[i] = plaintext[plaintext_len - 1] | 0x800000; + else + key[i] = 0x80; + + key[14] = plaintext_len << 4; + + md4_encrypt(output, key); + + i = 0; + hash[i++] = ((output[0] >> 0) & 0xff); + hash[i++] = ((output[0] >> 8) & 0xff); + hash[i++] = ((output[0] >> 16) & 0xff); + hash[i++] = ((output[0] >> 24) & 0xff); + hash[i++] = ((output[1] >> 0) & 0xff); + hash[i++] = ((output[1] >> 8) & 0xff); + hash[i++] = ((output[1] >> 16) & 0xff); + hash[i++] = ((output[1] >> 24) & 0xff); + hash[i++] = ((output[2] >> 0) & 0xff); + hash[i++] = ((output[2] >> 8) & 0xff); + hash[i++] = ((output[2] >> 16) & 0xff); + hash[i++] = ((output[2] >> 24) & 0xff); + hash[i++] = ((output[3] >> 0) & 0xff); + hash[i++] = ((output[3] >> 8) & 0xff); + hash[i++] = ((output[3] >> 16) & 0xff); + hash[i++] = ((output[3] >> 24) & 0xff); +} diff --git a/CL/ntlm8_functions.cl b/CL/ntlm8_functions.cl new file mode 100644 index 0000000..6247fe2 --- /dev/null +++ b/CL/ntlm8_functions.cl @@ -0,0 +1,172 @@ +__constant char charset[] = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; + + +inline void index_to_plaintext_ntlm8(unsigned long index, __constant char *charset, unsigned char *plaintext) { + + for (int i = 7; i >= 0; i--) { + plaintext[i] = charset[index % 95]; + index = index / 95; + } + + return; +} + + +/* + * MD4 OpenCL kernel based on Solar Designer's MD4 algorithm implementation at: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md4 + * This code is in public domain. + * + * This software is Copyright (c) 2010, Dhiru Kholia + * and Copyright (c) 2012, magnum + * and Copyright (c) 2015, Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, + * are permitted. + * + * Useful References: + * 1 nt_opencl_kernel.c (written by Alain Espinosa ) + * 2. http://tools.ietf.org/html/rfc1320 + * 3. http://en.wikipedia.org/wiki/MD4 + */ + +#undef MD4_LUT3 /* No good for this format, just here for reference */ + +/* The basic MD4 functions */ +#if MD4_LUT3 +#define F(x, y, z) lut3(x, y, z, 0xca) +#elif USE_BITSELECT +#define F(x, y, z) bitselect((z), (y), (x)) +#elif HAVE_ANDNOT +#define F(x, y, z) ((x & y) ^ ((~x) & z)) +#else +#define F(x, y, z) (z ^ (x & (y ^ z))) +#endif + +#if MD4_LUT3 +#define G(x, y, z) lut3(x, y, z, 0xe8) +#else +#define G(x, y, z) (((x) & ((y) | (z))) | ((y) & (z))) +#endif + +#if MD4_LUT3 +#define H(x, y, z) lut3(x, y, z, 0x96) +#define H2 H +#else +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) +#endif + +/* The MD4 transformation for all three rounds. */ +#define STEP(f, a, b, c, d, x, s) \ + (a) += f((b), (c), (d)) + (x); \ + (a) = rotate((a), (uint)(s)) //(a) = ((a << s) | (a >> (32 - s))) + +inline void md4_encrypt(__private uint *hash, __private uint *W) +{ + hash[0] = 0x67452301; + hash[1] = 0xefcdab89; + hash[2] = 0x98badcfe; + hash[3] = 0x10325476; + + /* Round 1 */ + STEP(F, hash[0], hash[1], hash[2], hash[3], W[0], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[1], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[2], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[3], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[4], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[5], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[6], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[7], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[8], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[9], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[10], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[11], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[12], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[13], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[14], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[15], 19); + + /* Round 2 */ + STEP(G, hash[0], hash[1], hash[2], hash[3], W[0] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[4] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[8] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[12] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[1] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[5] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[9] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[13] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[2] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[6] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[10] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[14] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[3] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[7] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[11] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[15] + 0x5a827999, 13); + + /* Round 3 */ + STEP(H, hash[0], hash[1], hash[2], hash[3], W[0] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[8] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[4] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[12] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[2] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[10] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[6] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[14] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[1] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[9] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[5] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[13] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[3] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[11] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[7] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[15] + 0x6ed9eba1, 15); + + hash[0] = hash[0] + 0x67452301; + hash[1] = hash[1] + 0xefcdab89; + hash[2] = hash[2] + 0x98badcfe; + hash[3] = hash[3] + 0x10325476; +} + + +inline unsigned long hash_ntlm8(unsigned char *plaintext) { + unsigned int key[16] = {0}; + unsigned int output[4]; + + for (int i = 0; i < 4; i++) + key[i] = plaintext[i * 2] | (plaintext[(i * 2) + 1] << 16); + + key[4] = 0x80; + key[14] = 0x80; + + md4_encrypt(output, key); + + return ((unsigned long)output[1]) << 32 | (unsigned long)output[0]; +} + + +inline unsigned long hash_to_index_ntlm8(unsigned long hash, unsigned int pos) { + return (hash + pos) % 6634204312890625UL; +} + + +inline unsigned long hash_char_to_index_ntlm8(__global unsigned char *hash_value, unsigned int pos) { + unsigned long ret = hash_value[7]; + ret <<= 8; + ret |= hash_value[6]; + ret <<= 8; + ret |= hash_value[5]; + ret <<= 8; + ret |= hash_value[4]; + ret <<= 8; + ret |= hash_value[3]; + ret <<= 8; + ret |= hash_value[2]; + ret <<= 8; + ret |= hash_value[1]; + ret <<= 8; + ret |= hash_value[0]; + + return (ret + pos) % 6634204312890625UL; +} diff --git a/CL/ntlm9_functions.cl b/CL/ntlm9_functions.cl new file mode 100644 index 0000000..da4d8b8 --- /dev/null +++ b/CL/ntlm9_functions.cl @@ -0,0 +1,179 @@ +__constant char charset[] = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; + + +/* Since nobody else has made 9-character rainbow tables, we're free to take some of + * our own artistic liberties... + * + * We have a 64-bit number that we need to map to a 9-character plaintext. This + * means if the character set is of length 128 or less, we can break the number into + * nine 7-bit fragments, and use them to index into the character set. This ends up + * being 2.4x faster than the standard division method (below)! */ +inline void index_to_plaintext_ntlm9(unsigned long index, __constant char *charset, unsigned char *plaintext) { + + for (int i = 0; i < 9; i++) { + plaintext[i] = charset[ (index & 0xff) % 95 ]; // TODO: is the 0xff necessary? + index >>= 7; + } + + return; +} + + +/* + * MD4 OpenCL kernel based on Solar Designer's MD4 algorithm implementation at: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md4 + * This code is in public domain. + * + * This software is Copyright (c) 2010, Dhiru Kholia + * and Copyright (c) 2012, magnum + * and Copyright (c) 2015, Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, + * are permitted. + * + * Useful References: + * 1 nt_opencl_kernel.c (written by Alain Espinosa ) + * 2. http://tools.ietf.org/html/rfc1320 + * 3. http://en.wikipedia.org/wiki/MD4 + */ + +#undef MD4_LUT3 /* No good for this format, just here for reference */ + +/* The basic MD4 functions */ +#if MD4_LUT3 +#define F(x, y, z) lut3(x, y, z, 0xca) +#elif USE_BITSELECT +#define F(x, y, z) bitselect((z), (y), (x)) +#elif HAVE_ANDNOT +#define F(x, y, z) ((x & y) ^ ((~x) & z)) +#else +#define F(x, y, z) (z ^ (x & (y ^ z))) +#endif + +#if MD4_LUT3 +#define G(x, y, z) lut3(x, y, z, 0xe8) +#else +#define G(x, y, z) (((x) & ((y) | (z))) | ((y) & (z))) +#endif + +#if MD4_LUT3 +#define H(x, y, z) lut3(x, y, z, 0x96) +#define H2 H +#else +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) +#endif + +/* The MD4 transformation for all three rounds. */ +#define STEP(f, a, b, c, d, x, s) \ + (a) += f((b), (c), (d)) + (x); \ + (a) = rotate((a), (uint)(s)) //(a) = ((a << s) | (a >> (32 - s))) + +inline void md4_encrypt(__private uint *hash, __private uint *W) +{ + hash[0] = 0x67452301; + hash[1] = 0xefcdab89; + hash[2] = 0x98badcfe; + hash[3] = 0x10325476; + + /* Round 1 */ + STEP(F, hash[0], hash[1], hash[2], hash[3], W[0], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[1], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[2], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[3], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[4], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[5], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[6], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[7], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[8], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[9], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[10], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[11], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[12], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[13], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[14], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[15], 19); + + /* Round 2 */ + STEP(G, hash[0], hash[1], hash[2], hash[3], W[0] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[4] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[8] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[12] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[1] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[5] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[9] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[13] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[2] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[6] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[10] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[14] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[3] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[7] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[11] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[15] + 0x5a827999, 13); + + /* Round 3 */ + STEP(H, hash[0], hash[1], hash[2], hash[3], W[0] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[8] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[4] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[12] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[2] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[10] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[6] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[14] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[1] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[9] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[5] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[13] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[3] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[11] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[7] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[15] + 0x6ed9eba1, 15); + + hash[0] = hash[0] + 0x67452301; + hash[1] = hash[1] + 0xefcdab89; + hash[2] = hash[2] + 0x98badcfe; + hash[3] = hash[3] + 0x10325476; +} + + +inline unsigned long hash_ntlm9(unsigned char *plaintext) { + unsigned int key[16] = {0}; + unsigned int output[4]; + + for (int i = 0; i < 4; i++) + key[i] = plaintext[i * 2] | (plaintext[(i * 2) + 1] << 16); + + key[4] = plaintext[8] | 0x800000; + key[14] = 0x90; + + md4_encrypt(output, key); + + return ((unsigned long)output[1]) << 32 | (unsigned long)output[0]; +} + + +inline unsigned long hash_to_index_ntlm9(unsigned long hash, unsigned int pos) { + return (hash + pos) % 6634204312890625UL; +} + + +inline unsigned long hash_char_to_index_ntlm9(__global unsigned char *hash_value, unsigned int pos) { + unsigned long ret = hash_value[7]; + ret <<= 8; + ret |= hash_value[6]; + ret <<= 8; + ret |= hash_value[5]; + ret <<= 8; + ret |= hash_value[4]; + ret <<= 8; + ret |= hash_value[3]; + ret <<= 8; + ret |= hash_value[2]; + ret <<= 8; + ret |= hash_value[1]; + ret <<= 8; + ret |= hash_value[0]; + + return (ret + pos) % 6634204312890625UL; +} diff --git a/CL/precompute.cl b/CL/precompute.cl new file mode 100644 index 0000000..7123e03 --- /dev/null +++ b/CL/precompute.cl @@ -0,0 +1,52 @@ +#include "string.cl" +#include "rt.cl" + +__kernel void precompute( + __global unsigned int *g_hash_type, + __global unsigned char *g_hash, + __global unsigned int *g_hash_len, + __global char *g_charset, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned int *g_table_index, + __global unsigned long *g_chain_len, + __global unsigned int *g_device_num, + __global unsigned int *g_total_devices, + __global unsigned int *g_exec_block_scaler, + __global unsigned long *g_output) { + + long target_chain_len = (*g_chain_len - *g_device_num) - ((get_global_id(0) + *g_exec_block_scaler) * *g_total_devices) - 1; + + if (target_chain_len < 1) { + g_output[get_global_id(0)] = 0; + return; + } + + char charset[MAX_CHARSET_LEN]; + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + unsigned char hash[MAX_HASH_OUTPUT_LEN]; + unsigned char plaintext[MAX_PLAINTEXT_LEN]; + unsigned int plaintext_len = 0; + unsigned long index; + + unsigned int hash_type = *g_hash_type; + unsigned int hash_len = *g_hash_len; + unsigned int charset_len = g_strncpy(charset, g_charset, sizeof(charset)); + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned int reduction_offset = TABLE_INDEX_TO_REDUCTION_OFFSET(*g_table_index); + unsigned int chain_len = *g_chain_len; + unsigned long plaintext_space_total = fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + + g_memcpy(hash, g_hash, *g_hash_len); + index = hash_to_index(hash, hash_len, reduction_offset, plaintext_space_total, target_chain_len - 1); + + for(unsigned int i = target_chain_len; i < chain_len - 1; i++) { + index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, plaintext, &plaintext_len); + do_hash(hash_type, plaintext, plaintext_len, hash, &hash_len); + index = hash_to_index(hash, hash_len, reduction_offset, plaintext_space_total, i); + } + + g_output[get_global_id(0)] = index; +} diff --git a/CL/precompute_ntlm8.cl b/CL/precompute_ntlm8.cl new file mode 100644 index 0000000..572bb66 --- /dev/null +++ b/CL/precompute_ntlm8.cl @@ -0,0 +1,34 @@ +#include "ntlm8_functions.cl" + + +__kernel void precompute_ntlm8( + __global unsigned int *unused1, + __global unsigned char *g_hash, + __global unsigned int *unused2, + __global char *unused3, + __global unsigned int *unused4, + __global unsigned int *unused5, + __global unsigned int *unused6, + __global unsigned long *unused7, + __global unsigned int *g_device_num, + __global unsigned int *g_total_devices, + __global unsigned int *g_exec_block_scaler, + __global unsigned long *g_output) { + + long target_chain_len = (422000 - *g_device_num) - ((get_global_id(0) + *g_exec_block_scaler) * *g_total_devices) - 1; + + if (target_chain_len < 1) { + g_output[get_global_id(0)] = 0; + return; + } + + unsigned char plaintext[8]; + unsigned long index = hash_char_to_index_ntlm8(g_hash, target_chain_len - 1); + + for(unsigned int i = target_chain_len; i < 421999; i++) { + index_to_plaintext_ntlm8(index, charset, plaintext); + index = hash_to_index_ntlm8(hash_ntlm8(plaintext), i); + } + + g_output[get_global_id(0)] = index; +} diff --git a/CL/rt.cl b/CL/rt.cl new file mode 100644 index 0000000..7ae2ebe --- /dev/null +++ b/CL/rt.cl @@ -0,0 +1,143 @@ +#include "shared.h" +#include "ntlm.cl" + +#ifdef USE_DES_BITSLICE +#include "des_bs.cl" +#else +#include "des.cl" +#endif + +inline void index_to_plaintext(unsigned long index, char *charset, unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned long *plaintext_space_up_to_index, unsigned char *plaintext, unsigned int *plaintext_len) { + + + /* Since nobody else has made 9-character rainbow tables, we're free to take some of + * our own artistic liberties... + * + * We have a 64-bit number that we need to map to a 9-character plaintext. This + * means if the character set is of length 128 or less, we can break the number into + * nine 7-bit fragments, and use them to index into the character set. This ends up + * being 2.4x faster than the standard division method (below)! */ + + /* For speed, we only check that the minimum length is 9, and assume that the max is + * also 9, and that the character set is 128 characters or less. */ + if (plaintext_len_min == 9) { + *plaintext_len = 9; + + for (int i = 0; i < 9; i++) { + plaintext[i] = charset[ (index & 0xff) % charset_len ]; + index >>= 7; + } + + return; + } + + for (int i = plaintext_len_max - 1; i >= plaintext_len_min - 1; i--) { + if (index >= plaintext_space_up_to_index[i]) { + *plaintext_len = i + 1; + break; + } + } + + unsigned long index_x = index - plaintext_space_up_to_index[*plaintext_len - 1]; + for (int i = *plaintext_len - 1; i >= 0; i--) { + plaintext[i] = charset[index_x % charset_len]; + index_x = index_x / charset_len; + } + + return; +} + + +inline void do_hash(unsigned int hash_type, unsigned char *plaintext, unsigned int plaintext_len, unsigned char *hash_value, unsigned int *hash_len /*, __global unsigned char *g_debug*/) { +#if HASH_TYPE == HASH_LM + for (int i = plaintext_len; i < 8; i++) + plaintext[i] = 0; + +/* For some reason, a very strange compiler error happens when the SK array is moved + * into des.cl:des_encrypt(). Shelved for now... */ +#ifdef USE_DES_BITSLICE + des_encrypt(plaintext, hash_value); +#else + uint32_t SK[32]; + des_encrypt(SK, plaintext, hash_value /*, g_debug*/); +#endif + + *hash_len = 8; + +#elif HASH_TYPE == HASH_NTLM + ntlm_hash(plaintext, plaintext_len, hash_value /*, g_debug*/); + *hash_len = 16; +#endif + + return; +} + + +inline unsigned long hash_to_index(unsigned char *hash_value, unsigned int hash_len, unsigned int reduction_offset, unsigned long plaintext_space_total, unsigned int pos) { + unsigned long ret = hash_value[7]; + ret <<= 8; + ret |= hash_value[6]; + ret <<= 8; + ret |= hash_value[5]; + ret <<= 8; + ret |= hash_value[4]; + ret <<= 8; + ret |= hash_value[3]; + ret <<= 8; + ret |= hash_value[2]; + ret <<= 8; + ret |= hash_value[1]; + ret <<= 8; + ret |= hash_value[0]; + + return (ret + reduction_offset + pos) % plaintext_space_total; +} + + +inline unsigned long fill_plaintext_space_table(unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned long *plaintext_space_up_to_index) { + unsigned long n = 1; + + plaintext_space_up_to_index[0] = 0; + for (int i = 1; i <= plaintext_len_max; i++) { + n = n * charset_len; + if (i < plaintext_len_min) + plaintext_space_up_to_index[i] = 0; + else + plaintext_space_up_to_index[i] = plaintext_space_up_to_index[i - 1] + n; + } + return plaintext_space_up_to_index[plaintext_len_max]; +} + + +// Copies the plaintext_space_up_to_index array from global memory to local memory. +inline void copy_plaintext_space_up_to_index(unsigned long *dest, __global unsigned long *src) { + for (int i = 0; i < MAX_PLAINTEXT_LEN; i++) + dest[i] = src[i]; +} + + +inline unsigned long generate_rainbow_chain( + unsigned int hash_type, + char *charset, + unsigned int charset_len, + unsigned int plaintext_len_min, + unsigned int plaintext_len_max, + unsigned int reduction_offset, + unsigned int chain_len, + unsigned long start, + unsigned int pos, + unsigned long *plaintext_space_up_to_index, + unsigned long plaintext_space_total, + unsigned char *plaintext, + unsigned int *plaintext_len, + unsigned char *hash, + unsigned int *hash_len) { + + unsigned long index = start; + for (; pos < chain_len - 1; pos++) { + index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, plaintext, plaintext_len); + do_hash(hash_type, plaintext, *plaintext_len, hash, hash_len); + index = hash_to_index(hash, *hash_len, reduction_offset, plaintext_space_total, pos); + } + return index; +} diff --git a/CL/string.cl b/CL/string.cl new file mode 100644 index 0000000..e750e0f --- /dev/null +++ b/CL/string.cl @@ -0,0 +1,36 @@ +#ifndef _STRING_CL +#define _STRING_CL + +/* Performs standard strncpy() on __global source array to a local destination + * array. Unlike the traditional strncpy(), however, it returns the number of + * bytes copied, not a pointer to the destination. */ +inline unsigned int g_strncpy(char *dest, __global char *g_src, unsigned int n) { + int i = 0; + for (; i < n; i++) { + dest[i] = g_src[i]; + if (dest[i] == 0) + break; + } + return i; +} + +inline unsigned int strlen(char *s) { + unsigned int i = 0; + for (; *s; i++, s++) + ; + return i; +} + +inline void g_memcpy(unsigned char *dest, __global unsigned char *g_src, unsigned int n) { + unsigned int i = 0; + for (; i < n; i++) + dest[i] = g_src[i]; +} + +inline void bzero(char *s, unsigned int n) { + unsigned int i; + for (i = 0; i < n; i++) + s[i] = 0; +} + +#endif diff --git a/CL/test_chain.cl b/CL/test_chain.cl new file mode 100644 index 0000000..7e92ea0 --- /dev/null +++ b/CL/test_chain.cl @@ -0,0 +1,48 @@ +#include "rt.cl" +#include "string.cl" + +__kernel void test_chain( + __global char *g_charset, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned int *g_table_index, + __global unsigned int *g_chain_len, + __global unsigned long *g_start, + __global unsigned long *g_end, + __global unsigned char *g_debug) { + + char charset[MAX_CHARSET_LEN]; + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned int reduction_offset = TABLE_INDEX_TO_REDUCTION_OFFSET(*g_table_index); + unsigned int chain_len = *g_chain_len; + unsigned long start = *g_start; + + unsigned int charset_len = g_strncpy(charset, g_charset, sizeof(charset)); + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + unsigned char plaintext[MAX_PLAINTEXT_LEN]; + unsigned int plaintext_len = 0; + unsigned char hash[MAX_HASH_OUTPUT_LEN]; + unsigned int hash_len; + + unsigned long plaintext_space_total = fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + *g_end = generate_rainbow_chain( + HASH_TYPE, + charset, + charset_len, + plaintext_len_min, + plaintext_len_max, + reduction_offset, + chain_len, + start, + 0, + plaintext_space_up_to_index, + plaintext_space_total, + plaintext, + &plaintext_len, + hash, + &hash_len); + + return; +} diff --git a/CL/test_des.cl b/CL/test_des.cl new file mode 100644 index 0000000..359c53b --- /dev/null +++ b/CL/test_des.cl @@ -0,0 +1,37 @@ +#include "des.cl" + +__kernel void test_des( + __global unsigned char *input, + __global unsigned char *key, + __global unsigned char *output, + __global unsigned int *debug) { + //uint32_t SK[32]; + unsigned char my_key[8]; + unsigned char my_input[8]; + //unsigned char my_output[8]; + + for (int i = 0; i < 8; i++) { + my_key[i] = key[(get_global_id(0) * 8) + i]; + my_input[i] = input[(get_global_id(0) * 8) + i]; + } +/* + des_setkey(SK, my_key); + des_crypt_ecb(SK, my_input, my_output); +*/ + unsigned int k0, k1; + + //des_prep_key(&k0, &k1, my_key); + + k0 = 0; k1 = 0 ; + + unsigned char out[32]; + //des_encrypt(k0, k1, out); + + for(int i = 0; i < 8; i++) + output[i] = out[i]; +/* + for (int i = 0; i < 8; i++) + output[ (get_global_id(0) * 8) + i ] = (unsigned char)(out >> (i * 8)); //my_output[i]; +*/ + return; +} diff --git a/CL/test_hash.cl b/CL/test_hash.cl new file mode 100644 index 0000000..a7bbdbc --- /dev/null +++ b/CL/test_hash.cl @@ -0,0 +1,29 @@ +#include "rt.cl" +/*#include "string.cl"*/ + +__kernel void test_hash( + __global unsigned int *g_alg, + __global char *g_input, + __global unsigned int *g_input_len, + __global unsigned char *g_output, + __global unsigned int *g_output_len + , __global unsigned char *g_debug) { + + unsigned int alg = *g_alg; + unsigned char input[MAX_PLAINTEXT_LEN]; + unsigned char output[MAX_HASH_OUTPUT_LEN]; + unsigned int input_len = *g_input_len; + unsigned int output_len = 0; + + input[0] = 0; + for (int i = 0; i < input_len; i++) + input[i] = g_input[i]; + + do_hash(alg, input, input_len, output, &output_len /*, g_debug*/); + + *g_output_len = output_len; + for (int i = 0; i < output_len; i++) + g_output[i] = output[i]; + + return; +} diff --git a/CL/test_hash_to_index.cl b/CL/test_hash_to_index.cl new file mode 100644 index 0000000..14a08bb --- /dev/null +++ b/CL/test_hash_to_index.cl @@ -0,0 +1,31 @@ +#include "rt.cl" + +__kernel void test_hash_to_index( + __global unsigned char *g_hash, + __global unsigned int *g_hash_len, + __global unsigned int *g_charset_len, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned int *g_table_index, + __global unsigned int *g_pos, + __global unsigned long *g_index, + __global unsigned char *g_debug) { + + unsigned char hash[MAX_HASH_OUTPUT_LEN]; + unsigned int hash_len = *g_hash_len; + unsigned int charset_len = *g_charset_len; + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned int reduction_offset = TABLE_INDEX_TO_REDUCTION_OFFSET(*g_table_index); + unsigned int pos = *g_pos; + + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + + for (int i = 0; i < hash_len; i++) + hash[i] = g_hash[i]; + + unsigned long plaintext_space_total = fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + *g_index = hash_to_index(hash, hash_len, reduction_offset, plaintext_space_total, pos); + return; +} diff --git a/CL/test_index_to_plaintext.cl b/CL/test_index_to_plaintext.cl new file mode 100644 index 0000000..54a68fa --- /dev/null +++ b/CL/test_index_to_plaintext.cl @@ -0,0 +1,35 @@ +#include "rt.cl" +#include "shared.h" +#include "string.cl" + +__kernel void test_index_to_plaintext( + __global char *g_charset, + __global unsigned int *g_charset_len, + __global unsigned int *g_plaintext_len_min, + __global unsigned int *g_plaintext_len_max, + __global unsigned long *g_index, + __global unsigned char *g_plaintext, + __global unsigned int *g_plaintext_len, + __global unsigned char *g_debug +) { + unsigned long plaintext_space_up_to_index[MAX_PLAINTEXT_LEN]; + + char charset[MAX_CHARSET_LEN]; + unsigned int plaintext_len_min = *g_plaintext_len_min; + unsigned int plaintext_len_max = *g_plaintext_len_max; + unsigned long index = *g_index; + unsigned char plaintext[MAX_PLAINTEXT_LEN]; + unsigned int plaintext_len = *g_plaintext_len; + + unsigned int charset_len = g_strncpy(charset, g_charset, sizeof(charset)); + + fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, plaintext, &plaintext_len); + + *g_plaintext_len = plaintext_len; + for (int i = 0; i < plaintext_len; i++) + g_plaintext[i] = plaintext[i]; + + return; +} diff --git a/FPGA/Makefile b/FPGA/Makefile new file mode 100644 index 0000000..ed7555b --- /dev/null +++ b/FPGA/Makefile @@ -0,0 +1,15 @@ +CPP=g++ +COMPILE_OPTIONS=-Wall -O3 -std=c++11 -DNOT_AWS_FPGA +LINK_OPTIONS=-lOpenCL + +.PHONY: clean + +all: crackalack_fpga_gen + +%.o: %.cpp + $(CPP) $(COMPILE_OPTIONS) -o $@ -c $< + +crackalack_fpga_gen: crackalack_fpga_gen.o + $(CPP) -o crackalack_fpga_gen crackalack_fpga_gen.o $(LINK_OPTIONS) +clean: + rm -f *~ *.o crackalack_fpga_gen diff --git a/FPGA/Makefile.aws b/FPGA/Makefile.aws new file mode 100644 index 0000000..426b3ad --- /dev/null +++ b/FPGA/Makefile.aws @@ -0,0 +1,113 @@ +# Points to Utility Directory +COMMON_REPO = ../../../ +ABS_COMMON_REPO = $(shell readlink -f $(COMMON_REPO)) + +include ./utils.mk +# Run Target: +# hw - Compile for hardware +# sw_emu/hw_emu - Compile for software/hardware emulation +# FPGA Board Platform (Default ~ vcu1525) + +TARGETS := hw +TARGET := $(TARGETS) +DEVICES := xilinx_vcu1525_dynamic +DEVICE := $(DEVICES) +XCLBIN := ./xclbin +DSA := $(call device2sandsa, $(DEVICE)) + +CXX := $(XILINX_SDX)/bin/xcpp +XOCC := $(XILINX_SDX)/bin/xocc + +CXXFLAGS := $(opencl_CXXFLAGS) -Wall -O0 -g -std=c++14 +LDFLAGS := $(opencl_LDFLAGS) + +HOST_SRCS = crackalack_fpga_gen.cpp + +# Host compiler global settings +CXXFLAGS = -I $(XILINX_SDX)/runtime/include/1_2/ -I/$(XILINX_SDX)/Vivado_HLS/include/ -O0 -g -Wall -fmessage-length=0 -std=c++14 +LDFLAGS = -lOpenCL -lpthread -lrt -lstdc++ -L$(XILINX_SDX)/runtime/lib/x86_64 + +# Kernel compiler global settings +CLFLAGS = -t $(TARGET) --platform $(DEVICE) --save-temps + + +EXECUTABLE = crackalack_fpga_gen + +EMCONFIG_DIR = $(XCLBIN)/$(DSA) + +BINARY_CONTAINERS += $(XCLBIN)/crackalack_fpga_ntlm8.$(TARGET).$(DSA).xclbin +BINARY_CONTAINER_crackalack_fpga_ntlm8_OBJS += $(XCLBIN)/crackalack_fpga_ntlm8.$(TARGET).$(DSA).xo + +#Include Libraries +include $(ABS_COMMON_REPO)/libs/opencl/opencl.mk +include $(ABS_COMMON_REPO)/libs/xcl2/xcl2.mk +CXXFLAGS += $(xcl2_CXXFLAGS) +LDFLAGS += $(xcl2_LDFLAGS) +HOST_SRCS += $(xcl2_SRCS) + +CP = cp -rf + +.PHONY: all clean cleanall docs emconfig +all: $(EXECUTABLE) $(BINARY_CONTAINERS) emconfig + +.PHONY: exe +exe: $(EXECUTABLE) + +# Building kernel +$(XCLBIN)/crackalack_fpga_ntlm8.$(TARGET).$(DSA).xo: ./crackalack_fpga_ntlm8.cl + mkdir -p $(XCLBIN) + $(XOCC) $(CLFLAGS) -c -k crackalack_fpga_ntlm8 -I'$( DEVICE=" + $(ECHO) " Command to generate the design for specified Target and Device." + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + $(ECHO) " make check TARGET= DEVICE=" + $(ECHO) " Command to run application in emulation." + $(ECHO) "" + +docs: README.md + +README.md: description.json + $(ABS_COMMON_REPO)/utility/readme_gen/readme_gen.py description.json + diff --git a/FPGA/README_compile_kernel.txt b/FPGA/README_compile_kernel.txt new file mode 100644 index 0000000..fc07937 --- /dev/null +++ b/FPGA/README_compile_kernel.txt @@ -0,0 +1,89 @@ +Instructions For Compiling FPGA Kernel On Amazon EC2 + + +1.) Start a z1d.2xlarge instance with the FPGA development tools. As of the time of this writing (March 2019), the latest image is AMI ID: ami-0da0d9ed98b33a214. + + * Pro-tip: instead of using an on-demand instance, make a spot request for a 70% discount (!). + * Also note: a smaller/cheaper instance can be used to cut down on costs, such as t2.2xlarge or r5.xlarge. According to the FPGA vendor, 32 GB of RAM is required. + + +2.) Install the build tools (as root): + +# yum -y install ocl-icd ocl-icd-devel opencl-headers libstdc++-static kernel-headers kernel-devel gcc-c++ gcc gdb libstdc++-static make opencv python git libjpeg-turbo-devel libpng12-devel libtiff-devel compat-libtiff3 + + +3.) Shrink down the swap space so more is usable during compilation (as root): + +# swapoff /swapfile; rm -f /swapfile; dd if=/dev/zero of=/swapfile bs=512M count=16; chmod 0600 /swapfile; mkswap /swapfile; swapon /swapfile + + +4.) Update the swapiness (tells the kernel to not use swap space as aggressively): + +# sysctl vm.swappiness=10 + + +--- NOTE: all commands below are run as the centos user--NOT as root! --- + + +5.) Set your AWS keys: + +$ aws configure + + +6.) Place the Rainbow Crackalack sources in /home/centos/rainbowcrackalack. + + +7.) Set up the FPGA tools. + +$ cd ~ +$ git clone https://github.com/aws/aws-fpga +$ cd aws-fpga; source sdaccel_setup.sh + + +8.) Place the FPGA code into the example directory to compile: + +$ cp -R ~/rainbowcrackalack/fpga $SDACCEL_DIR/examples/xilinx/getting_started/host/rainbowcrackalack + + +9.) Begin compilation: + +$ screen -S generate +$ cd $SDACCEL_DIR/examples/xilinx/getting_started/host/rainbowcrackalack +$ ./make_kernel.sh + + +10.) Wait about 2 hours (on the z1d.2xlarge instance) for compilation to complete. + + * Monitor the ~/fpga_kernel_compile.txt file for updates. + * After compiling the kernel, the host application will be executed. This *should* fail. Continue on anyway. + + +11.) Create an S3 bucket and folders to hold AFI compilation logs: + +$ REGION=us-east-1 [update as necessary] +$ BUCKET=[give a name to your bucket] + +$ aws s3 mb s3://$BUCKET --region $REGION +$ aws s3 mb s3://$BUCKET/dcp +$ aws s3 mb s3://$BUCKET/logs + + +12.) Create the AFI image: + +$ cd ~ +$ cd aws-fpga; source sdaccel_setup.sh +$ cd $SDACCEL_DIR/examples/xilinx/getting_started/host/rainbowcrackalack +$ $SDACCEL_DIR/tools/create_sdaccel_afi.sh -xclbin=xclbin/crackalack_fpga_ntlm8.hw.xilinx_aws-vu9p-f1-04261818_dynamic_5_0.xclbin -o=crackalack_fpga_ntlm8 -s3_bucket=$BUCKET -s3_dcp_key=dcp -s3_logs_key=logs + + +13.) Copy *.awsxclbin file to S3 bucket: + +$ aws s3 cp crackalack_fpga_ntlm8.awsxclbin s3://$BUCKET/ + + +14.) Check on progress of AFI creation: + +$ aws ec2 describe-fpga-images --owners self + + + * After ~15 minutes or so, the "Code" field should transition from "pending" to "available." Once this is done, compilation is complete and the kernel is ready to be run on a real FPGA instance! diff --git a/FPGA/README_run_program.txt b/FPGA/README_run_program.txt new file mode 100644 index 0000000..e6ed8c2 --- /dev/null +++ b/FPGA/README_run_program.txt @@ -0,0 +1,53 @@ +Instructions For Generating Tables on Amazon EC2 FPGA F1 Instance + + +1.) Start an f1.2xlarge instance with the FPGA development tools. As of the time of this writing (March 2019), the latest image is AMI ID: ami-0da0d9ed98b33a214. + + * Pro-tip: instead of using an on-demand instance, make a spot request for a 70% discount (!). + + +2.) Log in as root, and configure AWS API key: + +ssh -i key.pem centos@1.2.3.4 +$ sudo -i +# aws configure + + +2.) Set up development tools: + +# pushd ~; git clone https://github.com/aws/aws-fpga; cd aws-fpga; source sdaccel_setup.sh; source sdaccel_runtime_setup.sh; popd + + +3.) Follow additional instructions given in the error message output above. As of the time of this writing, they are (double check that these are up-to-date): + +# curl -s https://s3.amazonaws.com/aws-fpga-developer-ami/1.5.0/Patches/XRT_2018_2_XDF_RC5/xrt_201802.2.1.0_7.5.1804-xrt.rpm -o xrt_201802.2.1.0_7.5.1804-xrt.rpm +# curl -s https://s3.amazonaws.com/aws-fpga-developer-ami/1.5.0/Patches/XRT_2018_2_XDF_RC5/xrt_201802.2.1.0_7.5.1804-aws.rpm -o xrt_201802.2.1.0_7.5.1804-aws.rpm +# yum remove -y xrt-aws xrt +# yum install -y *.rpm + + +4.) Reboot the machine (this may or may not be necessary...). + + +5.) Initialize development tools once more: + +# pushd ~; cd aws-fpga; source sdaccel_setup.sh; source sdaccel_runtime_setup.sh; popd +# ln -s /root/aws-fpga/SDAccel/examples/xilinx_2018.2/libs /root/aws-fpga/SDAccel/examples/xilinx_2018.2/getting_started/libs + + +6.) Place Rainbow Crackalack FPGA directory in correct location: + +# cp -R ~/rainbowcrackalack/fpga ~/aws-fpga/SDAccel/examples/xilinx/getting_started/host/rainbowcrackalack + + +7.) Download *.awsxclbin file: + +# cd ~/aws-fpga/SDAccel/examples/xilinx/getting_started/host/rainbowcrackalack +# BUCKET=[s3 bucket name here] +# aws s3 cp s3://$BUCKET/crackalack_fpga_ntlm8.awsxclbin . + + +8.) Compile & run host application: + +# ./make_hostprog.sh +# ./crackalack_fpga_gen ~ 0 128 diff --git a/FPGA/crackalack_fpga_gen.cpp b/FPGA/crackalack_fpga_gen.cpp new file mode 100644 index 0000000..9115257 --- /dev/null +++ b/FPGA/crackalack_fpga_gen.cpp @@ -0,0 +1,307 @@ +/* + * Rainbow Crackalack: crackalack_fpga_gen.cpp + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * For quick testing, here are the sha256 hashes of the first 16,384 bytes of the + * tables with indices 0 and 659, respectively: + * #0: 9d6d6893d7b107477de7db828472cbe48f2780c42dba918aa6bdea796523a522 + * #659: 62a42e8de712ad84cdfe1ef50908e1f77b92faa18973c9eb65201ad55f618d11 + */ + +#ifdef NOT_AWS_FPGA +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#include +#else +#include "xcl2.hpp" +#endif + +#define FPGA_NTLM8_KERNEL "crackalack_fpga_ntlm8.cl" +#define FPGA_NTLM8_KERNEL_ENTRY "crackalack_fpga_ntlm8" + +#define FCLOSE(_f) { fclose(_f); _f = NULL; } +#define FREE(_ptr) { free(_ptr); _ptr = NULL; } +#define CL_CHECK(_err, _code) _code; if (_err != CL_SUCCESS) { cerr << __FILE__ << ":" << __LINE__ << ": Error calling " << #_code << "; error code: " << _err << endl << flush; exit(-1); } + +#define NUM_CHAINS_TOTAL 67108864 + +// The interval, in seconds, to update the user on the rate and progress. +#define STATUS_UPDATE_INTERVAL 60 + + +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace std; + + +double get_elapsed(struct timespec *start) { + double ret = 0.0; + struct timespec now = {0}; + + clock_gettime(CLOCK_MONOTONIC, &now); + + ret = now.tv_sec - start->tv_sec; + long nsec_diff = now.tv_nsec - start->tv_nsec; + if (nsec_diff < 0) { + ret = ret - 1.0; + ret += ((nsec_diff + 1000000000) / 1000000000.0); + } else + ret += (nsec_diff / 1000000000.0); + + return ret; +} + + +/* Converts number of seconds into human-readable time, such as "X mins, Y secs". */ +string seconds_to_human_time(double seconds) { +#define ONE_MINUTE (60) +#define ONE_HOUR (ONE_MINUTE * 60) +#define ONE_DAY (ONE_HOUR * 24) + + char buf[64] = {0}; + unsigned int buf_size = sizeof(buf); + unsigned int seconds_int = (unsigned int)seconds; + if (seconds < ONE_MINUTE) + snprintf(buf, buf_size - 1, "%.1f seconds", seconds); + else if ((seconds >= ONE_MINUTE) && (seconds < ONE_HOUR)) + snprintf(buf, buf_size - 1, "%u minutes, %u seconds", seconds_int / ONE_MINUTE, seconds_int % ONE_MINUTE); + else if ((seconds >= ONE_HOUR) && (seconds < ONE_DAY)) + snprintf(buf, buf_size - 1, "%u hours, %u minutes", (unsigned int)(seconds_int / ONE_HOUR), (unsigned int)((seconds_int % ONE_HOUR) / ONE_MINUTE)); + else if (seconds >= ONE_DAY) + snprintf(buf, buf_size - 1, "%u days, %u hours", (unsigned int)(seconds_int / ONE_DAY), (unsigned int)((seconds_int % ONE_DAY) / ONE_HOUR)); + + return (string)buf; +} + + +int main(int ac, char **av) { + + if ((ac != 3) && (ac != 4)) { + cerr << "Usage: " << av[0] << " output_dir table_number [gws]" << endl; + return -1; + } + + char *output_dir = av[1]; + unsigned int part_index = atoi(av[2]); + + unsigned int gws = 16384; + if (ac == 4) { + gws = atoi(av[3]); + if (gws == 0) { + cerr << "Error: GWS must be greater than zero." << endl; + return -1; + } + cout << "Using user-supplied GWS of " << gws << "." << endl; + } else + cout << "Using default GWS of " << gws << "." << endl; + + string output_filename = output_dir; + output_filename += "/ntlm_ascii-32-95#8-8_0_422000x67108864_"; + output_filename += to_string(part_index); + output_filename += ".rt"; + FILE *f = fopen(output_filename.c_str(), "w+"); + if (f == NULL) { + cerr << "Failed to open file for writing: " << strerror(errno) << endl; + return -1; + } + + + vector platforms; + cl::Platform::get(&platforms); + if (platforms.size() == 0) { + cerr << "Failed to load platforms!" << endl; + return -1; + } + cout << "Number of platforms: " << platforms.size() << endl; + +#ifdef NOT_AWS_FPGA + + // On non-FPGA builds, the GWS is fixed at 1024 to help with debugging. + gws = 1024; + cout << endl << "Note: GWS is reset to 1024, as this is a non-FPGA build." << endl << endl << flush; + + vector devices; + platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0) { + cerr << "Failed to get devices!" << endl; + return -1; + } + cout << "Number of devices: " << devices.size() << endl; +#else /* On AWS FPGA... */ + std::vector devices = xcl::get_xil_devices(); +#endif + cl_int err = 0; + + cl::Device device = devices[0]; + CL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + CL_CHECK(err, cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + CL_CHECK(err, std::string device_name = device.getInfo(&err)); + cout << "Device name: " << device_name.c_str() << endl; + + +#ifdef NOT_AWS_FPGA + ifstream file; + file.open(FPGA_NTLM8_KERNEL); + if (!file) { + cerr << "Failed to open file containing kernel: " << FPGA_NTLM8_KERNEL << endl; + return -1; + } + + stringstream fileBuffer; + fileBuffer << file.rdbuf(); + file.close(); + + cl::Program::Sources sources; + sources.push_back({fileBuffer.str().c_str(), fileBuffer.str().length()}); + cl::Program program(context, sources); + if (program.build({device}) != CL_SUCCESS) { + cerr << "Failed to build kernel!" << endl << endl << program.getBuildInfo(device) << endl; + return -1; + } + + CL_CHECK(err, cl::Kernel kernel(program, FPGA_NTLM8_KERNEL_ENTRY, &err)); +#else /* On AWS FPGA... */ + + std::string binaryFile = xcl::find_binary_file(device_name,"crackalack_fpga_ntlm8"); + cl::Program::Binaries bins = xcl::import_binary_file(binaryFile); + devices.resize(1); + + vector binStatus; + CL_CHECK(err, cl::Program program(context, devices, bins, &binStatus, &err)); + CL_CHECK(err, cl::Kernel kernel(program, FPGA_NTLM8_KERNEL_ENTRY, &err)); + cout << "Return value from Kernel constructor: " << err << endl << flush; +#endif + + cl_ulong start_indices[gws] = {0}, end_indices[gws] = {0}; + + + CL_CHECK(err, cl::Buffer startIndicesBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, sizeof(cl_ulong) * gws, start_indices, &err)); + CL_CHECK(err, cl::Buffer endIndicesBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, sizeof(cl_ulong) * gws, end_indices, &err)); + + std::vector readBufferVector, writeBufferVector; + readBufferVector.push_back(startIndicesBuffer); + writeBufferVector.push_back(endIndicesBuffer); + + + cl_ulong *buf = (cl_ulong *)malloc(gws * sizeof(cl_ulong) * 2); + if (buf == NULL) { + cerr << "Error while creating buffer for file output!" << endl << flush; + return -1; + } + + cl_ulong start = (cl_ulong)part_index * (cl_ulong)NUM_CHAINS_TOTAL; + + // Calculate the number of kernel invokations we need to make. + unsigned int num_blocks = NUM_CHAINS_TOTAL / gws; + if ((NUM_CHAINS_TOTAL % gws) != 0) + num_blocks++; + + struct timespec start_time = {0}, last_update_time = {0}; + clock_gettime(CLOCK_MONOTONIC, &start_time); + clock_gettime(CLOCK_MONOTONIC, &last_update_time); + + cout << "Starting chain generation..." << endl << flush; + unsigned int num_chains_generated = 0; + for (unsigned int block = 0; block < num_blocks; block++) { + for (unsigned int i = 0; i < gws; i++) + start_indices[i] = start++; + + + CL_CHECK(err, err = queue.enqueueMigrateMemObjects(readBufferVector, 0)); + + CL_CHECK(err, err = kernel.setArg(0, startIndicesBuffer)); + CL_CHECK(err, err = kernel.setArg(1, endIndicesBuffer)); + + CL_CHECK(err, err = queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(gws))); + + CL_CHECK(err, err = queue.enqueueMigrateMemObjects(writeBufferVector, CL_MIGRATE_MEM_OBJECT_HOST)); + + CL_CHECK(err, err = queue.finish()); + + num_chains_generated += gws; + + + double last_update_diff = get_elapsed(&last_update_time); + if (last_update_diff >= 60.0) { + double total_elapsed_time = get_elapsed(&start_time); + unsigned int rate = (unsigned int)(num_chains_generated / total_elapsed_time); + double estimated_seconds_remaining = (NUM_CHAINS_TOTAL - num_chains_generated) / (double)rate; + cout << "Run time: " << seconds_to_human_time(total_elapsed_time) << "; Chains generated: " << num_chains_generated << "; Rate: " << rate << "/s" << endl << "Estimated time remaining: " << seconds_to_human_time(estimated_seconds_remaining) << endl; + clock_gettime(CLOCK_MONOTONIC, &last_update_time); + } + + // Make a single block of memory that contains the start & end indices, that way + // we can write this all at once. + for (unsigned int i = 0; i < gws; i++) { + buf[(i * 2)] = start_indices[i]; + buf[(i * 2) + 1] = end_indices[i]; + //cout << start_indices[i] << " -> " << end_indices[i] << endl; + } + fwrite(buf, sizeof(cl_ulong), gws * 2, f); + +// On non-FPGA builds, only generate the first 1K chains for testing. +#ifdef NOT_AWS_FPGA + if (num_chains_generated >= 1024) { + if (ftruncate(fileno(f), 1024 * 16) != 0) + cerr << "Failed to truncate. :(" << endl << flush; + + FCLOSE(f); + + cout << "Non-FPGA build terminating after 1K chains generated." << endl << flush; + if (part_index == 0) + cout << endl << "The SHA256 checksum of " << output_filename << " should be:" << endl << " 9d6d6893d7b107477de7db828472cbe48f2780c42dba918aa6bdea796523a522" << endl << flush; + else if (part_index == 652) + cout << endl << "The SHA256 checksum of " << output_filename << " should be:" << endl << " 62a42e8de712ad84cdfe1ef50908e1f77b92faa18973c9eb65201ad55f618d11" << endl << flush; + else + cout << endl << "Note: the SHA256 sums of table parts 0 and 652, respectively, are:" << endl << " 9d6d6893d7b107477de7db828472cbe48f2780c42dba918aa6bdea796523a522" << endl << " 62a42e8de712ad84cdfe1ef50908e1f77b92faa18973c9eb65201ad55f618d11" << endl << endl << "Try re-generating using these part numbers, and using the hashes to verify correctness." << endl << endl << flush; + return 0; + } +#endif + } + + // If the file is larger than it should be, then truncate it. Or, if it is + // shorter than it should be, this is an error. + bool success = true; + long file_pos = ftell(f); + if (file_pos > NUM_CHAINS_TOTAL * 16) { + if (ftruncate(fileno(f), NUM_CHAINS_TOTAL * 16) != 0) { + cerr << "Error while truncating file!: " << strerror(errno) << endl; + success = false; + } else + cout << "File is " << file_pos << " bytes. Truncated to " << NUM_CHAINS_TOTAL * 16 << "." << endl; + } else if (file_pos < NUM_CHAINS_TOTAL * 16) { + cerr << endl << endl << "!! ERROR !!" << endl << endl << "File size is short! Table generation FAILED." << endl << endl << flush; + success = false; + } + + FCLOSE(f); + FREE(buf); + + if (success) { + cout << endl << "Done! Table generation successfully completed." << endl << endl << flush; + return 0; + } else + return -1; +} diff --git a/FPGA/crackalack_fpga_ntlm8.cl b/FPGA/crackalack_fpga_ntlm8.cl new file mode 100644 index 0000000..0b7751b --- /dev/null +++ b/FPGA/crackalack_fpga_ntlm8.cl @@ -0,0 +1,164 @@ +inline void index_to_plaintext(unsigned long index, __constant char *charset, unsigned char *plaintext) { + + for (int i = 7; i >= 0; i--) { + plaintext[i] = charset[index % 95]; + index = index / 95; + } + + return; +} + + +/* + * MD4 OpenCL kernel based on Solar Designer's MD4 algorithm implementation at: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md4 + * This code is in public domain. + * + * This software is Copyright (c) 2010, Dhiru Kholia + * and Copyright (c) 2012, magnum + * and Copyright (c) 2015, Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, + * are permitted. + * + * Useful References: + * 1 nt_opencl_kernel.c (written by Alain Espinosa ) + * 2. http://tools.ietf.org/html/rfc1320 + * 3. http://en.wikipedia.org/wiki/MD4 + */ + +#undef MD4_LUT3 /* No good for this format, just here for reference */ + +/* The basic MD4 functions */ +#if MD4_LUT3 +#define F(x, y, z) lut3(x, y, z, 0xca) +#elif USE_BITSELECT +#define F(x, y, z) bitselect((z), (y), (x)) +#elif HAVE_ANDNOT +#define F(x, y, z) ((x & y) ^ ((~x) & z)) +#else +#define F(x, y, z) (z ^ (x & (y ^ z))) +#endif + +#if MD4_LUT3 +#define G(x, y, z) lut3(x, y, z, 0xe8) +#else +#define G(x, y, z) (((x) & ((y) | (z))) | ((y) & (z))) +#endif + +#if MD4_LUT3 +#define H(x, y, z) lut3(x, y, z, 0x96) +#define H2 H +#else +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) +#endif + +/* The MD4 transformation for all three rounds. */ +#define STEP(f, a, b, c, d, x, s) \ + (a) += f((b), (c), (d)) + (x); \ + (a) = rotate((a), (uint)(s)) //(a) = ((a << s) | (a >> (32 - s))) + +inline void md4_encrypt(__private uint *hash, __private uint *W) +{ + hash[0] = 0x67452301; + hash[1] = 0xefcdab89; + hash[2] = 0x98badcfe; + hash[3] = 0x10325476; + + /* Round 1 */ + STEP(F, hash[0], hash[1], hash[2], hash[3], W[0], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[1], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[2], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[3], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[4], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[5], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[6], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[7], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[8], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[9], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[10], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[11], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[12], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[13], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[14], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[15], 19); + + /* Round 2 */ + STEP(G, hash[0], hash[1], hash[2], hash[3], W[0] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[4] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[8] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[12] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[1] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[5] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[9] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[13] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[2] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[6] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[10] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[14] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[3] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[7] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[11] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[15] + 0x5a827999, 13); + + /* Round 3 */ + STEP(H, hash[0], hash[1], hash[2], hash[3], W[0] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[8] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[4] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[12] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[2] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[10] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[6] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[14] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[1] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[9] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[5] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[13] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[3] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[11] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[7] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[15] + 0x6ed9eba1, 15); + + hash[0] = hash[0] + 0x67452301; + hash[1] = hash[1] + 0xefcdab89; + hash[2] = hash[2] + 0x98badcfe; + hash[3] = hash[3] + 0x10325476; +} + +inline unsigned long ntlm_hash(unsigned char *plaintext, unsigned char *hash, unsigned int pos) { + unsigned int key[16] = {0}; + unsigned int output[4]; + + for (int i = 0; i < 4; i++) + key[i] = plaintext[i * 2] | (plaintext[(i * 2) + 1] << 16); + + key[4] = 0x80; + key[14] = 0x80; + + md4_encrypt(output, key); + + unsigned long ret = ((unsigned long)output[1]) << 32 | (unsigned long)output[0]; + return (ret + pos) % 6634204312890625UL; +} + + +__constant char charset[] = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; + + +/* TODO: specify array length in definition...somehow? */ +__kernel void crackalack_fpga_ntlm8(__global unsigned long *g_start_indices, __global unsigned long *g_end_indices) { + + unsigned long index = g_start_indices[get_global_id(0)]; + unsigned char plaintext[8]; + unsigned char hash[8]; + + + for (unsigned int pos = 0; pos < 421999; pos++) { + index_to_plaintext(index, charset, plaintext); + index = ntlm_hash(plaintext, hash, pos); + } + + g_end_indices[get_global_id(0)] = index; + return; +} diff --git a/FPGA/make_hostprog.sh b/FPGA/make_hostprog.sh new file mode 100755 index 0000000..fa87cb9 --- /dev/null +++ b/FPGA/make_hostprog.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +make -f Makefile.aws exe diff --git a/FPGA/make_kernel.sh b/FPGA/make_kernel.sh new file mode 100755 index 0000000..68a4011 --- /dev/null +++ b/FPGA/make_kernel.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +make -f Makefile.aws check TARGETS=hw DEVICES=$AWS_PLATFORM all | tee ~/fpga_kernel_compile.txt diff --git a/FPGA/utils.mk b/FPGA/utils.mk new file mode 100644 index 0000000..a5d413f --- /dev/null +++ b/FPGA/utils.mk @@ -0,0 +1,48 @@ +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +REPORT := no +PROFILE := no +DEBUG := no + +#'estimate' for estimate report generation +#'system' for system report generation +ifneq ($(REPORT), no) +CLFLAGS += --report estimate +CLLDFLAGS += --report system +endif + +#Generates profile summary report +ifeq ($(PROFILE), yes) +CLFLAGS += --profile_kernel data:all:all:all +endif + +#Generates debug summary report +ifeq ($(DEBUG), yes) +CLFLAGS += --dk protocol:all:all:all +endif + +#Checks for XILINX_SDX +ifndef XILINX_SDX +$(error XILINX_SDX variable is not set, please set correctly and rerun) +endif + +# sanitize_dsa - create a filesystem friendly name from dsa name +# $(1) - name of dsa +COLON=: +PERIOD=. +UNDERSCORE=_ +sanitize_dsa = $(strip $(subst $(PERIOD),$(UNDERSCORE),$(subst $(COLON),$(UNDERSCORE),$(1)))) + +device2dsa = $(if $(filter $(suffix $(1)),.xpfm),$(shell $(COMMON_REPO)/utility/parsexpmf.py $(1) dsa 2>/dev/null),$(1)) +device2sandsa = $(call sanitize_dsa,$(call device2dsa,$(1))) +device2dep = $(if $(filter $(suffix $(1)),.xpfm),$(dir $(1))/$(shell $(COMMON_REPO)/utility/parsexpmf.py $(1) hw 2>/dev/null) $(1),) + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +ECHO:= @echo + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..13fdf8f --- /dev/null +++ b/LICENSE @@ -0,0 +1,675 @@ + + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8be7354 --- /dev/null +++ b/Makefile @@ -0,0 +1,74 @@ +.PHONY: archive clean tests test all + +CC=gcc +COMPILE_OPTIONS=-Wall -g -O3 +LINK_OPTIONS=-lpthread + +# If we're doing a Windows build... +ifneq ($(WINDOWS_BUILD),) + COMPILE_OPTIONS += -I$(CL_INCLUDE) + LINK_OPTIONS += -static -lbcrypt + + GEN_PROG=crackalack_gen.exe + GETCHAIN_PROG=get_chain.exe + LOOKUP_PROG=crackalack_lookup.exe + #PERFECTIFY_PROG=perfectify.exe + RTC2RT_PROG=crackalack_rtc2rt.exe + UNITTEST_PROG=crackalack_unit_tests.exe + VERIFY_PROG=crackalack_verify.exe +else + LINK_OPTIONS += -ldl + + GEN_PROG=crackalack_gen + GETCHAIN_PROG=get_chain + LOOKUP_PROG=crackalack_lookup + PERFECTIFY_PROG=perfectify + RTC2RT_PROG=crackalack_rtc2rt + UNITTEST_PROG=crackalack_unit_tests + VERIFY_PROG=crackalack_verify +endif + + +all: $(GEN_PROG) $(UNITTEST_PROG) $(LOOKUP_PROG) $(RTC2RT_PROG) $(GETCHAIN_PROG) $(VERIFY_PROG) $(PERFECTIFY_PROG) + + +%.o: %.c + $(CC) $(COMPILE_OPTIONS) -o $@ -c $< + +$(GEN_PROG): charset.o clock.o cpu_rt_functions.o crackalack_gen.o file_lock.o gws.o hash_validate.o misc.o opencl_setup.o rtc_decompress.o verify.o + $(CC) $(COMPILE_OPTIONS) -o $(GEN_PROG) charset.o clock.o cpu_rt_functions.o crackalack_gen.o file_lock.o gws.o hash_validate.o misc.o opencl_setup.o rtc_decompress.o verify.o $(LINK_OPTIONS) + +$(UNITTEST_PROG): charset.o cpu_rt_functions.o crackalack_unit_tests.o hash_validate.o misc.o opencl_setup.o test_chain.o test_hash.o test_hash_to_index.o test_index_to_plaintext.o test_shared.o file_lock.o + $(CC) $(COMPILE_OPTIONS) -o $(UNITTEST_PROG) charset.o cpu_rt_functions.o crackalack_unit_tests.o hash_validate.o misc.o opencl_setup.o test_chain.o test_hash.o test_hash_to_index.o test_index_to_plaintext.o test_shared.o file_lock.o $(LINK_OPTIONS) + +$(GETCHAIN_PROG): get_chain.o + $(CC) $(COMPILE_OPTIONS) -o $(GETCHAIN_PROG) get_chain.o $(LINK_OPTIONS) + +$(VERIFY_PROG): charset.o cpu_rt_functions.o crackalack_verify.o file_lock.o hash_validate.o misc.o rtc_decompress.o verify.o + $(CC) $(COMPILE_OPTIONS) -o $(VERIFY_PROG) charset.o cpu_rt_functions.o crackalack_verify.o file_lock.o hash_validate.o misc.o rtc_decompress.o verify.o $(LINK_OPTIONS) + +$(RTC2RT_PROG): rtc_decompress.o crackalack_rtc2rt.o + $(CC) $(COMPILE_OPTIONS) -o $(RTC2RT_PROG) crackalack_rtc2rt.o rtc_decompress.o $(LINK_OPTIONS) + +$(LOOKUP_PROG): clock.o cpu_rt_functions.o charset.o file_lock.o hash_validate.o crackalack_lookup.o misc.o opencl_setup.o rtc_decompress.o test_shared.o verify.o + $(CC) $(COMPILE_OPTIONS) -o $(LOOKUP_PROG) charset.o clock.o cpu_rt_functions.o crackalack_lookup.o file_lock.o hash_validate.o misc.o opencl_setup.o rtc_decompress.o test_shared.o verify.o $(LINK_OPTIONS) + +$(PERFECTIFY_PROG): clock.o perfectify.o + $(CC) $(COMPILE_OPTIONS) -o $(PERFECTIFY_PROG) clock.o perfectify.o + + +clean: + rm -f *~ *.o *.exe *.zip crackalack_gen crackalack_unit_tests get_chain crackalack_verify crackalack_rtc2rt crackalack_lookup perfectify + +archive: clean + ./scripts/archive.sh + +test: $(UNITTEST_PROG) $(LOOKUP_PROG) $(GEN_PROG) + ./crackalack_unit_tests + python3 crackalack_tests.py + +tests: $(UNITTEST_PROG) $(LOOKUP_PROG) $(GEN_PROG) + ./crackalack_unit_tests + python3 crackalack_tests.py + +.PHONY: test tests clean archive diff --git a/README.md b/README.md new file mode 100644 index 0000000..d3a1aa2 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# Rainbow Crackalack + +Author: [Joe Testa](https://www.positronsecurity.com/company/) ([@therealjoetesta](https://twitter.com/therealjoetesta)) + +## About + +This project produces open-source code to generate rainbow tables as well as use them to look up password hashes. While the current release only supports NTLM, future releases aim to support MD5, SHA-1, SHA-256, and possibly more. Both Linux and Windows are supported! + +For more information, see the project website: [https://www.rainbowcrackalack.com/](https://www.rainbowcrackalack.com/) + +## Bounty Program + +Want to work on this project and get paid for your work? See the [project page](https://www.rainbowcrackalack.com/#bounty) for more details on what bounties are currently being offered! + +## Kickstarter Campaign + +We are currently running a [Kickstarter campaign](https://www.kickstarter.com/projects/jtesta/ntlm-9-character-rainbow-tables) to raise funds for GPU equipment so that we can generate NTLM 9-character tables targeting 50% efficiency. Both individual & corporate sponsorship tiers are available. [Please lend your support soon](https://www.kickstarter.com/projects/jtesta/ntlm-9-character-rainbow-tables), as the campaign window is limited! + +## Volunteering + +The project for generating NTLM 9-character tables is now underway! If you create 5 tables for us, your name will be listed on the [project website](https://www.rainbowcrackalack.com/) as a project supporter. If you create 200 tables, we will mail you a free magnetic hard drive containing NTLM 9-character tables with 50% efficiency. Ships world-wide! + +If you have modern GPU equipment and you'd like to contribute, please [reach out using this form](https://www.rainbowcrackalack.com/?showcontact=true) to coordinate efforts. + +## NTLM Tables + +Currently, NTLM 8-character tables are available for [free download via Bittorrent](https://www.rainbowcrackalack.com/rainbow_crackalack_ntlm_8.torrent). For convenience, they [may also be purchased](https://www.rainbowcrackalack.com/#download) on an SSD with a USB 3.0 external enclosure. + +## Examples + +#### Generating NTLM 9-character tables + +The following command shows how to generate a standard 9-character NTLM table: + + # ./crackalack_gen ntlm ascii-32-95 9 9 0 1350000 67108864 0 + +The arguments are designed to be comparable to those of the original (and now closed-source) rainbow crack tools. In order, they mean: + +|Argument |Meaning | +|------------|----------| +|ntlm |The hash algorithm to use. Currently only "ntlm" is supported.| +|ascii-32-95 |The character set to use. This effectively means "all available characters on the US keyboard".| +|9 |The minimum plaintext character length.| +|9 |The maximum plaintext character length.| +|0 |The reduction index. Not used under standard conditions.| +|1350000 |The chain length for a single rainbow chain.| +|67108864 |The number of chains per table (= 64M)| +|0 |The table part index. Keep all other args the same, and increment this field to generate a single set of tables.| + +#### Table lookups against NTLM 8-character hashes + +The following command shows how to look up a file of NTLM hashes (one per line) against the NTLM 8-character tables: + + # ./crackalack_lookup /export/ntlm8_tables/ /home/user/hashes.txt + +## Recommended Hardware + +The NVIDIA GTX & RTX lines of GPU hardware has been well-tested with the Rainbow Crackalack software, and offer an excellent price/performance ratio. Specifically, the GTX 1660 Ti or RTX 2060 are the best choices for building a new cracking machine. [This document](https://docs.google.com/spreadsheets/d/1jigNGvt9SUur_SNH7QDEACapJbrdL_wKYtprM23IDpM/edit?usp=sharing) contains the raw data that backs this recommendation. + +However, other modern equipment can work just fine, so you don't necessarily need to purchase something new. The NVIDIA GTX and AMD Vega product lines are still quite useful for cracking! + +## Change Log + +- v1.0: June 11, 2019: Initial revision. + +## Windows Build + +A 64-bit Windows build can be achieved on an Ubuntu host machine by installing the following prerequisites: + + # apt install mingw-w64 opencl-headers + +Then starting the build with: + + # make clean; ./make_windows.sh + diff --git a/charset.c b/charset.c new file mode 100644 index 0000000..864fa05 --- /dev/null +++ b/charset.c @@ -0,0 +1,75 @@ +/* + * Rainbow Crackalack: charset.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "charset.h" +#include +#include + +struct charsets { + char name[32]; + char content[96]; +}; +struct charsets valid_charsets[] = { + {"numeric", CHARSET_NUMERIC}, + {"alpha", CHARSET_ALPHA}, + {"alpha-numeric", CHARSET_ALPHA_NUMERIC}, + {"loweralpha", CHARSET_LOWERALPHA}, + {"loweralpha-numeric", CHARSET_LOWERALPHA_NUMERIC}, + {"mixalpha", CHARSET_MIXALPHA}, + {"mixalpha-numeric", CHARSET_MIXALPHA_NUMERIC}, + {"ascii-32-95", CHARSET_ASCII_32_95}, + {"ascii-32-65-123-4", CHARSET_ASCII_32_65_123_4}, + {"alpha-numeric-symbol32-space", CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE} +}; + + +/* Given the name of a character set (such as "ascii-32-95"), return the + * characters in that set (" !\"#$%&'()*..."). */ +char *validate_charset(char *charset_name) { + char *ret = NULL; + unsigned int i = 0; + + + /* Loop through all the valid charset names and see if any match what the + * user chose. */ + for (i = 0; i < (sizeof(valid_charsets) / sizeof(struct charsets)); i++) { + if (strcmp(charset_name, valid_charsets[i].name) == 0) + ret = valid_charsets[i].content; + } + + return ret; +} + + +/* Get a comma-separated list of valid character set names. */ +void get_valid_charsets(char *buf, unsigned int buf_size) { + unsigned int i = 0; + + + if (buf_size == 0) + return; + + buf[0] = '\0'; + for (i = 0; i < (sizeof(valid_charsets) / sizeof(struct charsets)); i++) { + strncat(buf, valid_charsets[i].name, buf_size - 1); + strncat(buf, ", ", buf_size - 1); + } + if (strlen(buf) >= 2) + buf[strlen(buf) - 2] = '\0'; + + return; +} diff --git a/charset.h b/charset.h new file mode 100644 index 0000000..5ec57b2 --- /dev/null +++ b/charset.h @@ -0,0 +1,38 @@ +#ifndef __CHARSET_H +#define __CHARSET_H + +#define CHARSET_NUMERIC "0123456789" +#define CHARSET_NUMERIC_LEN (sizeof(CHARSET_NUMERIC) - 1) + +#define CHARSET_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define CHARSET_ALPHA_LEN (sizeof(CHARSET_ALPHA) - 1) + +#define CHARSET_ALPHA_NUMERIC "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" +#define CHARSET_ALPHA_NUMERIC_LEN (sizeof(CHARSET_ALPHA_NUMERIC) - 1) + +#define CHARSET_LOWERALPHA "abcdefghijklmnopqrstuvwxyz" +#define CHARSET_LOWERALPHA_LEN (sizeof(CHARSET_LOWERALPHA) - 1) + +#define CHARSET_LOWERALPHA_NUMERIC "abcdefghijklmnopqrstuvwxyz0123456789" +#define CHARSET_LOWERALPHA_NUMERIC_LEN (sizeof(CHARSET_LOWERALPHA_NUMERIC) - 1) + +#define CHARSET_MIXALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define CHARSET_MIXALPHA_LEN (sizeof(CHARSET_MIXALPHA) - 1) + +#define CHARSET_MIXALPHA_NUMERIC "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" +#define CHARSET_MIXALPHA_NUMERIC_LEN (sizeof(CHARSET_MIXALPHA_NUMERIC) - 1) + +#define CHARSET_ASCII_32_95 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" +#define CHARSET_ASCII_32_95_LEN (sizeof(CHARSET_ASCII_32_95) - 1) + +#define CHARSET_ASCII_32_65_123_4 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`{|}~" +#define CHARSET_ASCII_32_65_123_4_LEN (sizeof(CHARSET_ASCII_32_65_123_4) - 1) + +#define CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_+=~`[]{}|\\:;\"'<>,.?/ " +#define CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE_LEN (sizeof(CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE) - 1) + + +char *validate_charset(char *charset_name); +void get_valid_charsets(char *buf, unsigned int buf_size); + +#endif diff --git a/clock.c b/clock.c new file mode 100644 index 0000000..a187a2c --- /dev/null +++ b/clock.c @@ -0,0 +1,96 @@ +/* + * Rainbow Crackalack: clock.c + * Copyright (C) 2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "clock.h" + + +/* Windows does not have a clock_gettime() implementation. The following + * implementation was taken from winpthreads (src/clock.c) (public domain): + * */ +#ifdef _WIN32 +#include + +#define POW10_9 1000000000 +int clock_gettime(clockid_t clock_id, struct timespec *tp) +{ + LARGE_INTEGER pf, pc; + + if (clock_id == CLOCK_MONOTONIC) + { + if (QueryPerformanceFrequency(&pf) == 0) + return -1; + + if (QueryPerformanceCounter(&pc) == 0) + return -1; + + tp->tv_sec = pc.QuadPart / pf.QuadPart; + tp->tv_nsec = (int) (((pc.QuadPart % pf.QuadPart) * POW10_9 + (pf.QuadPart >> 1)) / pf.QuadPart); + if (tp->tv_nsec >= POW10_9) { + tp->tv_sec ++; + tp->tv_nsec -= POW10_9; + } + + return 0; + } + return -1; +} +#endif + + +/* Gets the elapsed seconds from a timer (see start_timer()). */ +double get_elapsed(struct timespec *start) { + struct timespec end; + + + if (clock_gettime(CLOCK_MONOTONIC, &end)) { + fprintf(stderr, "Error while calling clock_gettime(): %s (%d)\n", strerror(errno), errno); + return 0.0; + } + + return (end.tv_sec + (end.tv_nsec / 1000000000.0)) - (start->tv_sec + (start->tv_nsec / 1000000000.0)); +} + + +/* Converts number of seconds into human-readable time, such as "X mins, Y secs". */ +void seconds_to_human_time(char *buf, unsigned int buf_size, double seconds) { +#define ONE_MINUTE (60) +#define ONE_HOUR (ONE_MINUTE * 60) +#define ONE_DAY (ONE_HOUR * 24) + unsigned int seconds_uint = (unsigned int)seconds; + if (seconds_uint < ONE_MINUTE) + snprintf(buf, buf_size - 1, "%.1f secs", seconds); + else if ((seconds_uint >= ONE_MINUTE) && (seconds_uint < ONE_HOUR)) + snprintf(buf, buf_size - 1, "%u mins, %u secs", seconds_uint / ONE_MINUTE, seconds_uint % ONE_MINUTE); + else if ((seconds_uint >= ONE_HOUR) && (seconds_uint < ONE_DAY)) + snprintf(buf, buf_size - 1, "%u hours, %u mins", (unsigned int)(seconds_uint / ONE_HOUR), (unsigned int)((seconds_uint % ONE_HOUR) / ONE_MINUTE)); + else if (seconds_uint >= ONE_DAY) + snprintf(buf, buf_size - 1, "%u days, %u hours", (unsigned int)(seconds_uint / ONE_DAY), (unsigned int)((seconds_uint % ONE_DAY) / ONE_HOUR)); +} + + +/* Starts a timer. */ +void start_timer(struct timespec *start) { + if (clock_gettime(CLOCK_MONOTONIC, start)) { + fprintf(stderr, "Error while calling clock_gettime(): %s (%d)\n", strerror(errno), errno); + start->tv_sec = 0; + start->tv_nsec = 0; + } +} diff --git a/clock.h b/clock.h new file mode 100644 index 0000000..7cb80ab --- /dev/null +++ b/clock.h @@ -0,0 +1,14 @@ +#ifndef _CLOCK_H +#define _CLOCK_H + +#include + +#ifdef _WIN32 +int clock_gettime(clockid_t clock_id, struct timespec *tp); +#endif + +double get_elapsed(struct timespec *start); +void seconds_to_human_time(char *buf, unsigned int buf_size, double seconds); +void start_timer(struct timespec *start); + +#endif diff --git a/cpu_rt_functions.c b/cpu_rt_functions.c new file mode 100644 index 0000000..bc34d8d --- /dev/null +++ b/cpu_rt_functions.c @@ -0,0 +1,281 @@ +/* + * Rainbow Crackalack: cpu_rt_functions.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include "cpu_rt_functions.h" +#include "shared.h" + + +uint64_t fill_plaintext_space_table(unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, uint64_t *plaintext_space_up_to_index) { + uint64_t n = 1; + int i; + + + plaintext_space_up_to_index[0] = 0; + for (i = 1; i <= plaintext_len_max; i++) { + n = n * charset_len; + if (i < plaintext_len_min) + plaintext_space_up_to_index[i] = 0; + else + plaintext_space_up_to_index[i] = plaintext_space_up_to_index[i - 1] + n; + } + return plaintext_space_up_to_index[plaintext_len_max]; +} + + +uint64_t hash_to_index(unsigned char *hash_value, unsigned int hash_len, unsigned int reduction_offset, uint64_t plaintext_space_total, unsigned int pos) { + uint64_t ret = hash_value[7]; + ret <<= 8; + ret |= hash_value[6]; + ret <<= 8; + ret |= hash_value[5]; + ret <<= 8; + ret |= hash_value[4]; + ret <<= 8; + ret |= hash_value[3]; + ret <<= 8; + ret |= hash_value[2]; + ret <<= 8; + ret |= hash_value[1]; + ret <<= 8; + ret |= hash_value[0]; + + return (ret + reduction_offset + pos) % plaintext_space_total; +} + + +void index_to_plaintext(uint64_t index, char *charset, unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, uint64_t *plaintext_space_up_to_index, char *plaintext, unsigned int *plaintext_len) { + int i; + uint64_t index_x; + + + /* Since nobody else has made 9-character rainbow tables, we're free to take some of + * our own artistic liberties... + * + * We have a 64-bit number that we need to map to a 9-character plaintext. This + * means if the character set is of length 128 or less, we can break the number into + * nine 7-bit fragments, and use them to index into the character set. This ends up + * being 2.4x faster than the standard division method (below)! */ + + /* For speed, we only check that the minimum length is 9, and assume that the max is + * also 9, and that the character set is 128 characters or less. */ + if (plaintext_len_min == 9) { + *plaintext_len = 9; + + for (i = 0; i < 9; i++) { + plaintext[i] = charset[ (index & 0xff) % charset_len ]; + index >>= 7; + } + + return; + } + + + for (i = plaintext_len_max - 1; i >= plaintext_len_min - 1; i--) { + if (index >= plaintext_space_up_to_index[i]) { + *plaintext_len = i + 1; + if (*plaintext_len >= MAX_PLAINTEXT_LEN) + return; + + plaintext[*plaintext_len] = '\0'; + break; + } + } + + index_x = index - plaintext_space_up_to_index[*plaintext_len - 1]; + for (i = *plaintext_len - 1; i >= 0; i--) { + plaintext[i] = charset[index_x % charset_len]; + index_x = index_x / charset_len; + } + + return; +} + + +uint64_t generate_rainbow_chain( + unsigned int hash_type, + char *charset, + unsigned int charset_len, + unsigned int plaintext_len_min, + unsigned int plaintext_len_max, + unsigned int reduction_offset, + unsigned int chain_len, + uint64_t start, + uint64_t *plaintext_space_up_to_index, + uint64_t plaintext_space_total, + char *plaintext, + unsigned int *plaintext_len, + unsigned char *hash, + unsigned int *hash_len) { + uint64_t index = start; + unsigned int pos = 0; + + + if (hash_type != HASH_NTLM) + fprintf(stderr, "\n\tWARNING: only NTLM hashes are currently supported!\n\n"); + + for (; pos < chain_len - 1; pos++) { + index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, plaintext, plaintext_len); + ntlm_hash(plaintext, *plaintext_len, hash); + index = hash_to_index(hash, *hash_len, reduction_offset, plaintext_space_total, pos); + } + return index; +} + + +/* Calculates the NTLM hash on the specified plaintext. The result is stored in the hash + * argument, which must be at least 16 bytes in size. */ +void ntlm_hash(char *plaintext, unsigned int plaintext_len, unsigned char *hash) { + unsigned int key[16] = {0}; + unsigned int output[4]; + int i = 0; + + + if (plaintext_len > 27) { + plaintext[27] = 0; + plaintext_len = 27; + } + + for (; i < (plaintext_len / 2); i++) + key[i] = plaintext[i * 2] | (plaintext[(i * 2) + 1] << 16); + + if ((plaintext_len % 2) == 1) + key[i] = plaintext[plaintext_len - 1] | 0x800000; + else + key[i] = 0x80; + + key[14] = plaintext_len << 4; + + md4_encrypt(output, key); + + i = 0; + hash[i++] = ((output[0] >> 0) & 0xff); + hash[i++] = ((output[0] >> 8) & 0xff); + hash[i++] = ((output[0] >> 16) & 0xff); + hash[i++] = ((output[0] >> 24) & 0xff); + hash[i++] = ((output[1] >> 0) & 0xff); + hash[i++] = ((output[1] >> 8) & 0xff); + hash[i++] = ((output[1] >> 16) & 0xff); + hash[i++] = ((output[1] >> 24) & 0xff); + hash[i++] = ((output[2] >> 0) & 0xff); + hash[i++] = ((output[2] >> 8) & 0xff); + hash[i++] = ((output[2] >> 16) & 0xff); + hash[i++] = ((output[2] >> 24) & 0xff); + hash[i++] = ((output[3] >> 0) & 0xff); + hash[i++] = ((output[3] >> 8) & 0xff); + hash[i++] = ((output[3] >> 16) & 0xff); + hash[i++] = ((output[3] >> 24) & 0xff); +} + + +/* The below copyright notice applies to the md4_encrypt() function only. */ + +/* + * MD4 OpenCL kernel based on Solar Designer's MD4 algorithm implementation at: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md4 + * This code is in public domain. + * + * This software is Copyright (c) 2010, Dhiru Kholia + * and Copyright (c) 2012, magnum + * and Copyright (c) 2015, Sayantan Datta + * and it is hereby released to the general public under the following terms: + * Redistribution and use in source and binary forms, with or without modification, + * are permitted. + * + * Useful References: + * 1 nt_opencl_kernel.c (written by Alain Espinosa ) + * 2. http://tools.ietf.org/html/rfc1320 + * 3. http://en.wikipedia.org/wiki/MD4 + */ + +#define F(x, y, z) (z ^ (x & (y ^ z))) +#define G(x, y, z) (((x) & ((y) | (z))) | ((y) & (z))) +#define H(x, y, z) (((x) ^ (y)) ^ (z)) +#define H2(x, y, z) ((x) ^ ((y) ^ (z))) + +/* The MD4 transformation for all three rounds. */ +#define STEP(f, a, b, c, d, x, s) \ + (a) += f((b), (c), (d)) + (x); \ + (a) = ((a << s) | (a >> (32 - s))) + //(a) = rotate((a), (uint)(s)) //(a) = ((a << s) | (a >> (32 - s))) + +void md4_encrypt(unsigned int *hash, unsigned int *W) +{ + hash[0] = 0x67452301; + hash[1] = 0xefcdab89; + hash[2] = 0x98badcfe; + hash[3] = 0x10325476; + + /* Round 1 */ + STEP(F, hash[0], hash[1], hash[2], hash[3], W[0], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[1], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[2], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[3], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[4], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[5], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[6], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[7], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[8], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[9], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[10], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[11], 19); + STEP(F, hash[0], hash[1], hash[2], hash[3], W[12], 3); + STEP(F, hash[3], hash[0], hash[1], hash[2], W[13], 7); + STEP(F, hash[2], hash[3], hash[0], hash[1], W[14], 11); + STEP(F, hash[1], hash[2], hash[3], hash[0], W[15], 19); + + /* Round 2 */ + STEP(G, hash[0], hash[1], hash[2], hash[3], W[0] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[4] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[8] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[12] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[1] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[5] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[9] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[13] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[2] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[6] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[10] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[14] + 0x5a827999, 13); + STEP(G, hash[0], hash[1], hash[2], hash[3], W[3] + 0x5a827999, 3); + STEP(G, hash[3], hash[0], hash[1], hash[2], W[7] + 0x5a827999, 5); + STEP(G, hash[2], hash[3], hash[0], hash[1], W[11] + 0x5a827999, 9); + STEP(G, hash[1], hash[2], hash[3], hash[0], W[15] + 0x5a827999, 13); + + /* Round 3 */ + STEP(H, hash[0], hash[1], hash[2], hash[3], W[0] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[8] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[4] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[12] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[2] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[10] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[6] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[14] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[1] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[9] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[5] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[13] + 0x6ed9eba1, 15); + STEP(H, hash[0], hash[1], hash[2], hash[3], W[3] + 0x6ed9eba1, 3); + STEP(H2, hash[3], hash[0], hash[1], hash[2], W[11] + 0x6ed9eba1, 9); + STEP(H, hash[2], hash[3], hash[0], hash[1], W[7] + 0x6ed9eba1, 11); + STEP(H2, hash[1], hash[2], hash[3], hash[0], W[15] + 0x6ed9eba1, 15); + + hash[0] = hash[0] + 0x67452301; + hash[1] = hash[1] + 0xefcdab89; + hash[2] = hash[2] + 0x98badcfe; + hash[3] = hash[3] + 0x10325476; +} diff --git a/cpu_rt_functions.h b/cpu_rt_functions.h new file mode 100644 index 0000000..df82dfa --- /dev/null +++ b/cpu_rt_functions.h @@ -0,0 +1,19 @@ +#ifndef _CPU_RT_FUNCTIONS_H +#define _CPU_RT_FUNCTIONS_H + +#include + + +uint64_t fill_plaintext_space_table(unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, uint64_t *plaintext_space_up_to_index); + +uint64_t hash_to_index(unsigned char *hash_value, unsigned int hash_len, unsigned int reduction_offset, uint64_t plaintext_space_total, unsigned int pos); + +void index_to_plaintext(uint64_t index, char *charset, unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, uint64_t *plaintext_space_up_to_index, char *plaintext, unsigned int *plaintext_len); + +void ntlm_hash(char *plaintext, unsigned int plaintext_len, unsigned char *hash); + +uint64_t generate_rainbow_chain(unsigned int hash_type, char *charset, unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int reduction_offset, unsigned int chain_len, uint64_t start, uint64_t *plaintext_space_up_to_index, uint64_t plaintext_space_total, char *plaintext, unsigned int *plaintext_len, unsigned char *hash, unsigned int *hash_len); + +void md4_encrypt(unsigned int *hash, unsigned int *W); + +#endif diff --git a/crackalack_gen.c b/crackalack_gen.c new file mode 100644 index 0000000..dd92833 --- /dev/null +++ b/crackalack_gen.c @@ -0,0 +1,827 @@ +/* + * Rainbow Crackalack: crackalack_gen.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "charset.h" +#include "clock.h" +#include "cpu_rt_functions.h" +#include "file_lock.h" +#include "gws.h" +#include "hash_validate.h" +#include "misc.h" +#include "opencl_setup.h" +#include "shared.h" +#include "terminal_color.h" +#include "verify.h" +#include "version.h" + + +#define CRACKALACK_KERNEL_PATH "crackalack.cl" +#define CRACKALACK_NTLM8_KERNEL_PATH "crackalack_ntlm8.cl" +#define CRACKALACK_NTLM9_KERNEL_PATH "crackalack_ntlm9.cl" + +#define VERBOSE 1 + +#define UNDEFINED_INDEX 999 + +/* The initial number of chains each work unit should compute. This scales up and down + * at run-time based on the speed of execution (actually, this was disabled, because + * the Windows drivers don't like it...). */ +#define INITIAL_CHAINS_PER_EXECUTION 1 + +/* The interval, in seconds, that the user should be updated on the generation + * progress. */ +#define UPDATE_INTERVAL (1 * 60) /* 1 minute */ + + +#define LOCK_START_INDEX() \ + if (pthread_mutex_lock(&start_index_mutex)) { perror("Failed to lock mutex"); exit(-1); } + +#define UNLOCK_START_INDEX() \ + if (pthread_mutex_unlock(&start_index_mutex)) { perror("Failed to unlock mutex"); exit(-1); } + +#define ROUND(_x) ((unsigned int)(_x + 0.5)) + +void write_chains(char *filename, unsigned int chains_per_work_unit, cl_ulong *start_indices, unsigned int start_indices_size, cl_ulong *end_indices, unsigned int end_indices_size, unsigned int thread_id); + + +struct hash_names { + char name[8]; + unsigned int type; +}; +struct hash_names valid_hash_names[] = { + {"lm", HASH_LM}, + /*{"ntlm", HASH_NTLM}*/ +}; + + +/* Struct to represent one GPU device. */ +typedef struct { + cl_uint device_number; + cl_device_id device; + cl_context context; + cl_program program; + cl_kernel kernel; + cl_command_queue queue; + cl_uint num_work_units; +} gpu_dev; + +/* Struct to pass arguments to a host thread. */ +typedef struct { + unsigned int benchmark_mode; + + unsigned int hash_type; + char *charset; + unsigned int plaintext_len_min; + unsigned int plaintext_len_max; + unsigned int table_index; + unsigned int reduction_offset; + unsigned int chain_len; + char *filename; + + unsigned int initial_chains_per_execution; + + gpu_dev gpu; +} thread_args; + + +/* Mutex to protect access to the start_index counter between threads. */ +pthread_mutex_t start_index_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Barrier to ensure that kernels on multiple devices are all run at the same time. + * The closed-source AMD driver on Windows effectively blocks other devices while + * one kernel is running; this ensures parallelization in that environment, since + * all kernels will run at once. The open source AMD ROCm driver on Linux may or + * may not get a very slight performance bump with this enabled. */ +pthread_barrier_t barrier = {0}; + +/* The start index to use for the next chain to generate. This is shared among the + * host threads. */ +uint64_t start_index = 0; + +/* The number of chains to generate. This doesn't necessarily equal the number of + * chains entered on the command line, since we may be resuming a + * partially-constructed table. */ +unsigned int num_chains_to_generate = 0; + +/* The first chain that we will generate (this will not be zero if we resume an + * unfinished file or part index > 0). We use this to track how many chains we + * generated so far. */ +uint64_t first_generated_chain = 0; + +/* The time that the threads were started. */ +struct timespec global_start_time = {0}; + +/* The last time that the generation rate was output to stdout. */ +struct timespec last_update_time = {0}; + +/* Set to 1 if AMD GPUs found. */ +unsigned int is_amd_gpu = 0; + +/* The global work size, as over-ridden by the user on the command line. */ +size_t user_provided_gws = 0; + + +void print_usage_and_exit(char *prog_name, int exit_code) { + fprintf(stderr, "Usage: %s hash_algorithm charset_name plaintext_min_length plaintext_max_length table_index chain_length number_of_chains [part_index | -bench] [-gws GWS]\n\nExample: %s ntlm ascii-32-95 9 9 0 1350000 67108864 0\n\n", prog_name, prog_name); + exit(exit_code); +} + + +/* Outputs the number of chains created so far to stdout, along with the rate. + * Optionally, an estimate of how much time is remaining is also given. */ +void output_progress(unsigned int calculate_time_remaining) { + + uint64_t num_chains_generated = 0; + double run_time = 0.0, rate = 0.0; + char time_str[128]; + + memset(time_str, 0, sizeof(time_str)); + + + LOCK_START_INDEX(); + num_chains_generated = start_index - first_generated_chain; + UNLOCK_START_INDEX(); + + start_timer(&last_update_time); + run_time = get_elapsed(&global_start_time); + if (run_time == 0.0) + return; + + rate = num_chains_generated / run_time; + seconds_to_human_time(time_str, sizeof(time_str), run_time); + +#ifdef _WIN32 + printf("Run time: %s; Chains generated: %"PRIu64"; Rate: %s%u/s%s\n", time_str, num_chains_generated, WHITEB, (unsigned int)rate, CLR); +#else + printf("Run time: %s; Chains generated: %'"PRIu64"; Rate: %s%'u/s%s\n", time_str, num_chains_generated, WHITEB, (unsigned int)rate, CLR); +#endif + + if (calculate_time_remaining && (rate > 0.0)) { + seconds_to_human_time(time_str, sizeof(time_str), (num_chains_to_generate - num_chains_generated) / rate); + printf("Estimated time remaining: %s\n", time_str); + } + fflush(stdout); +} + + +/* A host thread which controls each GPU. */ +void *host_thread(void *ptr) { + thread_args *args = (thread_args *)ptr; + gpu_dev *gpu = &(args->gpu); + + char *kernel_path = CRACKALACK_KERNEL_PATH, *kernel_name = "crackalack"; + size_t gws = 0, kernel_work_group_size = 0, kernel_preferred_work_group_size_multiple = 0; + uint64_t *start_indices = NULL, *end_indices = NULL; + unsigned int i = 0, indices_size = 0, thread_complete = 0, num_passes = 0, pass = 0, chain_len = 0; + /*time_t thread_start_time = 0; + double elapsed = 0;*/ + int err = 0; + + cl_context context = NULL; + cl_command_queue queue = NULL; + cl_kernel kernel = NULL; + + cl_mem hash_type_buffer = NULL, charset_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, reduction_offset_buffer = NULL, chain_len_buffer = NULL, indices_buffer = NULL, pos_start_buffer = NULL; + + cl_uint pos_start = 0; + + + /* If we're generating the standard NTLM 8- or 9-character tables, use the special + * optimized kernel instead! */ + if (is_ntlm8(args->hash_type, args->charset, args->plaintext_len_min, args->plaintext_len_max, args->reduction_offset, args->chain_len)) { + kernel_path = CRACKALACK_NTLM8_KERNEL_PATH; + kernel_name = "crackalack_ntlm8"; + if (args->gpu.device_number == 0) { /* Only the first thread prints this. */ + printf("Note: optimized NTLM8 kernel will be used.\n"); fflush(stdout); + } + } /*else if (is_ntlm9(args->hash_type, args->charset, args->plaintext_len_min, args->plaintext_len_max, args->reduction_offset, args->chain_len)) { + kernel_path = CRACKALACK_NTLM9_KERNEL_PATH; + kernel_name = "crackalack_ntlm9"; + if (args->gpu.device_number == 0) { * Only the first thread prints this. * + printf("Note: optimized NTLM9 kernel will be used.\n"); fflush(stdout); + } + }*/ + + /* Get the number of compute units in this device. */ + get_device_uint(gpu->device, CL_DEVICE_MAX_COMPUTE_UNITS, &(gpu->num_work_units)); + + /* Load the kernel. */ + gpu->context = CLCREATECONTEXT(context_callback, &(gpu->device)); + gpu->queue = CLCREATEQUEUE(gpu->context, gpu->device); + load_kernel(gpu->context, 1, &(gpu->device), kernel_path, kernel_name, &(gpu->program), &(gpu->kernel), args->hash_type); + + context = gpu->context; + queue = gpu->queue; + kernel = gpu->kernel; + + if ((rc_clGetKernelWorkGroupInfo(kernel, gpu->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL) != CL_SUCCESS) || \ + (rc_clGetKernelWorkGroupInfo(kernel, gpu->device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &kernel_preferred_work_group_size_multiple, NULL) != CL_SUCCESS)) { + fprintf(stderr, "Failed to get preferred work group size!\n"); + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + pthread_exit(NULL); + return NULL; + } + + /* If the user provided a static GWS on the command line, use that. Otherwise, + * use the driver's work group size multiplied by the preferred multiple. */ + if (user_provided_gws > 0) { + gws = user_provided_gws; + printf("GPU #%u is using user-provided GWS value of %"PRIu64"\n", gpu->device_number, gws); + } else if (get_optimal_gws(gpu->device) > 0) { + gws = get_optimal_gws(gpu->device); + printf("GPU #%u is using optimized GWS: %"PRIu64"\n", gpu->device_number, gws); + } else { + gws = kernel_work_group_size * kernel_preferred_work_group_size_multiple; + printf("GPU #%u is using dynamic GWS: %"PRIu64" (work group) x %"PRIu64" (pref. multiple) = %"PRIu64"\n", gpu->device_number, kernel_work_group_size, kernel_preferred_work_group_size_multiple, gws); + } + fflush(stdout); + + /* AMD on Windows will hang if the number of chains to generate is less than the + * GWS. The open-source ROCm driver under Linux works fine, though. */ +#if _WIN32 + if ((is_amd_gpu) && (gws >= num_chains_to_generate)) { + printf("\n\n !! WARNING !!\n\nThe GWS (global work size) is greater or equal to the number of chains to generate (%"PRId64" >= %u). The closed-source AMD Windows driver has been observed to hang indefinitely in this case. If this happens, either raise the number of chains to generate, or lower the GWS setting using the '-gws' parameter.\n\n", gws, num_chains_to_generate); fflush(stdout); + } +#endif + + indices_size = gws; + start_indices = calloc(indices_size, sizeof(cl_ulong)); + end_indices = calloc(indices_size, sizeof(cl_ulong)); + if ((start_indices == NULL) || (end_indices == NULL)) { + fprintf(stderr, "Failed to create start/end index buffers.\n"); + exit(-1); + } + + + num_passes = 1; + if (args->chain_len > MAX_CHAIN_LEN) { + num_passes = args->chain_len / MAX_CHAIN_LEN; + if ((args->chain_len % MAX_CHAIN_LEN) > 0) + num_passes++; + } + + while(1) { + LOCK_START_INDEX(); + + /* Check if all chains were already created. If so, release the mutex and + * terminate the thread. */ + if ((start_index - first_generated_chain) >= num_chains_to_generate) { + /*printf("Thread #%u complete!: %lu %u\n", gpu->device_number, start_index, num_chains);*/ + UNLOCK_START_INDEX(); + thread_complete = 1; + } else { + for (i = 0; i < indices_size; i++) { + start_indices[i] = start_index; + start_index++; + } + UNLOCK_START_INDEX(); + } + + /* All chains generated, so terminate the thread. */ + if (thread_complete) { + CLFREEBUFFER(hash_type_buffer); + CLFREEBUFFER(charset_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(reduction_offset_buffer); + CLFREEBUFFER(chain_len_buffer); + CLFREEBUFFER(indices_buffer); + CLFREEBUFFER(pos_start_buffer); + + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + + FREE(start_indices); + FREE(end_indices); + pthread_exit(NULL); + return NULL; + } + + /* Most of the parameters need only be set once upon first invokation. */ + if (hash_type_buffer == NULL) { + CLCREATEARG(0, hash_type_buffer, CL_RO, args->hash_type, sizeof(cl_uint)); + CLCREATEARG_ARRAY(1, charset_buffer, CL_RO, args->charset, strlen(args->charset) + 1); + CLCREATEARG(2, plaintext_len_min_buffer, CL_RO, args->plaintext_len_min, sizeof(cl_uint)); + CLCREATEARG(3, plaintext_len_max_buffer, CL_RO, args->plaintext_len_max, sizeof(cl_uint)); + CLCREATEARG(4, reduction_offset_buffer, CL_RO, args->reduction_offset, sizeof(cl_uint)); + } + + /* The start_indices parameter must be set each block. The start indices are loaded into this read/write buffer, and the end indices will be in it when finished. */ + CLCREATEARG_ARRAY(6, indices_buffer, CL_RW, start_indices, indices_size * sizeof(cl_ulong)); + + /* If the chain length is greater than MAX_CHAIN_LEN, then the chains must be computed in multiple passes (otherwise Windows drivers crash). */ + for (pass = 0; pass < num_passes; pass++) { + chain_len = args->chain_len; + + /* If we're doing multiple passes, and aren't handling the last pass, set the chain length to a multiple of MAX_CHAIN_LEN. We add one at the end because the GPU code stops one short of the chain length. */ + if ((num_passes > 1) && (pass != (num_passes - 1))) + chain_len = ((pass + 1) * MAX_CHAIN_LEN) + 1; + + /* Starting at 0, the position start increases by a multiple of MAX_CHAIN_LEN. */ + pos_start = pass * MAX_CHAIN_LEN; + + /*printf("Pass #%u: pos_start: %u; chain_len: %u\n", pass, pos_start, chain_len);*/ + CLCREATEARG(5, chain_len_buffer, CL_RO, chain_len, sizeof(cl_uint)); + CLCREATEARG(7, pos_start_buffer, CL_RO, pos_start, sizeof(cl_uint)); + + /* For AMD GPUs, ensure that all kernels are running concurrently. This is a + * requirement for the closed-source Windows driver, and may or may not be + * very slightly helpful under the open-source ROCm Linux driver. */ +#ifdef _WIN32 + if (is_amd_gpu) { + int barrier_ret = pthread_barrier_wait(&barrier); + if ((barrier_ret != 0) && (barrier_ret != PTHREAD_BARRIER_SERIAL_THREAD)) { + fprintf(stderr, "pthread_barrier_wait() failed!\n"); fflush(stderr); + exit(-1); + } + } +#endif + + /* Run the kernel, wait for it to finish, and calculate its run time. */ + /*thread_start_time = time(NULL);*/ + CLRUNKERNEL(gpu->queue, gpu->kernel, &gws); + CLFLUSH(gpu->queue); + CLWAIT(gpu->queue); + /*elapsed = difftime(time(NULL), thread_start_time);*/ + + CLFREEBUFFER(chain_len_buffer); + CLFREEBUFFER(pos_start_buffer); + } + + /* Get the kernel output. */ + CLREADBUFFER(indices_buffer, indices_size * sizeof(cl_ulong), end_indices); + CLFREEBUFFER(indices_buffer); + + /* If we are in benchmark mode, don't loop again, nor write to the output file. */ + if (args->benchmark_mode) + thread_complete = 1; + else { + + /* Write the chains to the file. */ + write_chains(args->filename, 1, start_indices, indices_size, end_indices, indices_size, gpu->device_number); + + /* Thread #0 outputs the generation progress periodically. */ + if ((args->gpu.device_number == 0) && (get_elapsed(&last_update_time) >= UPDATE_INTERVAL) && (thread_complete == 0)) + output_progress(1); + } + } + + /* Never reached. */ + return NULL; +} + + +/* Writes the chains given by the kernel to the file. */ +void write_chains(char *filename, unsigned int chains_per_work_unit, cl_ulong *start_indices, unsigned int start_indices_size, cl_ulong *end_indices, unsigned int end_indices_size, unsigned int thread_id) { + int i = 0, j = 0; + unsigned int file_size = 0; + cl_ulong start = 0; + rc_file f = rc_fopen(filename, 0), l = NULL; + char log_filename[256] = {0}; + int empty_chains = 0; + + + if (f == NULL) + exit(-1); + + /* Get an exclusive lock on all bytes of the file, including those not yet written + * (i.e.: another thread cannot write past the current end of the file). */ + if (rc_flock(f) != 0) + exit(-1); + + /* Get the filename of the rainbow table log to write to, then open it for appending. + * This is the same filename as the rainbow table, but with ".log" appended. */ + get_rt_log_filename(log_filename, sizeof(log_filename), filename); + l = rc_fopen(log_filename, 1); + if (l == NULL) + exit(-1); + + /* Get a lock on the log. Probably not strictly necessary, since the table is locked + * first, and other threads are blocked at this point... */ + if (rc_flock(l) != 0) + fprintf(stderr, "\nError while locking log file!\n"); + + /* Go to the end of the table file. */ + if (rc_fseek(f, 0, RCSEEK_END) != 0) { + fprintf(stderr, "Error seeking to end of output file.\n"); + exit(-1); + } + + /* If we have results that extend past the end of the file, write zeros as + * placeholders until we get to the point where our data starts. */ + file_size = rc_ftell(f); + + rt_log(l, "Thread #%u: file size at start is %u (%u chains)\n", thread_id, file_size, file_size / CHAIN_SIZE); + + empty_chains = (int)((((start_indices[0] - first_generated_chain) * CHAIN_SIZE) - file_size) / CHAIN_SIZE); + + if (empty_chains > 0) + rt_log(l, "\tWriting %d empty chains (%u bytes)\n", empty_chains, empty_chains * CHAIN_SIZE); + + for (i = 0; i < empty_chains; i++) { + rc_fwrite(&start, sizeof(start), 1, f); + rc_fwrite(&start, sizeof(start), 1, f); + } + + /* Otherwise, if another thread wrote placeholders already, seek to the point at which + * we need to overwrite. */ + rt_log(l, "\tSeeking to position %lu (chain #%lu).\n", (start_indices[0] - first_generated_chain) * CHAIN_SIZE, start_indices[0] - first_generated_chain); + if (rc_fseek(f, (start_indices[0] - first_generated_chain) * CHAIN_SIZE, RCSEEK_SET) != 0) { + perror("Error seeking in file"); + exit(-1); + } + + /* Write the chains. */ + for (i = 0; i < start_indices_size; i++) { + start = start_indices[i]; + for (j = (i * chains_per_work_unit); (j < ((i * chains_per_work_unit) + chains_per_work_unit)) && (j < end_indices_size); j++) { + rc_fwrite(&start, sizeof(cl_ulong), 1, f); + rc_fwrite(&(end_indices[j]), sizeof(cl_ulong), 1, f); + start++; + } + } + + if (start_indices_size > 0) + rt_log(l, "\tWrote chains start indices from %"PRIu64" to %"PRIu64"\n", start_indices[0], start - 1); + + + rc_fclose(l); + rc_fclose(f); +} + + +int main(int ac, char **av) { + cl_platform_id platforms[MAX_NUM_PLATFORMS] = {0}; + cl_device_id devices[MAX_NUM_DEVICES] = {0}; + pthread_t threads[MAX_NUM_DEVICES] = {0}; + char filename[256] = {0}, time_str[128] = {0}; + + FILE *f = NULL; + unsigned int file_size = 0; + thread_args *args = NULL; + char *hash_name = NULL, *charset_name = NULL, *charset = NULL; + unsigned int plaintext_len_min = 0, plaintext_len_max = 0, total_chains_in_table = 0, table_index = 0, benchmark_mode = 0; + unsigned int resuming_table = 0; /* Set when a table gen is being resumed. */ + cl_uint hash_type = 0, chain_len = 0, num_platforms = 0, num_devices = 0; + uint64_t part_index = 0; + int i = 0; + + + ENABLE_CONSOLE_COLOR(); + PRINT_PROJECT_HEADER(); +#ifndef _WIN32 + /* Allows printf() to insert commas in thousandths place. */ + setlocale(LC_NUMERIC, ""); + /*setenv("CUDA_CACHE_DISABLE", "1", 1);*/ /* Disables kernel caching. */ + /*setenv("HSA_ENABLE_SDMA", "0", 1);*/ /* The ROCm driver on AMD Vega 64 doesn't work without this. */ +#endif + + if ((ac != 9) && (ac != 11)) + print_usage_and_exit(av[0], -1); + if ((ac == 11) && (strcmp(av[9], "-gws") != 0)) + print_usage_and_exit(av[0], -1); + + /* Read command-line arguments. */ + hash_name = av[1]; + charset_name = av[2]; + plaintext_len_min = (unsigned int)atoi(av[3]); + plaintext_len_max = (unsigned int)atoi(av[4]); + table_index = (unsigned int)atoi(av[5]); + chain_len = (unsigned int)atoi(av[6]); + total_chains_in_table = (unsigned int)atoi(av[7]); + + /* See if the user wants to run the benchmarks. */ + if (strcmp(av[8], "-bench") == 0) { + benchmark_mode = 1; + printf("Benchmarks have been disabled in this release due to inconsistent results. They may be re-implemented in a future release.\n\nIn the meantime, a rough benchmark can be achieved by generating the following table:\n\n %s ntlm ascii-32-95 8 8 0 422000 1000000 0\n\n", av[0]); + exit(-1); + } else + part_index = (unsigned int)atoi(av[8]); + + /* Manually override the global work size. */ + if (ac == 11) + user_provided_gws = (unsigned int)atoi(av[10]); + + + /* Check that this system has sufficient RAM. */ + CHECK_MEMORY_SIZE(); + + + /* Format the filename based on the user options. */ + snprintf(filename, sizeof(filename) - 1, "%s_%s#%u-%u_%u_%ux%u_%"PRIu64".rt", hash_name, charset_name, plaintext_len_min, plaintext_len_max, table_index, chain_len, total_chains_in_table, part_index); + + + /* If the user provided an invalid hash name, dump the valid options and + * exit. */ + hash_type = hash_str_to_type(hash_name); + if (hash_type == HASH_UNDEFINED) { + fprintf(stderr, "Error: hash \"%s\" not supported. Valid values are:\n", hash_name); + for (i = 0; i < (sizeof(valid_hash_names) / sizeof(struct hash_names)); i++) + fprintf(stderr, "%s\n", valid_hash_names[i].name); + exit(-1); + } + + + charset = validate_charset(charset_name); + if (charset == NULL) { + char buf[256] = {0}; + + get_valid_charsets(buf, sizeof(buf)); + fprintf(stderr, "Error: charset \"%s\" not supported. Valid values are: %s", charset_name, buf); + exit(-1); + } + + + /* Ensure that the plaintext max length is set and is less than 256. Also + * ensure that the max is greater than the min. */ + if ((plaintext_len_max == 0) || (plaintext_len_max > MAX_PLAINTEXT_LEN)) { + fprintf(stderr, "Error: plaintext max length must be greater than 0 and less than 256.\n"); + exit(-1); + } else if (plaintext_len_min > plaintext_len_max) { + fprintf(stderr, "Error: plaintext min length must be less than plaintext max length.\n"); + exit(-1); + } else if (plaintext_len_min < 8) { + printf("\n!! Warning: the minimum plaintext length is less than 8. In present day, it is not very efficient to use rainbow tables to crack passwords of length 1 through 7; GPU brute-forcing is much more effective in those cases. Continuing...\n\n"); + } + + /* Ensure that the chain length and chain counts are set. */ + if ((chain_len == 0) || (total_chains_in_table == 0)) { + fprintf(stderr, "Chain length and chain count must both be greater than 0.\n"); + exit(-1); + } + + /* The original rcrack didn't support chain counts >= 128M, as that would + * result in files greater than 2GB in size. It may work with modern + * rcrack/rcracki_mt, but its untested as of right now... */ + if (total_chains_in_table >= 134217728) + printf("\nWARNING: chain counts >= 134217728 are untested. Generated tables may not work in rcrack/rcracki_mt. Continuing anyway...\n\n"); + + /* Create the output file and test if it can be successfully locked. */ + f = rc_fopen(filename, 1); + if (f == NULL) { + fprintf(stderr, "Failed to create/open file: %s\n", filename); + exit(-1); + } + + if (rc_flock(f) != 0) { + fprintf(stderr, "Error locking file: %s\n", filename); + exit(-1); + } + + file_size = rc_ftell(f); /* File was opened for appending, so this holds the size. */ + rc_fclose(f); + + /* If the file size implies that it is already complete, run the verifier on it. */ + if (file_size == (total_chains_in_table * CHAIN_SIZE)) { + if (verify_rainbowtable_file(filename, VERIFY_TABLE_TYPE_GENERATED, VERIFY_TABLE_IS_COMPLETE, VERIFY_TRUNCATE_ON_ERROR, -1)) { + /* The table is complete, so tell the user and exit. */ + printf("Table in \"%s\" already appears to be complete. Terminating...\n", filename); + exit(0); + } else { /* The table was invalid, and was truncated, so we should continue... */ + struct stat st; + memset(&st, 0, sizeof(st)); + + /* Since the table was truncated above, update the file size. */ + if (stat(filename, &st) != 0) { + perror("Error calling stat()"); + exit(-1); + } + file_size = st.st_size; + } + } + + /* If the file already exists and isn't empty, then verify the file, and update + * the start_index so that we resume generation. */ + if (file_size > 0) { + printf("\n !! WARNING !!\n\nIt appears that the output table is partially generated. An attempt to resume generation will be made, but know that this is experimental and may end up failing after hours of work. A near-future release will further refine this feature.\n\n"); fflush(stdout); + + verify_rainbowtable_file(filename, VERIFY_TABLE_TYPE_GENERATED, VERIFY_TABLE_MAY_BE_INCOMPLETE, VERIFY_TRUNCATE_ON_ERROR, -1); + + /* fopen()'s modes are weird. Its easier to just re-open the file for reading + * at this point, rather than change the code above and re-use the open handle. */ + f = rc_fopen(filename, 0); + if (f == NULL) + exit(-1); + + /* The file size may be different now if the verification function, above, + * truncated it due to errors. Ensure that at least one chain is in the file. */ + rc_fseek(f, 0, RCSEEK_END); + if (rc_ftell(f) >= CHAIN_SIZE) { + + /* Seek to the last starting index in the file and read it. */ + rc_fseek(f, CHAIN_SIZE, RCSEEK_END); + rc_fread(&start_index, sizeof(start_index), 1, f); + + start_index++; /* Increment the index to the next one needed. */ + first_generated_chain = start_index; + + /* The number of chains left to generate would be the total requested by the + * user, minus the number of chains already in the file. */ + rc_fseek(f, 0, RCSEEK_END); + num_chains_to_generate = total_chains_in_table - (rc_ftell(f) / CHAIN_SIZE); + + resuming_table = 1; + } + rc_fclose(f); + } else { /* This is a new table. */ + uint64_t plaintext_space_up_to_index[16] = {0}; + + + start_index = first_generated_chain = total_chains_in_table * part_index; + num_chains_to_generate = total_chains_in_table; + + /* Ensure our plaintext_space_up_to_index array is large enough to call + * fill_plaintext_space_table() with. */ + if (plaintext_len_max > (sizeof(plaintext_space_up_to_index) + 1)) { + fprintf(stderr, "\n !! Warning: plaintext length max is too large (%u > %"PRIu64"). Skipping start index safety check.\n\n", plaintext_len_max, sizeof(plaintext_space_up_to_index) + 1); fflush(stderr); + } else { + uint64_t plaintext_space_total = fill_plaintext_space_table(strlen(charset), plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + /* Ensure that the user didn't specify a part index so great that it + * overflows the plaintext space total. If so, calculate the largest + * part index that can be used with this character set and tell the + * user before terminating. */ + if (start_index + num_chains_to_generate > plaintext_space_total) { + uint64_t highest_part_index = plaintext_space_total / num_chains_to_generate; + if ((plaintext_space_total % num_chains_to_generate) != 0) + highest_part_index--; + + fprintf(stderr, "\n !! Error: start index (%"PRIu64") + number of chains to generate (%u) > plaintext space total (%"PRIu64")! The highest part index that can be generated without causing this overflow is %"PRIu64" (hint: you set the part index too high (%"PRIu64").\n\n", start_index, num_chains_to_generate, plaintext_space_total, highest_part_index, part_index); fflush(stderr); + exit(-1); + } + } + } + + /* Get the number of platforms and devices available. */ + get_platforms_and_devices(MAX_NUM_PLATFORMS, platforms, &num_platforms, MAX_NUM_DEVICES, devices, &num_devices, VERBOSE); + + /* Check the device type and set flags.*/ + if (num_devices > 0) { + char device_vendor[128] = {0}; + + get_device_str(devices[0], CL_DEVICE_VENDOR, device_vendor, sizeof(device_vendor) - 1); + if (strstr(device_vendor, "Advanced Micro Devices") != NULL) + is_amd_gpu = 1; + } + + /* Initialize the barrier. This is used in some cases to ensure kernels across + * multiple devices run concurrently. */ + if (pthread_barrier_init(&barrier, NULL, num_devices) != 0) { + fprintf(stderr, "pthread_barrier_init() failed.\n"); + exit(-1); + } + + args = calloc(num_devices, sizeof(thread_args)); + if (args == NULL) { + fprintf(stderr, "Error while creating thread arg array.\n"); + exit(-1); + } + + /* Print info about how we're generating the table. */ + printf("Output file:\t\t%s\nHash algorithm:\t\t%s\nCharset name:\t\t%s\nCharset:\t\t%s\nCharset length:\t\t%"PRIu64"\nPlaintext length range: %u - %u\nReduction offset:\t0x%x\nChain length:\t\t%u\nNumber of chains:\t%u\nPart index:\t\t%"PRIu64"\n\n", filename, hash_name, charset_name, charset, strlen(charset), plaintext_len_min, plaintext_len_max, TABLE_INDEX_TO_REDUCTION_OFFSET(table_index), chain_len, total_chains_in_table, part_index); + + /* If we found a file to append to, tell the user what's happening. */ + if (resuming_table) + printf("Appending to existing file (%s) at chain #X.\n\n", filename); + + /* Print a time stamp of when the generation begins. */ + start_timer(&global_start_time); + last_update_time = global_start_time; + + { + time_t current_time = time(NULL); + strftime(time_str, sizeof(time_str), "%b. %d, %Y at %I:%M %p", localtime(¤t_time)); + printf("Table generation started on %s...\n\n", time_str); fflush(stdout); + } + + /* Spin up one host thread per GPU. */ + for (i = 0; i < num_devices; i++) { + args[i].benchmark_mode = benchmark_mode; + args[i].hash_type = hash_type; + args[i].charset = charset; + args[i].plaintext_len_min = plaintext_len_min; + args[i].plaintext_len_max = plaintext_len_max; + args[i].table_index = table_index; + args[i].reduction_offset = TABLE_INDEX_TO_REDUCTION_OFFSET(table_index); + args[i].chain_len = chain_len; + args[i].filename = filename; + args[i].initial_chains_per_execution = INITIAL_CHAINS_PER_EXECUTION; + args[i].gpu.device_number = i; + args[i].gpu.device = devices[i]; + + if (benchmark_mode) + args[i].initial_chains_per_execution = total_chains_in_table; + + if (pthread_create(&(threads[i]), NULL, &host_thread, &(args[i]))) { + perror("Failed to create thread"); + exit(-1); + } + } + + /* Wait for all threads to finish. */ + for (i = 0; i < num_devices; i++) { + if (pthread_join(threads[i], NULL) != 0) { + perror("Failed to join with thread"); + exit(-1); + } + } + + /*elapsed = difftime(time(NULL), global_start_time);*/ + if (benchmark_mode) { /* Benchmark... */ + /*unsigned int total_chains_generated = total_chains_in_table * (num_devices * args[0].compute_unit_multiple); + + printf("Generated %u chains on each of %u devices (%u chains total) in %.1f seconds.\nRate: %.1f/s\n", (total_chains_in_table * args[0].compute_unit_multiple), num_devices, total_chains_generated, elapsed, total_chains_generated / elapsed);*/ + } else { /* Normal table generation... */ + struct stat st; + + memset(&st, 0, sizeof(struct stat)); + + /* Output the run time, number of chains generated, and rate. */ + output_progress(0); + printf("\nGeneration complete!\n"); + + if (stat(filename, &st) == 0) { + unsigned int actual_num_chains = st.st_size / CHAIN_SIZE; + + /* If we generated more chains than the user requested, rename the file to + * reflect this. */ + if (actual_num_chains > total_chains_in_table) { + if (VERBOSE) + printf("\nNote %u extra chains created. Truncating...\n", actual_num_chains - total_chains_in_table); + if (truncate(filename, total_chains_in_table * CHAIN_SIZE) != 0) { + fprintf(stderr, "Error while truncating file %s: %s (%d)\n", filename, strerror(errno), errno); + } + + /* + char new_filename[sizeof(filename)]; + memset(new_filename, 0, sizeof(new_filename)); + + snprintf(new_filename, sizeof(new_filename) - 1, "%s_%s#%u-%u_%u_%ux%u_%u.rt", hash_name, charset_name, plaintext_len_min, plaintext_len_max, table_index, chain_len, actual_num_chains, part_index); + if (!rename(filename, new_filename)) { + printf("\nNote: because extra chains were generated, the file name was renamed to reflect this (from \"%s\" to \"%s\").\n\n", filename, new_filename); + strncpy(filename, new_filename, sizeof(filename) - 1); + } else + perror("Error while renaming file"); + */ + } + } + + /* Verify that the new table is valid. */ + printf("Now verifying rainbow table... "); + fflush(stdout); + if (!verify_rainbowtable_file(filename, VERIFY_TABLE_TYPE_GENERATED, VERIFY_TABLE_IS_COMPLETE, VERIFY_TRUNCATE_ON_ERROR, -1)) { + char log_filename[256] = {0}; + + get_rt_log_filename(log_filename, sizeof(log_filename), filename); + printf("\n"); + fprintf(stderr, "Error while verifying rainbowtable! It has been truncated to just before the point of error. Please give the following file to the developer: %s\n\n", log_filename); + } else { + /* Delete the rainbow table generation log, since the table was verified to be + * correct. No need to keep this debugging info. */ + delete_rt_log(filename); + printf("done!\n"); + } + } + + for (i = 0; i < num_devices; i++) + rc_clReleaseDevice(devices[i]); + + pthread_barrier_destroy(&barrier); + FREE(args); + return 0; +} diff --git a/crackalack_lookup.c b/crackalack_lookup.c new file mode 100644 index 0000000..970dbc2 --- /dev/null +++ b/crackalack_lookup.c @@ -0,0 +1,1823 @@ +/* + * Rainbow Crackalack: crackalack_lookup.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* + * Performs GPU-accelerated password hash lookups on rainbow tables. + */ + +#ifdef _WIN32 +#include +#else +#include +#define O_BINARY 0 +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "charset.h" +#include "clock.h" +#include "cpu_rt_functions.h" +#include "hash_validate.h" +#include "misc.h" +#include "opencl_setup.h" +#include "rtc_decompress.h" +#include "shared.h" +#include "test_shared.h" /* TODO: move hex_to_bytes() elsewhere. */ +#include "verify.h" +#include "version.h" + +#define VERBOSE 1 +#define PRECOMPUTE_KERNEL_PATH "precompute.cl" +#define PRECOMPUTE_NTLM8_KERNEL_PATH "precompute_ntlm8.cl" +#define FALSE_ALARM_KERNEL_PATH "false_alarm_check.cl" +#define FALSE_ALARM_NTLM8_KERNEL_PATH "false_alarm_check_ntlm8.cl" + + + +/* Struct to form a linked list of precomputed end indices, and potential start indices (which are usually false alarms). */ +struct _precomputed_and_potential_indices { + char *hash; + cl_ulong *precomputed_end_indices; + cl_uint num_precomputed_end_indices; + + cl_ulong *potential_start_indices; + unsigned int num_potential_start_indices; + unsigned int potential_start_indices_size; + unsigned int *potential_start_index_positions; /* Buffer size is always num_potential_start_indices. */ + + char *plaintext; /* Set if hash is cracked. */ + char *index_filename; /* File path containing the ".index" file. */ + struct _precomputed_and_potential_indices *next; +}; +typedef struct _precomputed_and_potential_indices precomputed_and_potential_indices; + + +/* Struct to represent one GPU device. */ +typedef struct { + cl_uint device_number; + cl_device_id device; + cl_context context; + cl_program program; + cl_kernel kernel; + cl_command_queue queue; + cl_uint num_work_units; +} gpu_dev; + + +/* Struct to pass arguments to a host thread. */ +typedef struct { + unsigned int hash_type; + char *hash_name; + char *hash; /* In hex. */ + char *charset; + char *charset_name; + unsigned int plaintext_len_min; + unsigned int plaintext_len_max; + unsigned int table_index; + unsigned int reduction_offset; + unsigned int chain_len; + + unsigned int total_devices; + uint64_t *results; + unsigned int num_results; + + cl_ulong *potential_start_indices; + unsigned int num_potential_start_indices; + + /* Buffer size is always num_potential_start_indices. */ + unsigned int *potential_start_index_positions; + + /* Length is always num_potential_start_indices. */ + cl_ulong *hash_base_indices; + + gpu_dev gpu; +} thread_args; + + +/* Struct to pass to binary search threads. */ +typedef struct { + cl_ulong *rainbow_table; + unsigned int num_chains; + precomputed_and_potential_indices *ppi_head; + unsigned int thread_number; + unsigned int total_threads; +} search_thread_args; + + +unsigned int count_tables(char *dir); +void free_loaded_hashes(char **hashes, unsigned int *num_hashes); +void *host_thread_false_alarm(void *ptr); +cl_ulong *search_precompute_cache(char *index_data, unsigned int *num_indices, char *filename, unsigned int filename_size); +void search_tables(char *dir, precomputed_and_potential_indices *ppi, rt_parameters *rt_params, thread_args *args); +void save_cracked_hash(precomputed_and_potential_indices *ppi, unsigned int hash_type); + + +/* The path of the pot file to store cracked hashes in. This can be overridden by + * a command line arg. */ +char jtr_pot_filename[128] = "rainbowcrackalack_jtr.pot"; +char hashcat_pot_filename[128] = "rainbowcrackalack_hashcat.pot"; + +/* The number of seconds spent on precomputation, file I/O, searching, and false alarm + * checking. */ +double time_precomp = 0, time_io = 0, time_searching = 0, time_falsealarms = 0; + +/* The total number of false alarms, chains processed, respectively. */ +uint64_t num_falsealarms = 0, num_chains_processed = 0; + +/* The total number of hashes cracked in this invokation and number of tables + * processed, respectively. */ +unsigned int num_cracked = 0, num_tables_processed = 0; + +/* Mutex to protect the precomputed_and_potential_indices array. _*/ +pthread_mutex_t ppi_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Barrier to ensure that kernels on multiple devices are all run at the same time. + * The closed-source AMD driver on Windows effectively blocks other devices while + * one kernel is running; this ensures parallelization in that environment, since + * all kernels will run at once. The open source AMD ROCm driver on Linux may or + * may not get a very slight performance bump with this enabled. */ +pthread_barrier_t barrier = {0}; + +/* Set to 1 if AMD GPUs found. */ +unsigned int is_amd_gpu = 0; + +/* The global work size, as over-ridden by the user on the command line. */ +size_t user_provided_gws = 0; + +/* The total number of precomputed indices loaded into memory. Each one of these is + * a cl_ulong (8 bytes). */ +uint64_t total_precomputed_indices_loaded = 0; + +/* Set to 1 if the NTLM8 message was printed. This prevents console spam. */ +unsigned int printed_precompute_ntlm8_message = 0; +unsigned int printed_false_alarm_ntlm8_message = 0; + +/* The total number of tables in all subdirectories of the directory given + * by the user. */ +unsigned int total_tables = 0; + +/* The number of the current table being processed. */ +unsigned int current_table = 0; + + +#define LOCK_PPI() \ + if (pthread_mutex_lock(&ppi_mutex)) { perror("Failed to lock mutex"); exit(-1); } + +#define UNLOCK_PPI() \ + if (pthread_mutex_unlock(&ppi_mutex)) { perror("Failed to unlock mutex"); exit(-1); } + + +/* Adds a potential start index (and position within the chain) to check for false + * alarms. */ +void add_potential_start_index_and_position(precomputed_and_potential_indices *ppi, cl_ulong start, unsigned int position) { + #define POTENTIAL_START_INDICES_INITIAL_SIZE 16 + + LOCK_PPI(); + + /* Initialize the potential_start_indices buffer if it isn't already. */ + if (ppi->potential_start_indices == NULL) { + ppi->potential_start_indices = calloc(POTENTIAL_START_INDICES_INITIAL_SIZE, sizeof(cl_ulong)); + ppi->potential_start_index_positions = calloc(POTENTIAL_START_INDICES_INITIAL_SIZE, sizeof(cl_ulong)); + if ((ppi->potential_start_indices == NULL) || (ppi->potential_start_index_positions == NULL)) { + fprintf(stderr, "Failed to initialize potential_start_indices / potential_start_index_positions buffer.\n"); + exit(-1); + } + ppi->potential_start_indices_size = POTENTIAL_START_INDICES_INITIAL_SIZE; + } + + /* If its time to re-size the array... */ + if (ppi->num_potential_start_indices == ppi->potential_start_indices_size) { + unsigned int new_size_in_ulongs = ppi->potential_start_indices_size * 2; + + /*printf("Resizing array from %u to %u.\n", ppi->potential_start_indices_size, new_size_in_ulongs);*/ + ppi->potential_start_indices = recalloc(ppi->potential_start_indices, new_size_in_ulongs * sizeof(cl_ulong), ppi->potential_start_indices_size * sizeof(cl_ulong)); + ppi->potential_start_index_positions = recalloc(ppi->potential_start_index_positions, new_size_in_ulongs * sizeof(cl_ulong), ppi->potential_start_indices_size * sizeof(cl_ulong)); + if ((ppi->potential_start_indices == NULL) || (ppi->potential_start_index_positions == NULL)) { + fprintf(stderr, "Failed to re-allocate potential_start_indices/potential_start_index_positions buffer to %u.\n", new_size_in_ulongs); + exit(-1); + } + ppi->potential_start_indices_size = new_size_in_ulongs; + } + ppi->potential_start_indices[ppi->num_potential_start_indices] = start; + ppi->potential_start_index_positions[ppi->num_potential_start_indices] = position; + ppi->num_potential_start_indices++; + + UNLOCK_PPI(); +} + + +void check_false_alarms(precomputed_and_potential_indices *ppi, thread_args *args) { + pthread_t threads[MAX_NUM_DEVICES] = {0}; + char time_str[128] = {0}; + struct timespec start_time = {0}; + cl_ulong plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + + unsigned int num_potential_start_indices = 0, i = 0, j = 0; + unsigned int total_devices = args[0].total_devices; + cl_ulong plaintext_space_total = 0; + double time_delta = 0.0; + + precomputed_and_potential_indices *ppi_cur = ppi; + cl_ulong *potential_start_indices = NULL, *hash_base_indices = NULL; + unsigned int *potential_start_index_positions = NULL; + precomputed_and_potential_indices **ppi_refs = NULL; + + + /* First count all the potential start indices. */ + while(ppi_cur) { + num_potential_start_indices += ppi_cur->num_potential_start_indices; + ppi_cur = ppi_cur->next; + } + + /* If no potential matches were found, there's nothing else to do. */ + if (num_potential_start_indices == 0) { + printf("No matches found in table.\n"); + return; + } + printf(" Checking %u potential matches...\n", num_potential_start_indices); fflush(stdout); + num_falsealarms += num_potential_start_indices; + + /* Allocate a buffer to hold them all. */ + potential_start_indices = calloc(num_potential_start_indices, sizeof(cl_ulong)); + potential_start_index_positions = calloc(num_potential_start_indices, sizeof(cl_ulong)); + hash_base_indices = calloc(num_potential_start_indices, sizeof(cl_ulong)); + ppi_refs = calloc(num_potential_start_indices, sizeof(precomputed_and_potential_indices *)); + if ((potential_start_indices == NULL) || (potential_start_index_positions == NULL) || (hash_base_indices == NULL) || (ppi_refs == NULL)) { + fprintf(stderr, "Error while creating buffer for potential start indices/positions/hash indices/ppi refs.\n"); + exit(-1); + } + + plaintext_space_total = fill_plaintext_space_table(strlen(args->charset), args->plaintext_len_min, args->plaintext_len_max, plaintext_space_up_to_index); + + /* Collate all the start indices into one buffer. */ + ppi_cur = ppi; + while(ppi_cur) { + unsigned char hash[MAX_HASH_OUTPUT_LEN] = {0}; + unsigned int hash_len = hex_to_bytes(ppi_cur->hash, sizeof(hash), hash); + cl_ulong hash_base_index = hash_to_index(hash, hash_len, args->reduction_offset, plaintext_space_total, 0); /* We always use position 0 here. When the GPU code is comparing indices, it will add in the current position. */ + + + if (ppi_cur->plaintext == NULL) { + for (i = 0; i < ppi_cur->num_potential_start_indices; i++, j++) { + potential_start_indices[j] = ppi_cur->potential_start_indices[i]; + potential_start_index_positions[j] = ppi_cur->potential_start_index_positions[i]; + hash_base_indices[j] = hash_base_index; + + /* For this index, hold a reference to the ppi struct. This later lets us find + * the ppi, given a result index from the GPU. */ + ppi_refs[j] = ppi_cur; + } + } + + ppi_cur = ppi_cur->next; + } + + /*for (i = 0; i < num_potential_start_indices; i++) + printf("Start point: %lu; Chain position: %u; hash base index: %lu\n", potential_start_indices[i], potential_start_index_positions[i], hash_base_indices[i]);*/ + + /* Start the timer false alarm checking. */ + start_timer(&start_time); + + /* Start one thread to control each GPU. */ + for (i = 0; i < total_devices; i++) { + + /* Each thread gets the same reference to the list of potential start indices. */ + args[i].potential_start_indices = potential_start_indices; + args[i].num_potential_start_indices = num_potential_start_indices; + args[i].potential_start_index_positions = potential_start_index_positions; + args[i].hash_base_indices = hash_base_indices; + + if (pthread_create(&(threads[i]), NULL, &host_thread_false_alarm, &(args[i]))) { + perror("Failed to create thread"); + exit(-1); + } + } + + /* Wait for all threads to finish. */ + for (i = 0; i < total_devices; i++) { + if (pthread_join(threads[i], NULL) != 0) { + perror("Failed to join with thread"); + exit(-1); + } + } + + /* Search for valid results, and update the ppi with the plaintext. */ + for (i = 0; i < total_devices; i++) { + for (j = 0; j < args[i].num_results; j++) { + if (args[i].results[j] != 0) { + char plaintext[MAX_PLAINTEXT_LEN] = {0}; + unsigned int plaintext_len = 0; + + + index_to_plaintext(args[i].results[j], args[i].charset, strlen(args[i].charset), args[i].plaintext_len_min, args[i].plaintext_len_max, plaintext_space_up_to_index, plaintext, &plaintext_len); + + /* Double check NTLM results to weed out super false alarms. */ + if (args[i].hash_type == HASH_NTLM) { + unsigned char hash[16] = {0}; + char hash_hex[(sizeof(hash) * 2) + 1] = {0}; + + + ntlm_hash(plaintext, plaintext_len, hash); + if (!bytes_to_hex(hash, sizeof(hash), hash_hex, sizeof(hash_hex)) || \ + (strcmp(hash_hex, ppi_refs[j]->hash) != 0)) { + /*printf("Found super false positive!: NTLM('%s') != %s\n", plaintext, ppi_refs[j]->hash);*/ + continue; + } + } else + printf("WARNING: CPU code to double-check this cracked hash has not yet been added. There is a 60%% chance this is a false positive! A workaround is to use John The Ripper to validate this result(s).\n"); + + /* Its official: we cracked a hash! */ + + /* Save the plaintext, clear the precomputed end indices list (since its + * no longer useful, save the hash/plaintext combo into the pot file, and + * tell the user. */ + ppi_refs[j]->plaintext = strdup(plaintext); + ppi_refs[j]->num_precomputed_end_indices = 0; + FREE(ppi_refs[j]->precomputed_end_indices); + + save_cracked_hash(ppi_refs[j], args[i].hash_type); + printf("HASH CRACKED => %s:%s\n", ppi_refs[j]->hash, plaintext); fflush(stdout); + } + } + } + time_delta = get_elapsed(&start_time); + + time_falsealarms += time_delta; + seconds_to_human_time(time_str, sizeof(time_str), (unsigned int)time_delta); + printf(" Completed false alarm checks in %s.\n", time_str); fflush(stdout); + + FREE(potential_start_indices); + FREE(potential_start_index_positions); + FREE(hash_base_indices); + FREE(ppi_refs); + FREE(args->results); + args->num_results = 0; +} + + +/* Print a warning to the user if a lot of memory is used by the pre-computed indices. */ +void check_memory_usage() { + uint64_t total_memory = get_total_memory(), num_precompute_bytes = 0; + double percent_memory_used = 0.0; + + + if (total_memory == 0) + return; + + num_precompute_bytes = total_precomputed_indices_loaded * sizeof(cl_ulong); + percent_memory_used = ((double)num_precompute_bytes / (double)total_memory) * 100; + if (percent_memory_used > 65) { + printf("\n\n\n\t!! WARNING !!\n\n\tThe pre-computed indices take up more than 65%% of total RAM! This may result in strange failures from clFinish() and other OpenCL functions. If this happens, either run this lookup with a smaller number of hashes at a time, or do it on a machine with more memory.\n\n\tMemory used by pre-compute indices: %"QUOTE PRIu64"\n\tTotal RAM: %"QUOTE PRIu64"\n\tPercent used: %.1f%%\n\n\n\n", num_precompute_bytes, total_memory, percent_memory_used); + } +} + + +/* Free all the potential start indices. */ +void clear_potential_start_indices(precomputed_and_potential_indices *ppi) { + precomputed_and_potential_indices *ppi_cur = ppi; + + + while(ppi_cur) { + FREE(ppi_cur->potential_start_indices); + FREE(ppi_cur->potential_start_index_positions); + ppi_cur->num_potential_start_indices = 0; + + ppi_cur = ppi_cur->next; + } +} + + +/* Returns the total number of *.rt and *.rtc in all subdirectories of the + * specified directory. */ +unsigned int count_tables(char *dir) { + DIR *d = NULL; + struct dirent *de = NULL; + unsigned int ret = 0, is_file = 0, is_dir = 0; + + + d = opendir(dir); + if (d == NULL) + return 0; + + while ((de = readdir(d)) != NULL) { +#ifdef _WIN32 + struct stat st = {0}; + char path[256] = {0}; + + /* The d_type field of the dirent struct is not a POSIX standard, and Windows + * doesn't support it. So we fall back to using stat(). */ + snprintf(path, sizeof(path) - 1, "%s\\%s", dir, de->d_name); + if (stat(path, &st) < 0) { + fprintf(stderr, "Error: failed to stat() %s: %s. Continuing anyway...\n", path, strerror(errno)); fflush(stderr); + is_file = 0; + is_dir = 0; + } else { + is_file = S_ISREG(st.st_mode); + is_dir = S_ISDIR(st.st_mode); + } +#else + /* Linux has the d_type field, which is much more efficient to use than doing + * another stat(). */ + is_file = (de->d_type == DT_REG); + is_dir = (de->d_type == DT_DIR); +#endif + + if (is_file && (str_ends_with(de->d_name, ".rt") || str_ends_with(de->d_name, ".rtc"))) + ret++; + else if (is_dir && (strcmp(de->d_name, ".") != 0) && (strcmp(de->d_name, "..") != 0)) + ret += count_tables(de->d_name); + } + + closedir(d); + return ret; +} + + +/* Free the hashes we loaded from disk or command line. */ +void free_loaded_hashes(char **hashes, unsigned int *num_hashes) { + if (hashes != NULL) { + unsigned int i = 0; + for (; i < *num_hashes; i++) { + FREE(hashes[i]); + } + FREE(hashes); + *num_hashes = 0; + } +} + + +/* Free the precomputed_hashes linked list. */ +void free_precomputed_and_potential_indices(precomputed_and_potential_indices **ppi_head) { + precomputed_and_potential_indices *ppi = *ppi_head, *ppi_next = NULL; + + + while (ppi) { + ppi_next = ppi->next; + + FREE(ppi->precomputed_end_indices); + FREE(ppi->potential_start_indices); + FREE(ppi->potential_start_index_positions); + FREE(ppi->index_filename); + ppi->num_potential_start_indices = 0; + FREE(ppi->plaintext); + FREE(ppi); + + ppi = ppi_next; + } + *ppi_head = NULL; +} + + +/* Returns the number of CPU cores on this machine. */ +unsigned int get_num_cpu_cores() { +#ifdef _WIN32 + SYSTEM_INFO sysinfo = {0}; + + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; +#else + return get_nprocs(); +#endif +} + + +/* A host thread which controls each GPU for false alarm checks. */ +void *host_thread_false_alarm(void *ptr) { + thread_args *args = (thread_args *)ptr; + gpu_dev *gpu = &(args->gpu); + cl_context context = NULL; + cl_command_queue queue = NULL; + cl_kernel kernel = NULL; + int err = 0; + char *kernel_path = FALSE_ALARM_KERNEL_PATH, *kernel_name = "false_alarm_check"; + + cl_mem hash_type_buffer = NULL, charset_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, reduction_offset_buffer = NULL, plaintext_space_total_buffer = NULL, plaintext_space_up_to_index_buffer = NULL, device_num_buffer = NULL, total_devices_buffer = NULL, num_start_indices_buffer = NULL, start_indices_buffer = NULL, start_index_positions_buffer = NULL, hash_base_indices_buffer = NULL, output_block_buffer = NULL, exec_block_scaler_buffer = NULL; + /*cl_mem debug_ulong_buffer = NULL;*/ + + cl_ulong *start_indices = NULL, *hash_base_indices = NULL, *plaintext_indices = NULL, *output_block = NULL; + unsigned int *start_index_positions = NULL; + + unsigned int num_start_indices = 0, num_start_index_positions = 0, num_hash_base_indices = 0, num_plaintext_indices = 0, num_exec_blocks = 0, output_block_len = 0, exec_block = 0, output_block_index = 0, plaintext_indicies_index = 0; + uint64_t plaintext_space_total = 0; + cl_ulong plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + size_t gws = 0, kernel_work_group_size = 0, kernel_preferred_work_group_size_multiple = 0; + /*cl_ulong debug_ulong[128] = {0};*/ + + + plaintext_space_total = fill_plaintext_space_table(strlen(args->charset), args->plaintext_len_min, args->plaintext_len_max, plaintext_space_up_to_index); + + num_start_indices = num_start_index_positions = num_hash_base_indices = num_plaintext_indices = args->num_potential_start_indices; + + start_indices = args->potential_start_indices; + start_index_positions = args->potential_start_index_positions; + hash_base_indices = args->hash_base_indices; + + plaintext_indices = calloc(num_plaintext_indices, sizeof(cl_ulong)); + if (plaintext_indices == NULL) { + fprintf(stderr, "Error while allocating buffers.\n"); + exit(-1); + } + + /* If we're generating the standard NTLM 8-character tables, use the special + * optimized kernel instead! */ + if (is_ntlm8(args->hash_type, args->charset, args->plaintext_len_min, args->plaintext_len_max, args->reduction_offset, args->chain_len)) { + kernel_path = FALSE_ALARM_NTLM8_KERNEL_PATH; + kernel_name = "false_alarm_check_ntlm8"; + if ((args->gpu.device_number == 0) && (printed_false_alarm_ntlm8_message == 0)) { /* Only the first thread prints this, and only prints it once. */ + printf("\nNote: optimized NTLM8 kernel will be used for false alarm checks.\n\n"); fflush(stdout); + printed_false_alarm_ntlm8_message = 1; + } + } + + /* Load the kernel. */ + gpu->context = CLCREATECONTEXT(context_callback, &(gpu->device)); + gpu->queue = CLCREATEQUEUE(gpu->context, gpu->device); + load_kernel(gpu->context, 1, &(gpu->device), kernel_path, kernel_name, &(gpu->program), &(gpu->kernel), args->hash_type); + + /* These variables are set so the CLCREATEARG* macros work correctly. */ + context = gpu->context; + queue = gpu->queue; + kernel = gpu->kernel; + + if ((rc_clGetKernelWorkGroupInfo(kernel, gpu->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL) != CL_SUCCESS) || \ + (rc_clGetKernelWorkGroupInfo(kernel, gpu->device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &kernel_preferred_work_group_size_multiple, NULL) != CL_SUCCESS)) { + fprintf(stderr, "Failed to get preferred work group size!\n"); + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + pthread_exit(NULL); + return NULL; + } + + /* If the user provided a static GWS on the command line, use that. Otherwise, + * use the driver's work group size multiplied by the preferred multiple. */ + if (user_provided_gws > 0) { + gws = user_provided_gws; + printf("GPU #%u is using user-provided GWS value of %"PRIu64"\n", gpu->device_number, gws); + } else { + /*gws = kernel_work_group_size * kernel_preferred_work_group_size_multiple;*/ + + /* TODO: fix this so that false alarm checking is done in partitions instead of + * all at once (this can improve speed). Currently, when GWS != num_start_indices, + * lookups don't succeed due to some bug. */ + gws = num_start_indices; + + /* Somehow, on AMD GPUs, the kernel crashes with a message like: + * + * Memory access fault by GPU node-2 (Agent handle: 0x1bb5e00) on address + * 0x7f4c80b27000. Reason: Page not present or supervisor privilege. + * + * A work-around is to set the GWS to the number of start indices and just do it in + * one pass. */ + if (is_amd_gpu) + gws = num_start_indices; + + /*printf("GPU #%u is using dynamic GWS: %"PRIu64" (work group) x %"PRIu64" (pref. multiple) = %"PRIu64"\n", gpu->device_number, kernel_work_group_size, kernel_preferred_work_group_size_multiple, gws);*/ + } + fflush(stdout); + + + /* Count the number of times we need to run the kernel. */ + num_exec_blocks = num_start_indices / gws; + if (num_start_indices % gws != 0) + num_exec_blocks++; + + output_block_len = gws; + output_block = calloc(output_block_len, sizeof(cl_ulong)); + if (output_block == NULL) { + fprintf(stderr, "Error while allocating output buffer(s).\n"); + exit(-1); + } + + CLCREATEARG(0, hash_type_buffer, CL_RO, args->hash_type, sizeof(cl_uint)); + CLCREATEARG_ARRAY(1, charset_buffer, CL_RO, args->charset, strlen(args->charset) + 1); + CLCREATEARG(2, plaintext_len_min_buffer, CL_RO, args->plaintext_len_min, sizeof(cl_uint)); + CLCREATEARG(3, plaintext_len_max_buffer, CL_RO, args->plaintext_len_max, sizeof(cl_uint)); + CLCREATEARG(4, reduction_offset_buffer, CL_RO, args->reduction_offset, sizeof(cl_uint)); + CLCREATEARG(5, plaintext_space_total_buffer, CL_RO, plaintext_space_total, sizeof(cl_ulong)); + CLCREATEARG_ARRAY(6, plaintext_space_up_to_index_buffer, CL_RO, plaintext_space_up_to_index, MAX_PLAINTEXT_LEN * sizeof(cl_ulong)); + CLCREATEARG(7, device_num_buffer, CL_RO, gpu->device_number, sizeof(cl_uint)); + CLCREATEARG(8, total_devices_buffer, CL_RO, args->total_devices, sizeof(cl_uint)); + CLCREATEARG(9, num_start_indices_buffer, CL_RO, num_start_indices, sizeof(cl_uint)); + CLCREATEARG_ARRAY(10, start_indices_buffer, CL_RO, start_indices, num_start_indices * sizeof(cl_ulong)); + CLCREATEARG_ARRAY(11, start_index_positions_buffer, CL_RO, start_index_positions, num_start_index_positions * sizeof(unsigned int)); + CLCREATEARG_ARRAY(12, hash_base_indices_buffer, CL_RO, hash_base_indices, num_hash_base_indices * sizeof(cl_ulong)); + CLCREATEARG_ARRAY(14, output_block_buffer, CL_WO, output_block, output_block_len * sizeof(cl_ulong)); + + for (exec_block = 0; exec_block < num_exec_blocks; exec_block++) { + unsigned int exec_block_scaler = exec_block * gws; + + CLCREATEARG(13, exec_block_scaler_buffer, CL_RO, exec_block_scaler, sizeof(cl_uint)); + + if (is_amd_gpu) { + int barrier_ret = pthread_barrier_wait(&barrier); + if ((barrier_ret != 0) && (barrier_ret != PTHREAD_BARRIER_SERIAL_THREAD)) { + fprintf(stderr, "pthread_barrier_wait() failed!\n"); fflush(stderr); + exit(-1); + } + } + + /* Run the kernel and wait for it to finish. */ + CLRUNKERNEL(gpu->queue, gpu->kernel, &gws); + CLFLUSH(gpu->queue); + CLWAIT(gpu->queue); + + /* Read the results. */ + CLREADBUFFER(output_block_buffer, output_block_len * sizeof(cl_ulong), output_block); + + output_block_index = 0; + while ((plaintext_indicies_index < num_plaintext_indices) && (output_block_index < output_block_len)) + plaintext_indices[plaintext_indicies_index++] = output_block[output_block_index++]; + + CLFREEBUFFER(exec_block_scaler_buffer); + } + + /* Set the results so the main thread can access them. */ + args->results = plaintext_indices; + args->num_results = num_plaintext_indices; + + /* + { + unsigned int i = 0; + + printf("results: "); + for (i = 0; i < args->num_results; i++) + printf("%lu ", args->results[i]); + + printf("\n"); + } + */ + + FREE(output_block); + + CLFREEBUFFER(hash_type_buffer); + CLFREEBUFFER(charset_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(reduction_offset_buffer); + CLFREEBUFFER(plaintext_space_total_buffer); + CLFREEBUFFER(plaintext_space_up_to_index_buffer); + CLFREEBUFFER(device_num_buffer); + CLFREEBUFFER(total_devices_buffer); + CLFREEBUFFER(num_start_indices_buffer); + CLFREEBUFFER(start_indices_buffer); + CLFREEBUFFER(start_index_positions_buffer); + CLFREEBUFFER(hash_base_indices_buffer); + CLFREEBUFFER(output_block_buffer); + + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + + pthread_exit(NULL); + return NULL; +} + + +/* A host thread which controls each GPU for hash pre-computation. */ +void *host_thread_precompute(void *ptr) { + thread_args *args = (thread_args *)ptr; + gpu_dev *gpu = &(args->gpu); + cl_context context = NULL; + cl_command_queue queue = NULL; + cl_kernel kernel = NULL; + int err = 0; + char *kernel_path = PRECOMPUTE_KERNEL_PATH, *kernel_name = "precompute"; + + cl_mem hash_type_buffer = NULL, hash_buffer = NULL, hash_len_buffer = NULL, charset_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, table_index_buffer = NULL, chain_len_buffer = NULL, device_num_buffer = NULL, total_devices_buffer = NULL, exec_block_scaler_buffer = NULL, output_block_buffer = NULL/*, debug_buffer = NULL*/; + + size_t gws = 0; + cl_ulong *output = NULL, *output_block = NULL; + unsigned int output_len = 0, output_block_len = 0, num_exec_blocks = 0, exec_block = 0, output_index = 0, output_block_index = 0; + /*unsigned int i = 0;*/ + + unsigned char hash_binary[32] = {0}; + cl_uint hash_binary_len = 0; + + + /* Convert the hash from a hex string to bytes.*/ + hash_binary_len = hex_to_bytes(args->hash, sizeof(hash_binary), hash_binary); + + /* The work size is the chain length divided among the total number of GPUs. Round + * up if it doesn't divide evenly; this results in slightly more work being done in + * order to get complete coverage. */ + output_len = args->chain_len / args->total_devices; + if ((args->chain_len % args->total_devices) != 0) + output_len++; + + /* If we're generating the standard NTLM 8-character tables, use the special + * optimized kernel instead! */ + if (is_ntlm8(args->hash_type, args->charset, args->plaintext_len_min, args->plaintext_len_max, args->reduction_offset, args->chain_len)) { + kernel_path = PRECOMPUTE_NTLM8_KERNEL_PATH; + kernel_name = "precompute_ntlm8"; + if ((args->gpu.device_number == 0) && (printed_precompute_ntlm8_message == 0)) { /* Only the first thread prints this, and only prints it once. */ + printf("\nNote: optimized NTLM8 kernel will be used for precomputation.\n\n"); fflush(stdout); + printed_precompute_ntlm8_message = 1; + } + } + + /* Load the kernel. */ + gpu->context = CLCREATECONTEXT(context_callback, &(gpu->device)); + gpu->queue = CLCREATEQUEUE(gpu->context, gpu->device); + load_kernel(gpu->context, 1, &(gpu->device), kernel_path, kernel_name, &(gpu->program), &(gpu->kernel), args->hash_type); + + /* These variables are set so the CLCREATEARG* macros work correctly. */ + context = gpu->context; + queue = gpu->queue; + kernel = gpu->kernel; + + if (rc_clGetKernelWorkGroupInfo(kernel, gpu->device, CL_KERNEL_WORK_GROUP_SIZE /*CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE*/, sizeof(size_t), &gws, NULL) != CL_SUCCESS) { + fprintf(stderr, "Failed to get preferred work group size!\n"); + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + pthread_exit(NULL); + return NULL; + } + gws = gws * gpu->num_work_units; + + /* In the event that the global work size is larger than the number of outputs we + * need, cap the GWS. */ + if (gws > output_len) gws = output_len; + + /* Count the number of times we need to run the kernel. */ + num_exec_blocks = output_len / gws; + if (output_len % gws != 0) + num_exec_blocks++; + + /*printf("Host thread #%u started; GWS: %zu.\n", gpu->device_number, gws);*/ + + /* This will hold the results from this one GPU. */ + output = calloc(output_len, sizeof(cl_ulong)); + + /* Holds the results from one kernel exec. */ + output_block_len = gws; + output_block = calloc(output_block_len, sizeof(cl_ulong)); + + if ((output == NULL) || (output_block == NULL)) { + fprintf(stderr, "Error while allocating output buffer(s).\n"); + exit(-1); + } + + /* Get the number of compute units in this device. */ + /*get_device_uint(gpu->device, CL_DEVICE_MAX_COMPUTE_UNITS, &(gpu->num_work_units));*/ + + + CLCREATEARG(0, hash_type_buffer, CL_RO, args->hash_type, sizeof(cl_uint)); + CLCREATEARG_ARRAY(1, hash_buffer, CL_RO, hash_binary, hash_binary_len); + CLCREATEARG(2, hash_len_buffer, CL_RO, hash_binary_len, sizeof(cl_uint)); + CLCREATEARG_ARRAY(3, charset_buffer, CL_RO, args->charset, strlen(args->charset) + 1); + CLCREATEARG(4, plaintext_len_min_buffer, CL_RO, args->plaintext_len_min, sizeof(cl_uint)); + CLCREATEARG(5, plaintext_len_max_buffer, CL_RO, args->plaintext_len_max, sizeof(cl_uint)); + CLCREATEARG(6, table_index_buffer, CL_RO, args->table_index, sizeof(cl_uint)); + CLCREATEARG(7, chain_len_buffer, CL_RO, args->chain_len, sizeof(cl_uint)); + CLCREATEARG(8, device_num_buffer, CL_RO, gpu->device_number, sizeof(cl_uint)); + CLCREATEARG(9, total_devices_buffer, CL_RO, args->total_devices, sizeof(cl_uint)); + CLCREATEARG_ARRAY(11, output_block_buffer, CL_WO, output_block, output_block_len * sizeof(cl_ulong)); + /*CLCREATEARG_DEBUG(9, debug_buffer, debug_ptr);*/ + + for (exec_block = 0; exec_block < num_exec_blocks; exec_block++) { + unsigned int exec_block_scaler = exec_block * gws; + + + CLCREATEARG(10, exec_block_scaler_buffer, CL_RO, exec_block_scaler, sizeof(cl_uint)); + + if (is_amd_gpu) { + int barrier_ret = pthread_barrier_wait(&barrier); + if ((barrier_ret != 0) && (barrier_ret != PTHREAD_BARRIER_SERIAL_THREAD)) { + fprintf(stderr, "pthread_barrier_wait() failed!\n"); fflush(stderr); + exit(-1); + } + } + + /* Run the kernel and wait for it to finish. */ + CLRUNKERNEL(gpu->queue, gpu->kernel, &gws); + CLFLUSH(gpu->queue); + CLWAIT(gpu->queue); + + /* Read the results. */ + CLREADBUFFER(output_block_buffer, output_block_len * sizeof(cl_ulong), output_block); + + /* Append this block out output to the total output for this GPU. */ + output_block_index = 0; + while ((output_index < output_len) && (output_block_index < output_block_len)) + output[output_index++] = output_block[output_block_index++]; + + CLFREEBUFFER(exec_block_scaler_buffer); + } + + /* Set the results so the main thread can access them. */ + args->results = output; + args->num_results = output_len; + + /* + printf("GPU %u: ", gpu->device_number); + for (i = 0; i < output_len; i++) { + printf("%"PRIu64" ", output[i]); + } + printf("\n"); + */ + + FREE(output_block); + + CLFREEBUFFER(hash_type_buffer); + CLFREEBUFFER(hash_buffer); + CLFREEBUFFER(hash_len_buffer); + CLFREEBUFFER(charset_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(table_index_buffer); + CLFREEBUFFER(chain_len_buffer); + CLFREEBUFFER(device_num_buffer); + CLFREEBUFFER(total_devices_buffer); + CLFREEBUFFER(exec_block_scaler_buffer); + CLFREEBUFFER(output_block_buffer); + /*CLFREEBUFFER(debug_buffer);*/ + + CLRELEASEKERNEL(gpu->kernel); + CLRELEASEPROGRAM(gpu->program); + CLRELEASEQUEUE(gpu->queue); + CLRELEASECONTEXT(gpu->context); + + pthread_exit(NULL); + return NULL; +} + + +void precompute_hash(unsigned int num_devices, thread_args *args, precomputed_and_potential_indices **ppi_head) { + pthread_t threads[MAX_NUM_DEVICES] = {0}; + char filename[128] = {0}, time_str[128] = {0}, index_data[256] = {0}; + struct timespec start_time = {0}; + unsigned int i = 0, j = 0, output_index = 0; + int k = 0; + uint64_t *output = NULL; + FILE *f = NULL; + precomputed_and_potential_indices *ppi = NULL; + + + /* Set the index data we're looking for (or will create later). */ + snprintf(index_data, sizeof(index_data) - 1, "%s_%s#%d-%d_%d_%d:%s\n", args->hash_name, args->charset_name, args->plaintext_len_min, args->plaintext_len_max, args->table_index, args->chain_len, args->hash); /*ntlm_loweralpha#8-8_0_100:49e5bfaab1be72a6c5236f15736a3e15*/ + + /* Search through the cache and see if we already precomputed the indices for this + * hash. */ + output = search_precompute_cache(index_data, &output_index, filename, sizeof(filename)); + + /* Cache miss... */ + if (output == NULL) { + + /* Start the timer for this hash. */ + start_timer(&start_time); + + /* Start one thread to control each GPU. */ + for (i = 0; i < num_devices; i++) { + if (pthread_create(&(threads[i]), NULL, &host_thread_precompute, &(args[i]))) { + perror("Failed to create thread"); + exit(-1); + } + } + + /* Wait for all threads to finish. */ + for (i = 0; i < num_devices; i++) { + if (pthread_join(threads[i], NULL) != 0) { + perror("Failed to join with thread"); + exit(-1); + } + } + + seconds_to_human_time(time_str, sizeof(time_str), get_elapsed(&start_time)); + + printf(" Completed in %s.\n", time_str); fflush(stdout); + + /* Create one output array to hold all the results. */ + output = calloc(args[0].num_results * num_devices, sizeof(uint64_t)); + if (output == NULL) { + fprintf(stderr, "Error allocating buffer for GPU results.\n"); + exit(-1); + } + + /* + The results end up spread out like this across many GPUs: + + GPU 0: 100 94 88 82 76 70 64 58 52 46 40 34 28 22 16 10 4 + GPU 1: 99 93 87 81 75 69 63 57 51 45 39 33 27 21 15 9 3 + GPU 2: 98 92 86 80 74 68 62 56 50 44 38 32 26 20 14 8 2 + GPU 3: 97 91 85 79 73 67 61 55 49 43 37 31 25 19 13 7 1 + GPU 4: 96 90 84 78 72 66 60 54 48 42 36 30 24 18 12 6 0 + GPU 5: 95 89 83 77 71 65 59 53 47 41 35 29 23 17 11 5 0 + + Below, we collate the results into a single array containing "100 99 98 [...]". + */ + for (i = 0; i < args[0].num_results; i++) { + for (j = 0; j < num_devices; j++) { + output[output_index] = args[j].results[i]; + output_index++; + } + } + + /* Now that pulled all the GPU results into one array, free them. */ + for (i = 0; i < num_devices; i++) { + FREE(args[i].results); + args[i].num_results = 0; + } + + /* We may have a few extra indices in the array at the end, if the chain length + * is not divisible by the number of GPUs. In that case, we simply truncate the + * end of the array. */ + if (output_index >= args[0].chain_len - 1) + output_index = args[0].chain_len -1; + else { /* Sanity check: this should never happen... */ + fprintf(stderr, "Error: output_index < chain_len - 1!: %u < %u\n", output_index, args[0].chain_len - 1); + exit(-1); + } + + /* Reverse the output buffer. + * TODO: this logic can be merged in, above, to simplify. */ + { + uint64_t *tmp = calloc(output_index, sizeof(uint64_t)); + if (tmp == NULL) { + fprintf(stderr, "Failed to create temp buffer.\n"); + exit(-1); + } + + for (i = 0; i < output_index; i++) + tmp[i] = output[output_index - i - 1]; + + FREE(output); + output = tmp; + } + + /* Search for the first unused filename in the space of rcracki.precalc.[0-1048576]. */ + for (i = 0; i < 1048576; i++) { + int fd = -1; + + snprintf(filename, sizeof(filename) - 1, "rcracki.precalc.%d", i); + + /* Create a file for writing with permissions of 0600. */ + fd = open(filename, O_CREAT | O_EXCL | O_WRONLY | O_BINARY, S_IRUSR | S_IWUSR); + + if (fd != -1) { /* On success, convert to a file pointer. */ + f = fdopen(fd, "wb"); + break; + } + } + + if (f == NULL) { + fprintf(stderr, "Error: could not create any precalc file (rcracki.precalc.[0-1048576])\n"); + exit(-1); + } + + /* Ok, so it turns out that we generated the array backwards. Oh well. We will + * just iterate backwards here to compensate. */ + /*for (k = output_index - 1; k >= 0; k--) + fwrite(&(output[k]), sizeof(cl_ulong), 1, f);*/ + + for (k = 0; k < output_index; k++) + fwrite(&(output[k]), sizeof(cl_ulong), 1, f); + + FCLOSE(f); + + /* Now create the rcracki.precalc.?.index file. */ + strncat(filename, ".index", sizeof(filename) - 1); + f = fopen(filename, "wb"); + if (f == NULL) { + fprintf(stderr, "Error while creating file: %s\n", filename); + exit(-1); + } else { + fwrite(index_data, sizeof(char), strlen(index_data), f); + FCLOSE(f); + } + + } else { + printf("Using cached pre-computed indices for hash %s.\n", args->hash); fflush(stdout); + } + + total_precomputed_indices_loaded += output_index; + + /* + printf("output_index: %u\nFinal array: ", output_index); + + for (i = 0; i < output_index; i++) + printf("%"PRIu64" ", output[i]); + printf("\n"); + + printf("\nFinal array hex: "); + + for (i = 0; i < output_index; i++) + printf("%08"PRIx64" ", output[i]); + printf("\n"); + */ + + /* Time to store the precomputed indices. If no head exists in the linked list... */ + if (*ppi_head == NULL) { + *ppi_head = calloc(1, sizeof(precomputed_and_potential_indices)); + if (*ppi_head == NULL) { + fprintf(stderr, "Error allocating buffer for precomputed indices.\n"); + exit(-1); + } + ppi = *ppi_head; + } else { + ppi = *ppi_head; + while (ppi->next != NULL) + ppi = ppi->next; + ppi->next = calloc(1, sizeof(precomputed_and_potential_indices)); + if (ppi->next == NULL) { + fprintf(stderr, "Error allocating buffer for precomputed indices.\n"); + exit(-1); + } + ppi = ppi->next; + } + + ppi->hash = args->hash; + ppi->num_precomputed_end_indices = output_index; + + ppi->precomputed_end_indices = calloc(ppi->num_precomputed_end_indices, sizeof(cl_ulong)); + if (ppi->precomputed_end_indices == NULL) { + fprintf(stderr, "Error allocating index buffer for precomputed indices.\n"); + exit(-1); + } + + /* Store the precomputed indices into the array. */ + for (i = 0; i < ppi->num_precomputed_end_indices; i++) + ppi->precomputed_end_indices[i] = output[i]; + + /* Set the filename, so it can be deleted if the hash is cracked later. */ + ppi->index_filename = strdup(filename); + + FREE(output); +} + + +void print_usage_and_exit(char *prog_name, int exit_code) { +#ifdef _WIN32 + fprintf(stderr, "Usage: %s rainbow_table_directory (single_hash | filename_with_many_hashes.txt)\n\nExample:\n %s D:\\rt_ntlm\\ 64f12cddaa88057e06a81b54e73b949b\n %s D:\\rt_ntlm\\ C:\\Users\\jsmith\\Desktop\\hashes.txt [-gws GWS]\n\n", prog_name, prog_name, prog_name); +#else + fprintf(stderr, "Usage: %s rainbow_table_directory (single_hash | filename_with_many_hashes.txt)\n\nExample:\n %s /export/rt_ntlm/ 64f12cddaa88057e06a81b54e73b949b\n %s /export/rt_ntlm/ /home/user/hashes.txt [-gws GWS]\n\n", prog_name, prog_name, prog_name); +#endif + exit(exit_code); +} + + +/* Helper function for rt_binary_search(). */ +unsigned int _rt_binary_search(cl_ulong *rainbow_table, unsigned int low, unsigned int high, cl_ulong search_index, cl_ulong *start) { + unsigned int chain = 0; + + + /*printf("_rt_binary_search(%u, %u, %lu)\n", low, high, search_index);*/ + if (high - low <= 8) { + for (chain = low; chain < high; chain++) { + if (search_index == rainbow_table[(chain * 2) + 1]) { + *start = rainbow_table[chain * 2]; + /*printf("\nbinary search: found %lu at %u (between %u and %u)\n", *start, chain, low, high);*/ + return 1; + } + } + } else { + chain = ((high - low) / 2) + low; + if (search_index >= rainbow_table[(chain * 2) + 1]) + return _rt_binary_search(rainbow_table, chain, high, search_index, start); + else + return _rt_binary_search(rainbow_table, low, chain, search_index, start); + } + + return 0; +} + + +void *rt_binary_search_thread(void *ptr) { + search_thread_args *args = (search_thread_args *)ptr; + precomputed_and_potential_indices *ppi_cur = args->ppi_head; + unsigned int i = 0; + cl_ulong start = 0; + + + while (ppi_cur != NULL) { + if (ppi_cur->plaintext == NULL) { /* If this hash isn't cracked yet... */ + for (i = 0 + args->thread_number; i < ppi_cur->num_precomputed_end_indices; i += args->total_threads) { + if (_rt_binary_search(args->rainbow_table, 0, args->num_chains, ppi_cur->precomputed_end_indices[i], &start)) { + add_potential_start_index_and_position(ppi_cur, start, i); + } + } + } + ppi_cur = ppi_cur->next; + } + + pthread_exit(NULL); + return NULL; +} + + +/* Rainbow table binary search. Searches a table's end indices for any matches with + * precomputed end indices. If/when matches are found, the corresponding start indices + * are added to the precomputed_and_potential_indices's potential_start_indices + * array. */ +void rt_binary_search(cl_ulong *rainbow_table, unsigned int num_chains, precomputed_and_potential_indices *ppi_head) { + struct timespec start_time_searching = {0}; + char time_searching_str[64] = {0}; + unsigned int num_threads = get_num_cpu_cores(); + pthread_t *threads = NULL; + search_thread_args *args = NULL; + unsigned int i = 0; + double s_time = 0; + + + start_timer(&start_time_searching); + args = calloc(num_threads, sizeof(search_thread_args)); + threads = calloc(num_threads, sizeof(pthread_t)); + if ((args == NULL) || (threads == NULL)) { + fprintf(stderr, "Failed to create thread/args for searching.\n"); + exit(-1); + } + + printf(" Searching table for matching endpoints...\n"); fflush(stdout); + + for (i = 0; i < num_threads; i++) { + args[i].thread_number = i; + args[i].total_threads = num_threads; + args[i].rainbow_table = rainbow_table; + args[i].num_chains = num_chains; + args[i].ppi_head = ppi_head; + + if (pthread_create(&(threads[i]), NULL, &rt_binary_search_thread, &(args[i]))) { + perror("Failed to create thread"); + exit(-1); + } + } + + /* Wait for all threads to finish. */ + for (i = 0; i < num_threads; i++) { + if (pthread_join(threads[i], NULL) != 0) { + perror("Failed to join with thread"); + exit(-1); + } + } + + s_time = get_elapsed(&start_time_searching); + seconds_to_human_time(time_searching_str, sizeof(time_searching_str), s_time); + printf(" Table searched in %s.\n", time_searching_str); fflush(stdout); + + time_searching += s_time; + FREE(args); + FREE(threads); +} + + +void save_cracked_hash(precomputed_and_potential_indices *ppi, unsigned int hash_type) { + FILE *jtr_file = fopen(jtr_pot_filename, "ab"), *hashcat_file = fopen(hashcat_pot_filename, "ab"); + unsigned int hash_len = strlen(ppi->hash), plaintext_len = strlen(ppi->plaintext); + char *dot_pos = strrchr(ppi->index_filename, '.'); + + + if (jtr_file == NULL) { + fprintf(stderr, "Error: could not open pot file for writing: %s: %s\n", jtr_pot_filename, strerror(errno)); + exit(-1); + } else if (hashcat_file == NULL) { + fprintf(stderr, "Error: could not open pot file for writing: %s: %s\n", hashcat_pot_filename, strerror(errno)); + exit(-1); + } + + /* The JTR pot file format requires NTLM hashes to be prepended with "$NT$". */ + if ((hash_type == HASH_NTLM) && (fwrite("$NT$", sizeof(char), 4, jtr_file) != 4)) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } + + if (fwrite(ppi->hash, sizeof(char), hash_len, jtr_file) != hash_len) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } else if (fwrite(ppi->hash, sizeof(char), hash_len, hashcat_file) != hash_len) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } + + if (fwrite(":", sizeof(char), 1, jtr_file) != 1) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } else if (fwrite(":", sizeof(char), 1, hashcat_file) != 1) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } + + if (fwrite(ppi->plaintext, sizeof(char), plaintext_len, jtr_file) != plaintext_len) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } else if (fwrite(ppi->plaintext, sizeof(char), plaintext_len, hashcat_file) != plaintext_len) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } + + if (fwrite("\n", sizeof(char), 1, jtr_file) != 1) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } else if (fwrite("\n", sizeof(char), 1, hashcat_file) != 1) { + fprintf(stderr, "Error while writing to pot file: %s\n", strerror(errno)); + exit(-1); + } + + FCLOSE(jtr_file); + FCLOSE(hashcat_file); + + /* Delete the index file containing information about the precomputed indices. Since + * this hash was cracked, this is no longer needed. */ + if (unlink(ppi->index_filename) != 0) { + fprintf(stderr, "Error while deleting precompute index file: %s: %s\n", ppi->index_filename, strerror(errno)); + /*exit(-1);*/ + } + + /* Truncate the ".index" off the end of the filename; this forms the precomputation + * filename. */ + *dot_pos = '\0'; + if (unlink(ppi->index_filename) != 0) { + fprintf(stderr, "Error while deleting precompute file: %s: %s\n", ppi->index_filename, strerror(errno)); + /*exit(-1);*/ + } + + num_cracked++; + num_falsealarms--; +} + + +/* Searches the precompute cache for matching index data. If found, an array of + * indices is returned, num_indices set to the array size, and the filename buffer + * is set to the *.index cache file. */ +cl_ulong *search_precompute_cache(char *index_data, unsigned int *num_indices, char *filename, unsigned int filename_size) { + char buf[256] = {0}; + int file_size = 0; + DIR *d = NULL; + struct dirent *de = NULL; + FILE *f = NULL; + cl_ulong *ret = NULL; + + + *num_indices = 0; + memset(filename, 0, filename_size); + + + /* Go through all *.index files in the current directory and find any that match + * the hash passed to us. If found, we already pre-computed the values. */ + d = opendir("."); + if (d == NULL) { + fprintf(stderr, "Can't open current directory.\n"); + exit(-1); + } + while ((de = readdir(d)) != NULL) { + if (str_ends_with(de->d_name, ".index")) { + /*printf("Looking at %s\n", de->d_name);*/ + + /* Open this *.index file. */ + f = fopen(de->d_name, "rb"); + if (f == NULL) { + fprintf(stderr, "Failed to open %s for reading.\n", de->d_name); + exit(-1); + } + + file_size = get_file_size(f); + + /* Read the index data.*/ + if ((file_size >= sizeof(buf)) || (fread(buf, sizeof(char), file_size, f) != file_size)) { + fprintf(stderr, "Failed to read index data: %s\n", strerror(errno)); + exit(-1); + } + + FCLOSE(f); + + /* We found an index file that matches all our parameters. Open its related + * file containing precomputed indices. */ + if (strcmp(index_data, buf) == 0) { + + /* Set the filename to the *.index file for the caller. */ + strncpy(filename, de->d_name, filename_size - 1); + de->d_name[strlen(de->d_name) - 6] = '\0'; + + f = fopen(de->d_name, "rb"); + if (f == NULL) { + fprintf(stderr, "Failed to open precomputed index file: %s\n", de->d_name); + exit(-1); + } + + file_size = get_file_size(f); + + if (file_size % sizeof(cl_ulong) != 0) { + fprintf(stderr, "Precomputed indices file is not a multiple of %"PRIu64": %u\n", sizeof(cl_ulong), file_size); + exit(-1); + } + + *num_indices = file_size / sizeof(cl_ulong); + + ret = calloc(*num_indices, sizeof(cl_ulong)); + if (ret == NULL) { + fprintf(stderr, "Failed to create indices buffer.\n"); + exit(-1); + } + + if (fread(ret, sizeof(cl_ulong), *num_indices, f) != *num_indices) { + fprintf(stderr, "Failed to read indices file.\n"); + exit(-1); + } + FCLOSE(f); + + break; + } + } + } + closedir(d); d = NULL; + return ret; +} + + +void search_tables(char *dir_name, precomputed_and_potential_indices *ppi, rt_parameters *rt_params, thread_args *args) { + char filepath[512] = {0}; + DIR *dir = NULL; + struct dirent *de = NULL; + struct stat st; + + + dir = opendir(dir_name); + if (dir == NULL) /* This directory may not allow the current process permission. */ + return; + + while ((de = readdir(dir)) != NULL) { + + /* Create an absolute path to this entity. */ + filepath_join(filepath, sizeof(filepath), dir_name, de->d_name); + + /* If this is a directory, recurse into it. */ + if ((strcmp(de->d_name, ".") != 0) && (strcmp(de->d_name, "..") != 0) && (stat(filepath, &st) == 0) && S_ISDIR(st.st_mode)) { + search_tables(filepath, ppi, rt_params, args); + + /* If we're searching for rainbowtable parameters, and successfully parsed them + * in the recursive call, we're done. */ + if ((rt_params != NULL) && rt_params->parsed) + return; + + /* If this is a compressed or uncompressed rainbow table, process it! */ + } else if (str_ends_with(de->d_name, ".rt") || str_ends_with(de->d_name, ".rtc")) { + + /* If the caller is only interested in rainbow table parameters... */ + if (rt_params != NULL) { + + /* Try to parse them from this file name. On success, return immediately + * (no further processing needed), otherwise continue searching until the + * first valid set of parameters is found. */ + parse_rt_params(rt_params, de->d_name); + if (rt_params->parsed) + return; + + } else { + long file_size = 0; + FILE *f = NULL; + cl_ulong *rainbow_table = NULL; + unsigned int num_chains = 0, num_uncracked = 0; + struct timespec start_time_table = {0}, start_time_io = {0}; + double time_io_this_table = 0.0; + precomputed_and_potential_indices *ppi_cur = ppi; + unsigned int is_uncompressed_table = 0; + + + /* Count the number of uncracked hashes we have left. */ + while (ppi_cur != NULL) { + if (ppi_cur->plaintext == NULL) + num_uncracked++; + + ppi_cur = ppi_cur->next; + } + + /* If all the hashes were cracked, there's no need to continue processing + * tables. */ + if (num_uncracked == 0) { + printf("All hashes cracked. Skipping rest of tables.\n"); + break; + } + + if (str_ends_with(de->d_name, ".rtc")) { + int ret = 0; + + current_table++; + printf("[%u of %u] Processing compressed table: %s...\n", current_table, total_tables, filepath); fflush(stdout); + + start_timer(&start_time_table); /* For all table processes. */ + start_timer(&start_time_io); /* For loading the table only. */ + if ((ret = rtc_decompress(filepath, &rainbow_table, &num_chains)) != 0) { + fprintf(stderr, "Error while decompressing RTC table: %d\n", ret); + exit(-1); + } + time_io_this_table = get_elapsed(&start_time_io); + time_io += time_io_this_table; + } else { + current_table++; + printf("[%u of %u] Processing uncompressed table: %s...\n", current_table, total_tables, filepath); fflush(stdout); + is_uncompressed_table = 1; + + start_timer(&start_time_table); /* For all table processes. */ + start_timer(&start_time_io); /* For loading the table only. */ + f = fopen(filepath, "rb"); + if (f != NULL) { + file_size = get_file_size(f); + + if ((file_size % (sizeof(cl_ulong) * 2) == 0) && (file_size > 0)) { + unsigned int num_longs = file_size / sizeof(cl_ulong); + + rainbow_table = calloc(num_longs, sizeof(cl_ulong)); + if (rainbow_table == NULL) { + fprintf(stderr, "Failed to allocate %"PRIu64" bytes for rainbow table!: %s\n", num_longs * sizeof(cl_ulong), filepath); + exit(-1); + } + + if (fread(rainbow_table, sizeof(cl_ulong), num_longs, f) != num_longs) { + fprintf(stderr, "Error while reading rainbow table: %s\n", strerror(errno)); + exit(-1); + } + time_io_this_table = get_elapsed(&start_time_io); + time_io += time_io_this_table; + num_chains = num_longs / 2; + } else + fprintf(stderr, "Rainbow table size is not a multiple of %"PRIu64": %ld\n", sizeof(cl_ulong) * 2, file_size); + } else + fprintf(stderr, "Could not open file for reading: %s", strerror(errno)); + + FCLOSE(f); + } + + if (rainbow_table != NULL) { + char time_io_str[64] = {0}; + unsigned int skip_table = 0; + + seconds_to_human_time(time_io_str, sizeof(time_io_str), time_io_this_table); + printf(" Table loaded in %s.\n", time_io_str); fflush(stdout); + + /* If the table is uncompressed (*.rt), then there's a possibility its unsorted on accident. We will + * verify them first to make sure. */ + if (is_uncompressed_table == 1) { + struct timespec start_time_verify = {0}; + + printf("Verifying uncompressed table is suitable for lookups..."); fflush(stdout); + start_timer(&start_time_verify); + if (!verify_rainbowtable(rainbow_table, num_chains, VERIFY_TABLE_TYPE_LOOKUP, 0, 0, NULL)) { + fprintf(stderr, "\nError: %s is not a valid table suitable for lookups! (Hint: it may not be sorted.) Skipping...\n\n", filepath); fflush(stderr); + FREE(rainbow_table); + skip_table = 1; /* Skip further processing on this table only. */ + } + printf("done in %.1f seconds.\n", get_elapsed(&start_time_verify)); fflush(stdout); + } + if (skip_table == 0) { + rt_binary_search(rainbow_table, num_chains, ppi); + num_chains_processed += num_chains; + num_tables_processed++; + FREE(rainbow_table); + + /* Check endpoint matches. */ + check_false_alarms(ppi, args); + + printf(" Table fully processed in %.1f seconds.\n\n", get_elapsed(&start_time_table)); fflush(stdout); + + /* We checked the potential matches above, so there's nothing else to with + * them. */ + clear_potential_start_indices(ppi); + } + } + } + } + } + + closedir(dir); dir = NULL; +} + + +int main(int ac, char **av) { + char *rt_dir = NULL, *single_hash = NULL, *filename = NULL, *file_data = NULL, **hashes = NULL, *line = NULL, *pot_file_data = NULL; + unsigned int i = 0, j = 0, num_hashes = 0; + FILE *f = NULL; + struct stat st = {0}; + thread_args *args = NULL; + struct timespec start_time = {0}; + char time_precomp_str[64] = {0}, time_io_str[64] = {0}, time_searching_str[64] = {0}, time_falsealarms_str[64] = {0}, time_total_str[64] = {0}, time_per_table_str[64] = {0}; + + rt_parameters rt_params = {0}; + + cl_platform_id platforms[MAX_NUM_PLATFORMS] = {0}; + cl_device_id devices[MAX_NUM_DEVICES] = {0}; + + cl_uint num_platforms = 0, num_devices = 0; + + precomputed_and_potential_indices *ppi_head = NULL, *ppi_cur = NULL; + + + ENABLE_CONSOLE_COLOR(); + PRINT_PROJECT_HEADER(); + setlocale(LC_NUMERIC, ""); + if ((ac < 3) || (ac > 5)) + print_usage_and_exit(av[0], -1); + else if ((ac == 5) && (strcmp(av[3], "-gws") != 0)) + print_usage_and_exit(av[0], -1); + + + /* Initialize the devices. */ + get_platforms_and_devices(MAX_NUM_PLATFORMS, platforms, &num_platforms, MAX_NUM_DEVICES, devices, &num_devices, VERBOSE); + + /* Check the device type and set flags.*/ + if (num_devices > 0) { + char device_vendor[128] = {0}; + + get_device_str(devices[0], CL_DEVICE_VENDOR, device_vendor, sizeof(device_vendor) - 1); + if (strstr(device_vendor, "Advanced Micro Devices") != NULL) + is_amd_gpu = 1; + } + + /* Print a warning on Windows 7 systems, as they are observed to be highly + * unstable for performing lookups on. */ + PRINT_WIN7_LOOKUP_WARNING(); + + /* Check that this system has sufficient RAM. */ + CHECK_MEMORY_SIZE(); + + /* Initialize the barrier. This is used in some cases to ensure kernels across + * multiple devices run concurrently. */ + if (pthread_barrier_init(&barrier, NULL, num_devices) != 0) { + fprintf(stderr, "pthread_barrier_init() failed.\n"); + exit(-1); + } + + printf("Binary searching will be done with %u threads.\n", get_num_cpu_cores()); + + /* First arg is the directory (and/or sub-directories) containing rainbow tables. */ + rt_dir = av[1]; + + /* The default rainbowcrackalack.pot file can be overridden with a third argument. + * This is undocumented since its probably only useful for automated testing. */ + if (ac == 4) { + strncpy(jtr_pot_filename, av[3], sizeof(jtr_pot_filename)); + strncpy(hashcat_pot_filename, av[3], sizeof(hashcat_pot_filename)); + strncat(hashcat_pot_filename, ".hashcat", sizeof(hashcat_pot_filename)); + } else if (ac == 5) + user_provided_gws = (unsigned int)atoi(av[4]); + + + /* Open the JTR pot file for reading. We will check the hash(es) to see if any are + * already cracked. */ + f = fopen(jtr_pot_filename, "rb"); + if (f) { + unsigned long file_size = get_file_size(f); + + pot_file_data = calloc(file_size, sizeof(char)); + if (pot_file_data == NULL) { + fprintf(stderr, "Failed to allocate buffer for pot file.\n"); + exit(-1); + } + + if (fread(pot_file_data, sizeof(char), file_size, f) != file_size) { + fprintf(stderr, "Error reading pot file: %s\n", strerror(errno)); + exit(-1); + } + } else { + /* Allocate an empty string. */ + pot_file_data = calloc(1, sizeof(char)); + if (pot_file_data == NULL) { + fprintf(stderr, "Failed to allocate buffer for pot file.\n"); + exit(-1); + } + } + + FCLOSE(f); + + /* Check if the second arg is a hash or a file containing hashes. */ + if (stat(av[2], &st) == 0) + filename = av[2]; + else { + single_hash = av[2]; + + /* If this hash is already in the pot file, then there's nothing else to do. */ + if (pot_file_data && strstr(pot_file_data, single_hash)) { + printf("Specified hash has already been cracked! Check %s.\n", jtr_pot_filename); + exit(0); + } + } + + if (filename) { + FILE *f = fopen(filename, "rb"); + unsigned int previously_cracked = 0; + + + if (f == NULL) { + fprintf(stderr, "Error while opening file for reading: %s\n", filename); + goto err; + } + + file_data = calloc(st.st_size + 1, sizeof(char)); + if (file_data == NULL) { + fprintf(stderr, "Error while allocating buffer for hash file.\n"); + goto err; + } + + if (fread(file_data, sizeof(char), st.st_size, f) != st.st_size) { + fprintf(stderr, "Error while reading hash file.\n"); + goto err; + } + + FCLOSE(f); + + /* Count the number of newlines in the file so we know how large to make the + * hash array. */ + for (i = 0; i < st.st_size; i++) { + if (file_data[i] == '\n') + num_hashes++; + } + num_hashes++; /* In case the last line doesn't end with an LF. */ + + hashes = calloc(num_hashes, sizeof(char *)); + if (hashes == NULL) { + fprintf(stderr, "Error while allocating buffer for hashes.\n"); + goto err; + } + + /* Tokenize the hash file by line. Store each hash in the array. */ + num_hashes = 0; + line = strtok(file_data, "\n"); + while (line) { + + /* Skip empty lines. */ + if (strlen(line) > 0) { + + /* Skip previously-cracked hashes. */ + if (strstr(pot_file_data, line) != NULL) + previously_cracked++; + else { + hashes[num_hashes] = strdup(line); + if (hashes[num_hashes] == NULL) { + fprintf(stderr, "Error while allocating buffer for hashes.\n"); + goto err; + } + num_hashes++; + } + + line = strtok(NULL, "\n"); + } + } + + FREE(file_data); + + if (num_hashes == 0) { + printf("All hashes have already been cracked! Check %s.\n", jtr_pot_filename); + exit(0); + } else { + printf("Loaded %u of %u uncracked hashes from %s.\n", num_hashes, num_hashes + previously_cracked, filename); fflush(stdout); + } + + } else { /* A single hash was provided. */ + hashes = calloc(1, sizeof(char *)); + if (hashes == NULL) { + fprintf(stderr, "Error while allocating buffer for hashes.\n"); + goto err; + } + hashes[0] = strdup(single_hash); + num_hashes = 1; + } + + /* We're done checking the pot file for previously-cracked hashes. */ + FREE(pot_file_data); + + /* Look through the supplied rainbow table directory, and infer the parameters via + * the filenames. */ + search_tables(rt_dir, NULL, &rt_params, NULL); + if (!rt_params.parsed) { + fprintf(stderr, "Failed to infer rainbow table parameters from files in directory. Ensure that valid rainbow table files are in %s (and/or its sub-directories).\n", rt_dir); + exit(-1); + } + + /* At this time, only NTLM hashes are supported. */ + if (rt_params.hash_type != HASH_NTLM) { + fprintf(stderr, "Unfortunately, only NTLM hashes are supported at this time. Terminating.\n"); + exit(-1); + } + + /* Ensure that valid hashes were provided. */ + if (rt_params.hash_type == HASH_NTLM) { + for (i = 0; i < num_hashes; i++) { + if (strlen(hashes[i]) != 32) { + fprintf(stderr, "Error: invalid NTLM hash (length is not 32!): %s\n", hashes[i]); + exit(-1); + } + } + } + + args = calloc(num_devices, sizeof(thread_args)); + if (args == NULL) { + fprintf(stderr, "Error while creating thread arg array.\n"); + goto err; + } + + /* We set most of the args once, since all GPUs & hashes need all the same + * parameters. */ + for (i = 0; i < num_devices; i++) { + args[i].hash_type = rt_params.hash_type; + args[i].hash_name = rt_params.hash_name; + args[i].hash = NULL; /* Filled in below. */ + args[i].charset = validate_charset(rt_params.charset_name); + args[i].charset_name = rt_params.charset_name; + args[i].plaintext_len_min = rt_params.plaintext_len_min; + args[i].plaintext_len_max = rt_params.plaintext_len_max; + args[i].table_index = rt_params.table_index; + args[i].reduction_offset = rt_params.reduction_offset; + args[i].chain_len = rt_params.chain_len; + args[i].total_devices = num_devices; + args[i].gpu.device_number = i; + args[i].gpu.device = devices[i]; + get_device_uint(args[i].gpu.device, CL_DEVICE_MAX_COMPUTE_UNITS, &(args[i].gpu.num_work_units)); + } + + start_timer(&start_time); + for (i = 0; i < num_hashes; i++) { + printf("Pre-computing hash #%u: %s...\n", i + 1, hashes[i]); fflush(stdout); + + for (j = 0; j < num_devices; j++) + args[j].hash = hashes[i]; + + precompute_hash(num_devices, args, &ppi_head); + } + time_precomp = get_elapsed(&start_time); + seconds_to_human_time(time_precomp_str, sizeof(time_precomp_str), time_precomp); + printf("\nPre-computation finished in %s.\n\n", time_precomp_str); fflush(stdout); + + /* If too much memory is taken up by the pre-computed indices, print a warning to the + * user. Strange crashes in the OpenCL functions can occur when memory is exhausted, + * and its not obvious that this is the culprit. */ + check_memory_usage(); + + /* Using the pre-computed end indices, perform a binary search on all rainbow tables + * in the target directory. Any matching indices will trigger false alarm checks. */ + total_tables = count_tables(rt_dir); + search_tables(rt_dir, ppi_head, NULL, args); + + seconds_to_human_time(time_precomp_str, sizeof(time_precomp_str), time_precomp); + seconds_to_human_time(time_io_str, sizeof(time_io_str), time_io); + seconds_to_human_time(time_searching_str, sizeof(time_searching_str), time_searching); + seconds_to_human_time(time_falsealarms_str, sizeof(time_falsealarms_str), time_falsealarms); + seconds_to_human_time(time_total_str, sizeof(time_total_str), time_precomp + time_io + time_searching + time_falsealarms); + seconds_to_human_time(time_per_table_str, sizeof(time_per_table_str), (double)(time_precomp + time_io + time_searching + time_falsealarms) / (double)num_tables_processed); + + printf("\n\n RAINBOW CRACKALACK LOOKUP REPORT\n\n"); + + if (num_cracked == 0) + printf("\nNo hashes were cracked. :(\n\n\n"); + else { + printf(" * Crack Summary *\n\n"); + printf(" Of the %u hashes loaded, %u were cracked, or %.2f%%.\n\n", num_hashes, num_cracked, ((double)num_cracked / (double)num_hashes) * 100); + + printf(" Results\n -------\n"); + ppi_cur = ppi_head; + while(ppi_cur != NULL) { + if (ppi_cur->plaintext != NULL) + printf(" %s %s\n", ppi_cur->hash, ppi_cur->plaintext); + + ppi_cur = ppi_cur->next; + } + printf(" -------\n\n"); + printf(" Results have been written in JTR format to: %s\n", jtr_pot_filename); + printf(" Results have been written in hashcat format to: %s\n\n\n", hashcat_pot_filename); + } + + printf(" * Time Summary *\n\n Precomputation: %s\n I/O: %s\n Searching: %s\n False alarm checks: %s\n\n Total: %s\n\n\n", time_precomp_str, time_io_str, time_searching_str, time_falsealarms_str, time_total_str); + + printf(" * Statistics *\n\n Number of tables processed: %u\n Number of false alarms: %" QUOTE PRIu64"\n Number of chains processed: %" QUOTE PRIu64"\n\n Time spent per table: %s\n False alarms checked per second: %" QUOTE ".1f\n\n False alarms per no. chains: %.5f%%\n Successful cracks per false alarms: %.5f%%\n Successful cracks per total chains: %.8f%%\n\n\n", num_tables_processed, num_falsealarms, num_chains_processed, time_per_table_str, (double)num_falsealarms / time_falsealarms, ((double)num_falsealarms / (double)num_chains_processed) * 100.0, ((double)num_cracked / (double)num_falsealarms) * 100.0, ((double)num_cracked / (double)num_chains_processed) * 100.0); + + free_precomputed_and_potential_indices(&ppi_head); + free_loaded_hashes(hashes, &num_hashes); + FREE(args); + pthread_barrier_destroy(&barrier); + return 0; + + err: + FCLOSE(f); + FREE(file_data); + free_precomputed_and_potential_indices(&ppi_head); + free_loaded_hashes(hashes, &num_hashes); + FREE(args); + pthread_barrier_destroy(&barrier); + return -1; +} diff --git a/crackalack_rtc2rt.c b/crackalack_rtc2rt.c new file mode 100644 index 0000000..b07d7f8 --- /dev/null +++ b/crackalack_rtc2rt.c @@ -0,0 +1,66 @@ +/* + * Rainbow Crackalack: crackalack_rtc2rt.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 +#include +#endif +#include + +#include "rtc_decompress.h" +#include "version.h" + + +int main(int ac, char **av) { + uint64_t *uncompressed_table = NULL; + unsigned int i = 0, num_chains = 0; + char *rtc_filename_input = NULL, *rt_filename_output = NULL; + int ret = 0; + FILE *f = NULL; + + + ENABLE_CONSOLE_COLOR(); + PRINT_PROJECT_HEADER(); + if (ac != 3) { + fprintf(stderr, "Usage: %s [rtc file input] [rt file output]\n", av[0]); + return -1; + } + + rtc_filename_input = av[1]; + rt_filename_output = av[2]; + + ret = rtc_decompress(rtc_filename_input, &uncompressed_table, &num_chains); + if (ret != 0) { + fprintf(stderr, "Error while uncompressing RTC file: %s; error code: %d\n", rtc_filename_input, ret); + return -1; + } + + f = fopen(rt_filename_output, "wb"); + if (!f) { + fprintf(stderr, "Error: could not open %s for writing.\n", rt_filename_output); + return -1; + } + + for (i = 0; i < num_chains; i++) { + fwrite(&(uncompressed_table[i * 2]), sizeof(uint64_t), 1, f); + fwrite(&(uncompressed_table[(i * 2) + 1]), sizeof(uint64_t), 1, f); + } + + fclose(f); + + printf("Successfully uncompressed %u chains in RTC file \"%s\" to RT file \"%s\".\n", num_chains, rtc_filename_input, rt_filename_output); + return 0; +} diff --git a/crackalack_tests.py b/crackalack_tests.py new file mode 100755 index 0000000..39bf456 --- /dev/null +++ b/crackalack_tests.py @@ -0,0 +1,611 @@ +#!/usr/bin/python3 + +# +# This program performs a series of tests the rainbow table generation and +# lookup code. +# + + +# Ensure that this script is run with Python3. +import shutil, sys, time +if sys.version_info.major != 3: + print('Error: you must invoke this script with python3, not python.') + exit(-1) + +import glob, hashlib, os, platform, subprocess, shutil, tempfile + +CLR = "\033[0m"; +WHITEB = "\033[1;97m"; # White + bold +ITALICIZE = "\033[3m"; +GREEN = "\033[0;32m"; +RED = "\033[0;31m"; +GREENB = "\033[1;32m"; # Green + bold +REDB = "\033[1;31m"; # Red + bold + +GEN_PROG_NAME='crackalack_gen' +LOOKUP_PROG_NAME='crackalack_lookup' + +CYGWIN=False +if platform.system().startswith('CYGWIN'): + CYGWIN=True + +VERBOSE=False + + +# Generation tests to run. The crackalack_gen program is given the specified +# arguments, and the output table must match the specified sha256 hash. +# Python doesn't allow lists to be dictionary keys, so the hashes weirdly come first. +GEN_TESTS = { + +# The LM tests below are valid, but LM chain generation is useless these days +# (LM hashes are mostly better off being brute-forced). + #'8d95bf8d6e342f0b035b8ccb9c8b21e907f7ad7aa82390727ff131284f883116': + # ['lm', 'alpha-numeric-symbol32-space', '1', '7', '0', '15200', '20480', '0'], + #'90a66f17ab7ecc7592d2294f9bbe7cf93024853ef5f758fd62d71628b41e13c9': + # ['lm', 'alpha-numeric-symbol32-space', '1', '7', '0', '15200', '20480', '3'], + #'d0d8d33c48a47430dd80ca2d105136a3af84030d060ea9b032d7cbf132359489': + # ['lm', 'alpha-numeric-symbol32-space', '1', '7', '3', '15200', '20480', '0'], + #'b1bc9339f64a54585deea18190c3908a19a57d9f97f20354a4ba09c4e38db208': + # ['lm', 'alpha', '1', '7', '5', '15200', '20480', '5'], + #'c7b78031861588d6bad6011e6f6ebdf8e261c6d93fb18ef34e79eace22bed7fa': + # ['lm', 'mixalpha-numeric', '1', '7', '8', '15200', '20480', '8'], + +# The test below is valid, but we have too many as it is, and its better to have +# coverage on the real-world parameters with 8 and 9 character tables. +# 'e8f8b8dca8f8b8ea31a8faa0ad6160edcb9a0413e45c3acf2d0bd6a4e5780dbd': +# ['ntlm', 'ascii-32-65-123-4', '1', '8', '0', '1111', '8192', '0'], + + # Generic NTLM 8-character generation. + 'cad901a98addf66cd05460134d4baf6240c00fa0e17929a33a56c61c46b567bc': + ['ntlm', 'ascii-32-95', '8', '8', '0', '10240', '10240', '333'], + + # Optimized NTLM8 kernel test. + '3203f5596449254dc8f0639a8bbf54555c911aebc69d018b1e45cf0e8be27086': + ['ntlm', 'ascii-32-95', '8', '8', '0', '422000', '10240', '0'], + + 'd2cc84717ad29e7018ba4ef41f36d8f98b1b55bc8cf1613797dd0ca0143708f7': + ['ntlm', 'ascii-32-95', '9', '9', '0', '10240', '10240', '666'], + + # Optimized NTLM9 kernel test. +# 'd80a48c8216ed71d50d35da735a51d8c32c1c779296142fc82582f8ff643d153': +# ['ntlm', 'ascii-32-95', '9', '9', '0', '1000000', '10240', '0'], +} + +# The 'table' key is the filename to use to make a fake rainbow table (as the +# precomputation parameters will be inferred from it). The 'precalc_hash' key +# is the sha256 hash of the rcracki.precalc.0 file. The 'index_hash' key is the +# sha256 hash of the rcracki.precalc.0.index file. +PRECOMP_TESTS = { + 0: {'table': 'ntlm_loweralpha#8-8_0_100x1024_0.rt', 'password_hash': '49e5bfaab1be72a6c5236f15736a3e15', 'precalc_hash': '19d665d6181415aa70f8c5487585a778526b4ca39ccb5dcfb04f7a0bc508593b', 'index_hash': '0a4a8f162529d5e41f8df5e0a5438ff3890ca7f7dbbec2599eac8d15ca0c2e03'}, + 1: {'table': 'ntlm_ascii-32-95#8-8_16_100x1024_0.rt', 'password_hash': '49e5bfaab1be72a6c5236f15736a3e15', 'precalc_hash': '79b4cf5ccae26544d35aacd424fd97c39e024c9e5430c6c546e99f6dfd2dccf0', 'index_hash': '183c7687853f9344e8c047bf3d5f30b627988a233e2a6cafb71ee00151aa536f'}, + 2: {'table': 'ntlm_ascii-32-95#9-9_1024_100x1024_0.rt', 'password_hash': '49e5bfaab1be72a6c5236f15736a3e15', 'precalc_hash': '8c201e0686f2fe32b78a9c207b6f6a753c40fda689df88e5259a2e854bc68c12', 'index_hash': 'ea7081e000872ec06a115e068643048dcdecda047fe665ef6ca74ee68dd7e384'}, +} + +pot_filename = 'temp.pot' + + +# Ensures that the precalc files match their expacted values. If the expected values +# are None, then the precalc files must not exist. Returns True when expected values +# match, otherwise False. +def check_precalc_files(temp_dir, index, precalc_index_file_hash_expected, precalc_file_hash_expected, delete=False): + precalc_file = os.path.join(temp_dir, "rcracki.precalc.%u" % index) + precalc_index_file = os.path.join(temp_dir, "rcracki.precalc.%u.index" % index) + + # If the expected hashes are both None, then the files are expected to not exist. + if precalc_index_file_hash_expected is None and precalc_file_hash_expected is None: + precalc_file_exists = os.path.exists(precalc_file) + precalc_index_file_exists = os.path.exists(precalc_index_file) + if precalc_file_exists or precalc_index_file_exists: + print("FAILED: precalc and/or index file exists: %s: %r; %s: %r" % (precalc_file, precalc_file_exists, precalc_index_file, precalc_index_file_exists)) + return False + else: + return True + + # Check that the precalc and precalc index files are properly created. + precalc_file_hash_actual = get_hash(precalc_file) + precalc_index_file_hash_actual = get_hash(precalc_index_file) + if precalc_file_hash_actual != precalc_file_hash_expected: + print("FAILED: precalc file hash mismatch!: %s\n\tExpected: %s\n\tActual: %s" % (precalc_file, precalc_file_hash_expected, precalc_file_hash_actual)) + return False + + if precalc_index_file_hash_actual != precalc_index_file_hash_expected: + print("FAILED: precalc index file hash mismatch!: %s\n\tExpected: %s\n\tActual: %s" % (precalc_index_file, precalc_index_file_hash_expected, precalc_index_file_hash_actual)) + return False + + # Delete the precalc files if the caller prefers. + if delete: + os.unlink(precalc_file) + os.unlink(precalc_index_file) + + return True + + +# Checks that the pot file contains all the specified plaintexts (if None, the pot file +# is expected to not exist). Returns True if the expected case is found, otherwise +# False. +def check_pot_file(pot_filename, plaintexts): + + # If no hash was specified, then the pot file must not exist. + if (plaintexts is None) or (len(plaintexts) == 0): + if os.path.exists(pot_filename): + print("FAILED: pot file exists when it shouldn't!: %s" % pot_filename) + return False + else: + return True + + pot_lines = [] + with open(get_real_path(pot_filename), 'r') as f: + for line in f: + pot_lines.append(line) + + # Make a deep copy of the pot lines. + unmatched_lines = [] + for pot_line in pot_lines: + unmatched_lines.append(pot_line) + + for plaintext in plaintexts: + # Check each unmatched line in the pot file to see if this plaintext is in it. + found = False + for pot_line in pot_lines: + if pot_line.strip().endswith(plaintext): + found = True + + # Since this line is matched, remove it from the unmatched list. + unmatched_lines.remove(pot_line) + break + + if found is False: + print("FAILED: pot file does not have the following plaintext: %s\npot file: [%s]" % (plaintext, "\n".join(pot_lines))) + return False + + # Ensure that extra lines aren't in the pot file. + if len(unmatched_lines) > 0: + print("FAILED: extra entries found in pot file!: %s" % ''.join(unmatched_lines)) + return False + + return True + + +# Creates a sorted rainbow table that is either completely fake, or mostly fake with +# a few valid chains sprinkled in. +def create_rt_table(rt_dir, filename, num_chains, real_chains=[]): + path = os.path.join(rt_dir, filename) + + start_time = time.time() + + # Add the user-supplied authentic chains. Convert the start index to bytes, but + # leave the end index as an integer (so they can be sorted below). + chains = [] + for i in range(0, len(real_chains)): + chains.append((real_chains[i][0].to_bytes(8, byteorder='little'), real_chains[i][1])) + + # Generate random start & end indices for the rest of the table. + for i in range(len(chains), num_chains): + chains.append((os.urandom(8), int.from_bytes(os.urandom(8), byteorder='little'))) + + # Sort the table by the end indices. + sorted_chains = sorted(chains, key=lambda x: x[1]) + + # Write the chains to the specified file path. + with open(path, 'wb') as f: + for i in range(0, num_chains): + f.write(sorted_chains[i][0]) + f.write(sorted_chains[i][1].to_bytes(8, byteorder='little')) + + # If the the calls to os.urandom() take too long, warn the user. + total_time = int(time.time() - start_time) + if total_time > 3: + print("\nWARNING: creation of fake table %s with %u chains took %u seconds.\n" % (path, num_chains, total_time)) + + return path + + +# Returns the SHA-256 hash of a file, or None if the file does not exist. +def get_hash(filename): + + ret = None + try: + with open(filename, 'rb') as f: + ret = hashlib.sha256(f.read()).hexdigest() + except FileNotFoundError as e: + pass + + return ret + + +# Do the precomputation tests. +def do_precomp_tests(temp_dir): + all_passed = True + for test_number in PRECOMP_TESTS: + pot_filepath, rt_dir = begin_lookup_test(temp_dir) + + test_data = PRECOMP_TESTS[test_number] + table_filename = test_data['table'] + password_hash = test_data['password_hash'] + precalc_hash_expected = test_data['precalc_hash'] + index_hash_expected = test_data['index_hash'] + + # Create a fake rainbow table and run the lookup program against it. The + # precomputed files will be generated using parameters inferred from the + # table's filename. Also, the lookup will fail, leaving the precompute files + # behind for us to examine. + fake_table = create_rt_table(rt_dir, table_filename, 1024) + + print('Precomputing hash %s against %s... ' % (password_hash, table_filename), end='', flush=True) + run_lookup(rt_dir, password_hash) + os.unlink(fake_table); + + if not check_precalc_files(temp_dir, 0, index_hash_expected, precalc_hash_expected): + all_passed = False + else: + print(" %spassed.%s" % (GREEN, CLR)) + + return all_passed + + +def get_real_path(path): + if CYGWIN: + return subprocess.run(['cygpath', '-w', path], stdout=subprocess.PIPE).stdout.decode('ascii').strip() + else: + return path + + +# Run the lookup program with the specified rainbow table directory and password hash +# (or file path to password hashes). +def run_lookup(rt_dir, password_hash, pot_filepath=None): + + # If the password hash is actually a file on disk, translate it to the real path (on Cygwin). + if os.path.exists(password_hash): + password_hash = get_real_path(password_hash) + + args = [lookup_prog_path] + [get_real_path(rt_dir), password_hash] + + if pot_filepath is not None: + args.append(get_real_path(pot_filepath)) + + # If verbose mode is on, print the output to stdout and stderr. + so = stdout=subprocess.DEVNULL + se = stderr=subprocess.DEVNULL + if VERBOSE: + so = None + se = None + + subprocess.run(args, stdout=so, stderr=se) + + +# Do the lookup tests. +def do_lookup_tests(temp_dir): + all_passed = True + + if do_lookup_test_1(temp_dir): + print("\t* Lookup test #1 %spassed.%s" % (GREEN, CLR)) + else: + print("%sFailed%s lookup test #1" % (RED, CLR)) + all_passed = False + + if do_lookup_test_2(temp_dir): + print("\t* Lookup test #2 %spassed.%s" % (GREEN, CLR)) + else: + print("%sFailed%s lookup test #2" % (RED, CLR)) + all_passed = False + + if do_lookup_test_3(temp_dir): + print("\t* Lookup test #3 %spassed.%s" % (GREEN, CLR)) + else: + print("%sFailed%s lookup test #3" % (RED, CLR)) + all_passed = False + + if do_lookup_test_4(temp_dir): + print("\t* Lookup test #4 %spassed.%s" % (GREEN, CLR)) + else: + print("%sFailed%s lookup test #4" % (RED, CLR)) + all_passed = False + + if do_lookup_test_5(temp_dir): + print("\t* Lookup test #5 %spassed.%s" % (GREEN, CLR)) + else: + print("%sFailed%s lookup test #5" % (RED, CLR)) + all_passed = False + + return all_passed + + +# Run lookup on single hash (via command line) against bogus table, then run it against +# a table with the solution. +def do_lookup_test_1(temp_dir): + pot_filepath, rt_dir = begin_lookup_test(temp_dir) + + # Create an entirely bogus rainbow table. + bogus_table_path = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_128_100x1024_0.rt', 1024) + + password_hash = 'a1ce652747dc7ad8f1a1579f2e5552f9' + + # The lookup process will find nothing. + run_lookup(rt_dir, password_hash, pot_filepath) + os.unlink(bogus_table_path) + + # Ensure that the pot file does not exist. + if not check_pot_file(pot_filepath, None): + return False + + # Check the hashes of the precalc files. + if not check_precalc_files(temp_dir, 0, 'f8d0743b62efb72fb4e3fc4ecf933974e8904269560be671da57dc4a887390a0', '838d5c9d2b91e46291644e9d7d8fbd726f4572ae724de7f9bf5ad25718bd13a4'): + return False + + # Create a table with the correct chain. + real_table = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_128_100x1024_0.rt', 1024, [(666, 814103150699223)]) + + # This time, the lookup will succeed. + run_lookup(rt_dir, password_hash, pot_filepath) + os.unlink(real_table) + + # The precalc and precalc index files should not exist, as they should have been + # deleted upon successful lookup. + if not check_precalc_files(temp_dir, 0, None, None): + return False + + # Ensure that the pot file exists and has the correct contents. + if not check_pot_file(pot_filepath, ['FYpzudMN']): + return False + + shutil.rmtree(rt_dir) + return True + + +# Perform lookup on hash from command line on one table with the solution. +def do_lookup_test_2(temp_dir): + pot_filepath, rt_dir = begin_lookup_test(temp_dir) + + # Create a table with the correct chain. + real_table = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_64_100x1024_0.rt', 1024, [(985, 433833498526988)]) + run_lookup(rt_dir, '1ae8e2c70bd95334f716edb522653a44', pot_filepath) + os.unlink(real_table) + + if not check_pot_file(pot_filepath, ['MEH*^~7F']): + return False + + return True + + +# Put four hashes into a file. Run against a bogus table first. Next, crack the first +# hash. Lastly, crack hashes #2 and #3. +def do_lookup_test_3(temp_dir): + pot_filepath, rt_dir = begin_lookup_test(temp_dir) + + # Write four hashes to hashes.txt. Only the first three will be cracked. + hashes_file = os.path.join(temp_dir, "hashes.txt") + with open(hashes_file, 'w') as f: + f.write("cbd0ab7936e84a60cf94ce55ab9c1448\n2627ce94b7adcc0b5be394ec6e2293dc\n76f1948b006c026b606886b39653f812\n4ecc2ad7428a2c641500a58bfd02009f") + + # Create a fake table, and attempt a lookup. + fake_table = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_32_100x1024_0.rt', 16384) + run_lookup(rt_dir, hashes_file, pot_filepath) + os.unlink(fake_table) + + if not check_precalc_files(temp_dir, 0, '77028afda2ec9749dfe5f921267a0cc8d9cb4d36a75b4e1d821f5a67702fba1d', '2ee034e24967c8be383d0ebccc370ebfabe13e75ed1f663eb04925166630f71c'): + return False + if not check_precalc_files(temp_dir, 1, '7afb3c6210f231514ac1b2af0e840d4a687a3bc85dc324c2bb4b50e8bdc743d0', 'f0e08c079910983a3f57513a1f837b4497da9a72ba2bcd939a6435abd8e5e988'): + return False + if not check_precalc_files(temp_dir, 2, '2bab1297cbee537c16a909869f677469fbb08d8df77ec5da05de3126693c6de4', '49d1cf85db647faf87469dc42b8d67ddb781311f73cca81aa79c7c23c17d05e3'): + return False + if not check_precalc_files(temp_dir, 3, '3d64b323a1732f5bb1aa957fe786b13f5fa90efbcc479ac0a40e69029adee307', '45f6259dc0d404e8c1a6afb0012faca5d0edcb4baf13d92a845bab0229e0be3f'): + return False + + # Ensure the pot file is still non-existent. + if not check_pot_file(pot_filepath, None): + return False + + + # Solve one hash out of the four provided. + real_table = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_32_100x1024_0.rt', 16384, [(955, 467938381128153)]) + run_lookup(rt_dir, hashes_file, pot_filepath) + os.unlink(real_table) + + if not check_precalc_files(temp_dir, 0, None, None): + return False + if not check_precalc_files(temp_dir, 1, '7afb3c6210f231514ac1b2af0e840d4a687a3bc85dc324c2bb4b50e8bdc743d0', 'f0e08c079910983a3f57513a1f837b4497da9a72ba2bcd939a6435abd8e5e988'): + return False + if not check_precalc_files(temp_dir, 2, '2bab1297cbee537c16a909869f677469fbb08d8df77ec5da05de3126693c6de4', '49d1cf85db647faf87469dc42b8d67ddb781311f73cca81aa79c7c23c17d05e3'): + return False + if not check_precalc_files(temp_dir, 3, '3d64b323a1732f5bb1aa957fe786b13f5fa90efbcc479ac0a40e69029adee307', '45f6259dc0d404e8c1a6afb0012faca5d0edcb4baf13d92a845bab0229e0be3f'): + return False + + # Ensure the pot file is still non-existent. + if not check_pot_file(pot_filepath, ['v&Uf*Ml\\']): + return False + + + # Solve two hashes out of the four provided. + real_table = create_rt_table(rt_dir, 'ntlm_ascii-32-95#8-8_32_100x1024_1.rt', 16384, [(1655, 478778248563219), (1047, 4236649556986690)]) + run_lookup(rt_dir, hashes_file, pot_filepath) + os.unlink(real_table) + + if not check_precalc_files(temp_dir, 0, None, None): + return False + if not check_precalc_files(temp_dir, 1, None, None): + return False + if not check_precalc_files(temp_dir, 2, None, None): + return False + if not check_precalc_files(temp_dir, 3, '3d64b323a1732f5bb1aa957fe786b13f5fa90efbcc479ac0a40e69029adee307', '45f6259dc0d404e8c1a6afb0012faca5d0edcb4baf13d92a845bab0229e0be3f'): + return False + + # Ensure the pot file is updated. + if not check_pot_file(pot_filepath, ['v&Uf*Ml\\', '\x8a\xd6V\x1c\x13\xbb\x00.\x94,\xb9+j\xb6\x00o\xa0\xb06\xa8\x15\xc1\x00\xf4\xa4\x84\xb7\xab2\xb0\x00z\xb1\xf9m\xc7\x9a\xa5\x00\xa7A*dwx\x9d\x00\x0e\xafM\x16\xe8\n\xb6\x00+\xe8\xd3\xc1\xbd\x8c\xd4\x00\xfco\xf2\xc2\xc1\x06\xd0\x00\xcd\xcd\x87-\xbf\n\xd8\x00\xb4\x93zO\xf0\xef\xc7\x00\xb0\xb19v\xe2\x95\xb6\x00\xdf4\xb3`\n\xdf\xa2\x00\xac\xc5\xc5\x1f}$\x91\x00\x929p\x91\xcc\x95\x8f\x00\xc6HCy\xb1\xf0\xa1\x00%\x17*x-E\x99\x00\x9b\xc0Z\xe8A\xe2\x8c\x00\xb9E\xa0A\x1cV\x85\x00\x15$\xe07\xbd9\x8c\x00\xfb\xec\xc4\xbdXC\xb2\x007\xfa[\xce\xb0=\xd1\x003\xdb\x89\xd62\x8e\xd6\x00Z\xfc\xa0\xcc\xa7s\xbf\x00\xe6\xde\xab\x93\x94\xc4\xc3\x00\xba\x14g?)\xe9\xb6\x00\xb6X\xe8E\x05\x12\xa4\x00\x86\x87Ub\xe7\x8e\x8d\x00\xadO\x93kE\x98\x06\x01L\xf0ke\xd6\x8c\xf9\x00\x8chd\xb3/|\xed\x00\xd1\xe2a*\xa6k\xf7\x00u\xb7,\x7f\x06&\xea\x00\xfe\x9e\xd2\xa7\x97J\xec\x00\x85\xfcON\'\xc9\xfb\x00#C.\xebT\x0f\xf4\x00Xa\xe2\x17\xb6\x95\xf8\x00=\xa4k*\xa7C\xef\x00\xc7\x05\xab^\x02\x12\xfc\x00\xc04\xf9$0\x1a\xef\x00\xc3\n,\x10v\x1e\xe4\x00\x1e\xbc\x80\xc0\xfa\xe9\xe5\x00\xea`\xf1\x93\x08\xc9\xda\x00R\x90\xf2\x1b\x04\xa1\xcc\x00\x07\xe2\x02\xd2\xc4\xb4\xda\x00\x198M\x93\x7f2\xe3\x00\x83E\xc3\xce\x97l\xe6\x00\x0f\xc3{\xaf\x12\xb6\xd4\x00\xb8\x1bs\xbe\xd0\x92\xea\x00\xc5s>oV\x05\xd9\x00W\x91do_\xdb\xed\x00&\x86\xf76\x03Y\xe2\x00\t\xc8&\x9c\xf0\x0b\xcc\x00\x01|\x88\xc6\xa2\x12\xc7\x00\xfduR,\x02G\xbb\x00\xa8x[\xd3#\xad\xbf\x00\x13/\xaa\x04\xbb\xc9\xb6\x00\x1a{\xfa\xc0\x10\x96\xb4\x00A\xd7\xc2\xf5\x0f\x9a\xa3\x00\x17\x90\xed:\xb2\xe0\x96\x00\xd4\xf0\xa6\xb9\x9f\x8d\x90\x00\x88\xa8\xc2B\xaf\\\x84\x00\xd6w\xe6z-\xe7\x9c\x00\xd05J\xc0\xb7\x12\x8e\x00\x8di\xc5r\xcaj\x95\x00\xde\xdftlR\xa4\xa0\x00 $\xd8c\xc0\x93\x97\x00\xe7KR\xb4\xb8\x84\x96\x00\xa9V\xf0\xe8`;\xb6\x00\x7f\xe2b\xf2\x9bj\xaa\x00\x14\xd5\x0c\x1e|l\xe2\x00b\x08\x8d\xfd6\xf6\xfd\x00*\xe3\xb7\xd9\xf0U\xe8\x001\x84\xf1\x13\xdf\x9c\xd4\x00\x1f\xb8\xf6\xa8\xfb\xfb\xcb\x00q\x0c\xda\x95\x0f\xd5\xc4\x00/\xa4p\x17$K\xb2\x00\xf9x7M\xc0\xf4\xa6\x00v\xc9W\xea)~\x91\x00ro\xe9\xa8\xd2\x92z\x00\xf7L9\xb5\xaaQ\x81\x00\x00\xc2t\xfc\xf5Jj\x00"Z\x8d\xa0?`x\x00\xb5\xe2\xf98\x8d\x91g\x00\xce5\xb6\x02\x9b\xa8a\x00\xd9>\x02\xba\xb4TX\x00[\xaah\x14\x95/Y\x00\xf8\x01\xe0\xc5\xd8NE\x00\xcb\xcc\xb1\x18\xa0E=\x00D\xf4\xbeC<\xcc@\x00k\xb8\x14\x80Xm4\x00\x8a\x9b\xe2\x1d9\\3\x00d\xef\xa0a\x16A\x1d\x00\x90!\x1c\t\xbc\xccF\x00h\x16\x892:Y=\x00\xc8\x8f\xfc\xd1\xd5\xf6)\x00\xa1\xa0,6\xd3F\x13\x00O4\x85\xde\xca-\x00\x00\x0b\x07a\x97}\xb6\x19\x00\xdd~\xd1~\xd7\xb2\x18\x00\xf0f\x0c\xb4\xa1\x7f$\x00;\xc3\x1b\xbf,\xd3\\\x00\xdc\xd7\xcf\xb3*yN\x00\x11\xfch\xc7\xb0|:\x00\x04g@\nD\xf2\'\x00\x96\xe2#\x8f2/\x1d\x00\xbc\xa4N\x1a\xba\x03\x08\x00m\x00\x00\x00\x00\x00\x00\x00U>E\n\xbd\x1d"\x00S\xee\xd4\x10(h\x1e\x00\xe3d\xd3\x9dj%\x13\x00\xf3-\xbc\xd2X\x03i\x00\xc9\xca\xf4r\xf9\x8e\x88\x00i\x07\xea6\xab\x10\x99\x00\xca\xff\x1eQ$\xe3\xa3\x00\xc2p\xf5\x9e`*\x9a\x00\xbf\xa6\xcdo\x80$\xb6\x00a\xea\xd0D\xdc\xae\xee\x00\x1dAj\xd8\'\x91\xe3\x00\xe0\xdc\x1c"\x140\xd8\x00\xe5\x1c\x8b\xda\xae\x01\xd0\x00\x9c\xe8~\x02N\xe8\xce\x00\xb2Z;\x15\x996\xcb\x00\xa4\x8f\x05C\x91\xbf\xb5\x00l\\\x08`\r\x88\xa0\x00\x02\xbd\'\xff\xf0p\xb6\x00\x98\xbd\x81]\xb6}\xcb\x00') + + run_lookup(rt_dir, hashes_file, pot_filepath) + os.unlink(rtc_table_filename) + + # Ensure no precalc files exist. + if not check_precalc_files(temp_dir, 0, None, None): + return False + if not check_precalc_files(temp_dir, 1, None, None): + return False + if not check_precalc_files(temp_dir, 2, None, None): + return False + if not check_precalc_files(temp_dir, 3, None, None): + return False + + # Ensure the pot file is updated. + if not check_pot_file(pot_filepath, ['"{;iFoa{', ',u}&jU 6', '(EeFTfAS', 'r.Sq&7eN']): + return False + + return True + + +# Deletes the pot file if it exists, along with all precompute files. Creates the +# rainbowtable directory. Returns paths to the pot file and rainbow table directory. +def begin_lookup_test(path): + + # Delete the pot file if it exists. + pot_filepath = os.path.join(path, pot_filename) + if os.path.exists(pot_filepath): + os.unlink(pot_filepath) + + # Delete all the precompute files, if any. + for f in os.listdir(path): + if f.startswith('rcracki.precalc.'): + os.unlink(os.path.join(path, f)) + + # Create the rainbow table directory. + rt_dir = os.path.join(temp_dir, "lookup_rt_%u" % int.from_bytes(os.urandom(4), byteorder='little')) + os.mkdir(rt_dir) + + return pot_filepath, rt_dir + + +if __name__ == '__main__': + tests_to_run = 'all' + + if (len(sys.argv) == 2) and (sys.argv[1] == '--verbose'): + VERBOSE = True + elif (len(sys.argv) == 3) and (sys.argv[1] in ['precomp', 'lookup', 'generate']): + tests_to_run = sys.argv[1] + VERBOSE = True + elif (len(sys.argv) == 2) and (sys.argv[1] == '--help'): + print("\nUsage: %s [precomp | lookup | generate | --verbose]\n\nWith no args, all tests are run. Otherwise, the 'precomp', 'lookup', or 'generate' arguments will run those respective tests only (with verbose mode on).\n" % sys.argv[0]) + exit(0) + + # NVIDIA caches old kernels in ~/.nv/ComputeCache. We need to delete it so we're + # sure the tests run the actual kernels. + compute_cache_dir = os.path.join(os.path.expandvars('~'), '/.nv/ComputeCache/') + if os.path.isdir(compute_cache_dir): + shutil.rmtree(compute_cache_dir) + + # AMD ROCm caches old kernels. in ~/.AMD/CLCache_rocm. + compute_cache_dir = os.path.join(os.path.expandvars('~'), '/.AMD/CLCache_rocm/') + if os.path.isdir(compute_cache_dir): + shutil.rmtree(compute_cache_dir) + + # Since we will be changing the current working directory, get the absolute path + # to the crackalack_gen program. + gen_prog_path = os.path.abspath(GEN_PROG_NAME) + lookup_prog_path = os.path.abspath(LOOKUP_PROG_NAME) + + # Make a temporary directory for us to generate tables in. + temp_dir = tempfile.mkdtemp(prefix='crackalack_tests') + + # Copy the OpenCL code to the temp dir so the programs can find them. + cl_dir = os.path.join(temp_dir, "CL") + shutil.copytree('CL', cl_dir) + shutil.copy('shared.h', cl_dir) + + # Change the working directory to the temporary directory. This way crackalack_gen + # creates files in here. + os.chdir(temp_dir) + + all_passed = True + + if (tests_to_run == 'all') or (tests_to_run == 'precomp'): + print("Performing pre-computation tests...\n") + all_passed = all_passed and do_precomp_tests(temp_dir) + + if (tests_to_run == 'all') or (tests_to_run == 'lookup'): + print("\n\nPerforming lookup tests...") + all_passed = all_passed and do_lookup_tests(temp_dir) + + if (tests_to_run == 'all') or (tests_to_run == 'generate'): + print("\n\nPerforming generation tests...\n") + for expected_hash in GEN_TESTS: + prog_args = GEN_TESTS[expected_hash] + + print('Generating table with args: %s... ' % ' '.join(prog_args), end='', flush=True) + subprocess.run([gen_prog_path] + prog_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + test_passed = False + # Go through all the files in the temp directory. Calculate the sha256 hash + # on the *.rt file, and compare it with the expected value. + for filename in os.listdir(temp_dir): + if filename.endswith('.rt'): + actual_hash = None + with open(filename, 'rb') as f: + actual_hash = hashlib.sha256(f.read()).hexdigest() + if actual_hash == expected_hash: + test_passed = True + print('%spassed.%s' % (GREEN, CLR)) + else: + print("%sFAILED!%s\n Expected: %s\n Actual: %s\n" % (RED, CLR, expected_hash, actual_hash)) + print("Test directory: %s" % temp_dir) + sys.exit(-1) + all_passed = False + + os.unlink(filename) + + all_passed = all_passed and test_passed + + + # Delete the temporary directory and any files within it. + shutil.rmtree(temp_dir) + + if all_passed: + print("\n\t%sALL TESTS PASS!%s\n" % (GREENB, CLR)) + else: + print("\n\n\tSOME TESTS %sFAILED!!%s\n" % (REDB, CLR)) + + sys.exit(0 if all_passed == True else -1) diff --git a/crackalack_unit_tests.c b/crackalack_unit_tests.c new file mode 100644 index 0000000..0a0164f --- /dev/null +++ b/crackalack_unit_tests.c @@ -0,0 +1,206 @@ +/* + * Rainbow Crackalack: crackalack_unit_tests.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 +#include +#endif +#include +#include +#include + +#include "opencl_setup.h" +#include "shared.h" +#include "test_chain.h" +#include "test_des.h" +#include "test_hash.h" +#include "test_hash_to_index.h" +#include "test_index_to_plaintext.h" +#include "version.h" + + +#define PRINT_PASSED() printf("%spassed.%s\n", GREEN, CLR); +#define PRINT_FAILED() printf("%sFAILED!%s\n", RED, CLR); + + +int main(int ac, char **av) { + cl_platform_id platforms[MAX_NUM_PLATFORMS]; + cl_device_id devices[MAX_NUM_DEVICES]; + cl_context context; + cl_program program; + cl_kernel kernel; + int err = 0; + cl_uint num_platforms = 0, num_devices = 0; + + int ret = 0; + unsigned int hash_type = HASH_UNDEFINED, all_tests_passed = 1; + + + ENABLE_CONSOLE_COLOR(); + PRINT_PROJECT_HEADER(); +#ifndef _WIN32 + setenv("CUDA_CACHE_DISABLE", "1", 1); /* Disables kernel caching. */ + setenv("HSA_ENABLE_SDMA", "0", 1); /* The ROCm driver on AMD Vega 64 doesn't work without this. */ +#endif + get_platforms_and_devices(MAX_NUM_PLATFORMS, platforms, &num_platforms, MAX_NUM_DEVICES, devices, &num_devices, 1); + + context = rc_clCreateContext(NULL, num_devices, devices, context_callback, NULL, &err); + if (err < 0) { + fprintf(stderr, "Failed to create context.\n"); + exit(-1); + } + + + /* PRNG test */ + /* + load_kernel(context, num_devices, devices, "test_prng.cl", "test_prng", &program, &kernel); + + printf("Running PRNG test... "); + if (!test_prng(devices[0], context, kernel)) { + ret = -1; + all_tests_passed = 0; + printf("FAILED!\n"); + } else + printf("passed.\n"); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + */ + + /* DES test */ + /* + load_kernel(context, num_devices, devices, "test_des.cl", "test_des", &program, &kernel); + + printf("Running DES tests... "); + if (!test_des(devices[0], context, kernel)) { + ret = -1; + all_tests_passed = 0; + printf("FAILED!\n"); + } else + printf("passed.\n"); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + */ + + /* index_to_plaintext() tests. */ + load_kernel(context, num_devices, devices, "test_index_to_plaintext.cl", "test_index_to_plaintext", &program, &kernel, hash_type); + printf("Running index_to_plaintext() tests... "); fflush(stdout); + if (!test_index_to_plaintext(devices[0], context, kernel)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + /* Hash tests. */ + printf("Running LM hash tests... "); fflush(stdout); + hash_type = HASH_LM; + load_kernel(context, num_devices, devices, "test_hash.cl", "test_hash", &program, &kernel, hash_type); + if (!test_hash(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + printf("Running NTLM hash tests... "); fflush(stdout); + hash_type = HASH_NTLM; + load_kernel(context, num_devices, devices, "test_hash.cl", "test_hash", &program, &kernel, hash_type); + if (!test_hash(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + /* hash_to_index() tests. */ + printf("Running LM hash_to_index() tests... "); fflush(stdout); + hash_type = HASH_LM; + load_kernel(context, num_devices, devices, "test_hash_to_index.cl", "test_hash_to_index", &program, &kernel, hash_type); + if (!test_h2i(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + printf("Running NTLM hash_to_index() tests... "); fflush(stdout); + hash_type = HASH_NTLM; + load_kernel(context, num_devices, devices, "test_hash_to_index.cl", "test_hash_to_index", &program, &kernel, hash_type); + if (!test_h2i(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + /* Chain tests. */ + printf("Running LM chain tests... "); fflush(stdout); + hash_type = HASH_LM; + load_kernel(context, num_devices, devices, "test_chain.cl", "test_chain", &program, &kernel, hash_type); + if (!test_chain(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + printf("Running NTLM chain tests... "); fflush(stdout); + hash_type = HASH_NTLM; + load_kernel(context, num_devices, devices, "test_chain.cl", "test_chain", &program, &kernel, hash_type); + if (!test_chain(devices[0], context, kernel, hash_type)) { + ret = -1; + all_tests_passed = 0; + PRINT_FAILED(); + } else + PRINT_PASSED(); + + CLRELEASEKERNEL(kernel); + CLRELEASEPROGRAM(program); + + + if (all_tests_passed) + printf("\n\t%sALL UNIT TESTS PASS!%s\n\n", GREENB, CLR); + else + printf("\n\t%sSome unit tests failed!%s :(\n\n", REDB, CLR); + + CLRELEASECONTEXT(context); + return ret; +} diff --git a/crackalack_verify.c b/crackalack_verify.c new file mode 100644 index 0000000..128438f --- /dev/null +++ b/crackalack_verify.c @@ -0,0 +1,104 @@ +/* + * Rainbow Crackalack: crackalack_verify.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 +#include +#endif + +#include +#include +#include +#include + +#include "terminal_color.h" +#include "verify.h" +#include "version.h" + + +static int raw_table = 0, sorted_table = 0, truncate = VERIFY_DONT_TRUNCATE; +static struct option long_options[] = { + {"raw", no_argument, &raw_table, 1}, + {"sorted", no_argument, &sorted_table, 1}, + {"truncate", no_argument, &truncate, VERIFY_TRUNCATE_ON_ERROR}, + {"num_chains", required_argument, 0, 'n'}, + {0, 0, 0, 0} +}; + + +void print_usage(char *prog_name) { + fprintf(stderr, "This program verifies rainbow tables.\n\n\n %s --raw [--truncate] [--num_chains X] table.rt\n\nThe above command will verify a newly-generated rainbow table. This ensures that the table 1.) has sequential start points, and 2.) has non-zero ending points. Optionally, it can truncate the file to just before the first error found, if any.\n\n\n %s --sorted [--num_chains X] table.rtc\n\nThe above command will verify a sorted rainbow table (i.e.: that it is suitable for lookups). It ensures that the end indices are sorted in ascending order. The table may be compressed or uncompressed.\n\n\nIn either case, num_chains sets the number of random chains to verify using CPU code (hence, providing a large number here will have a dramatic effect on the speed of verification). Unless overridden, this defaults to 100.\n\n\n", prog_name, prog_name); +} + + +int main(int ac, char **av) { + char *filename = NULL; + unsigned int table_type = 0; + int num_chains_to_verify = -1, c = 0, option_index = 0; + + + ENABLE_CONSOLE_COLOR(); + PRINT_PROJECT_HEADER(); + while ((c = getopt_long(ac, av, "", long_options, &option_index)) != -1) { + switch(c) { + case 0: + break; + case 'n': + num_chains_to_verify = atoi(optarg); + break; + default: + print_usage(av[0]); + exit(-1); + } + } + + /* If nether --raw nor --sorted were specified, or if both were specified, this is an error. */ + if ((raw_table ^ sorted_table) == 0) { + fprintf(stderr, "\nError: either --raw or --sorted must be specified!\n\n"); + print_usage(av[0]); + exit(-1); + } + + /* Sorted tables cannot be truncated. */ + if (sorted_table && truncate) { + fprintf(stderr, "\nError: sorted tables cannot be truncated.\n\n"); + exit(-1); + } + + /* Ensure that one argument remains (i.e.: the filename). */ + if (optind != ac - 1) { + fprintf(stderr, "\nError: RT/RTC file must be specified!\n\n"); + print_usage(av[0]); + exit(-1); + } + filename = av[optind]; + + if (raw_table) + table_type = VERIFY_TABLE_TYPE_GENERATED; + else if (sorted_table) + table_type = VERIFY_TABLE_TYPE_LOOKUP; + + if (!verify_rainbowtable_file(filename, table_type, VERIFY_TABLE_IS_COMPLETE, truncate, num_chains_to_verify)) { + fprintf(stderr, "\n%sRainbow table verification FAILED.%s", REDB, CLR); + if (truncate == VERIFY_TRUNCATE_ON_ERROR) + fprintf(stderr, " File truncated."); + fprintf(stderr, "\n\n"); + return -1; + } + + printf("%sRainbow table successfully verified!%s\n", GREENB, CLR); + return 0; +} diff --git a/file_lock.c b/file_lock.c new file mode 100644 index 0000000..5768655 --- /dev/null +++ b/file_lock.c @@ -0,0 +1,177 @@ +/* + * Rainbow Crackalack: file_lock.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +/* This contains file handling abstraction functions for when file locking is needed. + * Windows does not have fcntl(), flock(), or lockf(), so we need to call different + * functions to accomplish file locking. */ + +#include "file_lock.h" +#include "misc.h" + + +/* If append is set to 1, then the file is opened in append mode (similar to mode 'a') + * in the standard fopen(). */ +rc_file rc_fopen(char *filename, int append) { + rc_file f = NULL; + +#ifdef _WIN32 + f = CreateFile(filename, GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (f == INVALID_HANDLE_VALUE) { + f = NULL; + windows_print_error("CreateFile"); + } else if (append) + rc_fseek(f, 0, RCSEEK_END); +#else + if (append) + f = fopen(filename, "a"); + else + f = fopen(filename, "r+"); + + if (f == NULL) + perror("fopen"); +#endif + + return f; +} + + +/* Returns zero on success. */ +int rc_flock(rc_file f) { + int ret = -1; + +#ifdef _WIN32 + OVERLAPPED overlapped; + + memset(&overlapped, 0, sizeof(OVERLAPPED)); + if (LockFileEx(f, LOCKFILE_EXCLUSIVE_LOCK, 0, 0xffffffff, 0xffffffff, &overlapped)) + ret = 0; + else + windows_print_error("LockFileEx"); +#else + ret = flock(fileno(f), LOCK_EX); + if (ret != 0) + perror("flock"); +#endif + + return ret; +} + + +size_t rc_fread(void *ptr, size_t size, size_t nmemb, rc_file f) { + size_t ret = -1; + +#ifdef _WIN32 + DWORD bytes_read = 0; + if (!ReadFile(f, ptr, size * nmemb, &bytes_read, NULL)) + windows_print_error("ReadFile"); + else + ret = bytes_read / size; +#else + ret = fread(ptr, size, nmemb, f); +#endif + + return ret; +} + + +size_t rc_fwrite(const void *ptr, size_t size, size_t nmemb, rc_file f) { + size_t ret = -1; + +#ifdef _WIN32 + DWORD bytes_written = 0; + if (!WriteFile(f, ptr, size * nmemb, &bytes_written, NULL)) + windows_print_error("WriteFile"); + else + ret = bytes_written / size; +#else + ret = fwrite(ptr, size, nmemb, f); +#endif + + return ret; +} + + +/* For 'whence' arg, use RCSEEK_* flags. Returns 0 on success. */ +int rc_fseek(rc_file f, long offset, int whence) { + int ret = -1; + +#ifdef _WIN32 + if (SetFilePointer(f, offset, 0, whence) == INVALID_SET_FILE_POINTER) + windows_print_error("SetFilePointer"); + else + ret = 0; +#else + ret = fseek(f, offset, whence); + if (ret != 0) + perror("fseek"); +#endif + + return ret; +} + + +long rc_ftell(rc_file f) { + +#ifdef _WIN32 + return SetFilePointer(f, 0, 0, RCSEEK_CUR); +#else + return ftell(f); +#endif + +} + + +/* Returns 0 on success. */ +int rc_ftruncate(rc_file f, unsigned long length) { + +#ifdef _WIN32 + int ret = -1; + + if (rc_fseek(f, length, RCSEEK_SET) != 0) + fprintf(stderr, "Failed to truncate file because seek failed.\n"); + else { + if (!SetEndOfFile(f)) + windows_print_error("SetEndOfFile"); + else + ret = 0; + } + + return ret; +#else + return ftruncate(fileno(f), length); +#endif + +} + + +void rc_fclose(rc_file f) { +#ifdef _WIN32 + OVERLAPPED overlapped; + + memset(&overlapped, 0, sizeof(OVERLAPPED)); + /* Explicitly unlock the file before closing. According to the docs, this speeds + * up any waiting lock requests. */ + if (!UnlockFileEx(f, 0, 0xffffffff, 0xffffffff, &overlapped)) + windows_print_error("UnlockFileEx"); + + if (!CloseHandle(f)) + windows_print_error("CloseHandle"); +#else + fclose(f); +#endif +} diff --git a/file_lock.h b/file_lock.h new file mode 100644 index 0000000..723f14f --- /dev/null +++ b/file_lock.h @@ -0,0 +1,33 @@ +#ifndef _FILE_LOCK +#define _FILE_LOCK + +#include + + +#ifdef _WIN32 +#include +typedef HANDLE rc_file; +#define RCSEEK_SET FILE_BEGIN +#define RCSEEK_CUR FILE_CURRENT +#define RCSEEK_END FILE_END +#else +#include +#include +typedef FILE * rc_file; +#define RCSEEK_SET SEEK_SET +#define RCSEEK_CUR SEEK_CUR +#define RCSEEK_END SEEK_END +#endif + + +rc_file rc_fopen(char *filename, int append); +int rc_flock(rc_file f); +size_t rc_fread(void *ptr, size_t size, size_t nmemb, rc_file f); +size_t rc_fwrite(const void *ptr, size_t size, size_t nmemb, rc_file f); +int rc_fseek(rc_file f, long offset, int whence); +long rc_ftell(rc_file f); +int rc_ftruncate(rc_file f, unsigned long length); +void rc_fclose(rc_file f); + + +#endif diff --git a/get_chain.c b/get_chain.c new file mode 100644 index 0000000..d0aefa5 --- /dev/null +++ b/get_chain.c @@ -0,0 +1,84 @@ +/* + * Rainbow Crackalack: get_chain.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* This tool extracts a specific chain from an uncompressed rainbow table. */ + +#ifdef _WIN32 +#include +#endif +#include +#include +#include +#include +#include + +#include "terminal_color.h" + +/* The size of one chain entry (16 bytes). */ +#define CHAIN_SIZE (unsigned int)(sizeof(uint64_t) * 2) + + +int main(int ac, char **av) { + uint64_t start = 0, end = 0; + char *filename = NULL; + unsigned int chain_num = 0, file_pos = 0, file_size = 0; + FILE *f = NULL; + + + ENABLE_CONSOLE_COLOR(); + if (ac != 3) { + printf("\nThis tool outputs a specific chain in an uncompressed rainbow table.\n\nUsage: %s uncompressed_table.rt chain_num\n\n", av[0]); + exit(0); + } + + filename = av[1]; + chain_num = (unsigned int)atoi(av[2]); + file_pos = chain_num * CHAIN_SIZE; + + f = fopen(filename, "rb"); + + /* Get the file size. */ + fseek(f, 0, SEEK_END); + file_size = ftell(f); + + /* Ensure that the file size is aligned to 16 bytes. Otherwise, this + * rainbow table is invalid or compressed. */ + if ((file_size % CHAIN_SIZE) != 0) { + fprintf(stderr, "Error: file size is not aligned to %u bytes: %u\n", CHAIN_SIZE, file_size); + exit(-1); + } + + /* Ensure that the requested chain number is in the file. */ + if (((file_size == CHAIN_SIZE) && (chain_num > 0)) || + (file_pos > (file_size - CHAIN_SIZE))) { + fprintf(stderr, "Error: requested chain number would extend past end of file. Max chain number is %u.\n", (file_size / CHAIN_SIZE) - 1); + exit(-1); + } + + fseek(f, file_pos, SEEK_SET); + if ((fread(&start, sizeof(start), 1, f) != 1) || \ + (fread(&end, sizeof(end), 1, f) != 1)) { + fprintf(stderr, "Error while reading start and end indices: %s (%d)\n", strerror(errno), errno); + fclose(f); + return -1; + } + + fclose(f); + + printf("Start index: %" PRIu64 "\nEnd index: %" PRIu64 "\n", start, end); + return 0; +} diff --git a/gws.c b/gws.c new file mode 100644 index 0000000..25eed34 --- /dev/null +++ b/gws.c @@ -0,0 +1,70 @@ +#include +#include + +#include "gws.h" +#include "opencl_setup.h" + + +/* Given a GPU device, returns the optimal GWS setting (found through manual experimentation). Returns 0 if the optimal setting on the device is unknown. */ +unsigned int get_optimal_gws(cl_device_id device) { + char vendor[128] = {0}, name[64] = {0}; + + + get_device_str(device, CL_DEVICE_VENDOR, vendor, sizeof(vendor) - 1); + get_device_str(device, CL_DEVICE_NAME, name, sizeof(name) - 1); + + if (strcmp(vendor, "NVIDIA Corporation") == 0) { + if (strcmp(name, "GeForce GTX 1070 Ti") == 0) + return 19 * 768; /* NTLM 8-char: ?/s */ + + else if (strcmp(name, "GeForce GTX 1070") == 0) + return 15 * 768; /* NTLM 8-char: 3,028/s */ + + else if (strcmp(name, "GeForce GTX 1660 Ti") == 0) + return 24 * 1536; /* NTLM 8-char: 8,070/s */ + + else if (strcmp(name, "GeForce RTX 2060") == 0) +#ifdef _WIN32 + return 30 * 256; /* This is a guess based on the RTX 2070's Windows performance. The RTX 2070's optimal performance is at 36 compute units x 256 = 9216, so maybe the RTX 2060's is 30 compute units x 256 = 7680? */ +#else + return 30 * 512; /* NTLM 8-char: 5287/s */ +#endif + + else if (strcmp(name, "GeForce RTX 2070") == 0) +#ifdef _WIN32 + return 36 * 256; /* NTLM 8-char: 4,683/s */ +#else + return 36 * 512; /* NTLM 8-char: 6,345/s */ +#endif + + /* The RTX 2080 numbers are an educated guess based on how the RTX 2070 and 2060's numbers. Their compute units times 512 is optimal for Linux; their compute units times 256 is optimal for Windows. */ + else if (strcmp(name, "GeForce RTX 2080") == 0) +#ifdef _WIN32 + return 46 * 256; +#else + return 46 * 512; +#endif + + /* The RTX 2080 Ti numbers are an educated guess based on how the RTX 2070 and 2060's numbers. Their compute units times 512 is optimal for Linux; their compute units times 256 is optimal for Windows. */ + else if (strcmp(name, "GeForce RTX 2080 Ti") == 0) +#ifdef _WIN32 + return 68 * 256; +#else + return 68 * 512; +#endif + + else if (strcmp(name, "Tesla V100-SXM2-16GB") == 0) /* Amazon EC2 p3.2xlarge instance */ + return 80 * 512; + } + + if (strcmp(vendor, "Advanced Micro Devices, Inc.") == 0) { + if (strcmp(name, "gfx900") == 0) /* AMD Vega 64 */ +#ifdef _WIN32 + return 64 * 256; /* NTLM 8-char: 2,560/s */ +#else + return 64 * 768; /* NTLM 8-char: 5,671/s */ +#endif + } + + return 0; +} diff --git a/gws.h b/gws.h new file mode 100644 index 0000000..dae1475 --- /dev/null +++ b/gws.h @@ -0,0 +1,6 @@ +#ifndef _GWS_H +#define _GWS_H + +unsigned int get_optimal_gws(cl_device_id device); + +#endif diff --git a/hash_validate.c b/hash_validate.c new file mode 100644 index 0000000..ad21eb5 --- /dev/null +++ b/hash_validate.c @@ -0,0 +1,33 @@ +/* + * Rainbow Crackalack: hash_validate.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include "hash_validate.h" +#include "shared.h" + + +cl_uint hash_str_to_type(char *hash_str) { + unsigned int ret = HASH_UNDEFINED; + + + if (strcmp(hash_str, "lm") == 0) + ret = HASH_LM; + else if (strcmp(hash_str, "ntlm") == 0) + ret = HASH_NTLM; + + return ret; +} diff --git a/hash_validate.h b/hash_validate.h new file mode 100644 index 0000000..54e83b0 --- /dev/null +++ b/hash_validate.h @@ -0,0 +1,8 @@ +#ifndef _HASH_VALIDATE_H +#define _HASH_VALIDATE_H + +#include + +cl_uint hash_str_to_type(char *hash_str); + +#endif diff --git a/make_windows.sh b/make_windows.sh new file mode 100755 index 0000000..cc69f11 --- /dev/null +++ b/make_windows.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +CP=/bin/cp +MAKE=/usr/bin/make +MKTEMP=/bin/mktemp +RM=/bin/rm + +TEMPDIR=`$MKTEMP -d -t rcrackalack_XXXXXXXXXXX` +$CP -R /usr/include/CL $TEMPDIR +$MAKE CC=x86_64-w64-mingw32-gcc WINDOWS_BUILD=1 CL_INCLUDE=$TEMPDIR -j 8 +x86_64-w64-mingw32-strip *.exe +$RM -rf $TEMPDIR diff --git a/manual_tests/ntlm8_lookup_101/README.txt b/manual_tests/ntlm8_lookup_101/README.txt new file mode 100644 index 0000000..5d0ddf1 --- /dev/null +++ b/manual_tests/ntlm8_lookup_101/README.txt @@ -0,0 +1 @@ +The 'random_ntlm_hashes_8_chars_101.txt' file contains 101 random 8-character NTLM hashes. Cracking them against the NTLM 8 disk of tables will yield the included 'rainbowcrackalack_jtr.pot' file (pre-sorted). Sort the 'rainbowcrackalack_jtr.pot' file you produce and compare it with this one; the test is successful if there are no differences between the two. diff --git a/manual_tests/ntlm8_lookup_101/rainbowcrackalack_jtr.pot b/manual_tests/ntlm8_lookup_101/rainbowcrackalack_jtr.pot new file mode 100644 index 0000000..d2efb3f --- /dev/null +++ b/manual_tests/ntlm8_lookup_101/rainbowcrackalack_jtr.pot @@ -0,0 +1,92 @@ +$NT$03d5353a16d89732e540b38126c35bd9:2}MOJGy5 +$NT$07fc6f210b9f6473eb36f5d10f82df37:7fI#:'Mh +$NT$08380598fa040c12b0c089762c708daa:q+\PT%WW +$NT$0b2700902ebb81db659d3db28eabcba9:U(BQGY"e +$NT$0f26c226e5d72e4c6e035efa539b546e:A,7umA(; +$NT$0f93959b74887b2f5fc74a0bcce18827:"aN(L78, +$NT$15013c793d7eb0b8ad10240157212500:1E1>~9`w +$NT$156df07663cd7a29f30c0c956526d14a:qL@#!Y"` +$NT$196728cac88a25327fc6e012450d3307:*'N?62\_ +$NT$1af176dd57fbff32798ed1994d89d2b6:@UR%;:HU +$NT$25ad4924bf56b6c0dd1f8804ffb33963:7}MAjqqq +$NT$28767904eef6793d6d1efdb8015f6a90:Dw>N$02R +$NT$2cf12d04fd6e660bf4182a49bbf48553:{,Iv?FA* +$NT$3385b89adb9836627f8813b754ceeadc:0HQ_qfE\ +$NT$39b00f4aaebaf614da751a26ae92f209:G|r=sM.r +$NT$39db43e10f94e450cb190d8f1c473543:iYU[Q d~ +$NT$39db957fb0da88f8376313632ab5f3cc:QAK"RgnQ +$NT$3c4887cc0b4d3be2d43ca5e72c2a86fb:!pRoA3X> +$NT$4175d41bb49cd4fd42069435228f0135:`A7&tlSo +$NT$42ac006f02e6a75c386a02ca256e247c:g?%#1"%? +$NT$42c6e79ec760cd4090186b6e5fd332a7:E9LE5Ir +$NT$43056139989ae5229dbb34498275881e:y%pxYA/8 +$NT$4420e91385de7cac3ecb9eed09838aa5:S:)-@j!_ +$NT$4591f8e93ac153282059b5629607aceb: fdrp_,*M,:( +$NT$6dbdef85f2ef86d14f30b9f8ebd18ce0:^^/=+0a@ +$NT$7003b87dcb5281b822c87b580f9facc6:^9#!Ht>- +$NT$735af0747972be245cb3854ac697fdf3:K9*a3Mpg +$NT$769589dfa6fb9537e4a735f21c496973:ILDC:)"f +$NT$7791e22860220ab589e2d100ada11856:1J^r+/+w +$NT$7d665a33697edb2879e15a2555bd76ad:>f]_.z7: +$NT$7e6c8e5a1865e2dd7f4eda728747914b:$^13{-I' +$NT$81d23c9bded8bff9555d3527a230988d:Zv$gW11% +$NT$89947eae166b9271ff74e4085db0666b:_{MPkCJ; +$NT$8b10c2f74444f8ac0b594b718f03754a:QZa +$NT$c898e7526c288dec7a04bf0818df665a:UMH#a#Z! +$NT$c8d670b7e34bc275269c831102fc692f:20IH)[e> +$NT$cb44e21dc843d8d1463bddf1e364d147:d?$!vu?\ +$NT$d4e323ec73f7475571e2b82600e86b73:c0w[U'a: +$NT$d56df0ebcefc88d4ba4b6e3b68567c1c:kswWtgwD +$NT$d82f7d7211fe4c80ddd10f5658eff5fc:^UI,C@U0 +$NT$db0f41b99e289d825b7eb21ec6e2b3bf:6O|I)}Ii +$NT$dd33a6f5f2990a256ee1ba298c43a889:?A-%-HaT +$NT$ddc15854cf4a767bc35ef1f3fc26dcfc:iB98cSnW +$NT$dea1499336eabe0dcddfc9b9cedf4ae4:Hsu}S,,t +$NT$defc96c4f1290e15ac71e35a78625246:E/6""r}] +$NT$dfaac64b8f564bf3515f4a599597d1f1:y:L6J1}? +$NT$e60afab18bc6182b7cd01f7df9311af5:4)PDMaR4 +$NT$e7e33039ddc6c3213bf588ff5623981b:TMm?W>(e +$NT$ea9c5d481b81adb5a6384debcbe75ab3:U\+@<)TA +$NT$ec5d32bee607b335258f052c1338b9d9:zDpL7C4m +$NT$efc7bad96c8ca79f0862612d293cf214:"((a"CUG +$NT$f302c2ad36c67f6416144595b2c94d38:oFN[=*LQ +$NT$f417de201da2836457d3d893281e6b0f:2M*CD'HD +$NT$fa024075a777c1d15ac152b77b108f1f:/'m)OLly +$NT$fd65d097d214891be354f10c857b7ea1:k74PGFv8 +$NT$fdf44fceea99979b18f74ffceae5c6a8:)?e 5M-9 +$NT$fe9ae66f67d5de3db7b0bac145008c1c:Y?-IrrF diff --git a/manual_tests/ntlm8_lookup_101/random_ntlm_hashes_8_chars_101.txt b/manual_tests/ntlm8_lookup_101/random_ntlm_hashes_8_chars_101.txt new file mode 100644 index 0000000..34d0de9 --- /dev/null +++ b/manual_tests/ntlm8_lookup_101/random_ntlm_hashes_8_chars_101.txt @@ -0,0 +1,101 @@ +a7c002406a080278885e47da3909187f +bb53c0cb3a8cf0f71f0d6b170ff6b622 +8e7c7397e513cee2cb50c2fc174ca7b3 +c260b99e3b87dfd7538acae873f99291 +6055c3fc48f8260ece230fe8b599375e +bdabd5d2332764e0f15a8dc7ee0c1d3a +fa024075a777c1d15ac152b77b108f1f +5f40b1dc5fbb903ca54582f2db414b1b +defc96c4f1290e15ac71e35a78625246 +42c6e79ec760cd4090186b6e5fd332a7 +47e815a24e164a101854a7a9a8c23a11 +39b00f4aaebaf614da751a26ae92f209 +db0f41b99e289d825b7eb21ec6e2b3bf +51a5bdd570c8c6227883c973616563ca +bc1b0fff073728a32ffb773595da3b15 +4cb6004dffba6f4f12c4ab7ce3fc3472 +fe9ae66f67d5de3db7b0bac145008c1c +fd65d097d214891be354f10c857b7ea1 +6c1124cc1e901c78a702616f3bde736a +42ac006f02e6a75c386a02ca256e247c +5662715575507f51a09dd74f7dc0492a +aa527818dd5811cd6644bd9b37895920 +c08e33d529d5442c5fa00e1b3a30c667 +531a02ea1f34253034b0deadd37bc2b3 +15013c793d7eb0b8ad10240157212500 +4591f8e93ac153282059b5629607aceb +2cf12d04fd6e660bf4182a49bbf48553 +8b10c2f74444f8ac0b594b718f03754a +950544335fd7bf79cb3f8ece4adbfc98 +ea9c5d481b81adb5a6384debcbe75ab3 +ec5d32bee607b335258f052c1338b9d9 +03d5353a16d89732e540b38126c35bd9 +cb44e21dc843d8d1463bddf1e364d147 +28767904eef6793d6d1efdb8015f6a90 +9caf23959ef9fa3ee2c83df0c4f28499 +196728cac88a25327fc6e012450d3307 +39db43e10f94e450cb190d8f1c473543 +c898e7526c288dec7a04bf0818df665a +7d665a33697edb2879e15a2555bd76ad +0b2700902ebb81db659d3db28eabcba9 +0f26c226e5d72e4c6e035efa539b546e +efc7bad96c8ca79f0862612d293cf214 +39db957fb0da88f8376313632ab5f3cc +e31ed0823c7c3c3e24705fd589ad6b39 +1af176dd57fbff32798ed1994d89d2b6 +9befc2b8b85cdb4057597acb8ef619a8 +156df07663cd7a29f30c0c956526d14a +43056139989ae5229dbb34498275881e +735af0747972be245cb3854ac697fdf3 +fdf44fceea99979b18f74ffceae5c6a8 +03862b9a1156e01c6009c14caab6355d +a5ca6fbb7d665800c06fa4d2acca42d9 +d82f7d7211fe4c80ddd10f5658eff5fc +dd33a6f5f2990a256ee1ba298c43a889 +a073a0b1c653c5ea772dbaf66fc2ca7c +3c4887cc0b4d3be2d43ca5e72c2a86fb +769589dfa6fb9537e4a735f21c496973 +7003b87dcb5281b822c87b580f9facc6 +64afc72c7aed55acb4fd4df7de31f4b5 +7a65091fbd0c6c6c6a6ed1ef4b14e607 +d4e323ec73f7475571e2b82600e86b73 +c8d670b7e34bc275269c831102fc692f +6dbdef85f2ef86d14f30b9f8ebd18ce0 +62edc1e25f9d216a69e1fedf5d382d5c +07fc6f210b9f6473eb36f5d10f82df37 +d56df0ebcefc88d4ba4b6e3b68567c1c +cc6fa018c1f3fe4f66f9db23770ed422 +8a2d0e30ba2e55c4199e5eaa43344335 +7e6c8e5a1865e2dd7f4eda728747914b +4420e91385de7cac3ecb9eed09838aa5 +e60afab18bc6182b7cd01f7df9311af5 +89947eae166b9271ff74e4085db0666b +dea1499336eabe0dcddfc9b9cedf4ae4 +f302c2ad36c67f6416144595b2c94d38 +53e86a6c7fb553ea9826edbe1abf8561 +5952b532c700380482c326a5abda129b +0f93959b74887b2f5fc74a0bcce18827 +81d23c9bded8bff9555d3527a230988d +25ad4924bf56b6c0dd1f8804ffb33963 +510e7fc4cca7950e35d81a823bf91198 +08380598fa040c12b0c089762c708daa +dfaac64b8f564bf3515f4a599597d1f1 +52cc18d2dc77f80601532e41cb6738a9 +425dd0b87cf964e3db966fe9b8b41bb8 +8b4e9658d35b8713cbcf6309483e675e +c8167e98d727a43ee37161fc68b3b482 +5c30eba31a41c57dc73f9a4a73751732 +7791e22860220ab589e2d100ada11856 +5cb7c670b17f959002d769f29e365b83 +9530dba282e574578a0977cf425a44ae +5732d8f7ce74942e5ce95a07761c3a70 +ba6ed526f5e131b0966a2c7b3f7087bf +af2345dc62326ff2286f0f92851b1bef +ddc15854cf4a767bc35ef1f3fc26dcfc +3385b89adb9836627f8813b754ceeadc +e7e33039ddc6c3213bf588ff5623981b +f417de201da2836457d3d893281e6b0f +93bbef5ea5b703bd2b9a698aef26bf59 +94a973336315e6c91028355dc573755e +65c3f0d5a3876530ce9cbb24007c685b +4175d41bb49cd4fd42069435228f0135 diff --git a/manual_tests/ntlm9_lookup_3/README.txt b/manual_tests/ntlm9_lookup_3/README.txt new file mode 100644 index 0000000..060cb67 --- /dev/null +++ b/manual_tests/ntlm9_lookup_3/README.txt @@ -0,0 +1,11 @@ +This is a very short test that checks NTLM9 lookups succeed on a very basic level. + +Start by generating a standard NTLM9 index 0 table: + + ./crackalack_gen ntlm ascii-32-95 9 9 0 1350000 67108864 0 + +Then sort this table. The SHA256 hash of the sorted table should be: + + 413d0dbe9c4630fde5c5a1c72051941a8316fe28083bb31a7c7dc3dc9d7d8a2f + +Perform a lookup on the three hashes in 'three_ntlm9_hashes.txt' against this table. All three hashes should be cracked. diff --git a/manual_tests/ntlm9_lookup_3/three_ntlm9_hashes.txt b/manual_tests/ntlm9_lookup_3/three_ntlm9_hashes.txt new file mode 100644 index 0000000..1f97f7e --- /dev/null +++ b/manual_tests/ntlm9_lookup_3/three_ntlm9_hashes.txt @@ -0,0 +1,3 @@ +aad56f4bbab89d9281167dd63ae0f1e5 +b54e4c690966fc49788c2280603ee7ab +908668b077c3bff98d028dc338a626f7 diff --git a/misc.c b/misc.c new file mode 100644 index 0000000..a29dde8 --- /dev/null +++ b/misc.c @@ -0,0 +1,337 @@ +/* + * Rainbow Crackalack: misc.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 +#include +#define STATUS_SUCCESS 0 +#else +#include +#include +#endif + +#include +#include +#include + +#include "charset.h" +#include "misc.h" +#include "shared.h" + + +/* Given a rainbow table filename, delete its associated log, if any exists. */ +void delete_rt_log(char *rt_filename) { + char log_filename[256] = {0}; + + get_rt_log_filename(log_filename, sizeof(log_filename), rt_filename); + unlink(log_filename); +} + + +/* Joins two file paths together in a platform-independent way. */ +void filepath_join(char *filepath_result, unsigned int filepath_result_size, const char *path1, const char *path2) { + strncpy(filepath_result, path1, filepath_result_size); +#ifdef _WIN32 + strncat(filepath_result, "\\", filepath_result_size); +#else + strncat(filepath_result, "/", filepath_result_size); +#endif + strncat(filepath_result, path2, filepath_result_size); +} + + +/* Returns an open file's size. */ +long get_file_size(FILE *f) { + long ret = 0; + long original_pos = ftell(f); /* Save the file pointer's current position. */ + + + /* Seek to the end of the file. */ + if (fseek(f, 0, SEEK_END) < 0) { + fprintf(stderr, "Failed to seeking in file.\n"); + exit(-1); + } + + ret = ftell(f); + + /* Restore the file pointer to its original position. */ + if (fseek(f, original_pos, SEEK_SET) < 0) { + fprintf(stderr, "Failed to seeking in file.\n"); + exit(-1); + } + + return ret; +} + + +/* Returns the amount of system RAM, in bytes. Returns zero on error. */ +uint64_t get_total_memory() { + uint64_t total_memory = 0; +#ifdef _WIN32 + MEMORYSTATUSEX ms = {0}; + + ms.dwLength = sizeof(MEMORYSTATUSEX); + if (!GlobalMemoryStatusEx(&ms)) { + windows_print_error("GlobalMemoryStatusEx"); + return 0; + } + total_memory = ms.ullTotalPhys; +#else + struct sysinfo si = {0}; + + if (sysinfo(&si) < 0) { + fprintf(stderr, "\nFailed to call sysinfo(): %s (%d)\n", strerror(errno), errno); + return 0; + } + total_memory = si.totalram; +#endif + return total_memory; +} + + +/* Returns a random number between 0 and max - 1. */ +uint64_t get_random(uint64_t max) { + uint64_t ret = 0; + unsigned int i = 0; + unsigned char random_byte = 0; +#ifdef _WIN32 + BCRYPT_ALG_HANDLE hAlgorithm = NULL; + + + /* Get a handle to the random number generator. */ + if (BCryptOpenAlgorithmProvider(&hAlgorithm, BCRYPT_RNG_ALGORITHM, NULL, 0) != STATUS_SUCCESS) { + fprintf(stderr, "Error: failed to obtain handle to random number generator!\n"); + exit(-1); + } + + for (i = 0; i < 8; i++) { + /* Get a single random byte. */ + if (BCryptGenRandom(hAlgorithm, &random_byte, sizeof(unsigned char), 0) != STATUS_SUCCESS) { + fprintf(stderr, "Error: failed to obtain random bytes from random number generator!\n"); + exit(-1); + } + + /* Shift our return value up by 8 bits and OR in the new random byte. */ + ret <<= 8; + ret |= random_byte; + + /* If we exceeded the max value wanted by the caller, we're done reading random bytes. */ + if (ret > max) + break; + } + + /* Close the RNG handle. */ + if (BCryptCloseAlgorithmProvider(hAlgorithm, 0) != STATUS_SUCCESS) + fprintf(stderr, "Warning: failed to close handle to random number generator.\n"); +#else + FILE *urandom = fopen("/dev/urandom", "r"); + + + /* Ensure that we opened a handle to /dev/urandom. */ + if (urandom == NULL) { + fprintf(stderr, "Error: failed to open /dev/urandom!\n"); + exit(-1); + } + + for (i = 0; i < 8; i++) { + + /* Get a single random byte. */ + if (fread(&random_byte, sizeof(unsigned char), 1, urandom) != 1) { + fprintf(stderr, "Error: failed to obtain random bytes from random number generator!\n"); + exit(-1); + } + + /* Shift our return value up by 8 bits and OR in the new random byte. */ + ret <<= 8; + ret |= random_byte; + + /* If we exceeded the max value wanted by the caller, we're done reading random bytes. */ + if (ret > max) + break; + } + + FCLOSE(urandom); +#endif + + return ret % max; +} + + +/* Given a rainbow table filename, get its associated log filename. */ +void get_rt_log_filename(char *log_filename, size_t log_filename_size, char *rt_filename) { + snprintf(log_filename, log_filename_size - 1, "%s.log", rt_filename); +} + + +/* Returns 1 if the parameters form the standard NTLM 8 set, otherwise 0. */ +unsigned int is_ntlm8(unsigned int hash_type, char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int reduction_offset, unsigned int chain_len) { + if ((hash_type == HASH_NTLM) && \ + (strcmp(charset, CHARSET_ASCII_32_95) == 0) && \ + (plaintext_len_min == 8) && \ + (plaintext_len_max == 8) && \ + (reduction_offset == 0) && \ + (chain_len == 422000)) + return 1; + else + return 0; +} + + +/* Returns 1 if the parameters form the standard NTLM 9 set, otherwise 0. */ +unsigned int is_ntlm9(unsigned int hash_type, char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int reduction_offset, unsigned int chain_len) { + if ((hash_type == HASH_NTLM) && \ + (strcmp(charset, CHARSET_ASCII_32_95) == 0) && \ + (plaintext_len_min == 9) && \ + (plaintext_len_max == 9) && \ + (reduction_offset == 0) && \ + (chain_len == 1350000)) + return 1; + else + return 0; +} + + +/* Given a filename for a rainbow table, parse its parameters. On success the + * rt_parameters' parsed flag is set to 1, otherwise it is zero. */ +void parse_rt_params(rt_parameters *rt_params, char *rt_filename_orig) { + /* Filename is in the following format: "%s_%s#%u-%u_%u_%ux%u_%u" */ + char *hpos = NULL; + char rt_filename[512] = {0}; + + + rt_params->parsed = 0; + + /* Skip the directory path, if this filename is absolute. */ +#ifdef _WIN32 + hpos = strrchr(rt_filename_orig, '\\'); +#else + hpos = strrchr(rt_filename_orig, '/'); +#endif + if (hpos != NULL) + strncpy(rt_filename, hpos + 1, sizeof(rt_filename)); + else + strncpy(rt_filename, rt_filename_orig, sizeof(rt_filename)); + + /* Ensure that the filename ends in .rt or .rtc. */ + if (!str_ends_with(rt_filename, ".rt") && !str_ends_with(rt_filename, ".rtc")) + return; + + /* Manually pick out the strings from the filename. sscanf() can't be used because + * a buffer overflow can occur (note that the MinGW system doesn't support the + * "m" format modifier, which would have been a good and portable solution...). */ + hpos = strchr(rt_filename, '#'); + if (hpos) { + char *suffix = hpos + 1; + char *upos = NULL; + + + *hpos = '\0'; + upos = strchr(rt_filename, '_'); + if (upos) { + char *hash_name_ptr = rt_filename; + char *charset_name_ptr = upos + 1; + + + *upos = '\0'; + strncpy(rt_params->hash_name, hash_name_ptr, sizeof(rt_params->hash_name)); + strncpy(rt_params->charset_name, charset_name_ptr, sizeof(rt_params->charset_name)); + + /* Now parse the unsigned integers. */ + if (sscanf(suffix, "%u-%u_%u_%ux%u_%u", &rt_params->plaintext_len_min, &rt_params->plaintext_len_max, &rt_params->table_index, &rt_params->chain_len, &rt_params->num_chains, &rt_params->table_part) == 6) { + + + /* Calculate the reduction offset from the table index. */ + rt_params->reduction_offset = TABLE_INDEX_TO_REDUCTION_OFFSET(rt_params->table_index); + /* Validate the hash type. & character set name. */ + rt_params->hash_type = hash_str_to_type(rt_params->hash_name); + + /* Ensure that the hash type and character set is valid, the plaintext + * length min & max are set properly, and the chain length is set. */ + if ((rt_params->hash_type != HASH_UNDEFINED) && \ + (validate_charset(rt_params->charset_name) != NULL) && \ + (rt_params->plaintext_len_min > 0) && \ + (rt_params->plaintext_len_min <= rt_params->plaintext_len_max) && \ + (rt_params->plaintext_len_max < MAX_PLAINTEXT_LEN) && \ + (rt_params->chain_len > 0) && \ + (rt_params->num_chains > 0)) + rt_params->parsed = 1; + } + } + } +} + + +/* Combines realloc() with calloc(). */ +void *recalloc(void *ptr, size_t new_size, size_t old_size) { + ptr = realloc(ptr, new_size); + if (ptr == NULL) { + fprintf(stderr, "Failed to realloc buffer.\n"); + exit(-1); + } + + memset(ptr + old_size, 0, new_size - old_size); + return ptr; +} + + +/* Logs a message to the rainbow table log. */ +size_t rt_log(rc_file f, const char *fmt, ...) { + char buf[256] = {0}; + size_t len = 0; + + va_list args; + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf) - 1, fmt, args); + va_end(args); + + if (len > 0) + len = rc_fwrite(buf, len, 1, f); + + return len; +} + + +/* Returns 1 if the string ends with the specified suffix, otherwise 0. */ +int str_ends_with(const char *str, const char *suffix) { + size_t str_len; + size_t suffix_len; + + + if ((str == NULL) || (suffix == NULL)) + return 0; + + str_len = strlen(str); + suffix_len = strlen(suffix); + if (suffix_len > str_len) + return 0; + + return strncmp(str + str_len - suffix_len, suffix, suffix_len) == 0; +} + + +/* On Windows, prints the last error. */ +#ifdef _WIN32 +void windows_print_error(char *func_name) { + DWORD err_code = GetLastError(); + LPVOID err_str = NULL; + + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &err_str, 0, NULL); + + fprintf(stderr, "\n%s failed with error %lu: %s\n\n", func_name, err_code, (char *)err_str); + fflush(stderr); + LocalFree(err_str); +} +#endif diff --git a/misc.h b/misc.h new file mode 100644 index 0000000..76b9524 --- /dev/null +++ b/misc.h @@ -0,0 +1,81 @@ +#ifndef _MISC_H +#define _MISC_H + +#include + +/* The quote format specifier (which on UNIX prints numbers with commas in the thousanth's place, i.e.: %'u") can cause crashes in Windows. */ +#ifdef _WIN32 +#define QUOTE "" +#else +#define QUOTE "'" +#endif + +/* This is the longest chain length that a single kernel invokation can produce. Beyond this, it must be split up into parts. Linux drivers don't seem to have a problem with this larger chains, but Windows drivers end up getting killed by the watchdog timer. */ +#define MAX_CHAIN_LEN 450000 + +#define CHAIN_SIZE (unsigned int)(sizeof(uint64_t) * 2) + +#define FREE(_ptr) \ + { free(_ptr); _ptr = NULL; } + +#define FCLOSE(_f) \ + { if (_f != NULL) { fclose(_f); _f = NULL; } } + +#include "file_lock.h" + +#ifdef _WIN32 +#include + +#define CHECK_MEMORY_SIZE() \ + /* Our code + the OpenCL library does NOT like to run on Windows systems with 4GB \ + * of RAM. It tends to throw strange errors at strange times, so let's warn the \ + * user ahead of time... */ \ + if (get_total_memory() <= 4294967296) { /* Less than 4GB... */ \ + fprintf(stderr, "\n\n\n\t!! WARNING !!\n\n\nThis system has 4GB of RAM or less. On Windows systems, this tends to result in strange errors from the OpenCL library. While it is safe to continue anyway, this would be the prime suspect if any problems occur. In that case, either run on a system with more memory, or boot this machine in Linux (which has been seen to be much more forgiving in low-memory conditions).\n\n\n\n"); \ + fflush(stderr); \ + } +#define PRINT_WIN7_LOOKUP_WARNING() \ + if (IsWindows7OrGreater() && !IsWindows8OrGreater()) { fprintf(stderr, "\n\n\n\t!! WARNING !!\n\n\nPerforming lookups on Windows 7 is known to be very unstable. Crashes, screen flickering, and/or strange error messages may be observed. If this happens, unfortunately, there is no solution. However, a work-around would be to boot the machine into Linux, which does not show these problems. Lookups on Windows 10 systems work without issue as well.\n\n\n\n"); fflush(stderr); } +#else +#define CHECK_MEMORY_SIZE() /* Do nothing: Linux systems don't seem to have memory issues */ +#define PRINT_WIN7_LOOKUP_WARNING() /* Do nothing: Linux systems don't have lookup problems. */ +#endif + + +/* Struct to track parameters for rainbow tables found in a target directory. */ +struct _rt_parameters { + char hash_name[16]; + unsigned int hash_type; + char charset_name[32]; + unsigned int plaintext_len_min; + unsigned int plaintext_len_max; + unsigned int table_index; + unsigned int reduction_offset; + unsigned int chain_len; + unsigned int num_chains; + unsigned int table_part; + + unsigned int parsed; /* Set to 1 if parameters successfully parsed, otherwise 0. */ +}; +typedef struct _rt_parameters rt_parameters; + + +void delete_rt_log(char *rt_filename); +void filepath_join(char *filepath_result, unsigned int filepath_result_size, const char *path1, const char *path2); +long get_file_size(FILE *f); +uint64_t get_random(uint64_t max); +void get_rt_log_filename(char *log_filename, size_t log_filename_size, char *rt_filename); +uint64_t get_total_memory(); +unsigned int hash_str_to_type(char *hash_str); +unsigned int is_ntlm8(unsigned int hash_type, char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int reduction_offset, unsigned int chain_len); +unsigned int is_ntlm9(unsigned int hash_type, char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int reduction_offset, unsigned int chain_len); +void parse_rt_params(rt_parameters *rt_params, char *rt_filename); +void *recalloc(void *ptr, size_t new_size, size_t old_size); +size_t rt_log(rc_file f, const char *fmt, ...); +int str_ends_with(const char *str, const char *suffix); + +#ifdef _WIN32 +void windows_print_error(char *func_name); +#endif + +#endif diff --git a/opencl_device_info.h b/opencl_device_info.h new file mode 100644 index 0000000..375ee7a --- /dev/null +++ b/opencl_device_info.h @@ -0,0 +1,65 @@ +/* + * Developed by Claudio André in 2012 + * + * Copyright (c) 2012-2015 Claudio André + * This program comes with ABSOLUTELY NO WARRANTY; express or implied. + * + * This is free software, and you are welcome to redistribute it + * under certain conditions; as expressed here + * http://www.gnu.org/licenses/gpl-2.0.html + */ + +#ifndef OPENCL_DEVICE_INFO_H +#define OPENCL_DEVICE_INFO_H + +//Copied from opencl_common.h +#define DEV_UNKNOWN 0 //0 +#define DEV_CPU (1 << 0) //1 +#define DEV_GPU (1 << 1) //2 +#define DEV_ACCELERATOR (1 << 2) //4 +#define DEV_AMD (1 << 3) //8 +#define DEV_NVIDIA (1 << 4) //16 +#define DEV_INTEL (1 << 5) //32 +#define PLATFORM_APPLE (1 << 6) //64 +#define DEV_AMD_GCN_10 (1 << 7) //128 +#define DEV_AMD_GCN_11 (1 << 8) //256 +#define DEV_AMD_GCN_12 (1 << 9) //512 +#define DEV_AMD_VLIW4 (1 << 12) //4096 +#define DEV_AMD_VLIW5 (1 << 13) //8192 +#define DEV_NV_C2X (1 << 14) //16384 +#define DEV_NV_C30 (1 << 15) //32768 +#define DEV_NV_C32 (1 << 16) //65536 +#define DEV_NV_C35 (1 << 17) //131072 +#define DEV_NV_MAXWELL (1 << 18) //262144 +#define DEV_NV_PASCAL (1 << 19) //524288 +#define DEV_NV_VOLTA (1 << 20) //1M +#define DEV_USE_LOCAL (1 << 21) //2M +#define DEV_NO_BYTE_ADDRESSABLE (1 << 22) //4M +#define DEV_MESA (1 << 23) //8M + +#define cpu(n) ((n & DEV_CPU) == (DEV_CPU)) +#define gpu(n) ((n & DEV_GPU) == (DEV_GPU)) +#define gpu_amd(n) ((n & DEV_AMD) && gpu(n)) +#define gpu_nvidia(n) ((n & DEV_NVIDIA) && gpu(n)) +#define gpu_intel(n) ((n & DEV_INTEL) && gpu(n)) +#define cpu_amd(n) ((n & DEV_AMD) && cpu(n)) +#define cpu_intel(n) ((n & DEV_INTEL) && cpu(n)) +#define amd_gcn_10(n) ((n & DEV_AMD_GCN_10) && gpu_amd(n)) +#define amd_gcn_11(n) ((n & DEV_AMD_GCN_11) && gpu_amd(n)) +#define amd_gcn_12(n) ((n & DEV_AMD_GCN_12) && gpu_amd(n)) +#define amd_gcn(n) (amd_gcn_10(n) || (amd_gcn_11(n)) || amd_gcn_12(n)) +#define amd_vliw4(n) ((n & DEV_AMD_VLIW4) && gpu_amd(n)) +#define amd_vliw5(n) ((n & DEV_AMD_VLIW5) && gpu_amd(n)) +#define nvidia_sm_2x(n) ((n & DEV_NV_C2X) && gpu_nvidia(n)) +#define nvidia_sm_3x(n) (((n & DEV_NV_C30) || (n & DEV_NV_C32) || (n & DEV_NV_C35)) && gpu_nvidia(n)) +#define nvidia_sm_5x(n) ((n & DEV_NV_MAXWELL) && gpu_nvidia(n)) +#define nvidia_sm_6x(n) ((n & DEV_NV_PASCAL) && gpu_nvidia(n)) +#define no_byte_addressable(n) ((n & DEV_NO_BYTE_ADDRESSABLE)) +#define use_local(n) ((n & DEV_USE_LOCAL)) + +/* Only usable in host code */ +/*#if !_OPENCL_COMPILER +#define platform_apple(p) (get_platform_vendor_id(p) == PLATFORM_APPLE) +#endif*/ + +#endif /* OPENCL_DEVICE_INFO_H */ diff --git a/opencl_lm_hst_dev_shared.h b/opencl_lm_hst_dev_shared.h new file mode 100644 index 0000000..f8ab052 --- /dev/null +++ b/opencl_lm_hst_dev_shared.h @@ -0,0 +1,14 @@ +#ifndef _OPENCL_LM_HST_DEV_SHARED_H +#define _OPENCL_LM_HST_DEV_SHARED_H + +#define WORD int +#define lm_vector WORD + +typedef struct{ + union { + unsigned char c[8][8][sizeof(lm_vector)]; + lm_vector v[8][8]; + } xkeys; +} opencl_lm_transfer; + +#endif /* _OPENCL_LM_HST_DEV_SHARED_H */ diff --git a/opencl_lm_kernel_params.h b/opencl_lm_kernel_params.h new file mode 100644 index 0000000..45005e4 --- /dev/null +++ b/opencl_lm_kernel_params.h @@ -0,0 +1,170 @@ +#include "opencl_lm_hst_dev_shared.h" +#include "opencl_device_info.h" + +typedef unsigned WORD vtype; + +/* + * Some devices/drivers has problems with the optimized 'goto' program flow. + * Some AMD driver versions can't build the "fast goto" version but those who + * can runs faster. Hawaii on 14.9 fails, Tahiti on 14.9 does not (!?). + * + * Nvidia can build either kernel but GTX980 is significantly faster with the + * "safe goto" version (7% faster for one salt, 16% for many salts). + * + * OSX' Intel HD4000 driver [1.2(Sep25 2014 22:26:04)] fails building the + * "fast goto" version. + */ +#if nvidia_sm_5x(DEVICE_INFO) || gpu_intel(DEVICE_INFO) || \ + (gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1573 && !defined(__Tahiti__)) || \ + (gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1702) +//#warning Using 'safe goto' kernel +#define SAFE_GOTO +#else +//#warning Using 'fast goto' kernel +#endif + +#if no_byte_addressable(DEVICE_INFO) +#define RV7xx +#endif +#if gpu_nvidia(DEVICE_INFO) +#define _NV +#endif + +#define vxorf(a, b) \ + ((a) ^ (b)) +#define vnot(dst, a) \ + (dst) = ~(a) +#define vand(dst, a, b) \ + (dst) = (a) & (b) +#define vor(dst, a, b) \ + (dst) = (a) | (b) +#define vandn(dst, a, b) \ + (dst) = (a) & ~(b) +#define vxor(dst, a, b) \ + (dst) = vxorf((a), (b)) +#define vshl(dst, src, shift) \ + (dst) = (src) << (shift) +#define vshr(dst, src, shift) \ + (dst) = (src) >> (shift) +#define vshl1(dst, src) \ + vshl((dst), (src), 1) + +#if HAVE_LUT3 +#define vsel(dst, a, b, c) (dst) = lut3(a, b, c, 0xd8) +#elif defined(_NV) || __CPU__ +#define vsel(dst, a, b, c) \ + (dst) = (((a) & ~(c)) ^ ((b) & (c))) +#else +#define vsel(dst, a, b, c) \ + (dst) = bitselect((a),(b),(c)) +#endif + +#if defined(_NV) || __CPU__ +#include "opencl_sboxes.h" +#else +#include "opencl_sboxes-s.h" +#endif + +#define vst_private(dst, ofs, src) \ + *((__private vtype *)((__private lm_vector *)&(dst) + (ofs))) = (src) + +#define lm_clear_block_8(j) \ + vst_private(B[j] , 0, zero); \ + vst_private(B[j] , 1, zero); \ + vst_private(B[j] , 2, zero); \ + vst_private(B[j] , 3, zero); \ + vst_private(B[j] , 4, zero); \ + vst_private(B[j] , 5, zero); \ + vst_private(B[j] , 6, zero); \ + vst_private(B[j] , 7, zero); + +#define lm_clear_block \ + lm_clear_block_8(0); \ + lm_clear_block_8(8); \ + lm_clear_block_8(16); \ + lm_clear_block_8(24); \ + lm_clear_block_8(32); \ + lm_clear_block_8(40); \ + lm_clear_block_8(48); \ + lm_clear_block_8(56); + +#if BITMAP_SIZE_BITS_LESS_ONE < 0xffffffff +#define BITMAP_SIZE_BITS (BITMAP_SIZE_BITS_LESS_ONE + 1) +#else +/*undefined, cause error.*/ +#endif + +#define GET_HASH_0(hash, x, k, bits) \ + for (bit = bits; bit < k; bit++) \ + hash |= ((((uint)B[bit]) >> x) & 1) << bit; + +#define GET_HASH_1(hash, x, k, bits) \ + for (bit = bits; bit < k; bit++) \ + hash |= ((((uint)B[32 + bit]) >> x) & 1) << bit; + +inline void cmp_final(__private unsigned lm_vector *B, + __private unsigned int *binary, + __global unsigned int *offset_table, + __global unsigned int *hash_table, + volatile __global uint *output, + volatile __global uint *bitmap_dupe, + unsigned int depth, + unsigned int section, + unsigned int iter) +{ + unsigned long hash; + unsigned int hash_table_index, t, bit; + +#if SELECT_CMP_STEPS > 1 + GET_HASH_0(binary[0], depth, 32, REQ_BITMAP_BITS); + GET_HASH_1(binary[1], depth, 32, REQ_BITMAP_BITS); +#else + binary[0] = 0; + GET_HASH_0(binary[0], depth, 32, 0); + GET_HASH_1(binary[1], depth, 32, REQ_BITMAP_BITS); +#endif + + hash = ((unsigned long)binary[1] << 32) | (unsigned long)binary[0]; + hash += (unsigned long)offset_table[hash % OFFSET_TABLE_SIZE]; + hash_table_index = hash % HASH_TABLE_SIZE; + + if (hash_table[hash_table_index + HASH_TABLE_SIZE] == binary[1]) + if (hash_table[hash_table_index] == binary[0]) + if (!(atomic_or(&bitmap_dupe[hash_table_index/32], (1U << (hash_table_index % 32))) & (1U << (hash_table_index % 32)))) { + t = atomic_inc(&output[0]); + output[1 + 3 * t] = (section * 32) + depth; + output[2 + 3 * t] = iter; + output[3 + 3 * t] = hash_table_index; + } +} + +inline void cmp( __private unsigned lm_vector *B, + __global unsigned int *offset_table, + __global unsigned int *hash_table, + __global unsigned int *bitmaps, + volatile __global uint *output, + volatile __global uint *bitmap_dupe, + int section, unsigned int iter) { + + unsigned int value[2] , i, bit, bitmap_index; + + for (i = 0; i < 32; i++){ +#if SELECT_CMP_STEPS > 1 + value[0] = 0; + value[1] = 0; + GET_HASH_0(value[0], i, REQ_BITMAP_BITS, 0); + GET_HASH_1(value[1], i, REQ_BITMAP_BITS, 0); + bitmap_index = value[1] & (BITMAP_SIZE_BITS - 1); + bit = (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; + bitmap_index = value[0] & (BITMAP_SIZE_BITS - 1); + bit &= (bitmaps[(BITMAP_SIZE_BITS >> 5) + (bitmap_index >> 5)] >> (bitmap_index & 31)) & 1U; +#else + value[1] = 0; + GET_HASH_1(value[1], i, REQ_BITMAP_BITS, 0); + bitmap_index = value[1] & BITMAP_SIZE_BITS_LESS_ONE; + bit = (bitmaps[bitmap_index >> 5] >> (bitmap_index & 31)) & 1U; +#endif + if (bit) + cmp_final(B, value, offset_table, hash_table, output, bitmap_dupe, i, section, iter); + } +} diff --git a/opencl_misc.h b/opencl_misc.h new file mode 100644 index 0000000..e87b967 --- /dev/null +++ b/opencl_misc.h @@ -0,0 +1,798 @@ +/* + * OpenCL common macros + * + * Copyright (c) 2014-2015, magnum + * This software is hereby released to the general public under + * the following terms: Redistribution and use in source and binary + * forms, with or without modification, are permitted. + * + * NOTICE: After changes in headers, with nvidia driver you probably + * need to drop cached kernels to ensure the changes take effect: + * + * rm -fr ~/.nv/ComputeCache + * + */ + +#ifndef _OPENCL_MISC_H +#define _OPENCL_MISC_H + +#include "opencl_device_info.h" + +/* Note: long is *always* 64-bit in OpenCL */ +typedef uchar uint8_t; +typedef char int8_t; +typedef ushort uint16_t; +typedef short int16_t; +typedef uint uint32_t; +typedef int int32_t; +typedef ulong uint64_t; +typedef long int64_t; + +/* Nvidia bug workaround nicked from hashcat. These are for __constant arrays */ +#if gpu_nvidia(DEVICE_INFO) +#define __const_a8 __constant __attribute__ ((aligned (8))) +#else +#define __const_a8 __constant +#endif + +#if SIZEOF_SIZE_T == 8 +typedef uint64_t host_size_t; +#else +typedef uint32_t host_size_t; +#endif + +/* + * "Copy" of the one in dyna_salt.h (we only need it to be right size, + * bitfields are not allowed in OpenCL) + */ +typedef struct dyna_salt_t { + host_size_t salt_cmp_size; + host_size_t bitfield_and_offset; +} dyna_salt; + +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +/* + * Host code may pass -DV_WIDTH=2 or some other width. + */ +#if V_WIDTH > 1 +#define MAYBE_VECTOR_UINT VECTOR(uint, V_WIDTH) +#define MAYBE_VECTOR_ULONG VECTOR(ulong, V_WIDTH) +#else +#define MAYBE_VECTOR_UINT uint +#define MAYBE_VECTOR_ULONG ulong +#define SCALAR 1 +#endif + +#if SCALAR && 0 /* Used for testing */ +#define HAVE_LUT3 1 +inline uint lut3(uint x, uint y, uint z, uchar m) +{ + uint i; + uint r = 0; + for (i = 0; i < sizeof(uint) * 8; i++) + r |= (uint)((m >> ( (((x >> i) & 1) << 2) | + (((y >> i) & 1) << 1) | + ((z >> i) & 1) )) & 1) << i; + return r; +} +#endif + +/* + * Apparently nvidias can optimize stuff better (ending up in *better* LUT + * use) with the basic formulas instead of bitselect ones. Most formats + * show no difference but pwsafe does. + */ +#if !gpu_nvidia(DEVICE_INFO) +#define USE_BITSELECT 1 +#endif + +#if SM_MAJOR == 1 +#define OLD_NVIDIA 1 +#endif + +#if cpu(DEVICE_INFO) +#define HAVE_ANDNOT 1 +#endif + +#if SCALAR && SM_MAJOR >= 5 && (DEV_VER_MAJOR > 352 || (DEV_VER_MAJOR == 352 && DEV_VER_MINOR >= 21)) +#define HAVE_LUT3 1 +inline uint lut3(uint a, uint b, uint c, uint imm) +{ + uint r; + asm("lop3.b32 %0, %1, %2, %3, %4;" + : "=r" (r) + : "r" (a), "r" (b), "r" (c), "i" (imm)); + return r; +} + +#if 0 /* This does no good */ +#define HAVE_LUT3_64 1 +inline ulong lut3_64(ulong a, ulong b, ulong c, uint imm) +{ + ulong t, r; + + asm("lop3.b32 %0, %1, %2, %3, %4;" + : "=r" (t) + : "r" ((uint)a), "r" ((uint)b), "r" ((uint)c), "i" (imm)); + r = t; + asm("lop3.b32 %0, %1, %2, %3, %4;" + : "=r" (t) + : "r" ((uint)(a >> 32)), "r" ((uint)(b >> 32)), "r" ((uint)(c >> 32)), "i" (imm)); + return r + (t << 32); +} +#endif +#endif + +#if defined cl_amd_media_ops +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#define BITALIGN(hi, lo, s) amd_bitalign((hi), (lo), (s)) +#elif SCALAR && SM_MAJOR > 3 || (SM_MAJOR == 3 && SM_MINOR >= 2) +inline uint funnel_shift_right(uint hi, uint lo, uint s) +{ + uint r; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r" (r) + : "r" (lo), "r" (hi), "r" (s)); + return r; +} + +inline uint funnel_shift_right_imm(uint hi, uint lo, uint s) +{ + uint r; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r" (r) + : "r" (lo), "r" (hi), "i" (s)); + return r; +} +#define BITALIGN(hi, lo, s) funnel_shift_right(hi, lo, s) +#define BITALIGN_IMM(hi, lo, s) funnel_shift_right_imm(hi, lo, s) +#else +#define BITALIGN(hi, lo, s) (((hi) << (32 - (s))) | ((lo) >> (s))) +#endif + +#ifndef BITALIGN_IMM +#define BITALIGN_IMM(hi, lo, s) BITALIGN(hi, lo, s) +#endif + +#define CONCAT(TYPE,WIDTH) TYPE ## WIDTH +#define VECTOR(x, y) CONCAT(x, y) + +/* Workaround for problem seen with 9600GT */ +#ifndef MAYBE_CONSTANT +#if OLD_NVIDIA +#define MAYBE_CONSTANT __global const +#else +#define MAYBE_CONSTANT __constant +#endif +#endif + +#if USE_BITSELECT +inline uint SWAP32(uint x) +{ + return bitselect(rotate(x, 24U), rotate(x, 8U), 0x00FF00FFU); +} + +#define SWAP64(n) bitselect( \ + bitselect(rotate(n, 24UL), \ + rotate(n, 8UL), 0x000000FF000000FFUL), \ + bitselect(rotate(n, 56UL), \ + rotate(n, 40UL), 0x00FF000000FF0000UL), \ + 0xFFFF0000FFFF0000UL) +#else +inline uint SWAP32(uint x) +{ + x = rotate(x, 16U); + return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF); +} + +// You would not believe how many driver bugs variants of this macro reveal +#define SWAP64(n) \ + (((n) << 56) | (((n) & 0xff00) << 40) | \ + (((n) & 0xff0000) << 24) | (((n) & 0xff000000) << 8) | \ + (((n) >> 8) & 0xff000000) | (((n) >> 24) & 0xff0000) | \ + (((n) >> 40) & 0xff00) | ((n) >> 56)) +#endif + +#if SCALAR +#define VSWAP32 SWAP32 +#else +/* Vector-capable swap32() */ +inline MAYBE_VECTOR_UINT VSWAP32(MAYBE_VECTOR_UINT x) +{ + x = rotate(x, 16U); + return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF); +} +#endif + +/* + * These macros must not require alignment of (b). + */ +#define GET_UINT32(n, b, i) \ + { \ + (n) = ((uint) (b)[(i)] ) \ + | ((uint) (b)[(i) + 1] << 8) \ + | ((uint) (b)[(i) + 2] << 16) \ + | ((uint) (b)[(i) + 3] << 24); \ + } + +#define PUT_UINT32(n, b, i) \ + { \ + (b)[(i) ] = (uchar) ((n) ); \ + (b)[(i) + 1] = (uchar) ((n) >> 8); \ + (b)[(i) + 2] = (uchar) ((n) >> 16); \ + (b)[(i) + 3] = (uchar) ((n) >> 24); \ + } + +#define GET_UINT32BE(n, b, i) \ + { \ + (n) = ((uint) (b)[(i)] << 24) \ + | ((uint) (b)[(i) + 1] << 16) \ + | ((uint) (b)[(i) + 2] << 8) \ + | ((uint) (b)[(i) + 3] ); \ + } + +#define PUT_UINT32BE(n, b, i) \ + { \ + (b)[(i) ] = (uchar) ((n) >> 24); \ + (b)[(i) + 1] = (uchar) ((n) >> 16); \ + (b)[(i) + 2] = (uchar) ((n) >> 8); \ + (b)[(i) + 3] = (uchar) ((n) ); \ + } + +#define PUT_UINT64(n, b, i) \ + { \ + (b)[(i) ] = (uchar) ((n) ); \ + (b)[(i) + 1] = (uchar) ((ulong)(n) >> 8); \ + (b)[(i) + 2] = (uchar) ((ulong)(n) >> 16); \ + (b)[(i) + 3] = (uchar) ((ulong)(n) >> 24); \ + (b)[(i) + 4] = (uchar) ((ulong)(n) >> 32); \ + (b)[(i) + 5] = (uchar) ((ulong)(n) >> 40); \ + (b)[(i) + 6] = (uchar) ((ulong)(n) >> 48); \ + (b)[(i) + 7] = (uchar) ((ulong)(n) >> 56); \ + } + +#define GET_UINT64BE(n, b, i) \ + { \ + (n) = ((ulong) (b)[(i)] << 56) \ + | ((ulong) (b)[(i) + 1] << 48) \ + | ((ulong) (b)[(i) + 2] << 40) \ + | ((ulong) (b)[(i) + 3] << 32) \ + | ((ulong) (b)[(i) + 4] << 24) \ + | ((ulong) (b)[(i) + 5] << 16) \ + | ((ulong) (b)[(i) + 6] << 8) \ + | ((ulong) (b)[(i) + 7] ); \ + } + +#define PUT_UINT64BE(n, b, i) \ + { \ + (b)[(i) ] = (uchar) ((ulong)(n) >> 56); \ + (b)[(i) + 1] = (uchar) ((ulong)(n) >> 48); \ + (b)[(i) + 2] = (uchar) ((ulong)(n) >> 40); \ + (b)[(i) + 3] = (uchar) ((ulong)(n) >> 32); \ + (b)[(i) + 4] = (uchar) ((ulong)(n) >> 24); \ + (b)[(i) + 5] = (uchar) ((ulong)(n) >> 16); \ + (b)[(i) + 6] = (uchar) ((ulong)(n) >> 8); \ + (b)[(i) + 7] = (uchar) ((n) ); \ + } + +/* + * These require (b) to be aligned! + */ +#if __ENDIAN_LITTLE__ +#define GET_UINT32_ALIGNED(n, b, i) (n) = ((uint*)(b))[(i) >> 2] +#define PUT_UINT32_ALIGNED(n, b, i) ((uint*)(b))[(i) >> 2] = (n) +#define GET_UINT32BE_ALIGNED(n, b, i) (n) = SWAP32(((uint*)(b))[(i) >> 2]) +#define PUT_UINT32BE_ALIGNED(n, b, i) ((uint*)(b))[(i) >> 2] = SWAP32(n) +#define PUT_UINT64_ALIGNED(n, b, i) ((ulong*)(b))[(i) >> 3] = (n) +#define GET_UINT64BE_ALIGNED(n, b, i) (n) = SWAP64(((ulong*)(b))[(i) >> 3]) +#define PUT_UINT64BE_ALIGNED(n, b, i) ((ulong*)(b))[(i) >> 3] = SWAP64(n) +#else +#define GET_UINT32_ALIGNED(n, b, i) (n) = SWAP32(((uint*)(b))[(i) >> 2]) +#define PUT_UINT32_ALIGNED(n, b, i) ((uint*)(b))[(i) >> 2] = SWAP32(n) +#define GET_UINT32BE_ALIGNED(n, b, i) (n) = ((uint*)(b))[(i) >> 2] +#define PUT_UINT32BE_ALIGNED(n, b, i) ((uint*)(b))[(i) >> 2] = (n) +#define PUT_UINT64_ALIGNED(n, b, i) ((ulong*)(b))[(i) >> 3] = SWAP64(n) +#define GET_UINT64BE_ALIGNED(n, b, i) (n) = ((ulong*)(b))[(i) >> 3] +#define PUT_UINT64BE_ALIGNED(n, b, i) ((ulong*)(b))[(i) >> 3] = (n) +#endif + +/* Any device can do 8-bit reads BUT these macros are scalar only! */ +#define GETCHAR(buf, index) (((uchar*)(buf))[(index)]) +#define GETCHAR_G(buf, index) (((__global uchar*)(buf))[(index)]) +#define GETCHAR_L(buf, index) (((__local uchar*)(buf))[(index)]) +#define GETCHAR_BE(buf, index) (((uchar*)(buf))[(index) ^ 3]) +#define GETCHAR_MC(buf, index) (((MAYBE_CONSTANT uchar*)(buf))[(index)]) +#define LASTCHAR_BE(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & (0xffffff00U << ((((index) & 3) ^ 3) << 3))) + ((val) << ((((index) & 3) ^ 3) << 3)) + +#if no_byte_addressable(DEVICE_INFO) || !SCALAR || (gpu_amd(DEVICE_INFO) && defined(AMD_PUTCHAR_NOCAST)) +/* 32-bit stores */ +#define PUTCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << (((index) & 3) << 3))) + ((val) << (((index) & 3) << 3)) +#define PUTCHAR_G PUTCHAR +#define PUTCHAR_L PUTCHAR +#define PUTCHAR_BE(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << ((((index) & 3) ^ 3) << 3))) + ((val) << ((((index) & 3) ^ 3) << 3)) +#define PUTCHAR_BE_G PUTCHAR_BE +#define PUTSHORT(buf, index, val) (buf)[(index)>>1] = ((buf)[(index)>>1] & ~(0xffffU << (((index) & 1) << 4))) + ((val) << (((index) & 1) << 4)) +#define PUTSHORT_BE(buf, index, val) (buf)[(index)>>1] = ((buf)[(index)>>1] & ~(0xffffU << ((((index) & 1) ^ 3) << 4))) + ((val) << ((((index) & 1) ^ 3) << 4)) +#define XORCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2]) ^ ((val) << (((index) & 3) << 3)) +#define XORCHAR_BE(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2]) ^ ((val) << ((((index) & 3) ^ 3) << 3)) + +#else +/* 8-bit stores */ +#define PUTCHAR(buf, index, val) ((uchar*)(buf))[index] = (val) +#define PUTCHAR_G(buf, index, val) ((__global uchar*)(buf))[(index)] = (val) +#define PUTCHAR_L(buf, index, val) ((__local uchar*)(buf))[(index)] = (val) +#define PUTCHAR_BE(buf, index, val) ((uchar*)(buf))[(index) ^ 3] = (val) +#define PUTCHAR_BE_G(buf, index, val) ((__global uchar*)(buf))[(index) ^ 3] = (val) +#define PUTSHORT(buf, index, val) ((ushort*)(buf))[index] = (val) +#define PUTSHORT_BE(buf, index, val) ((ushort*)(buf))[(index) ^ 1] = (val) +#define XORCHAR(buf, index, val) ((uchar*)(buf))[(index)] ^= (val) +#define XORCHAR_BE(buf, index, val) ((uchar*)(buf))[(index) ^ 3] ^= (val) +#endif + +inline int check_pkcs_pad(const uchar *data, int len, int blocksize) +{ + int pad_len, padding, real_len; + + if (len & (blocksize - 1) || len < blocksize) + return -1; + + pad_len = data[len - 1]; + + if (pad_len < 1 || pad_len > blocksize) + return -1; + + real_len = len - pad_len; + data += real_len; + + padding = pad_len; + + while (pad_len--) + if (*data++ != padding) + return -1; + + return real_len; +} + +/* + * Use with some caution. Memory type agnostic and if both src and dst are + * 8-bit types, this works like a normal memcpy. + * + * If src and dst are larger but same size, it will still work fine but + * 'count' is number of ELEMENTS and not BYTES. + * + * If src and dst are different size types, you will get what you asked for... + */ +#define memcpy_macro(dst, src, count) do { \ + uint c = count; \ + for (uint _i = 0; _i < c; _i++) \ + (dst)[_i] = (src)[_i]; \ + } while (0) + +/* + * Optimized functions. You need to pick the one that corresponds to the + * source- and destination memory type(s). + * + * Note that for very small sizes, the overhead may make these functions + * slower than naive code. On the other hand, due to inlining we will + * hopefully have stuff optimized away more often than not! + */ + +/* src and dst are private mem */ +inline void memcpy_pp(void *dst, const void *src, uint count) +{ + union { + const uint *w; + const uchar *c; + } s; + union { + uint *w; + uchar *c; + } d; + + s.c = src; + d.c = dst; + + if (((size_t)dst & 0x03) == ((size_t)src & 0x03)) { + while (((size_t)s.c) & 0x03 && count--) + *d.c++ = *s.c++; + + while (count >= 4) { + *d.w++ = *s.w++; + count -= 4; + } + } + + while (count--) { + *d.c++ = *s.c++; + } +} + +/* src is private mem, dst is global mem */ +inline void memcpy_pg(__global void *dst, const void *src, uint count) +{ + union { + const uint *w; + const uchar *c; + } s; + union { + __global uint *w; + __global uchar *c; + } d; + + s.c = src; + d.c = dst; + + if (((size_t)dst & 0x03) == ((size_t)src & 0x03)) { + while (((size_t)s.c) & 0x03 && count--) + *d.c++ = *s.c++; + + while (count >= 4) { + *d.w++ = *s.w++; + count -= 4; + } + } + + while (count--) { + *d.c++ = *s.c++; + } +} + +/* src is global mem, dst is private mem */ +inline void memcpy_gp(void *dst, __global const void *src, uint count) +{ + union { + __global const uint *w; + __global const uchar *c; + } s; + union { + uint *w; + uchar *c; + } d; + + s.c = src; + d.c = dst; + + if (((size_t)dst & 0x03) == ((size_t)src & 0x03)) { + while (((size_t)s.c) & 0x03 && count--) + *d.c++ = *s.c++; + + while (count >= 4) { + *d.w++ = *s.w++; + count -= 4; + } + } + + while (count--) { + *d.c++ = *s.c++; + } +} + +/* src is constant mem, dst is private mem */ +inline void memcpy_cp(void *dst, __constant void *src, uint count) +{ + union { + __constant uint *w; + __constant uchar *c; + } s; + union { + uint *w; + uchar *c; + } d; + + s.c = src; + d.c = dst; + + if (((size_t)dst & 0x03) == ((size_t)src & 0x03)) { + while (((size_t)s.c) & 0x03 && count--) + *d.c++ = *s.c++; + + while (count >= 4) { + *d.w++ = *s.w++; + count -= 4; + } + } + + while (count--) { + *d.c++ = *s.c++; + } +} + +/* src is MAYBE_CONSTANT mem, dst is private mem */ +inline void memcpy_mcp(void *dst, MAYBE_CONSTANT void *src, uint count) +{ + union { + MAYBE_CONSTANT uint *w; + MAYBE_CONSTANT uchar *c; + } s; + union { + uint *w; + uchar *c; + } d; + + s.c = src; + d.c = dst; + + if (((size_t)dst & 0x03) == ((size_t)src & 0x03)) { + while (((size_t)s.c) & 0x03 && count--) + *d.c++ = *s.c++; + + while (count >= 4) { + *d.w++ = *s.w++; + count -= 4; + } + } + + while (count--) { + *d.c++ = *s.c++; + } +} + +/* dst is private mem */ +inline void memset_p(void *p, uint val, uint count) +{ + const uint val4 = val | (val << 8) | (val << 16) | (val << 24); + union { + uint *w; + uchar *c; + } d; + + d.c = p; + + while (((size_t)d.c) & 0x03 && count--) + *d.c++ = val; + + while (count >= 4) { + *d.w++ = val4; + count -= 4; + } + + while (count--) + *d.c++ = val; +} + +/* dst is global mem */ +inline void memset_g(__global void *p, uint val, uint count) +{ + const uint val4 = val | (val << 8) | (val << 16) | (val << 24); + union { + __global uint *w; + __global uchar *c; + } d; + + d.c = p; + + while (((size_t)d.c) & 0x03 && count--) + *d.c++ = val; + + while (count >= 4) { + *d.w++ = val4; + count -= 4; + } + + while (count--) + *d.c++ = val; +} + +/* s1 and s2 are private mem */ +inline int memcmp_pp(const void *s1, const void *s2, uint size) +{ + union { + const uint *w; + const uchar *c; + } a; + union { + const uint *w; + const uchar *c; + } b; + + a.c = s1; + b.c = s2; + + if (((size_t)s1 & 0x03) == ((size_t)s2 & 0x03)) { + while (((size_t)a.c) & 0x03 && size--) + if (*b.c++ != *a.c++) + return 1; + + while (size >= 4) { + if (*b.w++ != *a.w++) + return 1; + size -= 4; + } + } + + while (size--) + if (*b.c++ != *a.c++) + return 1; + + return 0; +} + +/* s1 is private mem, s2 is constant mem */ +inline int memcmp_pc(const void *s1, __constant const void *s2, uint size) +{ + union { + const uint *w; + const uchar *c; + } a; + union { + __constant const uint *w; + __constant const uchar *c; + } b; + + a.c = s1; + b.c = s2; + + if (((size_t)s1 & 0x03) == ((size_t)s2 & 0x03)) { + while (((size_t)a.c) & 0x03 && size--) + if (*b.c++ != *a.c++) + return 1; + + while (size >= 4) { + if (*b.w++ != *a.w++) + return 1; + size -= 4; + } + } + + while (size--) + if (*b.c++ != *a.c++) + return 1; + + return 0; +} + +/* s1 is private mem, s2 is MAYBE_CONSTANT mem */ +inline int memcmp_pmc(const void *s1, MAYBE_CONSTANT void *s2, uint size) +{ + union { + const uint *w; + const uchar *c; + } a; + union { + MAYBE_CONSTANT uint *w; + MAYBE_CONSTANT uchar *c; + } b; + + a.c = s1; + b.c = s2; + + if (((size_t)s1 & 0x03) == ((size_t)s2 & 0x03)) { + while (((size_t)a.c) & 0x03 && size--) + if (*b.c++ != *a.c++) + return 1; + + while (size >= 4) { + if (*b.w++ != *a.w++) + return 1; + size -= 4; + } + } + + while (size--) + if (*b.c++ != *a.c++) + return 1; + + return 0; +} + +/* haystack is private mem, needle is constant mem */ +inline int memmem_pc(const void *haystack, size_t haystack_len, + __constant const void *needle, size_t needle_len) +{ + char* haystack_ = (char*)haystack; + __constant const char* needle_ = (__constant const char*)needle; + int hash = 0; + int hay_hash = 0; + char* last; + size_t i; + + if (haystack_len < needle_len) + return 0; + + if (!needle_len) + return 1; + + for (i = needle_len; i; --i) { + hash += *needle_++; + hay_hash += *haystack_++; + } + + haystack_ = (char*)haystack; + needle_ = (__constant char*)needle; + last = haystack_+(haystack_len - needle_len + 1); + for (; haystack_ < last; ++haystack_) { + if (hash == hay_hash && + *haystack_ == *needle_ && + !memcmp_pc (haystack_, needle_, needle_len)) + return 1; + + hay_hash -= *haystack_; + hay_hash += *(haystack_+needle_len); + } + + return 0; +} + +#define STRINGIZE2(s) #s +#define STRINGIZE(s) STRINGIZE2(s) + +/* + * The reason the functions below are macros is it's the only way we can use + * them regardless of memory type (eg. __local or __global). The downside is + * we can't cast them so we need eg. dump8_le for a char array, or output will + * not be correct. + */ + +/* Dump an array (or variable) as hex */ +#define dump(x) dump_stuff_msg(STRINGIZE(x), x, sizeof(x)) +#define dump_stuff(x, size) dump_stuff_msg(STRINGIZE(x), x, size) + +/* + * This clumsy beast finally hides the problem from user. + */ +#define dump_stuff_msg(msg, x, size) do { \ + switch (sizeof((x)[0])) { \ + case 8: \ + dump_stuff64_msg(msg, x, size); \ + break; \ + case 4: \ + dump_stuff32_msg(msg, x, size); \ + break; \ + case 2: \ + dump_stuff16_msg(msg, x, size); \ + break; \ + case 1: \ + dump_stuff8_msg(msg, x, size); \ + break; \ + } \ + } while (0) + +/* requires char/uchar */ +#define dump_stuff8_msg(msg, x, size) do { \ + uint ii; \ + printf("%s : ", msg); \ + for (ii = 0; ii < (uint)size; ii++) { \ + printf("%02x", (x)[ii]); \ + if (ii % 4 == 3) \ + printf(" "); \ + } \ + printf("\n"); \ + } while (0) + +/* requires short/ushort */ +#define dump_stuff16_msg(msg, x, size) do { \ + uint ii; \ + printf("%s : ", msg); \ + for (ii = 0; ii < (uint)(size)/2; ii++) { \ + printf("%04x", (x)[ii]); \ + if (ii % 2 == 1) \ + printf(" "); \ + } \ + printf("\n"); \ + } while (0) + +/* requires int/uint */ +#define dump_stuff32_msg(msg, x, size) do { \ + uint ii; \ + printf("%s : ", msg); \ + for (ii = 0; ii < (uint)(size)/4; ii++) \ + printf("%08x ", SWAP32((x)[ii])); \ + printf("\n"); \ + } while (0) + +/* requires long/ulong */ +#define dump_stuff64_msg(msg, x, size) do { \ + uint ii; \ + printf("%s : ", msg); \ + for (ii = 0; ii < (uint)(size)/8; ii++) \ + printf("%016lx ", SWAP64((x)[ii])); \ + printf("\n"); \ + } while (0) + +#endif diff --git a/opencl_nonstd.h b/opencl_nonstd.h new file mode 100644 index 0000000..39b1fda --- /dev/null +++ b/opencl_nonstd.h @@ -0,0 +1,3130 @@ +/* + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. + * + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 + * + * Several same-gate-count expressions for each S-box are included (for use on + * different CPUs/GPUs). + * + * These Boolean expressions corresponding to DES S-boxes have been generated + * by Roman Rusakov for use in Openwall's + * John the Ripper password cracker: http://www.openwall.com/john/ + * Being mathematical formulas, they are not copyrighted and are free for reuse + * by anyone. + * + * This file (a specific representation of the S-box expressions, surrounding + * logic) is Copyright (c) 2011 by Solar Designer . + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. (This is a heavily cut-down "BSD license".) + * + * The effort has been sponsored by Rapid7: http://www.rapid7.com + */ + +#ifndef andn +#define andn 1 +#endif + +#undef triop +#if defined(__x86_64__) || defined(__i386__) +#define triop 0 +#else +#define triop 1 +#endif + +#undef regs +#if defined(__x86_64__) && defined(__SSE2__) +/* Also for AVX, XOP (we assume that these imply/define SSE2) */ +#define regs 16 +#elif defined(__x86_64__) +#define regs 15 +#elif defined(__i386__) +/* Hopefully, at least MMX */ +#define regs 8 +#else +/* PowerPC with AltiVec, etc. */ +#define regs 32 +#endif + +#undef latency +/* Latency 2 may also mean dual-issue with latency 1 */ +#define latency 2 + +//#if andn && triop && regs >= 18 && latency <= 3 +/* s1-00104, 49 gates, 18 regs, 13 andn, 2/7/41/79/122 stalls, 75 biop */ +/*inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + vtype x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + vtype x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + vtype x0A0A0000, x0AD80096, x00999900, x0AD99996; + vtype x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + vtype x05B77AC0, x05F77AD6, x50A22F83, x6391D07C, xBB0747B0; + vtype x00B700C0, x5AB85ACF, x50204249, x4090904C, x10B0D205; + vtype x2220EDF3, x99070200, x9CB078C0, xDCB07AC9; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x55005500, a1, a5); + vxor(x5A0F5A0F, a4, x55005500); + vor(x3333FFFF, a3, a6); + vxor(x66666666, a1, a3); + vand(x22226666, x3333FFFF, x66666666); + vxor(x2D2D6969, a4, x22226666); + vandn(x25202160, x2D2D6969, x5A0F5A0F); + + vxor(x00FFFF00, a5, a6); + vxor(x33CCCC33, a3, x00FFFF00); + vandn(x4803120C, x5A0F5A0F, x33CCCC33); + vor(x2222FFFF, a6, x22226666); + vxor(x6A21EDF3, x4803120C, x2222FFFF); + vandn(x4A01CC93, x6A21EDF3, x25202160); + + vor(x5555FFFF, a1, a6); + vor(x7F75FFFF, x6A21EDF3, x5555FFFF); + vandn(x00D20096, a5, x2D2D6969); + vxor(x7FA7FF69, x7F75FFFF, x00D20096); + + vandn(x0A0A0000, a4, x5555FFFF); + vxor(x0AD80096, x00D20096, x0A0A0000); + vandn(x00999900, x00FFFF00, x66666666); + vor(x0AD99996, x0AD80096, x00999900); + + vandn(x22332233, a3, x55005500); + vxor(x257AA5F0, x5A0F5A0F, x7F75FFFF); + vandn(x054885C0, x257AA5F0, x22332233); + vnot(xFAB77A3F, x054885C0); + vand(x2221EDF3, x3333FFFF, x6A21EDF3); + vxor(xD89697CC, xFAB77A3F, x2221EDF3); + vandn(x20, x7FA7FF69, a2); + vxor(x21, x20, xD89697CC); + vxor(*out3, *out3, x21); + + vxor(x05B77AC0, x00FFFF00, x054885C0); + vor(x05F77AD6, x00D20096, x05B77AC0); + vxor(x50A22F83, a1, x05F77AD6); + vxor(x6391D07C, x3333FFFF, x50A22F83); + vxor(xBB0747B0, xD89697CC, x6391D07C); + vor(x00, x25202160, a2); + vxor(x01, x00, xBB0747B0); + vxor(*out1, *out1, x01); + + vand(x00B700C0, a5, x05B77AC0); + vxor(x5AB85ACF, x5A0F5A0F, x00B700C0); + vandn(x50204249, x5AB85ACF, x0AD99996); + vand(x4090904C, xD89697CC, x6391D07C); + vxor(x10B0D205, x50204249, x4090904C); + vor(x30, x10B0D205, a2); + vxor(x31, x30, x0AD99996); + vxor(*out4, *out4, x31); + + vand(x2220EDF3, x2222FFFF, x6A21EDF3); + vandn(x99070200, xBB0747B0, x2220EDF3); + vxor(x9CB078C0, x05B77AC0, x99070200); + vor(xDCB07AC9, x50204249, x9CB078C0); + vandn(x10, a2, x4A01CC93); + vxor(x11, x10, xDCB07AC9); + vxor(*out2, *out2, x11); +}*/ +//#elif !andn || !triop || latency >= 3 +/* s1-00484, 49 gates, 17 regs, 11 andn, 4/9/39/79/120 stalls, 74 biop */ +/* Currently used for MMX/SSE2 and x86-64 SSE2 */ +inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, x25202160; + vtype x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + vtype x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + vtype x0A0A0000, x0AD80096, x00999900, x0AD99996; + vtype x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + vtype x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + vtype x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + vtype x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x55005500, a1, a5); + vxor(x5A0F5A0F, a4, x55005500); + vor(x3333FFFF, a3, a6); + vxor(x66666666, a1, a3); + vand(x22226666, x3333FFFF, x66666666); + vxor(x2D2D6969, a4, x22226666); + vandn(x25202160, x2D2D6969, x5A0F5A0F); + + vxor(x00FFFF00, a5, a6); + vxor(x33CCCC33, a3, x00FFFF00); + vandn(x4803120C, x5A0F5A0F, x33CCCC33); + vor(x2222FFFF, a6, x22226666); + vxor(x6A21EDF3, x4803120C, x2222FFFF); + vandn(x4A01CC93, x6A21EDF3, x25202160); + + vor(x5555FFFF, a1, a6); + vor(x7F75FFFF, x6A21EDF3, x5555FFFF); + vandn(x00D20096, a5, x2D2D6969); + vxor(x7FA7FF69, x7F75FFFF, x00D20096); + + vandn(x0A0A0000, a4, x5555FFFF); + vxor(x0AD80096, x00D20096, x0A0A0000); + vandn(x00999900, x00FFFF00, x66666666); + vor(x0AD99996, x0AD80096, x00999900); + + vandn(x22332233, a3, x55005500); + vxor(x257AA5F0, x5A0F5A0F, x7F75FFFF); + vandn(x054885C0, x257AA5F0, x22332233); + vnot(xFAB77A3F, x054885C0); + vand(x2221EDF3, x3333FFFF, x6A21EDF3); + vxor(xD89697CC, xFAB77A3F, x2221EDF3); + vandn(x20, x7FA7FF69, a2); + vxor(x21, x20, xD89697CC); + vxor(out[c3], out[c3], x21); + + + vxor(x05B77AC0, x00FFFF00, x054885C0); + vor(x05F77AD6, x00D20096, x05B77AC0); + vxor(x36C48529, x3333FFFF, x05F77AD6); + vxor(x6391D07C, a1, x36C48529); + vxor(xBB0747B0, xD89697CC, x6391D07C); + vor(x00, x25202160, a2); + vxor(x01, x00, xBB0747B0); + vxor(out[c1], out[c1], x01); + + vxor(x4C460000, x3333FFFF, x7F75FFFF); + vor(x4EDF9996, x0AD99996, x4C460000); + vxor(x2D4E49EA, x6391D07C, x4EDF9996); + vor(xBBFFFFB0, x00FFFF00, xBB0747B0); + vxor(x96B1B65A, x2D4E49EA, xBBFFFFB0); + vor(x10, x4A01CC93, a2); + vxor(x11, x10, x96B1B65A); + vxor(out[c2], out[c2], x11); + + vor(x5AFF5AFF, a5, x5A0F5A0F); + vandn(x52B11215, x5AFF5AFF, x2D4E49EA); + vand(x4201C010, x4A01CC93, x6391D07C); + vxor(x10B0D205, x52B11215, x4201C010); + vor(x30, x10B0D205, a2); + vxor(x31, x30, x0AD99996); + vxor(out[c4], out[c4], x31); +} +//#else +/* s1-01753, 49 gates, 17/18 regs, 14 andn, 3/16/48/88/132 stalls, 76 biop */ +/*inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + vtype x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + vtype x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + vtype x0A0A0000, x0AD80096, x00999900, x0AD99996; + vtype x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + vtype x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + vtype x50064209, x55B138C9, x361685BF, x89014200, xDCB07AC9; + vtype x33555533, xDC54BDFF, xCC00A8CC, x10B0D205; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x55005500, a1, a5); + vxor(x5A0F5A0F, a4, x55005500); + vor(x3333FFFF, a3, a6); + vxor(x66666666, a1, a3); + vand(x22226666, x3333FFFF, x66666666); + vxor(x2D2D6969, a4, x22226666); + vandn(x25202160, x2D2D6969, x5A0F5A0F); + + vxor(x00FFFF00, a5, a6); + vxor(x33CCCC33, a3, x00FFFF00); + vandn(x4803120C, x5A0F5A0F, x33CCCC33); + vor(x2222FFFF, a6, x22226666); + vxor(x6A21EDF3, x4803120C, x2222FFFF); + vandn(x4A01CC93, x6A21EDF3, x25202160); + + vor(x5555FFFF, a1, a6); + vor(x7F75FFFF, x6A21EDF3, x5555FFFF); + vandn(x00D20096, a5, x2D2D6969); + vxor(x7FA7FF69, x7F75FFFF, x00D20096); + + vandn(x0A0A0000, a4, x5555FFFF); + vxor(x0AD80096, x00D20096, x0A0A0000); + vandn(x00999900, x00FFFF00, x66666666); + vor(x0AD99996, x0AD80096, x00999900); + + vandn(x22332233, a3, x55005500); + vxor(x257AA5F0, x5A0F5A0F, x7F75FFFF); + vandn(x054885C0, x257AA5F0, x22332233); + vnot(xFAB77A3F, x054885C0); + vand(x2221EDF3, x3333FFFF, x6A21EDF3); + vxor(xD89697CC, xFAB77A3F, x2221EDF3); + vandn(x20, x7FA7FF69, a2); + vxor(x21, x20, xD89697CC); + vxor(*out3, *out3, x21); + + vxor(x05B77AC0, x00FFFF00, x054885C0); + vor(x05F77AD6, x00D20096, x05B77AC0); + vxor(x36C48529, x3333FFFF, x05F77AD6); + vxor(x6391D07C, a1, x36C48529); + vxor(xBB0747B0, xD89697CC, x6391D07C); + vor(x00, x25202160, a2); + vxor(x01, x00, xBB0747B0); + vxor(*out1, *out1, x01); + + vandn(x50064209, x5A0F5A0F, x0AD99996); + vxor(x55B138C9, x05B77AC0, x50064209); + vxor(x361685BF, x00D20096, x36C48529); + vandn(x89014200, xBB0747B0, x361685BF); + vxor(xDCB07AC9, x55B138C9, x89014200); + vandn(x10, a2, x4A01CC93); + vxor(x11, x10, xDCB07AC9); + vxor(*out2, *out2, x11); + + vxor(x33555533, x33CCCC33, x00999900); + vxor(xDC54BDFF, x5555FFFF, x89014200); + vandn(xCC00A8CC, xDC54BDFF, x33555533); + vxor(x10B0D205, xDCB07AC9, xCC00A8CC); + vor(x30, x10B0D205, a2); + vxor(x31, x30, x0AD99996); + vxor(*out4, *out4, x31); +}*/ +//#endif + +//#if andn && triop && latency <= 4 +/* s2-016251, 44 gates, 14 regs, 13 andn, 1/9/22/61/108 stalls, 66 biop */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x33CC33CC; + vtype x55550000, x00AA00FF, x33BB33FF; + vtype x33CC0000, x11441144, x11BB11BB, x003311BB; + vtype x00000F0F, x336600FF, x332200FF, x332200F0; + vtype x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + vtype x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + vtype x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + vtype x0A042084, x12752248, x1A7522CC, x00301083, x1A45324F; + vtype x0A451047, xBBDFDD7B, xB19ACD3C; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vandn(x55550000, a1, a6); + vandn(x00AA00FF, a5, x55550000); + vor(x33BB33FF, a2, x00AA00FF); + + vandn(x33CC0000, x33CC33CC, a6); + vand(x11441144, a1, x33CC33CC); + vxor(x11BB11BB, a5, x11441144); + vandn(x003311BB, x11BB11BB, x33CC0000); + + vand(x00000F0F, a3, a6); + vxor(x336600FF, x00AA00FF, x33CC0000); + vand(x332200FF, x33BB33FF, x336600FF); + vandn(x332200F0, x332200FF, x00000F0F); + + vand(x0302000F, a3, x332200FF); + vnot(xAAAAAAAA, a1); + vxor(xA9A8AAA5, x0302000F, xAAAAAAAA); + vxor(x33CCCC33, a6, x33CC33CC); + vandn(x33CCC030, x33CCCC33, x00000F0F); + vxor(x9A646A95, xA9A8AAA5, x33CCC030); + vandn(x10, a4, x332200F0); + vxor(x11, x10, x9A646A95); + vxor(*out2, *out2, x11); + + vandn(x00333303, a2, x33CCC030); + vxor(x118822B8, x11BB11BB, x00333303); + vandn(xA8208805, xA9A8AAA5, x118822B8); + vxor(x3CC3C33C, a3, x33CCCC33); + vxor(x94E34B39, xA8208805, x3CC3C33C); + vandn(x00, x33BB33FF, a4); + vxor(x01, x00, x94E34B39); + vxor(*out1, *out1, x01); + + vxor(x0331330C, x0302000F, x00333303); + vor(x3FF3F33C, x3CC3C33C, x0331330C); + vxor(xA9DF596A, x33BB33FF, x9A646A95); + vor(xA9DF5F6F, x00000F0F, xA9DF596A); + vxor(x962CAC53, x3FF3F33C, xA9DF5F6F); + + vandn(x0A042084, x9A646A95, x94E34B39); + vxor(x12752248, x11441144, x0331330C); + vor(x1A7522CC, x0A042084, x12752248); + vandn(x00301083, x003311BB, x3CC3C33C); + vxor(x1A45324F, x1A7522CC, x00301083); + vor(x20, x1A45324F, a4); + vxor(x21, x20, x962CAC53); + vxor(*out3, *out3, x21); + + vandn(x0A451047, x1A45324F, x118822B8); + vor(xBBDFDD7B, x33CCCC33, xA9DF596A); + vxor(xB19ACD3C, x0A451047, xBBDFDD7B); + vor(x30, x003311BB, a4); + vxor(x31, x30, xB19ACD3C); + vxor(*out4, *out4, x31); +}*/ +//#elif !andn && regs >= 15 +/* s2-016276, 44 gates, 15 regs, 11 andn, 1/9/24/59/104 stalls, 67 biop */ + +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x55550000, x00AA00FF, x33BB33FF; + vtype x33CC0000, x11441144, x11BB11BB, x003311BB; + vtype x00000F0F, x336600FF, x332200FF, x332200F0; + vtype x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + vtype x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + vtype x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + vtype xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + vtype x0A451047, xBBDFDD7B, xB19ACD3C; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vandn(x55550000, a1, a6); + vandn(x00AA00FF, a5, x55550000); + vor(x33BB33FF, a2, x00AA00FF); + + vandn(x33CC0000, x33CC33CC, a6); + vand(x11441144, a1, x33CC33CC); + vxor(x11BB11BB, a5, x11441144); + vandn(x003311BB, x11BB11BB, x33CC0000); + + vand(x00000F0F, a3, a6); + vxor(x336600FF, x00AA00FF, x33CC0000); + vand(x332200FF, x33BB33FF, x336600FF); + vandn(x332200F0, x332200FF, x00000F0F); + + vand(x0302000F, a3, x332200FF); + vnot(xAAAAAAAA, a1); + vxor(xA9A8AAA5, x0302000F, xAAAAAAAA); + vxor(x33CCCC33, a6, x33CC33CC); + vandn(x33CCC030, x33CCCC33, x00000F0F); + vxor(x9A646A95, xA9A8AAA5, x33CCC030); + vandn(x10, a4, x332200F0); + vxor(x11, x10, x9A646A95); + vxor(out[c2], out[c2], x11); + + vandn(x00333303, a2, x33CCC030); + vxor(x118822B8, x11BB11BB, x00333303); + vandn(xA8208805, xA9A8AAA5, x118822B8); + vxor(x3CC3C33C, a3, x33CCCC33); + vxor(x94E34B39, xA8208805, x3CC3C33C); + vandn(x00, x33BB33FF, a4); + vxor(x01, x00, x94E34B39); + vxor(out[c1], out[c1], x01); + + vxor(x0331330C, x0302000F, x00333303); + vor(x3FF3F33C, x3CC3C33C, x0331330C); + vxor(xA9DF596A, x33BB33FF, x9A646A95); + vor(xA9DF5F6F, x00000F0F, xA9DF596A); + vxor(x962CAC53, x3FF3F33C, xA9DF5F6F); + + vxor(xA9466A6A, x332200FF, x9A646A95); + vxor(x3DA52153, x94E34B39, xA9466A6A); + vand(x29850143, xA9DF5F6F, x3DA52153); + vand(x33C0330C, x33CC33CC, x3FF3F33C); + vxor(x1A45324F, x29850143, x33C0330C); + vor(x20, x1A45324F, a4); + vxor(x21, x20, x962CAC53); + vxor(out[c3], out[c3], x21); + + vandn(x0A451047, x1A45324F, x118822B8); + vor(xBBDFDD7B, x33CCCC33, xA9DF596A); + vxor(xB19ACD3C, x0A451047, xBBDFDD7B); + vor(x30, x003311BB, a4); + vxor(x31, x30, xB19ACD3C); + vxor(out[c4], out[c4], x31); +} + +//#elif andn && !triop && regs >= 15 && latency <= 2 +/* s2-016277, 44 gates, 15 regs, 12 andn, 4/15/35/74/121 stalls, 65 biop */ +/* Currently used for x86-64 SSE2 */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x55550000, x00AA00FF, x33BB33FF; + vtype x33CC0000, x11441144, x11BB11BB, x003311BB; + vtype x00000F0F, x336600FF, x332200FF, x332200F0; + vtype x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + vtype x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + vtype x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + vtype x97D27835, x81D25825, x812D58DA, x802158DA, x1A45324F; + vtype x0A451047, xBBDFDD7B, xB19ACD3C; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vandn(x55550000, a1, a6); + vandn(x00AA00FF, a5, x55550000); + vor(x33BB33FF, a2, x00AA00FF); + + vandn(x33CC0000, x33CC33CC, a6); + vand(x11441144, a1, x33CC33CC); + vxor(x11BB11BB, a5, x11441144); + vandn(x003311BB, x11BB11BB, x33CC0000); + + vand(x00000F0F, a3, a6); + vxor(x336600FF, x00AA00FF, x33CC0000); + vand(x332200FF, x33BB33FF, x336600FF); + vandn(x332200F0, x332200FF, x00000F0F); + + vand(x0302000F, a3, x332200FF); + vnot(xAAAAAAAA, a1); + vxor(xA9A8AAA5, x0302000F, xAAAAAAAA); + vxor(x33CCCC33, a6, x33CC33CC); + vandn(x33CCC030, x33CCCC33, x00000F0F); + vxor(x9A646A95, xA9A8AAA5, x33CCC030); + vandn(x10, a4, x332200F0); + vxor(x11, x10, x9A646A95); + vxor(out[c2], out[c2], x11); + + vandn(x00333303, a2, x33CCC030); + vxor(x118822B8, x11BB11BB, x00333303); + vandn(xA8208805, xA9A8AAA5, x118822B8); + vxor(x3CC3C33C, a3, x33CCCC33); + vxor(x94E34B39, xA8208805, x3CC3C33C); + vandn(x00, x33BB33FF, a4); + vxor(x01, x00, x94E34B39); + vxor(out[c1], out[c1], x01); + + vxor(x0331330C, x0302000F, x00333303); + vor(x3FF3F33C, x3CC3C33C, x0331330C); + vxor(xA9DF596A, x33BB33FF, x9A646A95); + vor(xA9DF5F6F, x00000F0F, xA9DF596A); + vxor(x962CAC53, x3FF3F33C, xA9DF5F6F); + + vxor(x97D27835, x94E34B39, x0331330C); + vand(x81D25825, xA9DF5F6F, x97D27835); + vxor(x812D58DA, a5, x81D25825); + vandn(x802158DA, x812D58DA, x33CC0000); + vxor(x1A45324F, x9A646A95, x802158DA); + vor(x20, x1A45324F, a4); + vxor(x21, x20, x962CAC53); + vxor(out[c3], out[c3], x21); + + vandn(x0A451047, x1A45324F, x118822B8); + vor(xBBDFDD7B, x33CCCC33, xA9DF596A); + vxor(xB19ACD3C, x0A451047, xBBDFDD7B); + vor(x30, x003311BB, a4); + vxor(x31, x30, xB19ACD3C); + vxor(out[c4], out[c4], x31); +}*/ + +//#elif !andn || (triop && latency >= 5) +/* s2-016380, 44 gates, 14/15 regs, 12 andn, 1/9/27/55/99 stalls, 68 biop */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x55550000, x00AA00FF, x33BB33FF; + vtype x33CC0000, x11441144, x11BB11BB, x003311BB; + vtype x00000F0F, x336600FF, x332200FF, x332200F0; + vtype x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + vtype x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + vtype x33333030, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + vtype xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + vtype x0A451047, xBBDFDD7B, xB19ACD3C; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vandn(x55550000, a1, a6); + vandn(x00AA00FF, a5, x55550000); + vor(x33BB33FF, a2, x00AA00FF); + + vandn(x33CC0000, x33CC33CC, a6); + vand(x11441144, a1, x33CC33CC); + vxor(x11BB11BB, a5, x11441144); + vandn(x003311BB, x11BB11BB, x33CC0000); + + vand(x00000F0F, a3, a6); + vxor(x336600FF, x00AA00FF, x33CC0000); + vand(x332200FF, x33BB33FF, x336600FF); + vandn(x332200F0, x332200FF, x00000F0F); + + vand(x0302000F, a3, x332200FF); + vnot(xAAAAAAAA, a1); + vxor(xA9A8AAA5, x0302000F, xAAAAAAAA); + vxor(x33CCCC33, a6, x33CC33CC); + vandn(x33CCC030, x33CCCC33, x00000F0F); + vxor(x9A646A95, xA9A8AAA5, x33CCC030); + vandn(x10, a4, x332200F0); + vxor(x11, x10, x9A646A95); + vxor(out[c2], out[c2], x11); + + vandn(x00333303, a2, x33CCC030); + vxor(x118822B8, x11BB11BB, x00333303); + vandn(xA8208805, xA9A8AAA5, x118822B8); + vxor(x3CC3C33C, a3, x33CCCC33); + vxor(x94E34B39, xA8208805, x3CC3C33C); + vandn(x00, x33BB33FF, a4); + vxor(x01, x00, x94E34B39); + vxor(out[c1], out[c1], x01); + + vandn(x33333030, a2, x00000F0F); + vor(x3FF3F33C, x3CC3C33C, x33333030); + vxor(xA9DF596A, x33BB33FF, x9A646A95); + vor(xA9DF5F6F, x00000F0F, xA9DF596A); + vxor(x962CAC53, x3FF3F33C, xA9DF5F6F); + + vxor(xA9466A6A, x332200FF, x9A646A95); + vxor(x3DA52153, x94E34B39, xA9466A6A); + vand(x29850143, xA9DF5F6F, x3DA52153); + vand(x33C0330C, x33CC33CC, x3FF3F33C); + vxor(x1A45324F, x29850143, x33C0330C); + vor(x20, x1A45324F, a4); + vxor(x21, x20, x962CAC53); + vxor(out[c3], out[c3], x21); + + vandn(x0A451047, x1A45324F, x118822B8); + vor(xBBDFDD7B, x33CCCC33, xA9DF596A); + vxor(xB19ACD3C, x0A451047, xBBDFDD7B); + vor(x30, x003311BB, a4); + vxor(x31, x30, xB19ACD3C); + vxor(out[c4], out[c4], x31); +}*/ + +//#else +/* s2-016520, 44 gates, 15 regs, 13 andn, 5/17/41/78/125 stalls, 68 biop */ +/* Currently used for MMX/SSE2 */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x55550000, x00AA00FF, x33BB33FF; + vtype x33CC0000, x11441144, x11BB11BB, x003311BB; + vtype x00000F0F, x336600FF, x332200FF, x332200F0; + vtype x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + vtype x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + vtype x03303003, xA9DF596A, xAAEF6969, xAAEF6F6F, x962CAC53; + vtype x0903030C, x093012B7, x19B832BF, x03FD00F0, x1A45324F; + vtype x0A451047, xBBDFDD7B, xB19ACD3C; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vandn(x55550000, a1, a6); + vandn(x00AA00FF, a5, x55550000); + vor(x33BB33FF, a2, x00AA00FF); + + vandn(x33CC0000, x33CC33CC, a6); + vand(x11441144, a1, x33CC33CC); + vxor(x11BB11BB, a5, x11441144); + vandn(x003311BB, x11BB11BB, x33CC0000); + + vand(x00000F0F, a3, a6); + vxor(x336600FF, x00AA00FF, x33CC0000); + vand(x332200FF, x33BB33FF, x336600FF); + vandn(x332200F0, x332200FF, x00000F0F); + + vand(x0302000F, a3, x332200FF); + vnot(xAAAAAAAA, a1); + vxor(xA9A8AAA5, x0302000F, xAAAAAAAA); + vxor(x33CCCC33, a6, x33CC33CC); + vandn(x33CCC030, x33CCCC33, x00000F0F); + vxor(x9A646A95, xA9A8AAA5, x33CCC030); + vandn(x10, a4, x332200F0); + vxor(x11, x10, x9A646A95); + vxor(out[c2], out[c2], x11); + + vandn(x00333303, a2, x33CCC030); + vxor(x118822B8, x11BB11BB, x00333303); + vandn(xA8208805, xA9A8AAA5, x118822B8); + vxor(x3CC3C33C, a3, x33CCCC33); + vxor(x94E34B39, xA8208805, x3CC3C33C); + vandn(x00, x33BB33FF, a4); + vxor(x01, x00, x94E34B39); + vxor(out[c1], out1[c1], x01); + + vandn(x03303003, a2, x3CC3C33C); + vxor(xA9DF596A, x33BB33FF, x9A646A95); + vxor(xAAEF6969, x03303003, xA9DF596A); + vor(xAAEF6F6F, x00000F0F, xAAEF6969); + vxor(x962CAC53, x3CC3C33C, xAAEF6F6F); + + vandn(x0903030C, a3, x962CAC53); + vxor(x093012B7, x003311BB, x0903030C); + vor(x19B832BF, x118822B8, x093012B7); + vxor(x03FD00F0, a5, x0302000F); + vxor(x1A45324F, x19B832BF, x03FD00F0); + vor(x20, x1A45324F, a4); + vxor(x21, x20, x962CAC53); + vxor(out[c3], out[c3], x21); + + vandn(x0A451047, x1A45324F, x118822B8); + vor(xBBDFDD7B, x33CCCC33, xA9DF596A); + vxor(xB19ACD3C, x0A451047, xBBDFDD7B); + vor(x30, x003311BB, a4); + vxor(x31, x30, xB19ACD3C); + vxor(out[c4], out[c4], x31); +}*/ +//#endif + +//#if andn && !triop && regs < 16 +/* s3-000406, 46 gates, 15 regs, 12 andn, 3/7/19/50/89 stalls, 70 biop */ +/* Currently used for MMX/SSE2 */ +/*inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204000D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype x3F3F3F3F, x40C040C0, x69963C69, x9669C396, xD6A98356; + vtype x7A855A0A, xFEEDDB9E, xB108856A, x8D6112FC, xB25E2DC3; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(*out4, *out4, x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vandn(x204000D0, x284100D8, a3); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204000D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(*out2, *out2, x11); + + vor(x3F3F3F3F, a2, a3); + vandn(x40C040C0, x4FE55EF4, x3F3F3F3F); + vxor(x69963C69, x3C3CC3C3, x55AAFFAA); + vnot(x9669C396, x69963C69); + vxor(xD6A98356, x40C040C0, x9669C396); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(*out1, *out1, x01); + + vxor(x7A855A0A, a1, x2FD00F5F); + vor(xFEEDDB9E, x9669C396, x7A855A0A); + vxor(xB108856A, x4FE55EF4, xFEEDDB9E); + vxor(x8D6112FC, x3C699796, xB108856A); + vxor(xB25E2DC3, x3F3F3F3F, x8D6112FC); + vor(x20, x284100D8, a5); + vxor(x21, x20, xB25E2DC3); + vxor(*out3, *out3, x21); +}*/ +//#elif andn && !triop && regs >= 16 +/* s3-000426, 46 gates, 16 regs, 14 andn, 2/5/12/35/75 stalls, 68 biop */ +/* Currently used for x86-64 SSE2 */ +/*inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204000D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype x4CC44CC4, x40C040C0, x69963C69, x9669C396, xD6A98356; + vtype x000F00F0, xFEBDC3D7, xFEB0C307, x4CEEEEC4, xB25E2DC3; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(*out4, *out4, x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vandn(x204000D0, x284100D8, a3); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204000D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(*out2, *out2, x11); + + vandn(x4CC44CC4, x4FE55EF4, a2); + vandn(x40C040C0, x4CC44CC4, a3); + vxor(x69963C69, x3C3CC3C3, x55AAFFAA); + vnot(x9669C396, x69963C69); + vxor(xD6A98356, x40C040C0, x9669C396); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(*out1, *out1, x01); + + vand(x000F00F0, a4, x0F0FF0F0); + vor(xFEBDC3D7, x3C3CC3C3, xD6A98356); + vandn(xFEB0C307, xFEBDC3D7, x000F00F0); + vor(x4CEEEEC4, x00AAAA00, x4CC44CC4); + vxor(xB25E2DC3, xFEB0C307, x4CEEEEC4); + vor(x20, x284100D8, a5); + vxor(x21, x20, xB25E2DC3); + vxor(*out3, *out3, x21); +}*/ +//#elif andn && triop && !(regs >= 17 && latency == 3) +/* s3-000470, 46 gates, 15 regs, 15 andn, 2/5/10/30/69 stalls, 69 biop */ +/*inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204000D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + vtype xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(*out4, *out4, x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vandn(x204000D0, x284100D8, a3); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204000D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(*out2, *out2, x11); + + vandn(x4CC44CC4, x4FE55EF4, a2); + vandn(x40C040C0, x4CC44CC4, a3); + vnot(xC3C33C3C, x3C3CC3C3); + vxor(x9669C396, x55AAFFAA, xC3C33C3C); + vxor(xD6A98356, x40C040C0, x9669C396); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(*out1, *out1, x01); + + vor(xD6E9C3D6, x40C040C0, x9669C396); + vor(x4CEEEEC4, x00AAAA00, x4CC44CC4); + vxor(x9A072D12, xD6E9C3D6, x4CEEEEC4); + vandn(x001A000B, a4, x4FE55EF4); + vor(x9A1F2D1B, x9A072D12, x001A000B); + vandn(x20, a5, x284100D8); + vxor(x21, x20, x9A1F2D1B); + vxor(*out3, *out3, x21); +}*/ +//#elif !andn && triop && regs >= 17 && latency >= 4 +/* s3-001117, 46 gates, 17 regs, 10 andn, 2/4/19/47/92 stalls, 69 biop */ +/*inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype x3F3F3F3F, xB01AA10B, xBF3FBF3F, x83037CFC, xD6A98356; + vtype x001A000B, x3C73979D, xBF73FFFD, x0D2DD23E, xB25E2DC3; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(*out4, *out4, x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vand(x204100D0, x7373F4F4, x284100D8); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204100D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(*out2, *out2, x11); + + vor(x3F3F3F3F, a2, a3); + vnot(xB01AA10B, x4FE55EF4); + vor(xBF3FBF3F, x3F3F3F3F, xB01AA10B); + vxor(x83037CFC, x3C3CC3C3, xBF3FBF3F); + vxor(xD6A98356, x55AAFFAA, x83037CFC); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(*out1, *out1, x01); + + vand(x001A000B, a4, xB01AA10B); + vxor(x3C73979D, x3C699796, x001A000B); + vor(xBF73FFFD, x83037CFC, x3C73979D); + vxor(x0D2DD23E, x44444444, x4969967A); + vxor(xB25E2DC3, xBF73FFFD, x0D2DD23E); + vor(x20, x284100D8, a5); + vxor(x21, x20, xB25E2DC3); + vxor(*out3, *out3, x21); +}*/ +//#elif triop && regs >= 17 && latency <= 3 +/* s3-001172, 46 gates, 17 regs, 10 andn, 2/3/19/55/98 stalls, 69 biop */ +/* +inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype xB01AA10B, xB33BB33B, xBF3FBF3F, x83037CFC, xD6A98356; + vtype x001A000B, x3C73979D, xBF73FFFD, x0D2DD23E, xB25E2DC3; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(out[c4], out[c4], x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vand(x204100D0, x7373F4F4, x284100D8); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204100D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(out[c2], out[c2], x11); + + vnot(xB01AA10B, x4FE55EF4); + vor(xB33BB33B, a2, xB01AA10B); + vor(xBF3FBF3F, a3, xB33BB33B); + vxor(x83037CFC, x3C3CC3C3, xBF3FBF3F); + vxor(xD6A98356, x55AAFFAA, x83037CFC); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(out[c1], out[c1], x01); + + vand(x001A000B, a4, xB01AA10B); + vxor(x3C73979D, x3C699796, x001A000B); + vor(xBF73FFFD, x83037CFC, x3C73979D); + vxor(x0D2DD23E, x44444444, x4969967A); + vxor(xB25E2DC3, xBF73FFFD, x0D2DD23E); + vor(x20, x284100D8, a5); + vxor(x21, x20, xB25E2DC3); + vxor(out[c3], out[c3], x21); +}*/ +//#else +/* s3-001283, 46 gates, 16 regs, 14 andn, 2/5/10/30/69 stalls, 69 biop */ + +inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + vtype x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + vtype x00005EF4, x00FF5EFF, x00555455, x3C699796; + vtype x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + vtype x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + vtype x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + vtype x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + vtype xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x44444444, a1, a2); + vxor(x0F0FF0F0, a3, a6); + vor(x4F4FF4F4, x44444444, x0F0FF0F0); + vxor(x00FFFF00, a4, a6); + vandn(x00AAAA00, x00FFFF00, a1); + vxor(x4FE55EF4, x4F4FF4F4, x00AAAA00); + + vxor(x3C3CC3C3, a2, x0F0FF0F0); + vandn(x3C3C0000, x3C3CC3C3, a6); + vxor(x7373F4F4, x4F4FF4F4, x3C3C0000); + vandn(x0C840A00, x4FE55EF4, x7373F4F4); + + vand(x00005EF4, a6, x4FE55EF4); + vor(x00FF5EFF, a4, x00005EF4); + vand(x00555455, a1, x00FF5EFF); + vxor(x3C699796, x3C3CC3C3, x00555455); + vandn(x30, x4FE55EF4, a5); + vxor(x31, x30, x3C699796); + vxor(out[c4], out[c4], x31); + + vand(x000FF000, x0F0FF0F0, x00FFFF00); + vxor(x55AA55AA, a1, a4); + vxor(x26D9A15E, x7373F4F4, x55AA55AA); + vor(x2FDFAF5F, a3, x26D9A15E); + vandn(x2FD00F5F, x2FDFAF5F, x000FF000); + + vor(x55AAFFAA, x00AAAA00, x55AA55AA); + vandn(x28410014, x3C699796, x55AAFFAA); + vand(x000000FF, a4, a6); + vandn(x000000CC, x000000FF, a2); + vxor(x284100D8, x28410014, x000000CC); + + vand(x204100D0, x7373F4F4, x284100D8); + vor(x3C3CC3FF, x3C3CC3C3, x000000FF); + vandn(x1C3CC32F, x3C3CC3FF, x204100D0); + vxor(x4969967A, a1, x1C3CC32F); + vand(x10, x2FD00F5F, a5); + vxor(x11, x10, x4969967A); + vxor(out[c2], out[c2], x11); + + vandn(x4CC44CC4, x4FE55EF4, a2); + vandn(x40C040C0, x4CC44CC4, a3); + vnot(xC3C33C3C, x3C3CC3C3); + vxor(x9669C396, x55AAFFAA, xC3C33C3C); + vxor(xD6A98356, x40C040C0, x9669C396); + vandn(x00, a5, x0C840A00); + vxor(x01, x00, xD6A98356); + vxor(out[c1], out[c1], x01); + + vor(xD6E9C3D6, x40C040C0, x9669C396); + vor(x4CEEEEC4, x00AAAA00, x4CC44CC4); + vxor(x9A072D12, xD6E9C3D6, x4CEEEEC4); + vandn(x001A000B, a4, x4FE55EF4); + vor(x9A1F2D1B, x9A072D12, x001A000B); + vandn(x20, a5, x284100D8); + vxor(x21, x20, x9A1F2D1B); + vxor(out[c3], out[c3], x21); +} +//#endif + +/* s4, 33 gates, 11/12 regs, 9 andn, 2/21/53/86/119 stalls, 52 biop */ +inline void +s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x5A5A5A5A, x0F0FF0F0; + vtype x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + vtype x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + vtype x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + vtype x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x5A5A5A5A, a1, a3); + vxor(x0F0FF0F0, a3, a5); + vor(x33FF33FF, a2, a4); + vxor(x33FFCC00, a5, x33FF33FF); + vandn(x0C0030F0, x0F0FF0F0, x33FFCC00); + vandn(x0C0CC0C0, x0F0FF0F0, a2); + vxor(x0CF3C03F, a4, x0C0CC0C0); + vor(x5EFBDA7F, x5A5A5A5A, x0CF3C03F); + vandn(x52FBCA0F, x5EFBDA7F, x0C0030F0); + vxor(x61C8F93C, a2, x52FBCA0F); + + vand(x00C0C03C, x0CF3C03F, x61C8F93C); + vandn(x0F0F30C0, x0F0FF0F0, x00C0C03C); + vxor(x3B92A366, x5A5A5A5A, x61C8F93C); + vandn(x30908326, x3B92A366, x0F0F30C0); + vxor(x3C90B3D6, x0C0030F0, x30908326); + + vxor(x33CC33CC, a2, a4); + vor(x0C0CFFFF, a5, x0C0CC0C0); + vxor(x379E5C99, x3B92A366, x0C0CFFFF); + vandn(x04124C11, x379E5C99, x33CC33CC); + vxor(x56E9861E, x52FBCA0F, x04124C11); + vandn(x00, a6, x3C90B3D6); + vxor(x01, x00, x56E9861E); + vxor(out[c1], out[c1], x01); + + vnot(xA91679E1, x56E9861E); + vandn(x10, x3C90B3D6, a6); + vxor(x11, x10, xA91679E1); + vxor(out[c2], out[c2], x11); + + vxor(x9586CA37, x3C90B3D6, xA91679E1); + vandn(x8402C833, x9586CA37, x33CC33CC); + vor(x84C2C83F, x00C0C03C, x8402C833); + vxor(xB35C94A6, x379E5C99, x84C2C83F); + vor(x20, x61C8F93C, a6); + vxor(x21, x20, xB35C94A6); + vxor(out[c3], out[c3], x21); + + vand(x30, a6, x61C8F93C); + vxor(x31, x30, xB35C94A6); + vxor(out[c4], out[c4], x31); +} + +//#if triop && latency >= 3 && latency <= 5 +/* s5-02432, 48 gates, 15/16 regs, 9 andn, 6/22/61/109/160 stalls, 72 biop */ +/*inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + vtype x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + vtype x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + vtype x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + vtype x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + vtype x101884B4, x0FF8EB24, x41413113, x4FF9FB37, x4FC2FBC2; + vtype x43E9BBC2, x16BCEE97, x0F080B04, x19B4E593; + vtype x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vor(x77777777, a1, a3); + vandn(x77770000, x77777777, a6); + vxor(x22225555, a1, x77770000); + vxor(x11116666, a3, x22225555); + vor(x1F1F6F6F, a4, x11116666); + + vandn(x70700000, x77770000, a4); + vxor(x43433333, a3, x70700000); + vand(x00430033, a5, x43433333); + vor(x55557777, a1, x11116666); + vxor(x55167744, x00430033, x55557777); + vxor(x5A19784B, a4, x55167744); + + vxor(x5A1987B4, a6, x5A19784B); + vor(x7A3BD7F5, x22225555, x5A1987B4); + vand(x003B00F5, a5, x7A3BD7F5); + vxor(x221955A0, x22225555, x003B00F5); + vand(x05050707, a4, x55557777); + vxor(x271C52A7, x221955A0, x05050707); + + vandn(x2A2A82A0, x7A3BD7F5, a1); + vxor(x6969B193, x43433333, x2A2A82A0); + vxor(x1FE06F90, a5, x1F1F6F6F); + vandn(x16804E00, x1FE06F90, x6969B193); + vnot(xE97FB1FF, x16804E00); + vandn(x20, xE97FB1FF, a2); + vxor(x21, x20, x5A19784B); + vxor(out[c3], *out3, x21); + + vandn(x43403302, x43433333, x003B00F5); + vxor(x35CAED30, x2A2A82A0, x1FE06F90); + vor(x37DEFFB7, x271C52A7, x35CAED30); + vandn(x349ECCB5, x37DEFFB7, x43403302); + vandn(x0B01234A, x1F1F6F6F, x349ECCB5); + + vand(x101884B4, x5A1987B4, x349ECCB5); + vxor(x0FF8EB24, x1FE06F90, x101884B4); + vand(x41413113, x43433333, x6969B193); + vor(x4FF9FB37, x0FF8EB24, x41413113); + vxor(x4FC2FBC2, x003B00F5, x4FF9FB37); + vand(x30, x4FC2FBC2, a2); + vxor(x31, x30, x271C52A7); + vxor(*out4, *out4, x31); + + vxor(x43E9BBC2, x77777777, x349ECCB5); + vxor(x16BCEE97, a1, x43E9BBC2); + vand(x0F080B04, a4, x0FF8EB24); + vxor(x19B4E593, x16BCEE97, x0F080B04); + vor(x00, x0B01234A, a2); + vxor(x01, x00, x19B4E593); + vxor(*out1, *out1, x01); + + vxor(x5C5C5C5C, x1F1F6F6F, x43433333); + vandn(x4448184C, x5C5C5C5C, x19B4E593); + vxor(x2DDABE71, x22225555, x0FF8EB24); + vxor(x6992A63D, x4448184C, x2DDABE71); + vand(x10, x1F1F6F6F, a2); + vxor(x11, x10, x6992A63D); + vxor(*out2, *out2, x11); +}*/ +//#elif (!triop && regs >= 16) || (triop && latency <= 2) +/* s5-04829, 48 gates, 15/16 regs, 9 andn, 4/24/65/113/163 stalls, 72 biop */ +/* Currently used for x86-64 SSE2 */ +/* +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + vtype x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + vtype x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + vtype x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + vtype x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + vtype x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + vtype x22222222, x16BCEE97, x0F080B04, x19B4E593; + vtype x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vor(x77777777, a1, a3); + vandn(x77770000, x77777777, a6); + vxor(x22225555, a1, x77770000); + vxor(x11116666, a3, x22225555); + vor(x1F1F6F6F, a4, x11116666); + + vandn(x70700000, x77770000, a4); + vxor(x43433333, a3, x70700000); + vand(x00430033, a5, x43433333); + vor(x55557777, a1, x11116666); + vxor(x55167744, x00430033, x55557777); + vxor(x5A19784B, a4, x55167744); + + vxor(x5A1987B4, a6, x5A19784B); + vor(x7A3BD7F5, x22225555, x5A1987B4); + vand(x003B00F5, a5, x7A3BD7F5); + vxor(x221955A0, x22225555, x003B00F5); + vand(x05050707, a4, x55557777); + vxor(x271C52A7, x221955A0, x05050707); + + vandn(x2A2A82A0, x7A3BD7F5, a1); + vxor(x6969B193, x43433333, x2A2A82A0); + vxor(x1FE06F90, a5, x1F1F6F6F); + vandn(x16804E00, x1FE06F90, x6969B193); + vnot(xE97FB1FF, x16804E00); + vandn(x20, xE97FB1FF, a2); + vxor(x21, x20, x5A19784B); + vxor(out[c3], out[c3], x21); + + vandn(x43403302, x43433333, x003B00F5); + vxor(x35CAED30, x2A2A82A0, x1FE06F90); + vor(x37DEFFB7, x271C52A7, x35CAED30); + vandn(x349ECCB5, x37DEFFB7, x43403302); + vandn(x0B01234A, x1F1F6F6F, x349ECCB5); + + vand(x101884B4, x5A1987B4, x349ECCB5); + vxor(x0FF8EB24, x1FE06F90, x101884B4); + vand(x41413333, x43433333, x55557777); + vor(x4FF9FB37, x0FF8EB24, x41413333); + vxor(x4FC2FBC2, x003B00F5, x4FF9FB37); + vand(x30, x4FC2FBC2, a2); + vxor(x31, x30, x271C52A7); + vxor(out[c4], out[c4], x31); + + vxor(x22222222, a1, x77777777); + vxor(x16BCEE97, x349ECCB5, x22222222); + vand(x0F080B04, a4, x0FF8EB24); + vxor(x19B4E593, x16BCEE97, x0F080B04); + vor(x00, x0B01234A, a2); + vxor(x01, x00, x19B4E593); + vxor(out[c1], out[c1], x01); + + vxor(x5C5C5C5C, x1F1F6F6F, x43433333); + vandn(x4448184C, x5C5C5C5C, x19B4E593); + vxor(x2DDABE71, x22225555, x0FF8EB24); + vxor(x6992A63D, x4448184C, x2DDABE71); + vand(x10, x1F1F6F6F, a2); + vxor(x11, x10, x6992A63D); + vxor(out[c2], out[c2], x11); +}*/ +//#else +/* s5-04832, 48 gates, 15/16 regs, 9 andn, 5/23/62/109/159 stalls, 72 biop */ +/* Currently used for MMX/SSE2 */ + +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + vtype x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + vtype x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + vtype x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + vtype x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + vtype x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + vtype x43E9BBC2, x16BCEE97, x0F080B04, x19B4E593; + vtype x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vor(x77777777, a1, a3); + vandn(x77770000, x77777777, a6); + vxor(x22225555, a1, x77770000); + vxor(x11116666, a3, x22225555); + vor(x1F1F6F6F, a4, x11116666); + + vandn(x70700000, x77770000, a4); + vxor(x43433333, a3, x70700000); + vand(x00430033, a5, x43433333); + vor(x55557777, a1, x11116666); + vxor(x55167744, x00430033, x55557777); + vxor(x5A19784B, a4, x55167744); + + vxor(x5A1987B4, a6, x5A19784B); + vor(x7A3BD7F5, x22225555, x5A1987B4); + vand(x003B00F5, a5, x7A3BD7F5); + vxor(x221955A0, x22225555, x003B00F5); + vand(x05050707, a4, x55557777); + vxor(x271C52A7, x221955A0, x05050707); + + vandn(x2A2A82A0, x7A3BD7F5, a1); + vxor(x6969B193, x43433333, x2A2A82A0); + vxor(x1FE06F90, a5, x1F1F6F6F); + vandn(x16804E00, x1FE06F90, x6969B193); + vnot(xE97FB1FF, x16804E00); + vandn(x20, xE97FB1FF, a2); + vxor(x21, x20, x5A19784B); + vxor(out[c3], out[c3], x21); + + vandn(x43403302, x43433333, x003B00F5); + vxor(x35CAED30, x2A2A82A0, x1FE06F90); + vor(x37DEFFB7, x271C52A7, x35CAED30); + vandn(x349ECCB5, x37DEFFB7, x43403302); + vandn(x0B01234A, x1F1F6F6F, x349ECCB5); + + vand(x101884B4, x5A1987B4, x349ECCB5); + vxor(x0FF8EB24, x1FE06F90, x101884B4); + vand(x41413333, x43433333, x55557777); + vor(x4FF9FB37, x0FF8EB24, x41413333); + vxor(x4FC2FBC2, x003B00F5, x4FF9FB37); + vand(x30, x4FC2FBC2, a2); + vxor(x31, x30, x271C52A7); + vxor(out[c4], out[c4], x31); + + vxor(x43E9BBC2, x77777777, x349ECCB5); + vxor(x16BCEE97, a1, x43E9BBC2); + vand(x0F080B04, a4, x0FF8EB24); + vxor(x19B4E593, x16BCEE97, x0F080B04); + vor(x00, x0B01234A, a2); + vxor(x01, x00, x19B4E593); + vxor(out[c1], out[c1], x01); + + vxor(x5C5C5C5C, x1F1F6F6F, x43433333); + vandn(x4448184C, x5C5C5C5C, x19B4E593); + vxor(x2DDABE71, x22225555, x0FF8EB24); + vxor(x6992A63D, x4448184C, x2DDABE71); + vand(x10, x1F1F6F6F, a2); + vxor(x11, x10, x6992A63D); + vxor(out[c2], out[c2], x11); +} + +//#endif + +//#if !triop && regs >= 16 +/* s6-000007, 46 gates, 19 regs, 8 andn, 3/19/39/66/101 stalls, 69 biop */ +/* Currently used for x86-64 SSE2 */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + vtype x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + vtype x09030C06, x09030000, x336622FF, x3A6522FF; + vtype x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + vtype x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + vtype x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + vtype x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + vtype x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vor(x3333FFFF, a2, a6); + vand(x11115555, a1, x3333FFFF); + vxor(x22DD6699, x33CC33CC, x11115555); + vxor(x22DD9966, a6, x22DD6699); + vandn(x00220099, a5, x22DD9966); + + vand(x00551144, a1, x22DD9966); + vxor(x33662277, a2, x00551144); + vxor(x5A5A5A5A, a1, a3); + vor(x7B7E7A7F, x33662277, x5A5A5A5A); + vxor(x59A31CE6, x22DD6699, x7B7E7A7F); + + vand(x09030C06, a3, x59A31CE6); + vandn(x09030000, x09030C06, a6); + vor(x336622FF, x00220099, x33662277); + vxor(x3A6522FF, x09030000, x336622FF); + vand(x30, x3A6522FF, a4); + vxor(x31, x30, x59A31CE6); + vxor(out[c4], out[c4], x31); + + vxor(x484D494C, a2, x7B7E7A7F); + vandn(x0000B6B3, a6, x484D494C); + vxor(x0F0FB9BC, a3, x0000B6B3); + vandn(x00FC00F9, a5, x09030C06); + vor(x0FFFB9FD, x0F0FB9BC, x00FC00F9); + + vor(x5DF75DF7, a1, x59A31CE6); + vand(x116600F7, x336622FF, x5DF75DF7); + vxor(x1E69B94B, x0F0FB9BC, x116600F7); + vandn(x1668B94B, x1E69B94B, x09030000); + vor(x20, x00220099, a4); + vxor(x21, x20, x1668B94B); + vxor(out[c3], out[c3], x21); + + vor(x7B7B7B7B, a2, x5A5A5A5A); + vxor(x411E5984, x3A6522FF, x7B7B7B7B); + vor(x1FFFFDFD, x11115555, x0FFFB9FD); + vxor(x5EE1A479, x411E5984, x1FFFFDFD); + + vxor(x3CB4DFD2, x22DD6699, x1E69B94B); + vandn(x004B002D, a5, x3CB4DFD2); + vnot(xB7B2B6B3, x484D494C); + vxor(xCCC9CDC8, x7B7B7B7B, xB7B2B6B3); + vxor(xCC82CDE5, x004B002D, xCCC9CDC8); + vandn(x10, xCC82CDE5, a4); + vxor(x11, x10, x5EE1A479); + vxor(out[c2], out[c2], x11); + + vxor(x0055EEBB, a6, x00551144); + vxor(x5A5AECE9, a1, x0F0FB9BC); + vand(x0050ECA9, x0055EEBB, x5A5AECE9); + vxor(xC5CAC1CE, x09030C06, xCCC9CDC8); + vxor(xC59A2D67, x0050ECA9, xC5CAC1CE); + vandn(x00, x0FFFB9FD, a4); + vxor(x01, x00, xC59A2D67); + vxor(out[c1], out[c1], x01); +}*/ +//#elif !triop && regs < 16 +/* s6-000009, 46 gates, 19 regs, 8 andn, 3/20/41/69/110 stalls, 69 biop */ +/* Currently used for MMX/SSE2 */ + +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + vtype x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + vtype x09030C06, x09030000, x336622FF, x3A6522FF; + vtype x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + vtype x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + vtype x1FFFFDFD, x7B7B7B7B, x64848686, x5EE1A479; + vtype x3CB4DFD2, x004B002D, x33363237, xCCC9CDC8, xCC82CDE5; + vtype x0055EEBB, x5A5AECE9, x0050ECA9, x0953E0AF, xC59A2D67; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vor(x3333FFFF, a2, a6); + vand(x11115555, a1, x3333FFFF); + vxor(x22DD6699, x33CC33CC, x11115555); + vxor(x22DD9966, a6, x22DD6699); + vandn(x00220099, a5, x22DD9966); + + vand(x00551144, a1, x22DD9966); + vxor(x33662277, a2, x00551144); + vxor(x5A5A5A5A, a1, a3); + vor(x7B7E7A7F, x33662277, x5A5A5A5A); + vxor(x59A31CE6, x22DD6699, x7B7E7A7F); + + vand(x09030C06, a3, x59A31CE6); + vandn(x09030000, x09030C06, a6); + vor(x336622FF, x00220099, x33662277); + vxor(x3A6522FF, x09030000, x336622FF); + vand(x30, x3A6522FF, a4); + vxor(x31, x30, x59A31CE6); + vxor(out[c4], out[c4], x31); + + vxor(x484D494C, a2, x7B7E7A7F); + vandn(x0000B6B3, a6, x484D494C); + vxor(x0F0FB9BC, a3, x0000B6B3); + vandn(x00FC00F9, a5, x09030C06); + vor(x0FFFB9FD, x0F0FB9BC, x00FC00F9); + + vor(x5DF75DF7, a1, x59A31CE6); + vand(x116600F7, x336622FF, x5DF75DF7); + vxor(x1E69B94B, x0F0FB9BC, x116600F7); + vandn(x1668B94B, x1E69B94B, x09030000); + vor(x20, x00220099, a4); + vxor(x21, x20, x1668B94B); + vxor(out[c3], out[c3], x21); + + vor(x1FFFFDFD, x11115555, x0FFFB9FD); + vor(x7B7B7B7B, a2, x5A5A5A5A); + vxor(x64848686, x1FFFFDFD, x7B7B7B7B); + vxor(x5EE1A479, x3A6522FF, x64848686); + + vxor(x3CB4DFD2, x22DD6699, x1E69B94B); + vandn(x004B002D, a5, x3CB4DFD2); + vxor(x33363237, x484D494C, x7B7B7B7B); + vnot(xCCC9CDC8, x33363237); + vxor(xCC82CDE5, x004B002D, xCCC9CDC8); + vandn(x10, xCC82CDE5, a4); + vxor(x11, x10, x5EE1A479); + vxor(out[c2], out[c2], x11); + + vxor(x0055EEBB, a6, x00551144); + vxor(x5A5AECE9, a1, x0F0FB9BC); + vand(x0050ECA9, x0055EEBB, x5A5AECE9); + vxor(x0953E0AF, x09030C06, x0050ECA9); + vxor(xC59A2D67, xCCC9CDC8, x0953E0AF); + vandn(x00, x0FFFB9FD, a4); + vxor(x01, x00, xC59A2D67); + vxor(out[c1], out[c1], x01); +} +//#elif latency >= 3 +/* s6-000028, 46 gates, 19 regs, 8 andn, 4/16/39/65/101 stalls, 69 biop */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + vtype x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + vtype x09030C06, x09030000, x336622FF, x3A6522FF; + vtype x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + vtype x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + vtype x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + vtype x3CB4DFD2, x004B002D, x33363237, xCCC9CDC8, xCC82CDE5; + vtype x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vor(x3333FFFF, a2, a6); + vand(x11115555, a1, x3333FFFF); + vxor(x22DD6699, x33CC33CC, x11115555); + vxor(x22DD9966, a6, x22DD6699); + vandn(x00220099, a5, x22DD9966); + + vand(x00551144, a1, x22DD9966); + vxor(x33662277, a2, x00551144); + vxor(x5A5A5A5A, a1, a3); + vor(x7B7E7A7F, x33662277, x5A5A5A5A); + vxor(x59A31CE6, x22DD6699, x7B7E7A7F); + + vand(x09030C06, a3, x59A31CE6); + vandn(x09030000, x09030C06, a6); + vor(x336622FF, x00220099, x33662277); + vxor(x3A6522FF, x09030000, x336622FF); + vand(x30, x3A6522FF, a4); + vxor(x31, x30, x59A31CE6); + vxor(out[c4], out[c4], x31); + + vxor(x484D494C, a2, x7B7E7A7F); + vandn(x0000B6B3, a6, x484D494C); + vxor(x0F0FB9BC, a3, x0000B6B3); + vandn(x00FC00F9, a5, x09030C06); + vor(x0FFFB9FD, x0F0FB9BC, x00FC00F9); + + vor(x7B7B7B7B, a2, x5A5A5A5A); + vxor(x411E5984, x3A6522FF, x7B7B7B7B); + vor(x1FFFFDFD, x11115555, x0FFFB9FD); + vxor(x5EE1A479, x411E5984, x1FFFFDFD); + + vor(x5DF75DF7, a1, x59A31CE6); + vand(x116600F7, x336622FF, x5DF75DF7); + vxor(x1E69B94B, x0F0FB9BC, x116600F7); + vandn(x1668B94B, x1E69B94B, x09030000); + vor(x20, x00220099, a4); + vxor(x21, x20, x1668B94B); + vxor(out[c3], out[c3], x21); + + vxor(x3CB4DFD2, x22DD6699, x1E69B94B); + vandn(x004B002D, a5, x3CB4DFD2); + vxor(x33363237, x484D494C, x7B7B7B7B); + vnot(xCCC9CDC8, x33363237); + vxor(xCC82CDE5, x004B002D, xCCC9CDC8); + vandn(x10, xCC82CDE5, a4); + vxor(x11, x10, x5EE1A479); + vxor(out[c2], out[c2], x11); + + vxor(x0055EEBB, a6, x00551144); + vxor(x5A5AECE9, a1, x0F0FB9BC); + vand(x0050ECA9, x0055EEBB, x5A5AECE9); + vxor(xC5CAC1CE, x09030C06, xCCC9CDC8); + vxor(xC59A2D67, x0050ECA9, xC5CAC1CE); + vandn(x00, x0FFFB9FD, a4); + vxor(x01, x00, xC59A2D67); + vxor(out[c1], out[c1], x01); +}*/ +//#else +/* s6-000031, 46 gates, 19 regs, 8 andn, 3/16/42/68/111 stalls, 69 biop */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x33CC33CC; + vtype x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + vtype x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + vtype x09030C06, x09030000, x336622FF, x3A6522FF; + vtype x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + vtype x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + vtype x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + vtype x3CB4DFD2, x004B002D, x84848484, xCCC9CDC8, xCC82CDE5; + vtype x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x33CC33CC, a2, a5); + + vor(x3333FFFF, a2, a6); + vand(x11115555, a1, x3333FFFF); + vxor(x22DD6699, x33CC33CC, x11115555); + vxor(x22DD9966, a6, x22DD6699); + vandn(x00220099, a5, x22DD9966); + + vand(x00551144, a1, x22DD9966); + vxor(x33662277, a2, x00551144); + vxor(x5A5A5A5A, a1, a3); + vor(x7B7E7A7F, x33662277, x5A5A5A5A); + vxor(x59A31CE6, x22DD6699, x7B7E7A7F); + + vand(x09030C06, a3, x59A31CE6); + vandn(x09030000, x09030C06, a6); + vor(x336622FF, x00220099, x33662277); + vxor(x3A6522FF, x09030000, x336622FF); + vand(x30, x3A6522FF, a4); + vxor(x31, x30, x59A31CE6); + vxor(out[c4], out[c4], x31); + + vxor(x484D494C, a2, x7B7E7A7F); + vandn(x0000B6B3, a6, x484D494C); + vxor(x0F0FB9BC, a3, x0000B6B3); + vandn(x00FC00F9, a5, x09030C06); + vor(x0FFFB9FD, x0F0FB9BC, x00FC00F9); + + vor(x7B7B7B7B, a2, x5A5A5A5A); + vxor(x411E5984, x3A6522FF, x7B7B7B7B); + vor(x1FFFFDFD, x11115555, x0FFFB9FD); + vxor(x5EE1A479, x411E5984, x1FFFFDFD); + + vor(x5DF75DF7, a1, x59A31CE6); + vand(x116600F7, x336622FF, x5DF75DF7); + vxor(x1E69B94B, x0F0FB9BC, x116600F7); + vandn(x1668B94B, x1E69B94B, x09030000); + vor(x20, x00220099, a4); + vxor(x21, x20, x1668B94B); + vxor(out[c3], out[c3], x21); + + vxor(x3CB4DFD2, x22DD6699, x1E69B94B); + vandn(x004B002D, a5, x3CB4DFD2); + vnot(x84848484, x7B7B7B7B); + vxor(xCCC9CDC8, x484D494C, x84848484); + vxor(xCC82CDE5, x004B002D, xCCC9CDC8); + vandn(x10, xCC82CDE5, a4); + vxor(x11, x10, x5EE1A479); + vxor(out[c2], out[c2], x11); + + vxor(x0055EEBB, a6, x00551144); + vxor(x5A5AECE9, a1, x0F0FB9BC); + vand(x0050ECA9, x0055EEBB, x5A5AECE9); + vxor(xC5CAC1CE, x09030C06, xCCC9CDC8); + vxor(xC59A2D67, x0050ECA9, xC5CAC1CE); + vandn(x00, x0FFFB9FD, a4); + vxor(x01, x00, xC59A2D67); + vxor(out[c1], out[c1], x01); +}*/ +//#endif + +//#if andn && triop && regs <= 16 && latency >= 5 +/* s7-000072, 46 gates, 16 regs, 10 andn, 2/5/17/51/93 stalls, 69 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x5BD6B55B, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x550A0255, x68E58668, x7DEF867D, x4E39B586; + vtype x50000050, x518C1051, x518C0000, x0B29A55A, x38D696A5; + vtype x63333363, x23132343, x26BC7346, x5B53F53B; + vtype xFFFF0000, xFFFF54A0, xADAF53FA, xA8AA02AA, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(out[c4], *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x5BD6B55B, x5AA5A55A, x01731001); + vandn(x04294004, x05AF5005, x5BD6B55B); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vand(x550A0255, x55AAAA55, x575F575F); + vxor(x68E58668, a2, x5BD6B55B); + vor(x7DEF867D, x550A0255, x68E58668); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vand(x50000050, x5AA5A55A, x550A0255); + vor(x518C1051, x018C1001, x50000050); + vandn(x518C0000, x518C1051, a5); + vxor(x0B29A55A, x5AA5A55A, x518C0000); + vxor(x38D696A5, x33FF33FF, x0B29A55A); + + vxor(x63333363, a2, x50000050); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vnot(xFFFF0000, a5); + vor(xFFFF54A0, x54A054A0, xFFFF0000); + vxor(xADAF53FA, x5250075A, xFFFF54A0); + vandn(xA8AA02AA, xADAF53FA, a1); + vxor(x8E1671EC, x26BC7346, xA8AA02AA); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs <= 16 && latency == 3 +/* s7-000788, 46 gates, 16 regs, 10 andn, 2/3/18/51/94 stalls, 69 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x5BD6B55B, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x550A0255, x68E58668, x7DEF867D, x4E39B586; + vtype x50000050, x63333363, x23132343, x26BC7346, x5B53F53B; + vtype x518C1051, x518C0000, x0B29A55A, x38D696A5; + vtype x0000AB5F, x5250AC05, x5755FD55, xD9438CB9, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(*out4, *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x5BD6B55B, x5AA5A55A, x01731001); + vandn(x04294004, x05AF5005, x5BD6B55B); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vand(x550A0255, x55AAAA55, x575F575F); + vxor(x68E58668, a2, x5BD6B55B); + vor(x7DEF867D, x550A0255, x68E58668); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vand(x50000050, x5AA5A55A, x550A0255); + vxor(x63333363, a2, x50000050); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + + vor(x518C1051, x018C1001, x50000050); + vandn(x518C0000, x518C1051, a5); + vxor(x0B29A55A, x5AA5A55A, x518C0000); + vxor(x38D696A5, x33FF33FF, x0B29A55A); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vandn(x0000AB5F, a5, x54A054A0); + vxor(x5250AC05, x5250075A, x0000AB5F); + vor(x5755FD55, a1, x5250AC05); + vnot(xD9438CB9, x26BC7346); + vxor(x8E1671EC, x5755FD55, xD9438CB9); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs == 18 && latency >= 6 +/* s7-002149, 46 gates, 18 regs, 11 andn, 2/5/20/40/66 stalls, 68 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x69969669, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x68E58668, x550A0255, x7DEF867D, x4E39B586; + vtype x0AA5A50A, x63333363, x23132343, x26BC7346, x5B53F53B; + vtype x018C0000, x63FF33FF, x627333FF, x38D696A5; + vtype x5659A956, x0251A854, x5755FD55, xA8AA02AA, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(*out4, *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x69969669, a2, x5AA5A55A); + vandn(x04294004, x05AF5005, x69969669); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vxor(x68E58668, x01731001, x69969669); + vand(x550A0255, x55AAAA55, x575F575F); + vor(x7DEF867D, x68E58668, x550A0255); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vandn(x0AA5A50A, x5AA5A55A, x550A0255); + vxor(x63333363, x69969669, x0AA5A50A); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + + vandn(x018C0000, x018C1001, a5); + vor(x63FF33FF, a4, x63333363); + vxor(x627333FF, x018C0000, x63FF33FF); + vxor(x38D696A5, x5AA5A55A, x627333FF); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vxor(x5659A956, x3FCF3F3F, x69969669); + vandn(x0251A854, x5659A956, x55AA55AA); + vor(x5755FD55, a1, x0251A854); + vnot(xA8AA02AA, x5755FD55); + vxor(x8E1671EC, x26BC7346, xA8AA02AA); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs >= 18 && latency == 5 +/* s7-002689, 46 gates, 18 regs, 10 andn, 2/5/14/31/69 stalls, 69 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x69969669, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x68E58668, x550A0255, x7DEF867D, x4E39B586; + vtype x50000050, x63333363, x23132343, x26BC7346, x5B53F53B; + vtype x518C1051, x518C0000, x0B29A55A, x38D696A5; + vtype xFFFF0000, xA8A0575F, xA8FF57FF, xA8AA02AA, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(*out4, *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x69969669, a2, x5AA5A55A); + vandn(x04294004, x05AF5005, x69969669); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vxor(x68E58668, x01731001, x69969669); + vand(x550A0255, x55AAAA55, x575F575F); + vor(x7DEF867D, x68E58668, x550A0255); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vand(x50000050, x5AA5A55A, x550A0255); + vxor(x63333363, a2, x50000050); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + + vor(x518C1051, x018C1001, x50000050); + vandn(x518C0000, x518C1051, a5); + vxor(x0B29A55A, x5AA5A55A, x518C0000); + vxor(x38D696A5, x33FF33FF, x0B29A55A); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vnot(xFFFF0000, a5); + vxor(xA8A0575F, x575F575F, xFFFF0000); + vor(xA8FF57FF, a4, xA8A0575F); + vandn(xA8AA02AA, xA8FF57FF, a1); + vxor(x8E1671EC, x26BC7346, xA8AA02AA); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs >= 19 && latency >= 6 +/* s7-003344, 46 gates, 19 regs, 10 andn, 3/9/14/39/66 stalls, 68 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x69969669, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x68E58668, x550A0255, x7DEF867D, x4E39B586; + vtype x50000050, x63333363, x23132343, x26BC7346, x5B53F53B; + vtype x518C1051, x518C0000, x695A96A5, x38D696A5; + vtype x5659A956, x0251A854, x5755FD55, xA8AA02AA, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(*out4, *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x69969669, a2, x5AA5A55A); + vandn(x04294004, x05AF5005, x69969669); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vxor(x68E58668, x01731001, x69969669); + vand(x550A0255, x55AAAA55, x575F575F); + vor(x7DEF867D, x68E58668, x550A0255); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vand(x50000050, x5AA5A55A, x550A0255); + vxor(x63333363, a2, x50000050); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + + vor(x518C1051, x018C1001, x50000050); + vandn(x518C0000, x518C1051, a5); + vxor(x695A96A5, x5AA5A55A, x33FF33FF); + vxor(x38D696A5, x518C0000, x695A96A5); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vxor(x5659A956, x3FCF3F3F, x69969669); + vandn(x0251A854, x5659A956, x55AA55AA); + vor(x5755FD55, a1, x0251A854); + vnot(xA8AA02AA, x5755FD55); + vxor(x8E1671EC, x26BC7346, xA8AA02AA); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs >= 17 && latency >= 4 +/* s7-003395, 46 gates, 17 regs, 11 andn, 3/5/10/39/67 stalls, 70 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x00FF0000, x33CC3333, x3FCF3F3F, x55AA55AA, x55AAAA55, x6A65956A; + vtype x5AA5A55A, x05505005, x05AF5005, x018C1001, x01731001; + vtype x33FF33FF, x030F030F, x575F575F, x5250075A; + vtype x69969669, x04294004, x33D633FB, x54A054A0, x6776675B; + vtype x68E58668, x550A0255, x7DEF867D, x4E39B586; + vtype x50000050, x63333363, x23132343, x26BC7346, x5B53F53B; + vtype x518C1051, x518C0000, x695A96A5, x38D696A5; + vtype x0000AB5F, x5250AC05, xAAAAAAAA, xA8AA02AA, x8E1671EC; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x00FF0000, a4, a5); + vxor(x33CC3333, a2, x00FF0000); + vor(x3FCF3F3F, a3, x33CC3333); + vxor(x55AA55AA, a1, a4); + vxor(x55AAAA55, a5, x55AA55AA); + vxor(x6A65956A, x3FCF3F3F, x55AAAA55); + + vxor(x5AA5A55A, a3, x55AAAA55); + vandn(x05505005, a1, x5AA5A55A); + vxor(x05AF5005, x00FF0000, x05505005); + vand(x018C1001, x33CC3333, x05AF5005); + vxor(x01731001, x00FF0000, x018C1001); + vandn(x30, a6, x01731001); + vxor(x31, x30, x6A65956A); + vxor(*out4, *out4, x31); + + vor(x33FF33FF, a2, a4); + vand(x030F030F, a3, x33FF33FF); + vor(x575F575F, a1, x030F030F); + vandn(x5250075A, x575F575F, x05AF5005); + + vxor(x69969669, a2, x5AA5A55A); + vandn(x04294004, x05AF5005, x69969669); + vandn(x33D633FB, x33FF33FF, x04294004); + vandn(x54A054A0, x55AA55AA, x030F030F); + vxor(x6776675B, x33D633FB, x54A054A0); + + vxor(x68E58668, x01731001, x69969669); + vand(x550A0255, x55AAAA55, x575F575F); + vor(x7DEF867D, x68E58668, x550A0255); + vxor(x4E39B586, x33D633FB, x7DEF867D); + vor(x00, x5250075A, a6); + vxor(x01, x00, x4E39B586); + vxor(*out1, *out1, x01); + + vand(x50000050, x5AA5A55A, x550A0255); + vxor(x63333363, a2, x50000050); + vandn(x23132343, x63333363, x54A054A0); + vxor(x26BC7346, x05AF5005, x23132343); + vxor(x5B53F53B, x7DEF867D, x26BC7346); + + vor(x518C1051, x018C1001, x50000050); + vandn(x518C0000, x518C1051, a5); + vxor(x695A96A5, x5AA5A55A, x33FF33FF); + vxor(x38D696A5, x518C0000, x695A96A5); + vand(x20, x5B53F53B, a6); + vxor(x21, x20, x38D696A5); + vxor(*out3, *out3, x21); + + vandn(x0000AB5F, a5, x54A054A0); + vxor(x5250AC05, x5250075A, x0000AB5F); + vnot(xAAAAAAAA, a1); + vandn(xA8AA02AA, xAAAAAAAA, x5250AC05); + vxor(x8E1671EC, x26BC7346, xA8AA02AA); + vand(x10, x6776675B, a6); + vxor(x11, x10, x8E1671EC); + vxor(*out2, *out2, x11); +}*/ +//#elif andn && triop && regs >= 17 +/* s7-036457, 46 gates, 17 regs, 9 andn, 1/6/16/50/93 stalls, 71 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x0FC3C3F3, x0000C3F3, x0000DBF3, x4F9493BB; + vtype x96B1A572, xB14E6EBF, x00008AA2, xB14EE41D; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(*out4, *out4, x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(*out3, *out3, x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vxor(x0FC3C3F3, x003C003C, x0FFFC3CF); + vand(x0000C3F3, a6, x0FC3C3F3); + vor(x0000DBF3, x00001841, x0000C3F3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(*out1, *out1, x01); + + vnot(x96B1A572, x694E5A8D); + vxor(xB14E6EBF, x27FFCBCD, x96B1A572); + vandn(x00008AA2, x0000DBF3, a2); + vxor(xB14EE41D, xB14E6EBF, x00008AA2); + vandn(x10, a1, x686B8385); + vxor(x11, x10, xB14EE41D); + vxor(*out2, *out2, x11); +}*/ +//#elif !andn && triop && regs >= 17 && latency <= 4 +/* s7-036496, 46 gates, 17 regs, 7 andn, 3/9/20/52/95 stalls, 70 biop */ +/* +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x0FC3C3F3, x0000C3F3, x0000DBF3, x4F9493BB; + vtype x00005151, x96B1A572, x96B1F423, xD9256798; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(out[c4], out[c4], x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(out[c3], out[c3], x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vxor(x0FC3C3F3, x003C003C, x0FFFC3CF); + vand(x0000C3F3, a6, x0FC3C3F3); + vor(x0000DBF3, x00001841, x0000C3F3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(out[c1], out[c1], x01); + + vand(x00005151, a2, x0000DBF3); + vnot(x96B1A572, x694E5A8D); + vxor(x96B1F423, x00005151, x96B1A572); + vxor(xD9256798, x4F9493BB, x96B1F423); + vor(x10, x686B8385, a1); + vxor(x11, x10, xD9256798); + vxor(out[c2], out[c2], x11); +}*/ +//#elif !andn && triop && regs >= 17 && latency >= 5 +/* s7-036532, 46 gates, 17 regs, 7 andn, 3/9/23/51/93 stalls, 71 biop */ +/* +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x0FC3C3F3, x0FC3DBF3, x0000DBF3, x4F9493BB; + vtype xFFFF240C, xFFFF755D, x26DA12C5, xD9256798; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(out[c4], out[c4], x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(out[c3], out[c3], x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vxor(x0FC3C3F3, x003C003C, x0FFFC3CF); + vor(x0FC3DBF3, x00001841, x0FC3C3F3); + vand(x0000DBF3, a6, x0FC3DBF3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(out[c1], out[c1], x01); + + vnot(xFFFF240C, x0000DBF3); + vor(xFFFF755D, a2, xFFFF240C); + vxor(x26DA12C5, x694E5A8D, x4F944848); + vxor(xD9256798, xFFFF755D, x26DA12C5); + vor(x10, x686B8385, a1); + vxor(x11, x10, xD9256798); + vxor(out[c2], out[c2], x11); +}*/ +//#elif andn && triop && regs <= 16 +/* s7-036610, 46 gates, 16 regs, 9 andn, 1/6/16/53/98 stalls, 70 biop */ +/* +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x6FFFDBCF, x6FC3DBF3, x0000DBF3, x4F9493BB; + vtype x96B1A572, xB14E6EBF, x00008AA2, xB14EE41D; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(out[c4], out[c4], x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(out[c3], out[c3], x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vor(x6FFFDBCF, x694E5A8D, x0FFFC3CF); + vxor(x6FC3DBF3, x003C003C, x6FFFDBCF); + vand(x0000DBF3, a6, x6FC3DBF3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(out[c1], out[c1], x01); + + vnot(x96B1A572, x694E5A8D); + vxor(xB14E6EBF, x27FFCBCD, x96B1A572); + vandn(x00008AA2, x0000DBF3, a2); + vxor(xB14EE41D, xB14E6EBF, x00008AA2); + vandn(x10, a1, x686B8385); + vxor(x11, x10, xB14EE41D); + vxor(out[c2], out[c2], x11); +}*/ +//#elif !andn && triop && latency >= 5 +/* s7-036634, 46 gates, 16 regs, 7 andn, 3/9/23/54/98 stalls, 70 biop */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x6FFFDBCF, x6FC3DBF3, x0000DBF3, x4F9493BB; + vtype xFFFF240C, xFFFF755D, x26DA12C5, xD9256798; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(*out4, *out4, x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(*out3, *out3, x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vor(x6FFFDBCF, x694E5A8D, x0FFFC3CF); + vxor(x6FC3DBF3, x003C003C, x6FFFDBCF); + vand(x0000DBF3, a6, x6FC3DBF3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(*out1, *out1, x01); + + vnot(xFFFF240C, x0000DBF3); + vor(xFFFF755D, a2, xFFFF240C); + vxor(x26DA12C5, x694E5A8D, x4F944848); + vxor(xD9256798, xFFFF755D, x26DA12C5); + vor(x10, x686B8385, a1); + vxor(x11, x10, xD9256798); + vxor(*out2, *out2, x11); +}*/ +//#elif !andn && triop && latency <= 4 +/* s7-036649, 46 gates, 16 regs, 7 andn, 3/9/20/55/100 stalls, 69 biop */ +/* +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x0FF0C0CC, x0FFFC3CF, x2E222B22, x28000802, x27FFCBCD; + vtype x48444844, x4FF4C8CC, x6F9C5F5B, x4F944848, x686B8385; + vtype x6FFFDBCF, x6FC3DBF3, x0000DBF3, x4F9493BB; + vtype x00005151, x96B1A572, x96B1F423, xD9256798; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(out[c4], out[c4], x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vandn(x0FF0C0CC, x0FF0CCCC, x00000F00); + vor(x0FFFC3CF, x000F0303, x0FF0C0CC); + vxor(x2E222B22, a2, x7B777E77); + vand(x28000802, x699C585B, x2E222B22); + vxor(x27FFCBCD, x0FFFC3CF, x28000802); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(out[c3], out[c3], x21); + + vandn(x48444844, x5A555A55, a3); + vor(x4FF4C8CC, x0FF0C0CC, x48444844); + vor(x6F9C5F5B, x0F000F00, x699C585B); + vand(x4F944848, x4FF4C8CC, x6F9C5F5B); + vxor(x686B8385, x27FFCBCD, x4F944848); + + vor(x6FFFDBCF, x694E5A8D, x0FFFC3CF); + vxor(x6FC3DBF3, x003C003C, x6FFFDBCF); + vand(x0000DBF3, a6, x6FC3DBF3); + vxor(x4F9493BB, x4F944848, x0000DBF3); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(out[c1], out[c1], x01); + + vand(x00005151, a2, x0000DBF3); + vnot(x96B1A572, x694E5A8D); + vxor(x96B1F423, x00005151, x96B1A572); + vxor(xD9256798, x4F9493BB, x96B1F423); + vor(x10, x686B8385, a1); + vxor(x11, x10, xD9256798); + vxor(out[c2], out[c2], x11); +}*/ +//#elif andn && !triop && regs >= 16 +/* s7-056931, 46 gates, 16 regs, 7 andn, 7/24/55/100/149 stalls, 67 biop */ +/* Currently used for x86-64 SSE2 */ +/*inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + vtype x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + vtype x21FF10FF, x21FFCB04, x2625C9C9, x27FFCBCD; + vtype x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(*out4, *out4, x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vor(x7F878F78, x0F000F00, x74878E78); + vand(x21101013, a3, x699C585B); + vor(x7F979F7B, x7F878F78, x21101013); + vandn(x30030CC0, x3CC33CC3, x0FF0F00F); + vxor(x4F9493BB, x7F979F7B, x30030CC0); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(*out1, *out1, x01); + + vor(x6F9CDBFB, x699C585B, x4F9493BB); + vand(x0000DBFB, a6, x6F9CDBFB); + vand(x00005151, a2, x0000DBFB); + vxor(x26DAC936, x694E5A8D, x4F9493BB); + vxor(x26DA9867, x00005151, x26DAC936); + + vor(x21FF10FF, a5, x21101013); + vxor(x21FFCB04, x0000DBFB, x21FF10FF); + vxor(x2625C9C9, a5, x26DAC936); + vor(x27FFCBCD, x21FFCB04, x2625C9C9); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(*out3, *out3, x21); + + vxor(x27FF1036, x0000DBFB, x27FFCBCD); + vor(x27FF103E, x003C003C, x27FF1036); + vnot(xB06B6C44, x4F9493BB); + vxor(x97947C7A, x27FF103E, xB06B6C44); + vandn(x10, x97947C7A, a1); + vxor(x11, x10, x26DA9867); + vxor(*out2, *out2, x11); +}*/ +//#else +/* s7-056945, 46 gates, 16 regs, 7 andn, 10/31/62/107/156 stalls, 67 biop */ +/* Currently used for MMX/SSE2 */ + +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + vtype x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + vtype x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + vtype x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + vtype x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + vtype x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + vtype x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + vtype x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vxor(x0FF00FF0, a4, a5); + vxor(x3CC33CC3, a3, x0FF00FF0); + vand(x00003CC3, a6, x3CC33CC3); + vand(x0F000F00, a4, x0FF00FF0); + vxor(x5A555A55, a2, x0F000F00); + vand(x00001841, x00003CC3, x5A555A55); + + vand(x00000F00, a6, x0F000F00); + vxor(x33333C33, a3, x00000F00); + vor(x7B777E77, x5A555A55, x33333C33); + vxor(x0FF0F00F, a6, x0FF00FF0); + vxor(x74878E78, x7B777E77, x0FF0F00F); + vandn(x30, a1, x00001841); + vxor(x31, x30, x74878E78); + vxor(out[c4], out[c4], x31); + + vandn(x003C003C, a5, x3CC33CC3); + vor(x5A7D5A7D, x5A555A55, x003C003C); + vxor(x333300F0, x00003CC3, x33333C33); + vxor(x694E5A8D, x5A7D5A7D, x333300F0); + + vxor(x0FF0CCCC, x00003CC3, x0FF0F00F); + vandn(x000F0303, a4, x0FF0CCCC); + vandn(x5A505854, x5A555A55, x000F0303); + vxor(x33CC000F, a5, x333300F0); + vxor(x699C585B, x5A505854, x33CC000F); + + vor(x7F878F78, x0F000F00, x74878E78); + vand(x21101013, a3, x699C585B); + vor(x7F979F7B, x7F878F78, x21101013); + vandn(x30030CC0, x3CC33CC3, x0FF0F00F); + vxor(x4F9493BB, x7F979F7B, x30030CC0); + vandn(x00, x4F9493BB, a1); + vxor(x01, x00, x694E5A8D); + vxor(out[c1], out[c1], x01); + + vor(x6F9CDBFB, x699C585B, x4F9493BB); + vand(x0000DBFB, a6, x6F9CDBFB); + vand(x00005151, a2, x0000DBFB); + vxor(x26DAC936, x694E5A8D, x4F9493BB); + vxor(x26DA9867, x00005151, x26DAC936); + + vor(x27DA9877, x21101013, x26DA9867); + vxor(x27DA438C, x0000DBFB, x27DA9877); + vxor(x2625C9C9, a5, x26DAC936); + vor(x27FFCBCD, x27DA438C, x2625C9C9); + vand(x20, x27FFCBCD, a1); + vxor(x21, x20, x699C585B); + vxor(out[c3], out[c3], x21); + + vxor(x27FF1036, x0000DBFB, x27FFCBCD); + vor(x27FF103E, x003C003C, x27FF1036); + vnot(xB06B6C44, x4F9493BB); + vxor(x97947C7A, x27FF103E, xB06B6C44); + vandn(x10, x97947C7A, a1); + vxor(x11, x10, x26DA9867); + vxor(out[c2], out[c2], x11); +} +//#endif + +//#if andn && !triop && regs <= 8 +/* s8-004798, 41 gates, 14 regs, 7 andn, 7/35/76/118/160 stalls, 59 biop */ +/* Currently used for MMX/SSE2 */ +/*inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + vtype x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + vtype x30EFB74A, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + vtype x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + vtype xF4FF73FF, x33D61AA5, x03E6D56A, x56B3803F; + vtype xC47C3D2F, xF77F3F3F, x693C26D9, x693CD926; + vtype x9EFF19FF, x6100C000, x6151D001, x62B7056B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x0C0C0C0C, a3, a2); + vandn(x0000F0F0, a5, a3); + vxor(x00FFF00F, a4, x0000F0F0); + vand(x00555005, a1, x00FFF00F); + vandn(x00515001, x00555005, x0C0C0C0C); + + vandn(x33000330, a2, x00FFF00F); + vor(x77555775, a1, x33000330); + vandn(x30303030, a2, a3); + vxor(x3030CFCF, a5, x30303030); + vand(x30104745, x77555775, x3030CFCF); + vor(x30555745, x00555005, x30104745); + + vxor(x30EFB74A, x00FFF00F, x30104745); + vnot(xCF1048B5, x30EFB74A); + vandn(x080A080A, a3, x77555775); + vxor(xC71A40BF, xCF1048B5, x080A080A); + vxor(xCB164CB3, x0C0C0C0C, xC71A40BF); + vor(x10, x00515001, a6); + vxor(x11, x10, xCB164CB3); + vxor(*out2, *out2, x11); + + vxor(x9E4319E6, a1, xCB164CB3); + vand(x000019E6, a5, x9E4319E6); + vxor(xF429738C, a2, xC71A40BF); + vxor(xF4296A6A, x000019E6, xF429738C); + vxor(xC729695A, x33000330, xF4296A6A); + + vor(xF4FF73FF, a4, xF429738C); + vxor(x33D61AA5, xC729695A, xF4FF73FF); + vxor(x03E6D56A, x3030CFCF, x33D61AA5); + vxor(x56B3803F, a1, x03E6D56A); + vand(x30, x56B3803F, a6); + vxor(x31, x30, xC729695A); + vxor(*out4, *out4, x31); + + vxor(xC47C3D2F, x30555745, xF4296A6A); + vor(xF77F3F3F, a2, xC47C3D2F); + vxor(x693C26D9, x9E4319E6, xF77F3F3F); + vxor(x693CD926, a5, x693C26D9); + vand(x20, x30555745, a6); + vxor(x21, x20, x693CD926); + vxor(*out3, *out3, x21); + + vor(x9EFF19FF, a4, x9E4319E6); + vandn(x6100C000, x693CD926, x9EFF19FF); + vor(x6151D001, x00515001, x6100C000); + vxor(x62B7056B, x03E6D56A, x6151D001); + vor(x00, x62B7056B, a6); + vxor(x01, x00, xC729695A); + vxor(*out1, *out1, x01); +}*/ +//#elif andn && triop && latency <= 2 +/* s8-005322, 41 gates, 14 regs, 11 andn, 3/26/67/109/151 stalls, 62 biop */ +/*inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + vtype x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + vtype x30EFB74A, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + vtype x9E4319E6, x000019E6, x33001AD6, xF429738C, xC729695A; + vtype x00332121, x9E4018C6, xC72996A5, x59698E63; + vtype xF4FF73FF, x33D6E55A, x65656565, x56B3803F; + vtype xF40083F0, x03D6640A, x61616161, x62B7056B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x0C0C0C0C, a3, a2); + vandn(x0000F0F0, a5, a3); + vxor(x00FFF00F, a4, x0000F0F0); + vand(x00555005, a1, x00FFF00F); + vandn(x00515001, x00555005, x0C0C0C0C); + + vandn(x33000330, a2, x00FFF00F); + vor(x77555775, a1, x33000330); + vandn(x30303030, a2, a3); + vxor(x3030CFCF, a5, x30303030); + vand(x30104745, x77555775, x3030CFCF); + vor(x30555745, x00555005, x30104745); + + vxor(x30EFB74A, x00FFF00F, x30104745); + vnot(xCF1048B5, x30EFB74A); + vandn(x080A080A, a3, x77555775); + vxor(xC71A40BF, xCF1048B5, x080A080A); + vxor(xCB164CB3, x0C0C0C0C, xC71A40BF); + vor(x10, x00515001, a6); + vxor(x11, x10, xCB164CB3); + vxor(*out2, *out2, x11); + + vxor(x9E4319E6, a1, xCB164CB3); + vand(x000019E6, a5, x9E4319E6); + vxor(x33001AD6, x33000330, x000019E6); + vxor(xF429738C, a2, xC71A40BF); + vxor(xC729695A, x33001AD6, xF429738C); + + vandn(x00332121, a2, x33001AD6); + vandn(x9E4018C6, x9E4319E6, x00332121); + vxor(xC72996A5, a5, xC729695A); + vxor(x59698E63, x9E4018C6, xC72996A5); + vandn(x20, x30555745, a6); + vxor(x21, x20, x59698E63); + vxor(*out3, *out3, x21); + + vor(xF4FF73FF, a4, xF429738C); + vxor(x33D6E55A, xC72996A5, xF4FF73FF); + vxor(x65656565, a1, x30303030); + vxor(x56B3803F, x33D6E55A, x65656565); + vand(x30, x56B3803F, a6); + vxor(x31, x30, xC729695A); + vxor(*out4, *out4, x31); + + vxor(xF40083F0, x00FFF00F, xF4FF73FF); + vandn(x03D6640A, x33D6E55A, xF40083F0); + vandn(x61616161, x65656565, x0C0C0C0C); + vxor(x62B7056B, x03D6640A, x61616161); + vor(x00, x62B7056B, a6); + vxor(x01, x00, xC729695A); + vxor(*out1, *out1, x01); +}*/ +//#elif triop && (latency >= 4 || (!andn && latency == 3)) +/* s8-015415, 41 gates, 14 regs, 7 andn, 5/23/57/98/140 stalls, 60 biop */ +/*inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + vtype x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + vtype xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + vtype xF429738C, xC72970BC, x9E4319E6, x000019E6, xC729695A; + vtype xF77C3E1F, xF77F3F3F, x9E43E619, x693CD926; + vtype xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + vtype xF700A600, x61008000, x03B7856B, x62B7056B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x0C0C0C0C, a3, a2); + vandn(x0000F0F0, a5, a3); + vxor(x00FFF00F, a4, x0000F0F0); + vand(x00555005, a1, x00FFF00F); + vandn(x00515001, x00555005, x0C0C0C0C); + + vandn(x33000330, a2, x00FFF00F); + vor(x77555775, a1, x33000330); + vandn(x30303030, a2, a3); + vxor(x3030CFCF, a5, x30303030); + vand(x30104745, x77555775, x3030CFCF); + vor(x30555745, x00555005, x30104745); + + vnot(xFF000FF0, x00FFF00F); + vxor(xCF1048B5, x30104745, xFF000FF0); + vandn(x080A080A, a3, x77555775); + vxor(xC71A40BF, xCF1048B5, x080A080A); + vxor(xCB164CB3, x0C0C0C0C, xC71A40BF); + vor(x10, x00515001, a6); + vxor(x11, x10, xCB164CB3); + vxor(*out2, *out2, x11); + + vxor(xF429738C, a2, xC71A40BF); + vxor(xC72970BC, x33000330, xF429738C); + vxor(x9E4319E6, a1, xCB164CB3); + vand(x000019E6, a5, x9E4319E6); + vxor(xC729695A, xC72970BC, x000019E6); + + vxor(xF77C3E1F, x30555745, xC729695A); + vor(xF77F3F3F, a2, xF77C3E1F); + vxor(x9E43E619, a5, x9E4319E6); + vxor(x693CD926, xF77F3F3F, x9E43E619); + vand(x20, x30555745, a6); + vxor(x21, x20, x693CD926); + vxor(*out3, *out3, x21); + + vxor(xF719A695, x3030CFCF, xC729695A); + vor(xF4FF73FF, a4, xF429738C); + vxor(x03E6D56A, xF719A695, xF4FF73FF); + vxor(x56B3803F, a1, x03E6D56A); + vand(x30, x56B3803F, a6); + vxor(x31, x30, xC729695A); + vxor(*out4, *out4, x31); + + vandn(xF700A600, xF719A695, a4); + vand(x61008000, x693CD926, xF700A600); + vxor(x03B7856B, x00515001, x03E6D56A); + vxor(x62B7056B, x61008000, x03B7856B); + vor(x00, x62B7056B, a6); + vxor(x01, x00, xC729695A); + vxor(*out1, *out1, x01); +}*/ +//#elif !andn || !triop +/* s8-019374, 41 gates, 14 regs, 7 andn, 4/25/61/103/145 stalls, 59 biop */ +/* Currently used for x86-64 SSE2 */ +inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + vtype x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + vtype xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + vtype x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + vtype xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + vtype xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + vtype xF700A600, x61008000, x03B7856B, x62B7056B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x0C0C0C0C, a3, a2); + vandn(x0000F0F0, a5, a3); + vxor(x00FFF00F, a4, x0000F0F0); + vand(x00555005, a1, x00FFF00F); + vandn(x00515001, x00555005, x0C0C0C0C); + + vandn(x33000330, a2, x00FFF00F); + vor(x77555775, a1, x33000330); + vandn(x30303030, a2, a3); + vxor(x3030CFCF, a5, x30303030); + vand(x30104745, x77555775, x3030CFCF); + vor(x30555745, x00555005, x30104745); + + vnot(xFF000FF0, x00FFF00F); + vxor(xCF1048B5, x30104745, xFF000FF0); + vandn(x080A080A, a3, x77555775); + vxor(xC71A40BF, xCF1048B5, x080A080A); + vxor(xCB164CB3, x0C0C0C0C, xC71A40BF); + vor(x10, x00515001, a6); + vxor(x11, x10, xCB164CB3); + vxor(out[c2], out[c2], x11); + + vxor(x9E4319E6, a1, xCB164CB3); + vand(x000019E6, a5, x9E4319E6); + vxor(xF429738C, a2, xC71A40BF); + vxor(xF4296A6A, x000019E6, xF429738C); + vxor(xC729695A, x33000330, xF4296A6A); + + vxor(xC47C3D2F, x30555745, xF4296A6A); + vor(xF77F3F3F, a2, xC47C3D2F); + vxor(x9E43E619, a5, x9E4319E6); + vxor(x693CD926, xF77F3F3F, x9E43E619); + vand(x20, x30555745, a6); + vxor(x21, x20, x693CD926); + vxor(out[c3], out[c3], x21); + + vxor(xF719A695, x3030CFCF, xC729695A); + vor(xF4FF73FF, a4, xF429738C); + vxor(x03E6D56A, xF719A695, xF4FF73FF); + vxor(x56B3803F, a1, x03E6D56A); + vand(x30, x56B3803F, a6); + vxor(x31, x30, xC729695A); + vxor(out[c4], out[c4], x31); + + vandn(xF700A600, xF719A695, a4); + vand(x61008000, x693CD926, xF700A600); + vxor(x03B7856B, x00515001, x03E6D56A); + vxor(x62B7056B, x61008000, x03B7856B); + vor(x00, x62B7056B, a6); + vxor(x01, x00, xC729695A); + vxor(out[c1], out[c1], x01); +} + +//#else +/* s8-019630, 41 gates, 14 regs, 11 andn, 4/21/60/101/143 stalls, 62 biop */ +/*inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + vtype * out1, vtype * out2, vtype * out3, vtype * out4) +{ + vtype x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + vtype x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + vtype xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + vtype x9E4319E6, x000019E6, x33001AD6, xF429738C, xC729695A; + vtype x00332121, x9E4018C6, xC72996A5, x59698E63; + vtype xF4FF73FF, x33D6E55A, x65656565, x56B3803F; + vtype x38299955, x03D6640A, x61616161, x62B7056B; + vtype x00, x01, x10, x11, x20, x21, x30, x31; + + vandn(x0C0C0C0C, a3, a2); + vandn(x0000F0F0, a5, a3); + vxor(x00FFF00F, a4, x0000F0F0); + vand(x00555005, a1, x00FFF00F); + vandn(x00515001, x00555005, x0C0C0C0C); + + vandn(x33000330, a2, x00FFF00F); + vor(x77555775, a1, x33000330); + vandn(x30303030, a2, a3); + vxor(x3030CFCF, a5, x30303030); + vand(x30104745, x77555775, x3030CFCF); + vor(x30555745, x00555005, x30104745); + + vnot(xFF000FF0, x00FFF00F); + vxor(xCF1048B5, x30104745, xFF000FF0); + vandn(x080A080A, a3, x77555775); + vxor(xC71A40BF, xCF1048B5, x080A080A); + vxor(xCB164CB3, x0C0C0C0C, xC71A40BF); + vor(x10, x00515001, a6); + vxor(x11, x10, xCB164CB3); + vxor(*out2, *out2, x11); + + vxor(x9E4319E6, a1, xCB164CB3); + vand(x000019E6, a5, x9E4319E6); + vxor(x33001AD6, x33000330, x000019E6); + vxor(xF429738C, a2, xC71A40BF); + vxor(xC729695A, x33001AD6, xF429738C); + + vandn(x00332121, a2, x33001AD6); + vandn(x9E4018C6, x9E4319E6, x00332121); + vxor(xC72996A5, a5, xC729695A); + vxor(x59698E63, x9E4018C6, xC72996A5); + vandn(x20, x30555745, a6); + vxor(x21, x20, x59698E63); + vxor(*out3, *out3, x21); + + vor(xF4FF73FF, a4, xF429738C); + vxor(x33D6E55A, xC72996A5, xF4FF73FF); + vxor(x65656565, a1, x30303030); + vxor(x56B3803F, x33D6E55A, x65656565); + vand(x30, x56B3803F, a6); + vxor(x31, x30, xC729695A); + vxor(*out4, *out4, x31); + + vxor(x38299955, xFF000FF0, xC72996A5); + vandn(x03D6640A, x33D6E55A, x38299955); + vandn(x61616161, x65656565, x0C0C0C0C); + vxor(x62B7056B, x03D6640A, x61616161); + vor(x00, x62B7056B, a6); + vxor(x01, x00, xC729695A); + vxor(*out1, *out1, x01); +}*/ +//#endif diff --git a/opencl_sboxes-s.h b/opencl_sboxes-s.h new file mode 100644 index 0000000..7111d0b --- /dev/null +++ b/opencl_sboxes-s.h @@ -0,0 +1,1458 @@ +/* + * Bitslice DES S-boxes making use of a vector conditional select operation + * (e.g., vsel on PowerPC with AltiVec). + * + * Gate counts: 36 33 33 26 35 34 34 32 + * Average: 32.875 + * + * Several same-gate-count expressions for each S-box are included (for use on + * different CPUs/GPUs). + * + * These Boolean expressions corresponding to DES S-boxes have been generated + * by Roman Rusakov for use in Openwall's + * John the Ripper password cracker: http://www.openwall.com/john/ + * Being mathematical formulas, they are not copyrighted and are free for reuse + * by anyone. + * + * This file (a specific representation of the S-box expressions, surrounding + * logic) is Copyright (c) 2011 by Solar Designer . + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. (This is a heavily cut-down "BSD license".) + * + * The effort has been sponsored by Rapid7: http://www.rapid7.com + */ + +#undef regs +#if defined(__x86_64__) && defined(__XOP__) +#define regs 16 +#elif defined(__x86_64__) +#define regs 15 +#elif defined(__i386__) +/* Hopefully, AMD XOP (but in 32-bit mode) */ +#define regs 8 +#else +/* PowerPC with AltiVec, etc. */ +#define regs 32 +#endif + +#undef latency +/* Latency 2 may also mean dual-issue with latency 1 */ +#define latency 2 + +//#if regs >= 17 || latency >= 3 +/* s1-000010, 36 gates, 17 regs, 8/28/65/102/139 stall cycles */ + +inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, + x5CA9E295; + vtype x55AFD1B7, x3C3C69C3, x6993B874; + vtype x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; + vtype x29EEADC0, x4B8771A3, x428679F3, x6B68D433; + vtype x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; + vtype x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; + vtype x0DBCE883, x3A25A215, x37994A96; + vtype xC9C93B62, x89490F02, xB96C2D16; + vtype x0, x1, x2, x3; + + vsel(x0F0F3333, a3, a2, a5); + vxor(x3C3C3C3C, a2, a3); + vor(x55FF55FF, a1, a4); + vxor(x69C369C3, x3C3C3C3C, x55FF55FF); + vsel(x0903B73F, a5, x0F0F3333, x69C369C3); + vxor(x09FCB7C0, a4, x0903B73F); + vxor(x5CA9E295, a1, x09FCB7C0); + + vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); + vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); + vxor(x6993B874, x55AFD1B7, x3C3C69C3); + + vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); + vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); + vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); + vxor(x529E962D, x0F0F3333, x5D91A51E); + + vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); + vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); + vsel(x428679F3, a5, x4B8771A3, x529E962D); + vxor(x6B68D433, x29EEADC0, x428679F3); + + vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); + vsel(x026F12F3, a4, x0F0F3333, x529E962D); + vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); + vnot(x94D83B6C, x6B27C493); + vsel(x0, x94D83B6C, x6B68D433, a6); + vxor(out[c1], out[c1], x0); + + vsel(x965E0B0F, x94D83B6C, a3, x428679F3); + vsel(x3327A113, x5BA7E193, a2, x69C369C3); + vsel(x847F0A1F, x965E0B0F, a4, x3327A113); + vxor(xD6E19C32, x529E962D, x847F0A1F); + vsel(x1, xD6E19C32, x5CA9E295, a6); + vxor(out[c2], out[c2], x1); + + vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); + vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); + vxor(x37994A96, x0DBCE883, x3A25A215); + vsel(x3, x37994A96, x529E962D, a6); + vxor(out[c4], out[c4], x3); + + vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); + vsel(x89490F02, a3, xC9C93B62, x965E0B0F); + vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); + vsel(x2, xB96C2D16, x6993B874, a6); + vxor(out[c3], out[c3], x2); +} + +//#else +/* s1-000011, 36 gates, 16 regs, 10/37/74/111/148 stall cycles */ +/* +inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, + x5CA9E295; + vtype x55AFD1B7, x3C3C69C3, x6993B874; + vtype x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; + vtype x29EEADC0, x4B8771A3, x428679F3, x6B68D433; + vtype x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; + vtype x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; + vtype x0DBCE883, x3A25A215, x37994A96; + vtype x8A487EA7, x8B480F07, xB96C2D16; + vtype x0, x1, x2, x3; + + vsel(x0F0F3333, a3, a2, a5); + vxor(x3C3C3C3C, a2, a3); + vor(x55FF55FF, a1, a4); + vxor(x69C369C3, x3C3C3C3C, x55FF55FF); + vsel(x0903B73F, a5, x0F0F3333, x69C369C3); + vxor(x09FCB7C0, a4, x0903B73F); + vxor(x5CA9E295, a1, x09FCB7C0); + + vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); + vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); + vxor(x6993B874, x55AFD1B7, x3C3C69C3); + + vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); + vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); + vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); + vxor(x529E962D, x0F0F3333, x5D91A51E); + + vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); + vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); + vsel(x428679F3, a5, x4B8771A3, x529E962D); + vxor(x6B68D433, x29EEADC0, x428679F3); + + vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); + vsel(x026F12F3, a4, x0F0F3333, x529E962D); + vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); + vnot(x94D83B6C, x6B27C493); + vsel(x0, x94D83B6C, x6B68D433, a6); + vxor(out[c1], out[c1], x0); + + vsel(x965E0B0F, x94D83B6C, a3, x428679F3); + vsel(x3327A113, x5BA7E193, a2, x69C369C3); + vsel(x847F0A1F, x965E0B0F, a4, x3327A113); + vxor(xD6E19C32, x529E962D, x847F0A1F); + vsel(x1, xD6E19C32, x5CA9E295, a6); + vxor(out[c2], out[c2], x1); + + vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); + vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); + vxor(x37994A96, x0DBCE883, x3A25A215); + vsel(x3, x37994A96, x529E962D, a6); + vxor(out[c4], out[c4], x3); + + vxor(x8A487EA7, x5CA9E295, xD6E19C32); + vsel(x8B480F07, a3, x8A487EA7, x847F0A1F); + vsel(xB96C2D16, x8B480F07, x3C3C3C3C, x3A25A215); + vsel(x2, xB96C2D16, x6993B874, a6); + vxor(out[c3], out[c3], x2); +} +*/ +//#endif + +//#if regs >= 18 && latency <= 2 +/* s2-000000, 33 gates, 18 regs, 3/26/57/90/125 stall cycles */ + +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + vtype x0F5AF03C, x6600FF56, x87A5F09C; + vtype xA55A963C, x3C69C30F, xB44BC32D; + vtype x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; + vtype xB46C662D, x278DB412, xB66CB43B; + vtype xD2DC4E52, x27993333, xD2994E33; + vtype x278D0F2D, x2E0E547B, x09976748; + vtype x0, x1, x2, x3; + + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); + + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(out[c2], out[c2], x1); + + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + + vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); + vxor(x3C69C30F, a3, x0F5AF03C); + vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + + vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); + vsel(x0F4B0F2D, a4, xB44BC32D, a5); + vxor(x699CC37B, x66D7CC56, x0F4B0F2D); + vxor(x996C66D2, xF0F0A5A9, x699CC37B); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(out[c1], out[c1], x0); + + vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + + vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD2DC4E52, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(out[c4], out[c4], x3); + + vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); + vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); + vxor(x09976748, x27993333, x2E0E547B); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(out[c3], out[c3], x2); +} + +//#elif regs >= 18 && latency >= 4 +/* s2-000002, 33 gates, 18 regs, 4/22/49/82/117 stall cycles */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + vtype x0F5AF03C, x6600FF56, x87A5F09C; + vtype xA55A963C, x3C69C30F, xB44BC32D; + vtype x0F4B0F2D, x66D7CC56, x962769FF, x996C66D2; + vtype xB46C662D, x278DB412, xB66CB43B; + vtype xD2DC4E52, x27993333, xD2994E33; + vtype x278D0F2D, x2E0E547B, x09976748; + vtype x0, x1, x2, x3; + + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); + + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(out[c2], out[c2], x1); + + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + + vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); + vxor(x3C69C30F, a3, x0F5AF03C); + vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + + vsel(x0F4B0F2D, a4, xB44BC32D, a5); + vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); + vxor(x962769FF, xF0F0A5A9, x66D7CC56); + vxor(x996C66D2, x0F4B0F2D, x962769FF); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(out[c1], out[c1], x0); + + vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + + vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD2DC4E52, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(out[c4], out[c4], x3); + + vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); + vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); + vxor(x09976748, x27993333, x2E0E547B); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(out[c3], out[c3], x2); +} +*/ +//#else +/* s2-000012, 33 gates, 17 regs, 5/17/51/86/121 stall cycles */ +/* +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; + vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; + vtype x0F5AF03C, x6600FF56, x87A5F09C; + vtype x875AF03C, xF00F0FA9, xB44BC32D; + vtype x6627A556, xD26C667B, x278DB412, xB66CB43B; + vtype x668DC32D, x99723CD2, x996C66D2; + vtype xD20E4EA9, x27993333, xD2994E33; + vtype x9927C3E1, x089F3F0C, x09976748; + vtype x0, x1, x2, x3; + + vsel(x55553333, a1, a3, a6); + vsel(x0055FF33, a6, x55553333, a5); + vsel(x33270F03, a3, a4, x0055FF33); + vxor(x66725A56, a1, x33270F03); + vxor(x00FFFF00, a5, a6); + vxor(x668DA556, x66725A56, x00FFFF00); + + vsel(x0F0F5A56, a4, x66725A56, a6); + vnot(xF0F0A5A9, x0F0F5A56); + vxor(xA5A5969A, x55553333, xF0F0A5A9); + vxor(xA55A699A, x00FFFF00, xA5A5969A); + vsel(x1, xA55A699A, x668DA556, a2); + vxor(out[c2], out[c2], x1); + + vxor(x0F5AF03C, a4, x0055FF33); + vsel(x6600FF56, x66725A56, a6, x00FFFF00); + vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + + vsel(x875AF03C, x87A5F09C, x0F5AF03C, a5); + vsel(xF00F0FA9, xF0F0A5A9, a4, x00FFFF00); + vsel(xB44BC32D, x875AF03C, xF00F0FA9, a3); + + vsel(x6627A556, x66725A56, x668DA556, x0055FF33); + vxor(xD26C667B, xB44BC32D, x6627A556); + vsel(x278DB412, x668DA556, xA5A5969A, a1); + vsel(xB66CB43B, xD26C667B, x278DB412, x6600FF56); + + vsel(x668DC32D, x668DA556, xB44BC32D, a6); + vnot(x99723CD2, x668DC32D); + vsel(x996C66D2, x99723CD2, xD26C667B, x00FFFF00); + vsel(x0, x996C66D2, xB44BC32D, a2); + vxor(out[c1], out[c1], x0); + + vsel(xD20E4EA9, xF00F0FA9, xD26C667B, x668DC32D); + vsel(x27993333, x278DB412, a3, x0055FF33); + vsel(xD2994E33, xD20E4EA9, x27993333, a5); + vsel(x3, x87A5F09C, xD2994E33, a2); + vxor(out[c4], out[c4], x3); + + vxor(x9927C3E1, x0055FF33, x99723CD2); + vsel(x089F3F0C, a4, x00FFFF00, x27993333); + vsel(x09976748, x089F3F0C, x9927C3E1, x0F0F5A56); + vsel(x2, xB66CB43B, x09976748, a2); + vxor(out[c3], out[c3], x2); +} +*/ +//#endif + +//#if latency >= 3 +/* s3-000000, 33 gates, 17 regs, 6/10/33/66/102 stall cycles */ +/* +inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0F330F33, x0F33F0CC, x5A66A599; + vtype x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; + vtype x556BA09E, x665A93AC, x99A56C53; + vtype x25A1A797, x5713754C, x66559355, x47B135C6; + vtype x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; + vtype x9E48CDE4, x655B905E, x00A55CFF, x9E49915E; + vtype xD6599874, x05330022, xD2699876; + vtype x665F9364, xD573F0F2, xB32C6396; + vtype x0, x1, x2, x3; + + vsel(x0F330F33, a4, a3, a5); + vxor(x0F33F0CC, a6, x0F330F33); + vxor(x5A66A599, a2, x0F33F0CC); + + vsel(x2111B7BB, a3, a6, x5A66A599); + vsel(x03FF3033, a5, a3, x0F33F0CC); + vsel(x05BB50EE, a5, x0F33F0CC, a2); + vsel(x074F201F, x03FF3033, a4, x05BB50EE); + vxor(x265E97A4, x2111B7BB, x074F201F); + + vsel(x556BA09E, x5A66A599, x05BB50EE, a4); + vsel(x665A93AC, x556BA09E, x265E97A4, a3); + vnot(x99A56C53, x665A93AC); + vsel(x1, x265E97A4, x99A56C53, a1); + vxor(out[c2], out[c2], x1); + + vxor(x25A1A797, x03FF3033, x265E97A4); + vsel(x5713754C, a2, x0F33F0CC, x074F201F); + vsel(x66559355, x665A93AC, a2, a5); + vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + + vxor(x9A5A5C60, x03FF3033, x99A56C53); + vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); + vxor(x87698DB4, x5713754C, xD07AF8F8); + vxor(xE13C1EE1, x66559355, x87698DB4); + + vsel(x9E48CDE4, x9A5A5C60, x87698DB4, x265E97A4); + vsel(x655B905E, x66559355, x05BB50EE, a4); + vsel(x00A55CFF, a5, a6, x9A5A5C60); + vsel(x9E49915E, x9E48CDE4, x655B905E, x00A55CFF); + vsel(x0, x9E49915E, xE13C1EE1, a1); + vxor(out[c1], out[c1], x0); + + vsel(xD6599874, xD07AF8F8, x66559355, x0F33F0CC); + vand(x05330022, x0F330F33, x05BB50EE); + vsel(xD2699876, xD6599874, x00A55CFF, x05330022); + vsel(x3, x5A66A599, xD2699876, a1); + vxor(out[c4], out[c4], x3); + + vsel(x665F9364, x265E97A4, x66559355, x47B135C6); + vsel(xD573F0F2, xD07AF8F8, x05330022, a4); + vxor(xB32C6396, x665F9364, xD573F0F2); + vsel(x2, xB32C6396, x47B135C6, a1); + vxor(out[c3], out[c3], x2); +} +*/ +//#else +/* s3-000004, 33 gates, 17 regs, 3/13/41/77/113 stall cycles */ +inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0F330F33, x0F33F0CC, x5A66A599; + vtype x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; + vtype x556BA09E, x665A93AC, x99A56C53; + vtype x25A1A797, x5713754C, x66559355, x47B135C6; + vtype x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; + vtype x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; + vtype xD069F8B4, x030FF0C3, xD2699876; + vtype xD579DDF4, xD579F0C3, xB32C6396; + vtype x0, x1, x2, x3; + + vsel(x0F330F33, a4, a3, a5); + vxor(x0F33F0CC, a6, x0F330F33); + vxor(x5A66A599, a2, x0F33F0CC); + + vsel(x2111B7BB, a3, a6, x5A66A599); + vsel(x03FF3033, a5, a3, x0F33F0CC); + vsel(x05BB50EE, a5, x0F33F0CC, a2); + vsel(x074F201F, x03FF3033, a4, x05BB50EE); + vxor(x265E97A4, x2111B7BB, x074F201F); + + vsel(x556BA09E, x5A66A599, x05BB50EE, a4); + vsel(x665A93AC, x556BA09E, x265E97A4, a3); + vnot(x99A56C53, x665A93AC); + vsel(x1, x265E97A4, x99A56C53, a1); + vxor(out[c2], out[c2], x1); + + vxor(x25A1A797, x03FF3033, x265E97A4); + vsel(x5713754C, a2, x0F33F0CC, x074F201F); + vsel(x66559355, x665A93AC, a2, a5); + vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + + vxor(x9A5A5C60, x03FF3033, x99A56C53); + vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); + vxor(x87698DB4, x5713754C, xD07AF8F8); + vxor(xE13C1EE1, x66559355, x87698DB4); + + vsel(x000CFFCF, a4, a6, x0F33F0CC); + vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); + vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); + vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); + vsel(x0, x9E49915E, xE13C1EE1, a1); + vxor(out[c1], out[c1], x0); + + vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); + vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); + vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); + vsel(x3, x5A66A599, xD2699876, a1); + vxor(out[c4], out[c4], x3); + + vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); + vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); + vxor(xB32C6396, x66559355, xD579F0C3); + vsel(x2, xB32C6396, x47B135C6, a1); + vxor(out[c3], out[c3], x2); +} +//#endif + +//#if regs >= 13 +/* s4-000014, 26 gates, 13 regs, 2/17/42/70/98 stall cycles */ + +inline void +s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, + x0AF50F0F, x4CA36B59; + vtype xB35C94A6; + vtype x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; + vtype x56E9861E; + vtype x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; + vtype x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; + vtype x0, x1, x2, x3; + + vsel(x0505AFAF, a5, a3, a1); + vsel(x0555AF55, x0505AFAF, a1, a4); + vxor(x0A5AA05A, a3, x0555AF55); + vsel(x46566456, a1, x0A5AA05A, a2); + vsel(x0A0A5F5F, a3, a5, a1); + vxor(x0AF55FA0, a4, x0A0A5F5F); + vsel(x0AF50F0F, x0AF55FA0, a3, a5); + vxor(x4CA36B59, x46566456, x0AF50F0F); + + vnot(xB35C94A6, x4CA36B59); + + vsel(x01BB23BB, a4, a2, x0555AF55); + vxor(x5050FAFA, a1, x0505AFAF); + vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); + vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + + vnot(x56E9861E, xA91679E1); + + vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); + vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); + vsel(x827D9784, xB35C94A6, x0AF55F00, a2); + vxor(xD2946D9A, x50E9FA1E, x827D9784); + vsel(x2, xD2946D9A, x4CA36B59, a6); + vxor(out[c3], out[c3], x2); + vsel(x3, xB35C94A6, xD2946D9A, a6); + vxor(out[c4], out[c4], x3); + + vsel(x31F720B3, a2, a4, x0AF55FA0); + vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); + vxor(x4712A7AD, x56E9861E, x11FB21B3); + vxor(x9586CA37, xD2946D9A, x4712A7AD); + vsel(x0, x56E9861E, x9586CA37, a6); + vxor(out[c1], out[c1], x0); + vsel(x1, x9586CA37, xA91679E1, a6); + vxor(out[c2], out[c2], x1); +} + +//#else +/* s4-000033, 26 gates, 12 regs, 4/22/48/76/104 stall cycles */ +/* +inline void +s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, + x0AF50F0F, x4CA36B59; + vtype xB35C94A6; + vtype x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; + vtype x56E9861E; + vtype x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; + vtype xD2F56D00, x46F9870F, x4773A737, x9586CA37; + vtype x0, x1, x2, x3; + + vsel(x0505AFAF, a5, a3, a1); + vsel(x0555AF55, x0505AFAF, a1, a4); + vxor(x0A5AA05A, a3, x0555AF55); + vsel(x46566456, a1, x0A5AA05A, a2); + vsel(x0A0A5F5F, a3, a5, a1); + vxor(x0AF55FA0, a4, x0A0A5F5F); + vsel(x0AF50F0F, x0AF55FA0, a3, a5); + vxor(x4CA36B59, x46566456, x0AF50F0F); + + vnot(xB35C94A6, x4CA36B59); + + vsel(x01BB23BB, a4, a2, x0555AF55); + vxor(x5050FAFA, a1, x0505AFAF); + vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); + vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + + vnot(x56E9861E, xA91679E1); + + vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); + vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); + vsel(x827D9784, xB35C94A6, x0AF55F00, a2); + vxor(xD2946D9A, x50E9FA1E, x827D9784); + vsel(x2, xD2946D9A, x4CA36B59, a6); + vxor(out[c3], out[c3], x2); + vsel(x3, xB35C94A6, xD2946D9A, a6); + vxor(out[c4], out[c4], x3); + + vsel(xD2F56D00, xD2946D9A, x0AF55F00, a4); + vsel(x46F9870F, x56E9861E, x0AF50F0F, a2); + vsel(x4773A737, x46F9870F, a2, x01BB23BB); + vxor(x9586CA37, xD2F56D00, x4773A737); + vsel(x0, x56E9861E, x9586CA37, a6); + vxor(out[c1], out[c1], x0); + vsel(x1, x9586CA37, xA91679E1, a6); + vxor(out[c2], out[c2], x1); +} +*/ +//#endif + +//#if regs <= 18 && latency <= 2 +/* s5-000000, 35 gates, 18 regs, 7/33/68/105/142 stall cycles */ +/* +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C; + vtype xAC81CFB2, xF72577AF, x5BA4B81D; + vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A; + vtype x12E6283D, x9E47D3D4, x1A676AB4; + vtype x2E3C69C6, x92C7C296, x369CC1D6; + vtype x891556DF, xE5E77F82, x6CF2295D; + vtype x0, x1, x2, x3; + + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); + + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); + + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(out[c3], out[c3], x2); + + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(out[c2], out[c2], x1); + + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + + vsel(x2E3C69C6, x2E3C2E3C, x963969C6, a6); + vsel(x92C7C296, x96C696C6, x1A676AB4, a1); + vsel(x369CC1D6, x2E3C69C6, x92C7C296, x5BA4B81D); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(out[c1], out[c1], x0); + + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(out[c4], out[c4], x3); +} +*/ +//#elif regs == 19 || (regs >= 20 && latency >= 3) +/* s5-000005, 35 gates, 19 regs, 7/29/60/95/132 stall cycles */ +/* +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C; + vtype xAC81CFB2, xF72577AF, x5BA4B81D; + vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A; + vtype x12E6283D, x9E47D3D4, x1A676AB4; + vtype x2E3CD3D4, x9697C1C6, x369CC1D6; + vtype x891556DF, xE5E77F82, x6CF2295D; + vtype x0, x1, x2, x3; + + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); + + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); + + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(out[c3], out[c3], x2); + + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(out[c2], out[c2], x1); + + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + + vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6); + vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); + vsel(x369CC1D6, x2E3CD3D4, x9697C1C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(out[c1], out[c1], x0); + + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(out[c4], out[c4], x3); +} +*/ +//#elif regs <= 18 && latency >= 5 +/* s5-000011, 35 gates, 18 regs, 9/31/62/95/132 stall cycles */ +/* +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C; + vtype xAC81CFB2, xF72577AF, x5BA4B81D; + vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A; + vtype x12E6283D, x9E47D3D4, x1A676AB4; + vtype x2E3CD3D4, x96DF41C6, x369CC1D6; + vtype x891556DF, xE5E77F82, x6CF2295D; + vtype x0, x1, x2, x3; + + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); + + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); + + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(out[c3], out[c3], x2); + + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(out[c2], out[c2], x1); + + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + + vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6); + vsel(x96DF41C6, x963969C6, x96C696C6, x12E6283D); + vsel(x369CC1D6, x2E3CD3D4, x96DF41C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(out[c1], out[c1], x0); + + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(out[c4], out[c4], x3); +} +*/ +//#elif regs >= 20 +/* s5-000016, 35 gates, 20 regs, 6/30/62/98/135 stall cycles */ + +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C; + vtype xAC81CFB2, xF72577AF, x5BA4B81D; + vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A; + vtype x12E6283D, x9E47D3D4, x1A676AB4; + vtype x891556DF, xE5E77F82, x6CF2295D; + vtype x2E3CA5F5, x9697C1C6, x369CC1D6; + vtype x0, x1, x2, x3; + + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); + + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); + + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(out[c3], out[c3], x2); + + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(out[c2], out[c2], x1); + + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(out[c4], out[c4], x3); + + vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); + vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); + vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(out[c1], out[c1], x0); +} + +//#else +/* s5-000023, 35 gates, 18 regs, 9/30/61/96/133 stall cycles */ +/* +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; + vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; + vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C; + vtype xAC81CFB2, xF72577AF, x5BA4B81D; + vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A; + vtype x12E6283D, x9E47D3D4, x1A676AB4; + vtype x891556DF, xE5E77F82, x6CF2295D; + vtype x2E3CD3D4, x96DF41C6, x369CC1D6; + vtype x0, x1, x2, x3; + + vsel(x550F550F, a1, a3, a5); + vnot(xAAF0AAF0, x550F550F); + vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); + vxor(x96C696C6, a2, xA5F5A5F5); + vxor(x00FFFF00, a5, a6); + vxor(x963969C6, x96C696C6, x00FFFF00); + + vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); + vsel(xB73121F7, a2, x963969C6, x96C696C6); + vsel(x1501DF0F, a6, x550F550F, xB73121F7); + vsel(x00558A5F, x1501DF0F, a5, a1); + vxor(x2E69A463, x2E3C2E3C, x00558A5F); + + vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); + vsel(x045157FD, a6, a1, x0679ED42); + vsel(xB32077FF, xB73121F7, a6, x045157FD); + vxor(x9D49D39C, x2E69A463, xB32077FF); + vsel(x2, x9D49D39C, x2E69A463, a4); + vxor(out[c3], out[c3], x2); + + vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); + vsel(xF72577AF, xB32077FF, x550F550F, a1); + vxor(x5BA4B81D, xAC81CFB2, xF72577AF); + vsel(x1, x5BA4B81D, x963969C6, a4); + vxor(out[c2], out[c2], x1); + + vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); + vsel(x4895469F, x5BA477AF, x00558A5F, a2); + vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); + vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + + vsel(x12E6283D, a5, x5BA4B81D, x963969C6); + vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); + vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + + vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); + vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); + vxor(x6CF2295D, x891556DF, xE5E77F82); + vsel(x3, x1A35669A, x6CF2295D, a4); + vxor(out[c4], out[c4], x3); + + vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6); + vsel(x96DF41C6, x963969C6, x96C696C6, x12E6283D); + vsel(x369CC1D6, x2E3CD3D4, x96DF41C6, x5BA477AF); + vsel(x0, x369CC1D6, x1A676AB4, a4); + vxor(out[c1], out[c1], x0); +} +*/ +//#endif + +//#if regs >= 16 && latency <= 2 +/* s6-000000, 34 gates, 16 regs, 5/34/70/107/144 stall cycles */ + +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + vtype x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; + vtype x86CD4C9B, x12E0FFFD, x942D9A67; + vtype x142956AB, x455D45DF, x1C3EE619; + vtype x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; + vtype x840DBB67, x6DA19C1E, x925E63E1; + vtype x9C3CA761, x257A75D5, xB946D2B4; + vtype x0, x1, x2, x3; + + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); + + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); + + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x05CF0A9F, a4, a3, x353A9A65); + vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + + vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(out[c1], out[c1], x0); + + vsel(x142956AB, x353A659A, x942D9A67, a2); + vsel(x455D45DF, a1, x86CD4C9B, x142956AB); + vxor(x1C3EE619, x5963A3C6, x455D45DF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(out[c4], out[c4], x3); + + vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); + vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); + vxor(x3CF19C86, x1C3EE619, x20CF7A9F); + vxor(x69A49C79, x555500FF, x3CF19C86); + + vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); + vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); + vnot(x925E63E1, x6DA19C1E); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(out[c2], out[c2], x1); + + vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); + vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); + vxor(xB946D2B4, x9C3CA761, x257A75D5); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(out[c3], out[c3], x2); +} + +//#elif regs == 15 +/* s6-000008, 34 gates, 15 regs, 6/25/57/94/131 stall cycles */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + vtype x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; + vtype x86CD4C9B, x12E0FFFD, x942D9A67; + vtype x142956AB, x455D45DF, x1C3EE619; + vtype xC3C36393, x2D1B471E, xC70B631E, x925E63E1; + vtype x8C2F1A67, x965B6386, x69A49C79; + vtype x1C2E8201, xA56850B5, xB946D2B4; + vtype x0, x1, x2, x3; + + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); + + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); + + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x05CF0A9F, a4, a3, x353A9A65); + vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + + vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(out[c1], out[c1], x0); + + vsel(x142956AB, x353A659A, x942D9A67, a2); + vsel(x455D45DF, a1, x86CD4C9B, x142956AB); + vxor(x1C3EE619, x5963A3C6, x455D45DF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(out[c4], out[c4], x3); + + vsel(xC3C36393, xCAC5659A, a2, a3); + vsel(x2D1B471E, x353A659A, a3, x5963A3C6); + vsel(xC70B631E, xC3C36393, x2D1B471E, x05CF0A9F); + vxor(x925E63E1, x555500FF, xC70B631E); + + vsel(x8C2F1A67, x942D9A67, x0A3F0A6F, x5963A3C6); + vsel(x965B6386, x925E63E1, xC70B631E, x8C2F1A67); + vnot(x69A49C79, x965B6386); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(out[c2], out[c2], x1); + + vsel(x1C2E8201, x942D9A67, x1C3EE619, x8C2F1A67); + vxor(xA56850B5, a2, x965B6386); + vxor(xB946D2B4, x1C2E8201, xA56850B5); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(out[c3], out[c3], x2); +} +*/ +//#elif regs <= 14 +/* s6-000082, 34 gates, 14 regs, 8/31/65/102/139 stall cycles */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; + vtype x35FF659A, x3AF06A95, x066F0CCF, x16E94A97; + vtype x1872E297, x35BE6539, x1C3EE619; + vtype x86CD4C9B, x12E0FFFD, x942D9A67; + vtype x0A63C087, x9E4E5AE0, x02FA65FD, x925E63E1; + vtype xAB756193, x8A75E187, xB946D2B4; + vtype x375A7BA0, x965B6386, x69A49C79; + vtype x0, x1, x2, x3; + + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); + + vsel(x353A6565, x353A659A, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x353A6565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); + + vsel(x35FF659A, a4, x353A659A, x353A6565); + vxor(x3AF06A95, a3, x35FF659A); + vsel(x066F0CCF, a3, a4, x5963A3C6); + vsel(x16E94A97, x3AF06A95, x066F0CCF, x6C5939A3); + + vsel(x1872E297, x5963A3C6, x3AF06A95, a1); + vsel(x35BE6539, x35FF659A, x353A6565, x6C5939A3); + vsel(x1C3EE619, x1872E297, x35BE6539, x066F0CCF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(out[c4], out[c4], x3); + + vsel(x86CD4C9B, xCAC5659A, x066F0CCF, x6C5939A3); + vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(out[c1], out[c1], x0); + + vsel(x0A63C087, x1872E297, x066F0CCF, a2); + vxor(x9E4E5AE0, x942D9A67, x0A63C087); + vsel(x02FA65FD, x12E0FFFD, a4, x353A9A65); + vsel(x925E63E1, x9E4E5AE0, x02FA65FD, x6C5939A3); + + vsel(xAB756193, a2, xCAC5659A, x9E4E5AE0); + vsel(x8A75E187, x0A63C087, xAB756193, x925E63E1); + vxor(xB946D2B4, a2, x8A75E187); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(out[c3], out[c3], x2); + + vsel(x375A7BA0, a2, x9E4E5AE0, x16E94A97); + vsel(x965B6386, x8A75E187, x375A7BA0, x1C3EE619); + vnot(x69A49C79, x965B6386); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(out[c2], out[c2], x1); +} +*/ +//#else +/* s6-000461, 34 gates, 16 regs, 7/23/48/82/118 stall cycles */ +/* +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; + vtype x553A5565, x0A3F0A6F, x6C5939A3, x5963A3C6; + vtype x15FF459A, x1AF04A95, x066F0CCF, x16E94A97; + vtype x1872E297, x55BE5539, x1C3EE619; + vtype x86CD4C9B, x12E0FFFD, x942D9A67; + vtype x2FCAD0F0, x1BF21BB1, x466E4C89, x69A49C79; + vtype x965B6386, x12769BE1, x925E63E1; + vtype x9867CA97, x69339C33, xB946D2B4; + vtype x0, x1, x2, x3; + + vsel(x555500FF, a1, a4, a5); + vxor(x666633CC, a2, x555500FF); + vsel(x606F30CF, x666633CC, a4, a3); + vxor(x353A659A, a1, x606F30CF); + vxor(x353A9A65, a5, x353A659A); + vnot(xCAC5659A, x353A9A65); + + vsel(x553A5565, a1, x353A9A65, a4); + vsel(x0A3F0A6F, a3, a4, x553A5565); + vxor(x6C5939A3, x666633CC, x0A3F0A6F); + vxor(x5963A3C6, x353A9A65, x6C5939A3); + + vsel(x15FF459A, a4, x353A659A, x553A5565); + vxor(x1AF04A95, a3, x15FF459A); + vsel(x066F0CCF, a3, a4, x5963A3C6); + vsel(x16E94A97, x1AF04A95, x066F0CCF, x6C5939A3); + + vsel(x1872E297, x5963A3C6, x1AF04A95, a1); + vsel(x55BE5539, x15FF459A, x553A5565, x6C5939A3); + vsel(x1C3EE619, x1872E297, x55BE5539, x066F0CCF); + vsel(x3, x5963A3C6, x1C3EE619, a6); + vxor(out[c4], out[c4], x3); + + vsel(x86CD4C9B, xCAC5659A, x066F0CCF, x6C5939A3); + vsel(x12E0FFFD, a5, x1AF04A95, x16E94A97); + vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); + vsel(x0, xCAC5659A, x942D9A67, a6); + vxor(out[c1], out[c1], x0); + + vxor(x2FCAD0F0, x353A9A65, x1AF04A95); + vsel(x1BF21BB1, x1AF04A95, a2, x553A5565); + vsel(x466E4C89, x55BE5539, x066F0CCF, x1BF21BB1); + vxor(x69A49C79, x2FCAD0F0, x466E4C89); + + vnot(x965B6386, x69A49C79); + vsel(x12769BE1, x1BF21BB1, x942D9A67, x69A49C79); + vsel(x925E63E1, x965B6386, x12769BE1, x555500FF); + vsel(x1, x925E63E1, x69A49C79, a6); + vxor(out[c2], out[c2], x1); + + vsel(x9867CA97, x942D9A67, x1872E297, x2FCAD0F0); + vsel(x69339C33, x69A49C79, a2, a4); + vsel(xB946D2B4, x9867CA97, x2FCAD0F0, x69339C33); + vsel(x2, x16E94A97, xB946D2B4, a6); + vxor(out[c3], out[c3], x2); +}*/ +//#endif + +//#if regs <= 16 || latency >= 3 +/* s7-000013, 34 gates, 15 regs, 9/27/56/88/119 stall cycles */ +/* +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; + vtype x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; + vtype x738F9C63, x11EF9867, x26DA9867; + vtype x4B4B9C63, x4B666663, x4E639396; + vtype x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; + vtype xD728827B, x6698807B, x699C585B; + vtype x738C847B, xA4A71E18, x74878E78; + vtype x333D9639, x74879639, x8B7869C6; + vtype x0, x1, x2, x3; + + vsel(x44447777, a2, a6, a3); + vxor(x4B4B7878, a4, x44447777); + vsel(x22772277, a3, a5, a2); + vsel(x0505F5F5, a6, a2, a4); + vsel(x220522F5, x22772277, x0505F5F5, a5); + vxor(x694E5A8D, x4B4B7878, x220522F5); + + vxor(x00FFFF00, a5, a6); + vxor(x66666666, a2, a3); + vsel(x32353235, a3, x220522F5, a4); + vsel(x26253636, x66666666, x32353235, x4B4B7878); + vxor(x26DAC936, x00FFFF00, x26253636); + vsel(x0, x26DAC936, x694E5A8D, a1); + vxor(out[c1], out[c1], x0); + + vxor(x738F9C63, a2, x26DAC936); + vsel(x11EF9867, x738F9C63, a5, x66666666); + vsel(x26DA9867, x26DAC936, x11EF9867, a6); + + vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); + vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); + vxor(x4E639396, x0505F5F5, x4B666663); + + vsel(x4E4B393C, x4B4B7878, x4E639396, a2); + vnot(xFF00FF00, a5); + vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); + vxor(xB14EE41D, x4E4B393C, xFF05DD21); + vsel(x1, xB14EE41D, x26DA9867, a1); + vxor(out[c2], out[c2], x1); + + vxor(xD728827B, x66666666, xB14EE41D); + vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); + vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); + vsel(x2, x699C585B, x4E639396, a1); + vxor(out[c3], out[c3], x2); + + vsel(x738C847B, x738F9C63, xD728827B, x4B4B7878); + vxor(xA4A71E18, x738F9C63, xD728827B); + vsel(x74878E78, x738C847B, xA4A71E18, a4); + + vsel(x333D9639, x32353235, x738C847B, xB14EE41D); + vsel(x74879639, x74878E78, x333D9639, a6); + vnot(x8B7869C6, x74879639); + vsel(x3, x74878E78, x8B7869C6, a1); + vxor(out[c4], out[c4], x3); +} +*/ +//#else +/* s7-000019, 34 gates, 17 regs, 5/28/57/88/119 stall cycles */ +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; + vtype x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; + vtype x738F9C63, x11EF9867, x26DA9867; + vtype x4B4B9C63, x4B666663, x4E639396; + vtype x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; + vtype xD728827B, x6698807B, x699C585B; + vtype x778A8877, xA4A71E18, x74878E78; + vtype x204A5845, x74879639, x8B7869C6; + vtype x0, x1, x2, x3; + + vsel(x44447777, a2, a6, a3); + vxor(x4B4B7878, a4, x44447777); + vsel(x22772277, a3, a5, a2); + vsel(x0505F5F5, a6, a2, a4); + vsel(x220522F5, x22772277, x0505F5F5, a5); + vxor(x694E5A8D, x4B4B7878, x220522F5); + + vxor(x00FFFF00, a5, a6); + vxor(x66666666, a2, a3); + vsel(x32353235, a3, x220522F5, a4); + vsel(x26253636, x66666666, x32353235, x4B4B7878); + vxor(x26DAC936, x00FFFF00, x26253636); + vsel(x0, x26DAC936, x694E5A8D, a1); + vxor(out[c1], out[c1], x0); + + vxor(x738F9C63, a2, x26DAC936); + vsel(x11EF9867, x738F9C63, a5, x66666666); + vsel(x26DA9867, x26DAC936, x11EF9867, a6); + + vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); + vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); + vxor(x4E639396, x0505F5F5, x4B666663); + + vsel(x4E4B393C, x4B4B7878, x4E639396, a2); + vnot(xFF00FF00, a5); + vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); + vxor(xB14EE41D, x4E4B393C, xFF05DD21); + vsel(x1, xB14EE41D, x26DA9867, a1); + vxor(out[c2], out[c2], x1); + + vxor(xD728827B, x66666666, xB14EE41D); + vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); + vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); + vsel(x2, x699C585B, x4E639396, a1); + vxor(out[c3], out[c3], x2); + + vsel(x778A8877, x738F9C63, x26DAC936, x26253636); + vxor(xA4A71E18, x738F9C63, xD728827B); + vsel(x74878E78, x778A8877, xA4A71E18, a4); + + vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); + vsel(x74879639, x74878E78, a3, x204A5845); + vnot(x8B7869C6, x74879639); + vsel(x3, x74878E78, x8B7869C6, a1); + vxor(out[c4], out[c4], x3); +} +//#endif + +//#if latency >= 3 +/* s8-000035, 32 gates, 15 regs, 6/15/47/79/111 stall cycles */ +/* +inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; + vtype x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; + vtype x3001F74E, x30555745, x693CD926; + vtype x0C0CD926, x0C3F25E9, x38D696A5; + vtype xC729695A; + vtype x03D2117B, xC778395B, xCB471CB2; + vtype x5425B13F, x56B3803F, x919AE965; + vtype x03DA807F, x613CD515, x62E6556A, xA59E6C31; + vtype x0, x1, x2, x3; + + vsel(x0505F5F5, a5, a1, a3); + vxor(x05FAF50A, a4, x0505F5F5); + vsel(x0F0F00FF, a3, a4, a5); + vsel(x22227777, a2, a5, a1); + vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); + vxor(x34E9B34C, a2, x07DA807F); + + vsel(x00FFF00F, x05FAF50A, a4, a3); + vsel(x0033FCCF, a5, x00FFF00F, a2); + vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); + vsel(x0C0C3F3F, a3, a5, a2); + vxor(x59698E63, x5565B15C, x0C0C3F3F); + + vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); + vsel(x30555745, x3001F74E, a1, x00FFF00F); + vxor(x693CD926, x59698E63, x30555745); + vsel(x2, x693CD926, x59698E63, a6); + vxor(out[c3], out[c3], x2); + + vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); + vxor(x0C3F25E9, x0033FCCF, x0C0CD926); + vxor(x38D696A5, x34E9B34C, x0C3F25E9); + + vnot(xC729695A, x38D696A5); + + vsel(x03D2117B, x07DA807F, a2, x0C0CD926); + vsel(xC778395B, xC729695A, x03D2117B, x30555745); + vxor(xCB471CB2, x0C3F25E9, xC778395B); + vsel(x1, xCB471CB2, x34E9B34C, a6); + vxor(out[c2], out[c2], x1); + + vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); + vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); + vxor(x919AE965, xC729695A, x56B3803F); + vsel(x3, xC729695A, x919AE965, a6); + vxor(out[c4], out[c4], x3); + + vsel(x03DA807F, x03D2117B, x07DA807F, x693CD926); + vsel(x613CD515, a1, x693CD926, x34E9B34C); + vxor(x62E6556A, x03DA807F, x613CD515); + vxor(xA59E6C31, xC778395B, x62E6556A); + vsel(x0, xA59E6C31, x38D696A5, a6); + vxor(out[c1], out[c1], x0); +}*/ +//#else +/* s8-000037, 32 gates, 15 regs, 3/17/49/81/113 stall cycles */ +inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype * out, + vtype c1, vtype c2 ,vtype c3 , vtype c4) +{ + vtype x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; + vtype x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; + vtype x3001F74E, x30555745, x693CD926; + vtype x0C0CD926, x0C3F25E9, x38D696A5; + vtype xC729695A; + vtype x03D2117B, xC778395B, xCB471CB2; + vtype x5425B13F, x56B3803F, x919AE965; + vtype x17B3023F, x75555755, x62E6556A, xA59E6C31; + vtype x0, x1, x2, x3; + + vsel(x0505F5F5, a5, a1, a3); + vxor(x05FAF50A, a4, x0505F5F5); + vsel(x0F0F00FF, a3, a4, a5); + vsel(x22227777, a2, a5, a1); + vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); + vxor(x34E9B34C, a2, x07DA807F); + + vsel(x00FFF00F, x05FAF50A, a4, a3); + vsel(x0033FCCF, a5, x00FFF00F, a2); + vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); + vsel(x0C0C3F3F, a3, a5, a2); + vxor(x59698E63, x5565B15C, x0C0C3F3F); + + vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); + vsel(x30555745, x3001F74E, a1, x00FFF00F); + vxor(x693CD926, x59698E63, x30555745); + vsel(x2, x693CD926, x59698E63, a6); + vxor(out[c3], out[c3], x2); + + vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); + vxor(x0C3F25E9, x0033FCCF, x0C0CD926); + vxor(x38D696A5, x34E9B34C, x0C3F25E9); + + vnot(xC729695A, x38D696A5); + + vsel(x03D2117B, x07DA807F, a2, x0C0CD926); + vsel(xC778395B, xC729695A, x03D2117B, x30555745); + vxor(xCB471CB2, x0C3F25E9, xC778395B); + vsel(x1, xCB471CB2, x34E9B34C, a6); + vxor(out[c2], out[c2], x1); + + vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); + vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); + vxor(x919AE965, xC729695A, x56B3803F); + vsel(x3, xC729695A, x919AE965, a6); + vxor(out[c4], out[c4], x3); + + vsel(x17B3023F, x07DA807F, a2, x59698E63); + vor(x75555755, a1, x30555745); + vxor(x62E6556A, x17B3023F, x75555755); + vxor(xA59E6C31, xC778395B, x62E6556A); + vsel(x0, xA59E6C31, x38D696A5, a6); + vxor(out[c1], out[c1], x0); +} +//#endif diff --git a/opencl_sboxes.h b/opencl_sboxes.h new file mode 100644 index 0000000..5b528a1 --- /dev/null +++ b/opencl_sboxes.h @@ -0,0 +1,345 @@ +#include "opencl_misc.h" + +#if HAVE_LUT3 + +/* + * Bitslice DES S-boxes with LOP3.LUT instructions + * For NVIDIA Maxwell architecture and CUDA 7.5 RC + * by DeepLearningJohnDoe, version 0.1.6, 2015/07/19 + * + * Gate counts: 25 24 25 18 25 24 24 23 + * Average: 23.5 + * Depth: 8 7 7 6 8 10 10 8 + * Average: 8 + * + * These Boolean expressions corresponding to DES S-boxes were + * discovered by + * + * Copyright (c) 2012-2015 Sayantan Datta + * Copyright (c) 2015 + * Copyright (c) 2015 magnum + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * The underlying mathematical formulas are NOT copyrighted. + */ +inline void +s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype xAA55AA5500550055 = lut3(a1, a4, a6, 0xC1); + vtype xA55AA55AF0F5F0F5 = lut3(a3, a6, xAA55AA5500550055, 0x9E); + vtype x5F5F5F5FA5A5A5A5 = lut3(a1, a3, a6, 0xD6); + vtype xF5A0F5A0A55AA55A = lut3(a4, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x56); + vtype x947A947AD1E7D1E7 = lut3(a2, xA55AA55AF0F5F0F5, xF5A0F5A0A55AA55A, 0x6C); + vtype x5FFF5FFFFFFAFFFA = lut3(a6, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x7B); + vtype xB96CB96C69936993 = lut3(a2, xF5A0F5A0A55AA55A, x5FFF5FFFFFFAFFFA, 0xD6); + vtype x3 = lut3(a5, x947A947AD1E7D1E7, xB96CB96C69936993, 0x6A); + vtype x55EE55EE55EE55EE = lut3(a1, a2, a4, 0x7A); + vtype x084C084CB77BB77B = lut3(a2, a6, xF5A0F5A0A55AA55A, 0xC9); + vtype x9C329C32E295E295 = lut3(x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x72); + vtype xA51EA51E50E050E0 = lut3(a3, a6, x55EE55EE55EE55EE, 0x29); + vtype x4AD34AD3BE3CBE3C = lut3(a2, x947A947AD1E7D1E7, xA51EA51E50E050E0, 0x95); + vtype x2 = lut3(a5, x9C329C32E295E295, x4AD34AD3BE3CBE3C, 0xC6); + vtype xD955D95595D195D1 = lut3(a1, a2, x9C329C32E295E295, 0xD2); + vtype x8058805811621162 = lut3(x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x90); + vtype x7D0F7D0FC4B3C4B3 = lut3(xA51EA51E50E050E0, xD955D95595D195D1, x8058805811621162, 0x76); + vtype x0805080500010001 = lut3(a3, xAA55AA5500550055, xD955D95595D195D1, 0x80); + vtype x4A964A96962D962D = lut3(xB96CB96C69936993, x4AD34AD3BE3CBE3C, x0805080500010001, 0xA6); + vtype x4 = lut3(a5, x7D0F7D0FC4B3C4B3, x4A964A96962D962D, 0xA6); + vtype x148014807B087B08 = lut3(a1, xAA55AA5500550055, x947A947AD1E7D1E7, 0x21); + vtype x94D894D86B686B68 = lut3(xA55AA55AF0F5F0F5, x8058805811621162, x148014807B087B08, 0x6A); + vtype x5555555540044004 = lut3(a1, a6, x084C084CB77BB77B, 0x70); + vtype xAFB4AFB4BF5BBF5B = lut3(x5F5F5F5FA5A5A5A5, xA51EA51E50E050E0, x5555555540044004, 0x97); + vtype x1 = lut3(a5, x94D894D86B686B68, xAFB4AFB4BF5BBF5B, 0x6C); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +inline void +s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype xEEEEEEEE99999999 = lut3(a1, a2, a6, 0x97); + vtype xFFFFEEEE66666666 = lut3(a5, a6, xEEEEEEEE99999999, 0x67); + vtype x5555FFFFFFFF0000 = lut3(a1, a5, a6, 0x76); + vtype x6666DDDD5555AAAA = lut3(a2, xFFFFEEEE66666666, x5555FFFFFFFF0000, 0x69); + vtype x6969D3D35353ACAC = lut3(a3, xFFFFEEEE66666666, x6666DDDD5555AAAA, 0x6A); + vtype xCFCF3030CFCF3030 = lut3(a2, a3, a5, 0x65); + vtype xE4E4EEEE9999F0F0 = lut3(a3, xEEEEEEEE99999999, x5555FFFFFFFF0000, 0x8D); + vtype xE5E5BABACDCDB0B0 = lut3(a1, xCFCF3030CFCF3030, xE4E4EEEE9999F0F0, 0xCA); + vtype x3 = lut3(a4, x6969D3D35353ACAC, xE5E5BABACDCDB0B0, 0xC6); + vtype x3333CCCC00000000 = lut3(a2, a5, a6, 0x14); + vtype xCCCCDDDDFFFF0F0F = lut3(a5, xE4E4EEEE9999F0F0, x3333CCCC00000000, 0xB5); + vtype x00000101F0F0F0F0 = lut3(a3, a6, xFFFFEEEE66666666, 0x1C); + vtype x9A9A64646A6A9595 = lut3(a1, xCFCF3030CFCF3030, x00000101F0F0F0F0, 0x96); + vtype x2 = lut3(a4, xCCCCDDDDFFFF0F0F, x9A9A64646A6A9595, 0x6A); + vtype x3333BBBB3333FFFF = lut3(a1, a2, x6666DDDD5555AAAA, 0xDE); + vtype x1414141441410000 = lut3(a1, a3, xE4E4EEEE9999F0F0, 0x90); + vtype x7F7FF3F3F5F53939 = lut3(x6969D3D35353ACAC, x9A9A64646A6A9595, x3333BBBB3333FFFF, 0x79); + vtype x9494E3E34B4B3939 = lut3(a5, x1414141441410000, x7F7FF3F3F5F53939, 0x29); + vtype x1 = lut3(a4, x3333BBBB3333FFFF, x9494E3E34B4B3939, 0xA6); + vtype xB1B1BBBBCCCCA5A5 = lut3(a1, a1, xE4E4EEEE9999F0F0, 0x4A); + vtype xFFFFECECEEEEDDDD = lut3(a2, x3333CCCC00000000, x9A9A64646A6A9595, 0xEF); + vtype xB1B1A9A9DCDC8787 = lut3(xE5E5BABACDCDB0B0, xB1B1BBBBCCCCA5A5, xFFFFECECEEEEDDDD, 0x8D); + vtype xFFFFCCCCEEEE4444 = lut3(a2, a5, xFFFFEEEE66666666, 0x2B); + vtype x4 = lut3(a4, xB1B1A9A9DCDC8787, xFFFFCCCCEEEE4444, 0x6C); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +inline void +s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype xA50FA50FA50FA50F = lut3(a1, a3, a4, 0xC9); + vtype xF0F00F0FF0F0F0F0 = lut3(a3, a5, a6, 0x4B); + vtype xAF0FA0AAAF0FAF0F = lut3(a1, xA50FA50FA50FA50F, xF0F00F0FF0F0F0F0, 0x4D); + vtype x5AA5A55A5AA55AA5 = lut3(a1, a4, xF0F00F0FF0F0F0F0, 0x69); + vtype xAA005FFFAA005FFF = lut3(a3, a5, xA50FA50FA50FA50F, 0xD6); + vtype x5AA5A55A0F5AFAA5 = lut3(a6, x5AA5A55A5AA55AA5, xAA005FFFAA005FFF, 0x9C); + vtype x1 = lut3(a2, xAF0FA0AAAF0FAF0F, x5AA5A55A0F5AFAA5, 0xA6); + vtype xAA55AA5500AA00AA = lut3(a1, a4, a6, 0x49); + vtype xFAFAA50FFAFAA50F = lut3(a1, a5, xA50FA50FA50FA50F, 0x9B); + vtype x50AF0F5AFA50A5A5 = lut3(a1, xAA55AA5500AA00AA, xFAFAA50FFAFAA50F, 0x66); + vtype xAFAFAFAFFAFAFAFA = lut3(a1, a3, a6, 0x6F); + vtype xAFAFFFFFFFFAFAFF = lut3(a4, x50AF0F5AFA50A5A5, xAFAFAFAFFAFAFAFA, 0xEB); + vtype x4 = lut3(a2, x50AF0F5AFA50A5A5, xAFAFFFFFFFFAFAFF, 0x6C); + vtype x500F500F500F500F = lut3(a1, a3, a4, 0x98); + vtype xF0505A0505A5050F = lut3(x5AA5A55A0F5AFAA5, xAA55AA5500AA00AA, xAFAFAFAFFAFAFAFA, 0x1D); + vtype xF0505A05AA55AAFF = lut3(a6, x500F500F500F500F, xF0505A0505A5050F, 0x9A); + vtype xFF005F55FF005F55 = lut3(a1, a4, xAA005FFFAA005FFF, 0xB2); + vtype xA55F5AF0A55F5AF0 = lut3(a5, xA50FA50FA50FA50F, x5AA5A55A5AA55AA5, 0x3D); + vtype x5A5F05A5A55F5AF0 = lut3(a6, xFF005F55FF005F55, xA55F5AF0A55F5AF0, 0xA6); + vtype x3 = lut3(a2, xF0505A05AA55AAFF, x5A5F05A5A55F5AF0, 0xA6); + vtype x0F0F0F0FA5A5A5A5 = lut3(a1, a3, a6, 0xC6); + vtype x5FFFFF5FFFA0FFA0 = lut3(x5AA5A55A5AA55AA5, xAFAFAFAFFAFAFAFA, x0F0F0F0FA5A5A5A5, 0xDB); + vtype xF5555AF500A05FFF = lut3(a5, xFAFAA50FFAFAA50F, xF0505A0505A5050F, 0xB9); + vtype x05A5AAF55AFA55A5 = lut3(xF0505A05AA55AAFF, x0F0F0F0FA5A5A5A5, xF5555AF500A05FFF, 0x9B); + vtype x2 = lut3(a2, x5FFFFF5FFFA0FFA0, x05A5AAF55AFA55A5, 0xA6); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +#if 1 +/* Roman Rusakov’s s4 */ +inline void +s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype x55AAFF00=lut3(a1, a4, a5, 0x36); + vtype x00F00F00=lut3(a3, a4, a5, 0x24); + vtype x1926330C=lut3(a2, a3, x55AAFF00, 0xA4); + vtype x4CA36B59=lut3(x00F00F00, a1, x1926330C, 0xB6); + + vtype x00FF55AA=lut3(a1, a4, a5, 0x6C); + vtype x3FCC6E9D=lut3(a2, a3, x00FF55AA, 0x5E); + vtype x6A7935C8=lut3(a1, x00F00F00, x3FCC6E9D, 0xD6); + + vtype x5D016B55=lut3(a1, x4CA36B59, x00FF55AA, 0xD4); + vtype x07AE9F5A=lut3(a3, x55AAFF00, x5D016B55, 0xD6); + vtype x61C8F93C=lut3(a1, a2, x07AE9F5A, 0x96); + + vtype x3=lut3(a6, x4CA36B59, x61C8F93C, 0xC9); + vtype x4=lut3(a6, x4CA36B59, x61C8F93C, 0x93); + out[c3]^=x3; + out[c4]^=x4; + + vtype x26DA5E91=x4CA36B59^x6A7935C8; + vtype x37217F22=lut3(a2, a4, x26DA5E91, 0x72); + vtype x56E9861E=x37217F22^x61C8F93C; + + vtype x1=lut3(a6, x56E9861E, x6A7935C8, 0x5C); + vtype x2=lut3(a6, x56E9861E, x6A7935C8, 0x35); + out[c1]^=x1; + out[c2]^=x2; +} +#else +/* DeepLearningJohnDoe's s4 */ +inline void +s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype x55F055F055F055F0 = lut3(a1, a3, a4, 0x72); + vtype xA500F5F0A500F5F0 = lut3(a3, a5, x55F055F055F055F0, 0xAD); + vtype xF50AF50AF50AF50A = lut3(a1, a3, a4, 0x59); + vtype xF5FA0FFFF5FA0FFF = lut3(a3, a5, xF50AF50AF50AF50A, 0xE7); + vtype x61C8F93C61C8F93C = lut3(a2, xA500F5F0A500F5F0, xF5FA0FFFF5FA0FFF, 0xC6); + vtype x9999666699996666 = lut3(a1, a2, a5, 0x69); + vtype x22C022C022C022C0 = lut3(a2, a4, x55F055F055F055F0, 0x18); + vtype xB35C94A6B35C94A6 = lut3(xF5FA0FFFF5FA0FFF, x9999666699996666, x22C022C022C022C0, 0x63); + vtype x4 = lut3(a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x6A); + vtype x4848484848484848 = lut3(a1, a2, a3, 0x12); + vtype x55500AAA55500AAA = lut3(a1, a5, xF5FA0FFFF5FA0FFF, 0x28); + vtype x3C90B3D63C90B3D6 = lut3(x61C8F93C61C8F93C, x4848484848484848, x55500AAA55500AAA, 0x1E); + vtype x8484333384843333 = lut3(a1, x9999666699996666, x4848484848484848, 0x14); + vtype x4452F1AC4452F1AC = lut3(xF50AF50AF50AF50A, xF5FA0FFFF5FA0FFF, xB35C94A6B35C94A6, 0x78); + vtype x9586CA379586CA37 = lut3(x55500AAA55500AAA, x8484333384843333, x4452F1AC4452F1AC, 0xD6); + vtype x2 = lut3(a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0x6A); + vtype x1 = lut3(a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0xA9); + vtype x3 = lut3(a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x56); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} +#endif + +inline void +s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype xA0A0A0A0FFFFFFFF = lut3(a1, a3, a6, 0xAB); + vtype xFFFF00005555FFFF = lut3(a1, a5, a6, 0xB9); + vtype xB3B320207777FFFF = lut3(a2, xA0A0A0A0FFFFFFFF, xFFFF00005555FFFF, 0xE8); + vtype x50505A5A5A5A5050 = lut3(a1, a3, xFFFF00005555FFFF, 0x34); + vtype xA2A2FFFF2222FFFF = lut3(a1, a5, xB3B320207777FFFF, 0xCE); + vtype x2E2E6969A4A46363 = lut3(a2, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, 0x29); + vtype x3 = lut3(a4, xB3B320207777FFFF, x2E2E6969A4A46363, 0xA6); + vtype xA5A50A0AA5A50A0A = lut3(a1, a3, a5, 0x49); + vtype x969639396969C6C6 = lut3(a2, a6, xA5A50A0AA5A50A0A, 0x96); + vtype x1B1B1B1B1B1B1B1B = lut3(a1, a2, a3, 0xCA); + vtype xBFBFBFBFF6F6F9F9 = lut3(a3, xA0A0A0A0FFFFFFFF, x969639396969C6C6, 0x7E); + vtype x5B5BA4A4B8B81D1D = lut3(xFFFF00005555FFFF, x1B1B1B1B1B1B1B1B, xBFBFBFBFF6F6F9F9, 0x96); + vtype x2 = lut3(a4, x969639396969C6C6, x5B5BA4A4B8B81D1D, 0xCA); + vtype x5555BBBBFFFF5555 = lut3(a1, a2, xFFFF00005555FFFF, 0xE5); + vtype x6D6D9C9C95956969 = lut3(x50505A5A5A5A5050, xA2A2FFFF2222FFFF, x969639396969C6C6, 0x97); + vtype x1A1A67676A6AB4B4 = lut3(xA5A50A0AA5A50A0A, x5555BBBBFFFF5555, x6D6D9C9C95956969, 0x47); + vtype xA0A0FFFFAAAA0000 = lut3(a3, xFFFF00005555FFFF, xA5A50A0AA5A50A0A, 0x3B); + vtype x36369C9CC1C1D6D6 = lut3(x969639396969C6C6, x6D6D9C9C95956969, xA0A0FFFFAAAA0000, 0xD9); + vtype x1 = lut3(a4, x1A1A67676A6AB4B4, x36369C9CC1C1D6D6, 0xCA); + vtype x5555F0F0F5F55555 = lut3(a1, a3, xFFFF00005555FFFF, 0xB1); + vtype x79790202DCDC0808 = lut3(xA2A2FFFF2222FFFF, xA5A50A0AA5A50A0A, x969639396969C6C6, 0x47); + vtype x6C6CF2F229295D5D = lut3(xBFBFBFBFF6F6F9F9, x5555F0F0F5F55555, x79790202DCDC0808, 0x6E); + vtype xA3A3505010101A1A = lut3(a2, xA2A2FFFF2222FFFF, x36369C9CC1C1D6D6, 0x94); + vtype x7676C7C74F4FC7C7 = lut3(a1, x2E2E6969A4A46363, xA3A3505010101A1A, 0xD9); + vtype x4 = lut3(a4, x6C6CF2F229295D5D, x7676C7C74F4FC7C7, 0xC6); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +inline void +s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype x5050F5F55050F5F5 = lut3(a1, a3, a5, 0xB2); + vtype x6363C6C66363C6C6 = lut3(a1, a2, x5050F5F55050F5F5, 0x66); + vtype xAAAA5555AAAA5555 = lut3(a1, a1, a5, 0xA9); + vtype x3A3A65653A3A6565 = lut3(a3, x6363C6C66363C6C6, xAAAA5555AAAA5555, 0xA9); + vtype x5963A3C65963A3C6 = lut3(a4, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xC6); + vtype xE7E76565E7E76565 = lut3(a5, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xAD); + vtype x455D45DF455D45DF = lut3(a1, a4, xE7E76565E7E76565, 0xE4); + vtype x4 = lut3(a6, x5963A3C65963A3C6, x455D45DF455D45DF, 0x6C); + vtype x1101220211012202 = lut3(a2, xAAAA5555AAAA5555, x5963A3C65963A3C6, 0x20); + vtype xF00F0FF0F00F0FF0 = lut3(a3, a4, a5, 0x69); + vtype x16E94A9716E94A97 = lut3(xE7E76565E7E76565, x1101220211012202, xF00F0FF0F00F0FF0, 0x9E); + vtype x2992922929929229 = lut3(a1, a2, xF00F0FF0F00F0FF0, 0x49); + vtype xAFAF9823AFAF9823 = lut3(a5, x5050F5F55050F5F5, x2992922929929229, 0x93); + vtype x3 = lut3(a6, x16E94A9716E94A97, xAFAF9823AFAF9823, 0x6C); + vtype x4801810248018102 = lut3(a4, x5963A3C65963A3C6, x1101220211012202, 0xA4); + vtype x5EE8FFFD5EE8FFFD = lut3(a5, x16E94A9716E94A97, x4801810248018102, 0x76); + vtype xF0FF00FFF0FF00FF = lut3(a3, a4, a5, 0xCD); + vtype x942D9A67942D9A67 = lut3(x3A3A65653A3A6565, x5EE8FFFD5EE8FFFD, xF0FF00FFF0FF00FF, 0x86); + vtype x1 = lut3(a6, x5EE8FFFD5EE8FFFD, x942D9A67942D9A67, 0xA6); + vtype x6A40D4ED6F4DD4EE = lut3(a2, x4, xAFAF9823AFAF9823, 0x2D); + vtype x6CA89C7869A49C79 = lut3(x1101220211012202, x16E94A9716E94A97, x6A40D4ED6F4DD4EE, 0x26); + vtype xD6DE73F9D6DE73F9 = lut3(a3, x6363C6C66363C6C6, x455D45DF455D45DF, 0x6B); + vtype x925E63E1965A63E1 = lut3(x3A3A65653A3A6565, x6CA89C7869A49C79, xD6DE73F9D6DE73F9, 0xA2); + vtype x2 = lut3(a6, x6CA89C7869A49C79, x925E63E1965A63E1, 0xCA); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +inline void +s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype x88AA88AA88AA88AA = lut3(a1, a2, a4, 0x0B); + vtype xAAAAFF00AAAAFF00 = lut3(a1, a4, a5, 0x27); + vtype xADAFF8A5ADAFF8A5 = lut3(a3, x88AA88AA88AA88AA, xAAAAFF00AAAAFF00, 0x9E); + vtype x0A0AF5F50A0AF5F5 = lut3(a1, a3, a5, 0xA6); + vtype x6B69C5DC6B69C5DC = lut3(a2, xADAFF8A5ADAFF8A5, x0A0AF5F50A0AF5F5, 0x6B); + vtype x1C69B2DC1C69B2DC = lut3(a4, x88AA88AA88AA88AA, x6B69C5DC6B69C5DC, 0xA9); + vtype x1 = lut3(a6, xADAFF8A5ADAFF8A5, x1C69B2DC1C69B2DC, 0x6A); + vtype x9C9C9C9C9C9C9C9C = lut3(a1, a2, a3, 0x63); + vtype xE6E63BFDE6E63BFD = lut3(a2, xAAAAFF00AAAAFF00, x0A0AF5F50A0AF5F5, 0xE7); + vtype x6385639E6385639E = lut3(a4, x9C9C9C9C9C9C9C9C, xE6E63BFDE6E63BFD, 0x93); + vtype x5959C4CE5959C4CE = lut3(a2, x6B69C5DC6B69C5DC, xE6E63BFDE6E63BFD, 0x5D); + vtype x5B53F53B5B53F53B = lut3(a4, x0A0AF5F50A0AF5F5, x5959C4CE5959C4CE, 0x6E); + vtype x3 = lut3(a6, x6385639E6385639E, x5B53F53B5B53F53B, 0xC6); + vtype xFAF505FAFAF505FA = lut3(a3, a4, x0A0AF5F50A0AF5F5, 0x6D); + vtype x6A65956A6A65956A = lut3(a3, x9C9C9C9C9C9C9C9C, xFAF505FAFAF505FA, 0xA6); + vtype x8888CCCC8888CCCC = lut3(a1, a2, a5, 0x23); + vtype x94E97A9494E97A94 = lut3(x1C69B2DC1C69B2DC, x6A65956A6A65956A, x8888CCCC8888CCCC, 0x72); + vtype x4 = lut3(a6, x6A65956A6A65956A, x94E97A9494E97A94, 0xAC); + vtype xA050A050A050A050 = lut3(a1, a3, a4, 0x21); + vtype xC1B87A2BC1B87A2B = lut3(xAAAAFF00AAAAFF00, x5B53F53B5B53F53B, x94E97A9494E97A94, 0xA4); + vtype xE96016B7E96016B7 = lut3(x8888CCCC8888CCCC, xA050A050A050A050, xC1B87A2BC1B87A2B, 0x96); + vtype xE3CF1FD5E3CF1FD5 = lut3(x88AA88AA88AA88AA, x6A65956A6A65956A, xE96016B7E96016B7, 0x3E); + vtype x6776675B6776675B = lut3(xADAFF8A5ADAFF8A5, x94E97A9494E97A94, xE3CF1FD5E3CF1FD5, 0x6B); + vtype x2 = lut3(a6, xE96016B7E96016B7, x6776675B6776675B, 0xC6); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +inline void +s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6, + __private vtype *out, vtype c1, vtype c2, vtype c3, vtype c4) +{ + vtype xEEEE3333EEEE3333 = lut3(a1, a2, a5, 0x9D); + vtype xBBBBBBBBBBBBBBBB = lut3(a1, a1, a2, 0x83); + vtype xDDDDAAAADDDDAAAA = lut3(a1, a2, a5, 0x5B); + vtype x29295A5A29295A5A = lut3(a3, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0x85); + vtype xC729695AC729695A = lut3(a4, xEEEE3333EEEE3333, x29295A5A29295A5A, 0xA6); + vtype x3BF77B7B3BF77B7B = lut3(a2, a5, xC729695AC729695A, 0xF9); + vtype x2900FF002900FF00 = lut3(a4, a5, x29295A5A29295A5A, 0x0E); + vtype x56B3803F56B3803F = lut3(xBBBBBBBBBBBBBBBB, x3BF77B7B3BF77B7B, x2900FF002900FF00, 0x61); + vtype x4 = lut3(a6, xC729695AC729695A, x56B3803F56B3803F, 0x6C); + vtype xFBFBFBFBFBFBFBFB = lut3(a1, a2, a3, 0xDF); + vtype x3012B7B73012B7B7 = lut3(a2, a5, xC729695AC729695A, 0xD4); + vtype x34E9B34C34E9B34C = lut3(a4, xFBFBFBFBFBFBFBFB, x3012B7B73012B7B7, 0x69); + vtype xBFEAEBBEBFEAEBBE = lut3(a1, x29295A5A29295A5A, x34E9B34C34E9B34C, 0x6F); + vtype xFFAEAFFEFFAEAFFE = lut3(a3, xBBBBBBBBBBBBBBBB, xBFEAEBBEBFEAEBBE, 0xB9); + vtype x2 = lut3(a6, x34E9B34C34E9B34C, xFFAEAFFEFFAEAFFE, 0xC6); + vtype xCFDE88BBCFDE88BB = lut3(a2, xDDDDAAAADDDDAAAA, x34E9B34C34E9B34C, 0x5C); + vtype x3055574530555745 = lut3(a1, xC729695AC729695A, xCFDE88BBCFDE88BB, 0x71); + vtype x99DDEEEE99DDEEEE = lut3(a4, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0xB9); + vtype x693CD926693CD926 = lut3(x3BF77B7B3BF77B7B, x34E9B34C34E9B34C, x99DDEEEE99DDEEEE, 0x69); + vtype x3 = lut3(a6, x3055574530555745, x693CD926693CD926, 0x6A); + vtype x9955EE559955EE55 = lut3(a1, a4, x99DDEEEE99DDEEEE, 0xE2); + vtype x9D48FA949D48FA94 = lut3(x3BF77B7B3BF77B7B, xBFEAEBBEBFEAEBBE, x9955EE559955EE55, 0x9C); + vtype x1 = lut3(a6, xC729695AC729695A, x9D48FA949D48FA94, 0x39); + + out[c1] ^= x1; + out[c2] ^= x2; + out[c3] ^= x3; + out[c4] ^= x4; +} + +#else + +#undef andn +#define andn 0 +#include "opencl_nonstd.h" + +#endif /* HAVE_LUT3 */ diff --git a/opencl_setup.c b/opencl_setup.c new file mode 100644 index 0000000..49e77cc --- /dev/null +++ b/opencl_setup.c @@ -0,0 +1,353 @@ +/* + * Rainbow Crackalack: opencl_setup.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + + +#ifdef _WIN32 +#include +static char dlerror_buffer[256] = {0}; +static HMODULE ocl = NULL; /* TODO: release on program exit. */ +#else +#include +static void *ocl = NULL; /* TODO: release on program exit. */ +#endif + +#include "misc.h" +#include "opencl_setup.h" + + +/* Toggled when the OpenCL library is loaded and initialized. */ +static unsigned int opencl_initialized = 0; + + +/* Pointers to OpenCL functions. */ +cl_int (*rc_clBuildProgram)(cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *) = NULL; +cl_mem (*rc_clCreateBuffer)(cl_context, cl_mem_flags, size_t, void *, cl_int *) = NULL; +cl_context (*rc_clCreateContext)(cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *) = NULL; +cl_command_queue (*rc_clCreateCommandQueueWithProperties)(cl_context, cl_device_id, const cl_queue_properties *, cl_int *) = NULL; +cl_kernel (*rc_clCreateKernel)(cl_program, const char *, cl_int *) = NULL; +cl_program (*rc_clCreateProgramWithSource)(cl_context, cl_uint, const char **, const size_t *, cl_int *) = NULL; +cl_int (*rc_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL; +cl_int (*rc_clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) = NULL; +cl_int (*rc_clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) = NULL; +cl_int (*rc_clFinish)(cl_command_queue) = NULL; +cl_int (*rc_clFlush)(cl_command_queue) = NULL; +cl_int (*rc_clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) = NULL; +cl_int (*rc_clGetDeviceInfo)(cl_device_id, cl_device_info, size_t, void *, size_t *) = NULL; +cl_int (*rc_clGetKernelWorkGroupInfo)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *) = NULL; +cl_int (*rc_clGetPlatformIDs)(cl_uint, cl_platform_id *, cl_uint *) = NULL; +cl_int (*rc_clGetProgramBuildInfo)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) = NULL; +cl_int (*rc_clReleaseCommandQueue)(cl_command_queue) = NULL; +cl_int (*rc_clReleaseContext)(cl_context) = NULL; +cl_int (*rc_clReleaseDevice)(cl_device_id) = NULL; +cl_int (*rc_clReleaseKernel)(cl_kernel) = NULL; +cl_int (*rc_clReleaseMemObject)(cl_mem) = NULL; +cl_int (*rc_clReleaseProgram)(cl_program) = NULL; +cl_int (*rc_clSetKernelArg)(cl_kernel, cl_uint, size_t, const void *) = NULL; + + +#ifdef _WIN32 + +void *rc_dlopen(char *library_name) { + return LoadLibraryA(library_name); +} + +int rc_dlclose(void *module) { + return FreeLibrary(module); +} + +void *rc_dlsym(void *module, char *function_name) { + return GetProcAddress(module, function_name); +} + +char *rc_dlerror(void) { + dlerror_buffer[0] = '\0'; + FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + dlerror_buffer, 0, NULL); + return dlerror_buffer; +} + +#else + +void *rc_dlopen(char *library_name) { + return dlopen(library_name, RTLD_NOW); +} + +int rc_dlclose(void *module) { + return dlclose(module); +} + +void *rc_dlsym(void *module, char *function_name) { + return dlsym(module, function_name); +} + +char * rc_dlerror(void) { + return dlerror(); +} + +#endif + + +void context_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) { + printf("\n\n\tError callback invoked!\n\n\terrinfo: %s\n\n", errinfo); + return; +} + + +void get_device_bool(cl_device_id device, cl_device_info param, cl_bool *b) { + if (rc_clGetDeviceInfo(device, param, sizeof(cl_bool), b, NULL) < 0) { + fprintf(stderr, "Error while getting device info.\n"); + exit(-1); + } +} + + +void get_device_str(cl_device_id device, cl_device_info param, char *buf, int buf_len) { + if (rc_clGetDeviceInfo(device, param, buf_len, buf, NULL) < 0) { + fprintf(stderr, "Error while getting device info.\n"); + exit(-1); + } +} + + +void get_device_uint(cl_device_id device, cl_device_info param, cl_uint *u) { + if (rc_clGetDeviceInfo(device, param, sizeof(cl_uint), u, NULL) < 0) { + fprintf(stderr, "Error while getting device info.\n"); + exit(-1); + } +} + + +void get_device_ulong(cl_device_id device, cl_device_info param, cl_ulong *ul) { + if (rc_clGetDeviceInfo(device, param, sizeof(cl_ulong), ul, NULL) < 0) { + fprintf(stderr, "Error while getting device info.\n"); + exit(-1); + } +} + + +/* Returns the array of platforms and devices. */ +void get_platforms_and_devices(cl_uint platforms_buffer_size, cl_platform_id *platforms, cl_uint *num_platforms, cl_uint devices_buffer_size, cl_device_id *devices, cl_uint *num_devices, unsigned int verbose) { + unsigned int i = 0; + cl_int err = 0; + cl_uint n = 0; + + + *num_platforms = 0; + *num_devices = 0; + + if (opencl_initialized == 0) { +#ifdef _WIN32 + ocl = rc_dlopen("OpenCL"); /* Windows */ +#else + ocl = rc_dlopen("libOpenCL.so"); /* Linux */ +#endif + if (ocl == NULL) { + fprintf(stderr, "\nFailed to open OpenCL library. Are the GPU drivers properly installed?\nError: %s\n\n", rc_dlerror()); + exit(-1); + } + rc_dlerror(); /* Clear error codes. */ + + LOADFUNC(ocl, clBuildProgram); + LOADFUNC(ocl, clCreateBuffer); + LOADFUNC(ocl, clCreateCommandQueueWithProperties); + LOADFUNC(ocl, clCreateContext); + LOADFUNC(ocl, clCreateKernel); + LOADFUNC(ocl, clCreateProgramWithSource); + LOADFUNC(ocl, clEnqueueNDRangeKernel); + LOADFUNC(ocl, clEnqueueReadBuffer); + LOADFUNC(ocl, clEnqueueWriteBuffer); + LOADFUNC(ocl, clFinish); + LOADFUNC(ocl, clFlush); + LOADFUNC(ocl, clGetDeviceIDs); + LOADFUNC(ocl, clGetDeviceInfo); + LOADFUNC(ocl, clGetKernelWorkGroupInfo); + LOADFUNC(ocl, clGetPlatformIDs); + LOADFUNC(ocl, clGetProgramBuildInfo); + LOADFUNC(ocl, clReleaseCommandQueue); + LOADFUNC(ocl, clReleaseContext); + LOADFUNC(ocl, clReleaseDevice); + LOADFUNC(ocl, clReleaseKernel); + LOADFUNC(ocl, clReleaseMemObject); + LOADFUNC(ocl, clReleaseProgram); + LOADFUNC(ocl, clSetKernelArg); + + opencl_initialized = 1; + } + + + if (rc_clGetPlatformIDs(platforms_buffer_size, platforms, num_platforms) < 0) { + fprintf(stderr, "Failed to get platform IDs. Are the OpenCL drivers installed?\n"); + exit(-1); + } else if (*num_platforms < 1) { + fprintf(stderr, "Number of platforms is < 1!\n"); + exit(-1); + } + if (verbose) + printf("Found %u platforms.\n", *num_platforms); + + for (i = 0; ((i < *num_platforms) && (*num_devices < devices_buffer_size)); i++) { + err = rc_clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, devices_buffer_size - *num_devices, &(devices[*num_devices]), &n); + if (err == CL_DEVICE_NOT_FOUND) + printf("No GPUs found on platform #%u\n", i); + else if (err < 0) + fprintf(stderr, "Error while getting device IDs on platform #%u; clGetDeviceIDs() returned: %d.\n", i, err); + else if (n < 1) + fprintf(stderr, "Platform #%u has < 1 devices!\n", i); + + *num_devices += n; + if (verbose) + printf("Found %u devices on platform #%u.\n", *num_devices, i); + } + + if (verbose) + print_device_info(devices, *num_devices); + + return; +} + + +/* Loads a kernel onto a device. */ +void load_kernel(cl_context context, cl_uint num_devices, const cl_device_id *devices, const char *source_filename, const char *kernel_name, cl_program *program, cl_kernel *kernel, unsigned int hash_type) { + FILE *f = NULL; + char *source = NULL; + int file_size = 0, bytes_read = 0, n = 0; + int err = 0; + char build_options[512] = {0}; + char device_vendor[128] = {0}; + char path[256] = {0}; + + + filepath_join(path, sizeof(path), "CL", source_filename); + f = fopen(path, "r"); + if (f == NULL) { + perror("Failed to open kernel."); + exit(-1); + } + + fseek(f, 0, SEEK_END); + file_size = ftell(f); + rewind(f); + + if (file_size < 1) { + fprintf(stderr, "File size of kernel is invalid.\n"); + exit(-1); + } + + source = calloc(file_size + 1, sizeof(char)); + if (source == NULL) { + fprintf(stderr, "Failed to allocate file buffer.\n"); + exit(-1); + } + + while (bytes_read < file_size) { + n = fread(source + bytes_read, sizeof(char), file_size - bytes_read, f); + if (n <= 0) { + fprintf(stderr, "Error while reading kernel.\n"); + exit(-1); + } + + bytes_read += n; + } + + FCLOSE(f); + + *program = rc_clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err); + if (err < 0) { + fprintf(stderr, "clCreateProgramWithSource failed.\n"); + exit(-1); + } + + snprintf(build_options, sizeof(build_options) - 1, "%s -DHASH_TYPE=%u", DEFAULT_BUILD_OPTIONS, hash_type); +#ifdef USE_DES_BITSLICE + strncat(build_options, " -DUSE_DES_BITSLICE=1", sizeof(build_options) - 1); +#endif + + /* If the first device is AMD, we will assume that its the ROCm driver. This isn't + * correct all of the time (i.e.: if Catalyst/Crimson drivers are in use), but + * until we can differentiate between the drivers, this is what we'll use... */ + get_device_str(devices[0], CL_DEVICE_VENDOR, device_vendor, sizeof(device_vendor) - 1); + if (strcmp(device_vendor, "Advanced Micro Devices, Inc.") == 0) + strncat(build_options, " -DAMD_ROCM=1", sizeof(build_options) - 1); + + /*printf("Building program with options: %s\n", build_options);*/ + if (rc_clBuildProgram(*program, num_devices, devices, build_options, NULL, NULL) < 0) { + size_t log_size = 0; + char *error_str = NULL; + + fprintf(stderr, "clBuildProgram failed.\n"); + rc_clGetProgramBuildInfo(*program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + error_str = calloc(log_size + 1, sizeof(char)); + rc_clGetProgramBuildInfo(*program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, error_str, NULL); + fprintf(stderr, "%s\n", error_str); + FREE(error_str); + exit(-1); + } + + *kernel = rc_clCreateKernel(*program, kernel_name, &err); + if (err < 0) { + fprintf(stderr, "clCreateKernel failed.\n"); + exit(-1); + } + + FREE(source); + return; +} + + +/* Prints debugging information about devices. */ +void print_device_info(cl_device_id *devices, cl_uint num_devices) { + int i = 0; + char device_name[64] = {0}; + char device_version[64] = {0}; + char device_vendor[128] = {0}; + char device_driver[128] = {0}; + cl_bool b = 0; + cl_uint max_compute_units = 0; + cl_ulong global_memsize = 0; + cl_ulong max_work_group_size = 0; + + + for (i = 0; i < num_devices; i++) { + get_device_str(devices[i], CL_DEVICE_NAME, device_name, sizeof(device_name) - 1); + get_device_str(devices[i], CL_DEVICE_VERSION, device_version, sizeof(device_version) - 1); + get_device_str(devices[i], CL_DEVICE_VENDOR, device_vendor, sizeof(device_vendor) - 1); + get_device_bool(devices[i], CL_DEVICE_AVAILABLE, &b); + get_device_uint(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, &max_compute_units); + get_device_ulong(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, &global_memsize); + get_device_ulong(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, &max_work_group_size); + get_device_str(devices[i], CL_DRIVER_VERSION, device_driver, sizeof(device_driver) - 1); + + printf("Device #%d:\n", i); + printf("\tVendor: %s\n", device_vendor); + printf("\tName: %s\n", device_name); + printf("\tVersion: %s\n", device_version); + printf("\tDriver: %s\n", device_driver); + printf("\tMax compute units: %u\n", max_compute_units); + printf("\tMax work group size: %"PRIu64"\n", max_work_group_size); + printf("\tGlobal memory size: %"PRIu64"\n", global_memsize); + if (b == 0) + printf("\t---> NOT AVAILABLE!\n"); + printf("\n"); + } + fflush(stdout); +} diff --git a/opencl_setup.h b/opencl_setup.h new file mode 100644 index 0000000..4938364 --- /dev/null +++ b/opencl_setup.h @@ -0,0 +1,118 @@ +#ifndef _OPENCL_SETUP_H +#define _OPENCL_SETUP_H + +#define MAX_NUM_PLATFORMS 32 +#define MAX_NUM_DEVICES 32 + +#define CL_RO CL_MEM_READ_ONLY +#define CL_WO CL_MEM_WRITE_ONLY +#define CL_RW CL_MEM_READ_WRITE + +/* Default build options for kernels. */ +#define DEFAULT_BUILD_OPTIONS "-Werror -I. -ICL" + +/* Enable USE_DES_BITSLICE to use the DES bitslice code from JohnTheRipper. At this time, it somehow runs at half the speed of unoptimized DES on NVIDIA. Anyone else want to look into what's going on? */ +/*#define USE_DES_BITSLICE 1*/ + + +void *rc_dlopen(char *library_name); +int rc_dlclose(void *module); +void *rc_dlsym(void *module, char *function_name); +char *rc_dlerror(void); + +extern cl_int (*rc_clBuildProgram)(cl_program, cl_uint, const cl_device_id *, const char *, void (CL_CALLBACK *)(cl_program, void *), void *); +extern cl_mem (*rc_clCreateBuffer)(cl_context, cl_mem_flags, size_t, void *, cl_int *); +extern cl_context (*rc_clCreateContext)(cl_context_properties *, cl_uint, const cl_device_id *, void (CL_CALLBACK *)(const char *, const void *, size_t, void *), void *, cl_int *); +extern cl_command_queue (*rc_clCreateCommandQueueWithProperties)(cl_context, cl_device_id, const cl_queue_properties *, cl_int *); +extern cl_kernel (*rc_clCreateKernel)(cl_program, const char *, cl_int *); +extern cl_program (*rc_clCreateProgramWithSource)(cl_context, cl_uint, const char **, const size_t *, cl_int *); +extern cl_int (*rc_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +extern cl_int (*rc_clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +extern cl_int (*rc_clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +extern cl_int (*rc_clFinish)(cl_command_queue); +extern cl_int (*rc_clFlush)(cl_command_queue); +extern cl_int (*rc_clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); +extern cl_int (*rc_clGetDeviceInfo)(cl_device_id, cl_device_info, size_t, void *, size_t *); +extern cl_int (*rc_clGetKernelWorkGroupInfo)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); +extern cl_int (*rc_clGetPlatformIDs)(cl_uint, cl_platform_id *, cl_uint *); +extern cl_int (*rc_clGetProgramBuildInfo)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); +extern cl_int (*rc_clReleaseCommandQueue)(cl_command_queue); +extern cl_int (*rc_clReleaseContext)(cl_context); +extern cl_int (*rc_clReleaseDevice)(cl_device_id); +extern cl_int (*rc_clReleaseKernel)(cl_kernel); +extern cl_int (*rc_clReleaseMemObject)(cl_mem); +extern cl_int (*rc_clReleaseProgram)(cl_program); +extern cl_int (*rc_clSetKernelArg)(cl_kernel, cl_uint, size_t, const void *); + + +#define CLMAKETESTVARS() \ + int err = 0; \ + size_t global_work_size = 1; \ + cl_command_queue queue = NULL; + +#define _CLCREATEARG(_arg_index, _buffer, _flags, _arg_ptr, _arg_size) \ + { _buffer = rc_clCreateBuffer(context, _flags, _arg_size, NULL, &err); \ + if (err < 0) { fprintf(stderr, "Error while creating buffer for \"%s\". Error code: %d\n", #_arg_ptr, err); exit(-1); } \ + err = rc_clEnqueueWriteBuffer(queue, _buffer, CL_TRUE, 0, _arg_size, _arg_ptr, 0, NULL, NULL); \ + if (err < 0) { fprintf(stderr, "Error while writing to buffer for \"%s\". Error code: %d\n", #_arg_ptr, err); exit(-1); } \ + err = rc_clSetKernelArg(kernel, _arg_index, sizeof(cl_mem), &_buffer); \ + if (err < 0) { fprintf(stderr, "Error setting kernel argument for %s at index %u.\n", #_arg_ptr, _arg_index); exit(-1); } } + +#define CLCREATEARG_ARRAY(_arg_index, _buffer, _flags, _arg, _len) \ + _CLCREATEARG(_arg_index, _buffer, _flags, _arg, _len); + +#define CLCREATEARG(_arg_index, _buffer, _flags, _arg, _arg_size) \ + _CLCREATEARG(_arg_index, _buffer, _flags, &_arg, _arg_size); + +#define CLCREATEARG_DEBUG(_arg_index, _debug_buffer, _debug_ptr) \ + { _debug_ptr = calloc(DEBUG_LEN, sizeof(unsigned char)); \ + CLCREATEARG(_arg_index, _debug_buffer, CL_MEM_READ_WRITE, _debug_ptr, DEBUG_LEN); } + +#define CLCREATECONTEXT(_context_callback, _device_ptr) \ + rc_clCreateContext(NULL, 1, _device_ptr, _context_callback, NULL, &err); if (err < 0) { fprintf(stderr, "Failed to create context: %d\n", err); exit(-1); } + +#define CLCREATEQUEUE(_context, _device) \ + rc_clCreateCommandQueueWithProperties(_context, _device, NULL, &err); if (err < 0) { fprintf(stderr, "clCreateCommandQueueWithProperties failed: %d\n", err); exit(-1); } + +#define CLRUNKERNEL(_queue, _kernel, _gws_ptr) \ + { err = rc_clEnqueueNDRangeKernel(_queue, _kernel, 1, NULL, _gws_ptr, NULL, 0, NULL, NULL); if (err < 0) { fprintf(stderr, "clEnqueueNDRangeKernel failed: %d\n", err); exit(-1); } } + +#define CLFLUSH(_queue) \ + { err = rc_clFlush(_queue); if (err < 0) { fprintf(stderr, "clFlush failed: %d\n", err); exit(-1); } } + +#define CLWAIT(_queue) \ + { err = rc_clFinish(_queue); if (err == CL_INVALID_COMMAND_QUEUE) { fprintf(stderr, "\nError: clFinish() returned CL_INVALID_COMMAND_QUEUE (%d). This is often caused by running out of host memory. Sometimes, it can be worked around by lowering the GWS setting (see command line options; hint: try setting it to a multiple of the max compute units reported at the beginning of the program output. For example, if the MCU is 15, try setting the GWS parameter to 15 * 256 = 3840, 15 * 1024 = 15360, etc).\n", err); exit(-1); } else if (err < 0) { fprintf(stderr, "clFinish failed: %d\n", err); exit(-1); } } + +#define CLREADBUFFER(_buffer, _len, _ptr) \ + { err = rc_clEnqueueReadBuffer(queue, _buffer, CL_TRUE, 0, _len, _ptr, 0, NULL, NULL); if (err < 0) { fprintf(stderr, "clEnqueueReadBuffer failed: %d\n", err); exit(-1); } } + +#define CLFREEBUFFER(_buffer) \ + if (_buffer != NULL) { rc_clReleaseMemObject(_buffer); _buffer = NULL; } + +#define CLRELEASEQUEUE(_queue) \ + if (_queue != NULL) { rc_clReleaseCommandQueue(_queue); _queue = NULL; } + +#define CLRELEASECONTEXT(_context) \ + if (_context != NULL) { rc_clReleaseContext(_context); _context = NULL; } + +#define CLRELEASEKERNEL(_kernel) \ + if (_kernel != NULL) { rc_clReleaseKernel(_kernel); _kernel = NULL; } + +#define CLRELEASEPROGRAM(_program) \ + if (_program != NULL) { rc_clReleaseProgram(_program); _program = NULL; } + +#define LOADFUNC(_ocl, _func_name) \ + { rc_##_func_name = rc_dlsym(_ocl, #_func_name); \ + if (rc_##_func_name == NULL) { fprintf(stderr, "Error while loading function %s: %s\n", #_func_name, rc_dlerror()); exit(-1); } } + + +void context_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data); +void get_device_bool(cl_device_id device, cl_device_info param, cl_bool *b); +void get_platforms_and_devices(cl_uint platforms_buffer_size, cl_platform_id *platforms, cl_uint *num_platforms, cl_uint devices_buffer_size, cl_device_id *devices, cl_uint *num_devices, unsigned int verbose); +void get_device_str(cl_device_id device, cl_device_info param, char *buf, int buf_len); +void get_device_uint(cl_device_id device, cl_device_info param, cl_uint *u); +void get_device_ulong(cl_device_id device, cl_device_info param, cl_ulong *ul); +void load_kernel(cl_context context, cl_uint num_devices, const cl_device_id *devices, const char *path, const char *kernel_name, cl_program *program, cl_kernel *kernel, unsigned int hash_type); +void print_device_info(cl_device_id *devices, cl_uint num_devices); + +#endif diff --git a/perfectify.c b/perfectify.c new file mode 100644 index 0000000..d6b15c5 --- /dev/null +++ b/perfectify.c @@ -0,0 +1,416 @@ +/* + * Rainbow Crackalack: perfectify.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/* This program will compare a sorted rainbow table with a directory of sorted + * tables, and strip out chains with identical end points. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clock.h" + +#define CHAIN_SIZE (sizeof(uint64_t) * 2) +#define FREE(_ptr) { free(_ptr); _ptr = NULL; } +#define FCLOSE(_f) { if (_f != NULL) { fclose(_f); _f = NULL; } } +#define CLOSEDIR(_d) { if (_d != NULL) { closedir(_d); _d = NULL; } } + + +struct source_rt { + char *filename; + unsigned long num_chains; + uint64_t *table; + unsigned long num_pruned_chains; + unsigned long pruned_since_last_compression; + unsigned long in_table_duplicates; +}; + + +/* TODO: this is in misc.c too. Merge! */ +int str_ends_with(const char *str, const char *suffix) { + size_t str_len; + size_t suffix_len; + + + if ((str == NULL) || (suffix == NULL)) + return 0; + + str_len = strlen(str); + suffix_len = strlen(suffix); + if (suffix_len > str_len) + return 0; + + return strncmp(str + str_len - suffix_len, suffix, suffix_len) == 0; +} + + +/* Compacts a rainbow table (this involves stripping out chains that have endpoints of zero). Returns the new + * number of chains in the table. */ +unsigned int compact_table(uint64_t *rainbow_table, unsigned int num_chains) { + unsigned int current_chain = 0, compacted_chain = 0; + uint64_t end = 0; + + + for (current_chain = 0; current_chain < num_chains; current_chain++) { + end = rainbow_table[(current_chain * 2) + 1]; + if (end != 0) { + rainbow_table[(compacted_chain * 2)] = rainbow_table[(current_chain * 2)]; + rainbow_table[(compacted_chain * 2) + 1] = end; + compacted_chain++; + } + } + return compacted_chain; +} + + +/* Loads a rainbow table from disk, and returns it as an array and sets num_chains + * accordingly. Returns NULL on error. */ +uint64_t *load_rainbow_table(char *filename, unsigned long *num_chains) { + FILE *f = fopen(filename, "rb"); + uint64_t *rainbow_table = NULL; + long file_size = 0; + + + if (f == NULL) { + fprintf(stderr, "Failed to open rainbow table file: %s\n", filename); + *num_chains = 0; + return NULL; + } + fseek(f, 0, SEEK_END); + file_size = ftell(f); + fseek(f, 0, SEEK_SET); + + /* Load the entire rainbow table at once. */ + *num_chains = file_size / CHAIN_SIZE; + rainbow_table = calloc(*num_chains * 2, sizeof(uint64_t)); + if (fread(rainbow_table, sizeof(uint64_t), *num_chains * 2, f) != *num_chains * 2) { + fprintf(stderr, "Error while reading file: %s (%d)\n", strerror(errno), errno); + *num_chains = 0; + FCLOSE(f); + return NULL; + } + FCLOSE(f); + + return rainbow_table; +} + + +/* Prunes a set of source tables to a comparison table. */ +void prune_tables(struct source_rt *source_rts, unsigned int num_source_files, uint64_t *compare_table, unsigned long compare_num_chains) { + unsigned int i = 0; + long compare_current_chain = 0; + unsigned int pruned_chains = 0; + uint64_t compare_end = 0, process_end = 0; + unsigned long process_current_chain = 0, process_num_chains = 0; + uint64_t *process_table = NULL; + + + /* If compare table is NULL, look for in-table duplicates only. */ + if ((compare_table == NULL) && (compare_num_chains == 0)) { + for (i = 0; i < num_source_files; i++) { + uint64_t previous_process_end = 0; + uint64_t *process_table = source_rts[i].table; + + + for (process_current_chain = 0; process_current_chain < source_rts[i].num_chains; process_current_chain++) { + process_end = process_table[(process_current_chain * 2) + 1]; + + if ((process_end != 0) && (process_end == previous_process_end)) { + process_table[(process_current_chain * 2) + 1] = 0; + source_rts[i].in_table_duplicates++; + source_rts[i].num_pruned_chains++; + } else + previous_process_end = process_end; + } + } + + return; + } + + + for (i = 0; i < num_source_files; i++) { + process_table = source_rts[i].table; + process_num_chains = source_rts[i].num_chains; + process_current_chain = 0; + compare_current_chain = 0; + pruned_chains = 0; + + while ((process_current_chain < process_num_chains) && (compare_current_chain < compare_num_chains)) { + process_end = process_table[(process_current_chain * 2) + 1]; + compare_end = compare_table[(compare_current_chain * 2) + 1]; + + if (process_end == compare_end) { + /*printf("Dup: %lu %lu (%"PRIu64" == %"PRIu64")\n", process_current_chain, compare_current_chain, process_end, compare_end);*/ + + /* Set the endpoint of this chain to zero, so we know to delete it later. */ + process_table[(process_current_chain * 2) + 1] = 0; + + pruned_chains++; /* Number pruned with respect to current comparison table. */ + source_rts[i].num_pruned_chains++; /* Total number pruned w.r.t. all tables compared so far. */ + source_rts[i].pruned_since_last_compression++; /* Number of chains pruned since the process table was last compressed. */ + + process_current_chain++; + /*compare_current_chain++;*/ + } else if (process_end < compare_end) + process_current_chain++; + else if (process_end > compare_end) + compare_current_chain++; + else { + fprintf(stderr, "\n !!!! Error: this should never happen!\n\n"); fflush(stderr); + exit(-1); + } + } + + if (pruned_chains > 0) { + printf("Pruned %u chains from %s\n", pruned_chains, source_rts[i].filename); fflush(stdout); + } else { + printf("\n !! WARNING: no chains pruned while processing %s!\n\n", source_rts[i].filename); fflush(stdout); + } + + /* Every 8M chains pruned, compress the table to remove empty chains. This + * slightly speeds up comparisons. */ + if (source_rts[i].pruned_since_last_compression > (8 * 1024 * 1024)) { + printf(" -> Table compression reduced number of chains from %lu to ", process_num_chains); + source_rts[i].num_chains = process_num_chains = compact_table(process_table, process_num_chains); + printf("%lu.\n", process_num_chains); fflush(stdout); + source_rts[i].pruned_since_last_compression = 0; + } + } +} + + +int main(int ac, char **av) { + unsigned int i = 0, num_source_files = 0; + FILE *f = NULL; + char *rt_dir = NULL; + DIR *dir = NULL; + struct dirent *de = NULL; + uint64_t *compare_table = NULL; + unsigned long compare_num_chains = 0; + unsigned int num_total_rt_files = 0, num_compared_rt_files = 0; + + struct source_rt *source_rts = NULL; + struct timespec start_time = {0}; + struct stat st = {0}; + + + if (ac < 3) { + printf("Usage: %s rt_to_process.rt [rt_to_process2.rt ...] rt_dir/\n", av[0]); + return 0; + } + + /* Ensure all arguments except the last one refer to a path to a regular file. */ + for (i = 1; i < ac - 1; i++) { + + if (stat(av[i], &st) < 0) { + fprintf(stderr, "Error: stat(%s): %s\n", av[i], strerror(errno)); fflush(stderr); + goto err; + } + + if ((st.st_mode & S_IFMT) == S_IFREG) + num_source_files++; + else { + fprintf(stderr, "Error: %s is not a file!\n", av[i]); fflush(stderr); + goto err; + } + } + + /* Ensure the last argument is a directory. */ + if (stat(av[ac - 1], &st) < 0) { + fprintf(stderr, "Error: stat(%s): %s\n", av[ac - 1], strerror(errno)); fflush(stderr); + goto err; + } + + if ((st.st_mode & S_IFMT) != S_IFDIR) { + fprintf(stderr, "Error: %s is not a directory!\n", av[ac - 1]); fflush(stderr); + goto err; + } + + /* Load the source files into an array of source_rt structs. */ + source_rts = calloc(num_source_files, sizeof(struct source_rt)); + if (source_rts == NULL) { + fprintf(stderr, "Failed to allocate buffer.\n"); fflush(stderr); + goto err; + } + + start_timer(&start_time); + for (i = 0; i < num_source_files; i++) { + source_rts[i].filename = av[i + 1]; + source_rts[i].table = load_rainbow_table(source_rts[i].filename, &source_rts[i].num_chains); + if (source_rts[i].table == NULL) { + fprintf(stderr, "Error: failed to load table: %s\n", source_rts[i].filename); fflush(stderr); + goto err; + } + } + printf("Loaded %u sources files in %.2f seconds.\n\n", num_source_files, get_elapsed(&start_time)); fflush(stdout); + + /* Set reference to compare directory. */ + rt_dir = av[ac - 1]; + + /* Prune sources with respect to each other. */ + if (num_source_files > 1) { + start_timer(&start_time); + for (i = 0; i < num_source_files; i++) { + compare_table = source_rts[i].table; + compare_num_chains = source_rts[i].num_chains; + prune_tables(&source_rts[i + 1], num_source_files - i - 1, compare_table, compare_num_chains); + } + printf("Self-pruned %u sources files in %.2f seconds.\n\n", num_source_files, get_elapsed(&start_time)); fflush(stdout); + } + + /* Open a handle to the directory. */ + dir = opendir(rt_dir); + if (dir == NULL) { + fprintf(stderr, "Error while opening directory: %s: %s (%d)\n", rt_dir, strerror(errno), errno); fflush(stderr); + goto err; + } + + /* Count the number of *.rt files. */ + while ((de = readdir(dir)) != NULL) { + if (str_ends_with(de->d_name, ".rt")) + num_total_rt_files++; + } + rewinddir(dir); + printf("%u rainbow tables found in %s\n", num_total_rt_files, rt_dir); fflush(stdout); + + while ((de = readdir(dir)) != NULL) { + char filename[384] = {0}; + + + if (!str_ends_with(de->d_name, ".rt")) + continue; + + snprintf(filename, sizeof(filename) - 1, "%s/%s", rt_dir, de->d_name); + printf("[%u of %u] Comparing %u source files to table: %s...\n", num_compared_rt_files, num_total_rt_files, num_source_files, filename); fflush(stdout); + + start_timer(&start_time); + compare_table = load_rainbow_table(filename, &compare_num_chains); + if (compare_table == NULL) { + fprintf(stderr, "Error: failed to load table: %s\n", filename); fflush(stderr); + continue; + } + + prune_tables(source_rts, num_source_files, compare_table, compare_num_chains); + FREE(compare_table); + + printf("Finished processing in %.2f seconds.\n\n", get_elapsed(&start_time)); fflush(stdout); + num_compared_rt_files++; + } + CLOSEDIR(dir); + + + /* Prune duplicates within each table. */ + prune_tables(source_rts, num_source_files, NULL, 0); + + for (i = 0; i < num_source_files; i++) { + printf("Total pruned chains for %s: %lu; in-table duplicates: %lu\n", source_rts[i].filename, source_rts[i].num_pruned_chains, source_rts[i].in_table_duplicates); fflush(stdout); + + /* If any chains were pruned since we last compressed the table, its time to + * compress it again. */ + if ((source_rts[i].pruned_since_last_compression > 0) || (source_rts[i].in_table_duplicates > 0)) + source_rts[i].num_chains = compact_table(source_rts[i].table, source_rts[i].num_chains); + + /* If any chains were pruned at all, its time to update the source table. */ + if ((source_rts[i].num_pruned_chains > 0) || (source_rts[i].in_table_duplicates > 0)) { + if (source_rts[i].num_chains == 0) { + printf("\n!!! NOTE: resulting file is empty! Deleting original...\n\n"); fflush(stdout); + if (unlink(source_rts[i].filename) < 0) { + fprintf(stderr, "Failed to delete file: %s: %s (%d)\n", source_rts[i].filename, strerror(errno), errno); + goto err; + } + } else { + f = fopen(source_rts[i].filename, "w"); + if (f == NULL) { + fprintf(stderr, "Failed to open %s for writing!\n", source_rts[i].filename); fflush(stderr); + goto err; + } + + if (fwrite(source_rts[i].table, sizeof(uint64_t), source_rts[i].num_chains * 2, f) != source_rts[i].num_chains * 2) { + fprintf(stderr, "Failed to write to %s!\n", source_rts[i].filename); fflush(stderr); + goto err; + } + + FCLOSE(f); + + /* + * Make a copy of the original filename we are processing, since we are + * going to modify it. * + temp_str = strdup(rt_to_process); + if (temp_str == NULL) { + fprintf(stderr, "Failed to copy rt_to_process string!\n"); fflush(stderr); + goto err; + } + + xpos = strchr(temp_str, 'x'); + uspos = strrchr(temp_str, '_'); + if ((xpos != NULL) && (uspos != NULL)) { + * Cut off everything after the x character. Example: + * "/path/ntlm_ascii-32-95#8-8_0_422000x67108864_0.rt" -> + * "/path/ntlm_ascii-32-95#8-8_0_422000x" * + xpos++; + *xpos = '\0'; + + * Make a new filename with the correct number of chains. * + snprintf(rt_to_process_new, sizeof(rt_to_process_new) - 1, "%s%lu%s", temp_str, process_num_chains, uspos); + + * Rename the file using the updated number of chains. * + if (rename(rt_to_process, rt_to_process_new) < 0) { + fprintf(stderr, "Failed to rename %s to %s!\n", rt_to_process, rt_to_process_new); fflush(stderr); + goto err; + } + + printf("\nSuccessfully wrote pruned table to %s.\n\n", rt_to_process_new); fflush(stdout); + } else { + + fprintf(stderr, "Failed to parse filename: %s\n", temp_str); fflush(stderr); + goto err; + } + FREE(temp_str); + */ + printf("\nSuccessfully wrote pruned table to %s.\n\n", source_rts[i].filename); fflush(stdout); + } + } else { + printf("\n !! WARNING: no chains pruned from %s.\n\n", source_rts[i].filename); fflush(stdout); + } + } + + + for (i = 0; i < num_source_files; i++) { + FREE(source_rts[i].table); + } + FREE(source_rts); + return 0; + err: + FCLOSE(f); + CLOSEDIR(dir); + + if (source_rts != NULL) { + for (i = 0; i < num_source_files; i++) { + FREE(source_rts[i].table); + } + FREE(source_rts); + } + + FREE(compare_table); + /*FREE(temp_str);*/ + return -1; +} diff --git a/rtc_decompress.c b/rtc_decompress.c new file mode 100644 index 0000000..04e6745 --- /dev/null +++ b/rtc_decompress.c @@ -0,0 +1,159 @@ +/* + * Rainbow Crackalack: rtc_decompress.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include + +#include "rtc_decompress.h" + + +/* Uncompresses an RTC file and returns a pointer to the rainbow table, along with the + * number of chains in it. Returns 0 on success, or an error code. */ +int rtc_decompress(char *filename, uint64_t **ret_uncompressed_table, unsigned int *ret_num_chains) { + char *fn_ptr = NULL; + FILE *f = NULL; + unsigned int i = 0, chain_size = 0, unused = 0, table_ptr = 0, num_chains = 0; + int ret = 0; + uint64_t *uncompressed_table = NULL; + + unsigned int uVersion = 0; + unsigned short uIndexSBits = 0; + unsigned short uIndexEBits = 0; + uint64_t uIndexSMin = 0, uIndexEMin = 0, uIndexEInterval = 0; + + uint64_t s = 0, e = 0, s_mask = 0, /*e_mask = 0,*/ buf[2] = {0}; + + + *ret_uncompressed_table = NULL; + *ret_num_chains = 0; + + /* sscanf(), below, is greedy when parsing "%s". So we will skip past all the + * strings in the filename. */ + for (i = strlen(filename) - 1; i > 0; i--) { + if (filename[i] == 'x') { + fn_ptr = &(filename[i + 1]); + break; + } + } + + if (sscanf(fn_ptr, "%u_%u.rtc", &num_chains, &unused) != 2) { + fprintf(stderr, "Error: failed to parse number of chains from filename: %s\n", fn_ptr); + ret = -1; + goto done; + } + + /*printf("Total chains in table: %u\n", total_chains_in_table);*/ + uncompressed_table = calloc(num_chains, sizeof(uint64_t) * 2); + if (uncompressed_table == NULL) { + fprintf(stderr, "Error: could not allocate %"PRIu64" bytes in memory for uncompressed table.\n", num_chains * sizeof(uint64_t) * 2); + ret = -2; + goto done; + } + + f = fopen(filename, "rb"); + if (f == NULL) { + fprintf(stderr, "Error: failed to open RTC file %s: %s\n", filename, strerror(errno)); + ret = -3; + goto done; + } + + if ((fread(&uVersion, sizeof(unsigned int), 1, f) != 1) || \ + (fread(&uIndexSBits, sizeof(unsigned short), 1, f) != 1) || \ + (fread(&uIndexEBits, sizeof(unsigned short), 1, f) != 1) || \ + (fread(&uIndexSMin, sizeof(uint64_t), 1, f) != 1) || \ + (fread(&uIndexEMin, sizeof(uint64_t), 1, f) != 1) || \ + (fread(&uIndexEInterval, sizeof(uint64_t), 1, f) != 1)) { + fprintf(stderr, "Error while reading RTC header: %s (%d)\n", strerror(errno), errno); + ret = -4; + } + + if (uVersion != 0x30435452) { + fprintf(stderr, "Error: RTC header invalid.\n"); + ret = -5; + goto done; + } + + /* + printf("uIndexSBits: %u\n", uIndexSBits); + printf("uIndexEBits: %u\n", uIndexEBits); + printf("uIndexSMin: %"PRIu64"\n", uIndexSMin); + printf("uIndexEMin: %"PRIu64"\n", uIndexEMin); + printf("uIndexEInterval: %"PRIu64"\n", uIndexEInterval); + */ + + if ((uIndexSBits > 64) || (uIndexEBits > 64)) { + fprintf(stderr, "Error: uIndexSBits and/or uIndexEBits is greater than 64: %u %u\n", uIndexSBits, uIndexEBits); + ret = -5; + goto done; + } + + chain_size = (uIndexSBits + uIndexEBits + 7) / 8; + if (chain_size > 16) { + fprintf(stderr, "Error: chain size is somehow greater than 16: %u\n", chain_size); + ret = -6; + goto done; + } + /*printf("Chain size: %u\n", chain_size);*/ + + for (i = 0; i < uIndexSBits; i++) { + s_mask <<= 1; + s_mask |= 1; + } + /*printf("s_mask: %"PRIu64"\n", s_mask);*/ + + for (i = 0; i < num_chains; i++) { + buf[0] = 0; + buf[1] = 0; + if (fread(buf, chain_size, 1, f) != 1) { + fprintf(stderr, "Error while reading chain: %s (%d)\n", strerror(errno), errno); + ret = -7; + goto done; + } + + s = (buf[0] & s_mask) + uIndexSMin; + e = uIndexEMin + (uIndexEInterval * i) + ((buf[0] >> uIndexSBits) | (buf[1] << (64 - uIndexSBits))); + + /*printf("#%u: %"PRIu64" %"PRIu64"\n", i, buf[0], buf[1]); + printf("\t%"PRIu64" %"PRIu64"\n", s, e);*/ + + uncompressed_table[table_ptr] = s; + table_ptr++; + uncompressed_table[table_ptr] = e; + table_ptr++; + } + + done: + if (f != NULL) { + fclose(f); + f = NULL; + } + + /* On error, free the table. Set the table pointer to NULL along with num_chains to + * zero so that the caller gets correct output. */ + if ((ret != 0) && (uncompressed_table != NULL)) { + free(uncompressed_table); + uncompressed_table = NULL; + num_chains = 0; + } + + *ret_uncompressed_table = uncompressed_table; + *ret_num_chains = num_chains; + return ret; +} diff --git a/rtc_decompress.h b/rtc_decompress.h new file mode 100644 index 0000000..aa0d436 --- /dev/null +++ b/rtc_decompress.h @@ -0,0 +1,8 @@ +#ifndef _RTC_DECOMPRESS_H +#define _RTC_DECOMPRESS_H + +#include + +int rtc_decompress(char *filename, uint64_t **uncompressed_table, unsigned int *num_chains); + +#endif diff --git a/scripts/archive.sh b/scripts/archive.sh new file mode 100755 index 0000000..9162035 --- /dev/null +++ b/scripts/archive.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +DATE=`date +%b_%d_%Y` +make clean +pushd .. +tar cf rainbowcrackalack_$DATE.tar rainbowcrackalack/ +bzip2 -9 rainbowcrackalack_$DATE.tar +popd diff --git a/scripts/build_windows_zip.sh b/scripts/build_windows_zip.sh new file mode 100755 index 0000000..e6edb97 --- /dev/null +++ b/scripts/build_windows_zip.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +CP=/bin/cp +CUT=/usr/bin/cut +GREP=/bin/grep +MKDIR=/bin/mkdir +MKTEMP=/bin/mktemp +MV=/bin/mv +RM=/bin/rm +ZIP=/usr/bin/zip + + +if [ ! -f Makefile ]; then + echo "This script must be run in the top-level source directory." + exit -1 +fi + +if [ ! -f $ZIP ]; then + echo "zip program not found. Install with: apt install zip" + exit -1; +fi + +# Clean the directory, then build the Windows executables. +make clean +./make_windows.sh + +# Ensure that the build succeeded. +if [[ ($? != 0) || (! -f crackalack_gen.exe) ]]; then + exit 0 +fi + +# Get the version number out of version.h. +VERSION=`$GREP "#define VERSION " version.h | $CUT -f2 -d"\""` +if [[ $VERSION == '' ]]; then + echo "Failed to extract version number. :(" + exit -1; +fi + +# Make a temporary directory, with "Rainbow Crackalack vX/" within it. +TEMPDIR=`$MKTEMP -d` +SUBDIR="Rainbow Crackalack $VERSION" +$MKDIR $TEMPDIR/"$SUBDIR" + +# Copy in the exe files, along with the entire CL directory. +$CP *.exe $TEMPDIR/"$SUBDIR" +$CP shared.h $TEMPDIR/"$SUBDIR" +$CP -R CL $TEMPDIR/"$SUBDIR" +$RM -f $TEMPDIR/"$SUBDIR"/CL/*~ + +# Make a zip file of the "Rainbow Crackalack vX" directory. +pushd $TEMPDIR +ZIP_FILENAME=Rainbow_Crackalack_Win64_$VERSION.zip +$ZIP -r $ZIP_FILENAME "$SUBDIR" +popd + + +if [ ! -f $TEMPDIR/$ZIP_FILENAME ]; then + echo -e "\n\nFailed to create $ZIP_FILENAME!\n" + exit -1 +fi + +$MV $TEMPDIR/$ZIP_FILENAME . +$RM -rf $TEMPDIR + +echo -e "\n\nSuccessfully created $ZIP_FILENAME!\n" +exit 0 + diff --git a/scripts/compress.py b/scripts/compress.py new file mode 100755 index 0000000..ae8a9bb --- /dev/null +++ b/scripts/compress.py @@ -0,0 +1,189 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: compress.py +# Copyright (C) 2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +# This program *only* works on the snap-packaged version of the rainbowcrack tools. +# Install them with: +# +# # snap install --beta rainbowcrack + + +# TODO: verify RTC files. + +import os, shutil, subprocess, sys, tempfile, time + + +RT2RTC = 'rainbowcrack.rt2rtc' +COMMON_DIR = 'snap/rainbowcrack/common' + + +def p(s): + print(s, flush=True) + + +# Compresses a single RT file in a temp directory using the most efficient values for +# the start and end bits. +# +# Starting with values of 1 each, the rt2rtc program is run on the RT file. It will +# output the optimal values, which we will parse. On the third invokation, we will +# have all of the optimal values, and the RTC file will be successfully written to the +# snap/rainbowcrack/common directory. +# +# Terminates the program on error, or returns a tuple containing: the path of the RTC +# file (relative the 'common' directory), the start bits, and the end bits. +def compress_rt(relative_temp_dir): + start_bits = 1 + end_bits = 1 + rtc_path = None + for i in range(0, 3): + args = [RT2RTC, relative_temp_dir, '-s', str(start_bits), '-e', str(end_bits)] + #print("args: %s" % " ".join(args)) + + proc = subprocess.run(args, stdout=subprocess.PIPE) + output = proc.stdout.decode('ascii') + #print(output) + + # Loop through all the lines of the output of the rt2rtc program. Parse out the + # start and end bit values. + for line in output.split("\n"): + if line.find('minimal value of start_point_bits is ') != -1: + start_bits = int(line[41:]) + if line.find('minimal value of end_point_bits is ') != -1: + end_bits = int(line[41:]) + if line.find('writing ') != -1: + rtc_path = line.strip()[8:-3] + + if rtc_path is not None: + break + + if rtc_path is None: + print("ERROR: failed to compress file. :(") + exit(-1) + + return rtc_path, start_bits, end_bits + + +# Moves an RTC file from the 'common' directory to the result directory. Renames the +# file if the destination already exists. Returns the absolute path of the RTC file +# in the result directory. +def move_rtc(rtc_filename, result_dir): + rtc_src_path = os.path.join(COMMON_DIR, rtc_filename) + + rtc_dst_path = None + index = 1 + moved = False + while moved is False: + rtc_dst_path = os.path.join(result_dir, rtc_filename) + if not os.path.exists(rtc_dst_path): + shutil.move(rtc_src_path, rtc_dst_path) + moved = True + else: + upos = rtc_filename.rfind('_') + rtc_filename = "%s_%u.rtc" % (rtc_filename[:upos], index) + index += 1 + + return rtc_dst_path + + +if len(sys.argv) == 1: + print("\nThis program runs rainbowcrack's rt2rtc program on a directory of RT files.\n\nBecause rt2rtc does not easily handle the case when two RT files produce the same RTC filename, and because it is not clear if it automatically uses the optimal start and end bit values, this program is preferred for compressing rainbow tables.\n\nIt depends on the 'rainbowcrack' snap package (as root, run \"snap install --beta rainbowcrack\"). Hence, it works on Linux only.\n\nPlace a directory of RT files in the ~/snap/rainbowcrack/common/ directory, and run with:\n\n $ python3 %s relative_rt_directory absolute_rtc_directory\n\nNote that the RT input directory path must be relative to the 'common' directory, but the RTC output directory path is absolute. i.e.: If the RT files are in ~/snap/rainbowcrack/common/rt_files/, and the output should go in ~/rtc, run the program with:\n\n $ python3 %s rt_files ~/rtc\n" % (sys.argv[0], sys.argv[0])) + exit(0) + +rt_dir = sys.argv[1] # Relative to COMMON_DIR +result_dir = sys.argv[2] # Absolute path. + + +# Ensure that the common directory exists. +if not os.path.exists(COMMON_DIR): + print("Error: cannot find rainbowcrack\'s common directory: %s. Did you install the rainbowcrack snap package? (Hint: snap install --beta rainbowcrack)\n\nOtherwise, ensure you are in the current user's base home directory.\n" % COMMON_DIR) + exit(-1) + +# Look for existing *.rtc files in the common dir. Terminate if any are found. +for source_filename in os.listdir(COMMON_DIR): + if source_filename.endswith('.rtc'): + p("\nError: an RTC file already exists in %s: %s" % (COMMON_DIR, source_filename)) + exit(-1) + +# Check that the RT directory exists within the common directory. +if not os.path.isdir(os.path.join(COMMON_DIR, rt_dir)): + print("Error: could not find directory named %s in %s." % (rt_dir, COMMON_DIR)) + exit(-1) + +# If the result directory path exists, ensure it is a diretory. +if os.path.exists(result_dir) and not os.path.isdir(result_dir): + print('Error: %s exists, but is not a directory.' % result_dir) + exit(-1) + +# Create a temp dir to copy each RT file into during processing. +temp_dir_abs = tempfile.mkdtemp(dir='snap/rainbowcrack/common') + +# Get the relative path (with respect to the common dir) of the temp dir. +relative_temp_dir = temp_dir_abs[temp_dir_abs.rfind('/') + 1:] + +# Create the result directory if it does not already exist. +if not os.path.isdir(result_dir): + os.mkdir(result_dir) + +print("\nRTC files will be stored in %s." % result_dir) +total_rt_bytes = 0 +total_rtc_bytes = 0 + +# Put all the source filenames into an array. Since files will be +# moving in and out of here, this ensures that os.listdir() doesn't +# get confused and returns the same file multiple times. +source_filenames = [] +for source_filename in os.listdir(os.path.join(COMMON_DIR, rt_dir)): + if not source_filename.endswith('.rt'): + continue + + source_filenames.append(source_filename) + +# Sort the filenames by the table index (i.e.: 'table_0.rt', 'table_1.rt', etc). +source_filenames = sorted(source_filenames, key = lambda x: int(x[x.rfind('_')+1:x.rfind('.')])) + +# Process each table. +for source_filename in source_filenames: + source_filename_abs = os.path.join(COMMON_DIR, rt_dir, source_filename) + source_filename_temp_abs = os.path.join(temp_dir_abs, source_filename) + shutil.move(source_filename_abs, source_filename_temp_abs) + + compress_start = time.time() + p("\nCompressing %s..." % source_filename) + rtc_filename_relative, unused1, unused2 = compress_rt(relative_temp_dir) + p(" Compression finished in %.1f seconds." % float(time.time() - compress_start)) + + # Move the RTC into result directory. + rtc_filename_abs = move_rtc(rtc_filename_relative, result_dir) + + # Move the source file back. + shutil.move(source_filename_temp_abs, source_filename_abs) + + rtc_size = os.path.getsize(rtc_filename_abs) + p(" Compressed size is %.2f GB." % float(rtc_size / (1024 ** 3))) + + total_rtc_bytes += rtc_size + total_rt_bytes += os.path.getsize(source_filename_abs) + + +os.rmdir(temp_dir_abs) + +print("\n\n-------------------------------------\n") +print("Raw table size: %.2f GB" % (total_rt_bytes / (1024 ** 3))) +print("Compressed table size: %.2f GB" % (total_rtc_bytes / (1024 ** 3))) +print("Compression rate: %.0f%%" % (((total_rt_bytes - total_rtc_bytes) / total_rt_bytes) * 100)) +exit(0) diff --git a/scripts/create_ntlm_passwords.py b/scripts/create_ntlm_passwords.py new file mode 100755 index 0000000..a5e7ebc --- /dev/null +++ b/scripts/create_ntlm_passwords.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: create_ntlm_passwords.py +# Copyright (C) 2018-2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +# This tool generates random passwords of a certain length and calculates their NTLM +# hashes. It is useful for testing the effectiveness of a set of rainbow tables. + +import hashlib, os, sys + +CHARSET_ASCII_32_95 = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + +passwords = [] +hashes = [] + +if (len(sys.argv) != 3): + print("Usage: %s [number of chars] [number of passwords]\n\nProgram will generate the specified number random passwords, each composed of the specified number of characters. For example, to generate 1,000 passwords of length 8:\n\n %s 8 1000\n" % (sys.argv[0], sys.argv[0])) + sys.exit(-1) + +num_pw_chars = int(sys.argv[1]) +num_passwords = int(sys.argv[2]) +if (num_pw_chars < 1) or (num_pw_chars > 16): + print("Number of characters must be between 1 and 16.") + sys.exit(-1) +elif (num_passwords < 1): + print("Number of passwords must be greater than 0.") + sys.exit(-1) + +print("Generating %d %d-character random passwords..." % (num_passwords, num_pw_chars)) +for i in range(0, num_passwords): + rand_bytes = os.urandom(num_pw_chars) + password = '' + for j in range(num_pw_chars): + password = password + CHARSET_ASCII_32_95[ rand_bytes[j] % 95 ] + passwords.append(password) + hashes.append(hashlib.new('md4', password.encode('utf-16le')).hexdigest()) + +hash_filename = "random_ntlm_hashes_%d_chars.txt" % num_pw_chars +passwords_filename = "random_passwords_%d_chars.txt" % num_pw_chars + +# Write the hashes to the file. +with open(hash_filename, 'w') as f: + f.write("\n".join(hashes)) + +# In a separate file, write the corresponding plaintext passwords. +with open(passwords_filename, 'w') as f: + f.write("\n".join(passwords)) + +print("\nNTLM hashes stored in: %s\nPlaintext passwords stored in: %s" % (hash_filename, passwords_filename)) diff --git a/scripts/enumerate_rainbow_chain.py b/scripts/enumerate_rainbow_chain.py new file mode 100755 index 0000000..a21bab6 --- /dev/null +++ b/scripts/enumerate_rainbow_chain.py @@ -0,0 +1,150 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: enumerate_rainbow_chain.py +# Copyright (C) 2018-2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +# This program will print all the plaintexts & hashes for a particular rainbow chain. +# It currently only supports NTLM hashes in the ascii-32-95 charset. + +import hashlib, sys + + +def table_index_to_reduction_offset(table_index): + return table_index * 65536 + + +def fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index): + n = 1 + + plaintext_space_up_to_index[0] = 0 + for i in range(1, plaintext_len_max + 1): + n = n * charset_len + if (i < plaintext_len_min): + plaintext_space_up_to_index[i] = 0 + else: + plaintext_space_up_to_index[i] = plaintext_space_up_to_index[i - 1] + n + + return plaintext_space_up_to_index[plaintext_len_max] + + +def index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index): + + plaintext = '' + plaintext_len = 0 + + + if plaintext_len_min == 9: + for i in range(0, 9): + plaintext = plaintext + charset[ (index & 0xff) % charset_len ] + index = index >> 7 + + return plaintext + + + for i in range(plaintext_len_max - 1, plaintext_len_min - 1 - 1, -1): + if (index >= plaintext_space_up_to_index[i]): + plaintext_len = i + 1 + break + + index_x = index - plaintext_space_up_to_index[plaintext_len - 1] + for i in range(plaintext_len - 1, -1, -1): + plaintext = charset[index_x % charset_len] + plaintext + index_x = int(index_x / charset_len) + + return plaintext + + +def do_hash(plaintext): + return hashlib.new('md4', plaintext.encode('utf-16le')).digest() + + +def hash_to_index(hash, reduction_offset, plaintext_space_total, pos): + return (int.from_bytes(hash, byteorder='little') + reduction_offset + pos) % plaintext_space_total; + + +def print_plaintexts(start_index, end_index, charset, charset_len, plaintext_space_up_to_index, plaintext_len_min, plaintext_len_max, reduction_offset, chain_len): + + print("Position\tPlaintext\tHash\tHash Index") + + index = start_index + for pos in range(0, chain_len - 1): + plaintext = index_to_plaintext(index, charset, charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index) + + hash = do_hash(plaintext) + + print("%u\t%s\t%s\t" % (pos, plaintext, hash.hex()), end='') + index = hash_to_index(hash[0:8], reduction_offset, plaintext_space_total, pos) + print(index) + + if index != end_index: + print("\nERROR: end index is expected to be %u, but computed %u.\n" % (end_index, index)) + + return + + +def run_self_tests(): + plaintext_space_up_to_index = [None] * 16 + charset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + charset_len = len(charset) + + plaintext_space_total = fill_plaintext_space_table(charset_len, 8, 8, plaintext_space_up_to_index); + + # Self-test: index_to_plaintext() + plaintext = index_to_plaintext(5222991064626285, charset, charset_len, 8, 8, plaintext_space_up_to_index) + if plaintext != 'jk5(J-f\\': + print("index_to_plaintext() failed self-test! Expected: %s. Got: %s." % ('jk5(J-f\\', plaintext)) + exit(-1) + + # Self-test: do_hash() + hash = do_hash('C1t1z3n#')[0:8] + if hash.hex() != 'ff0bc475edd85a6a': + print("do_hash() failed self-test! Expected: %s. Got: %s." % ('ff0bc475edd85a6a', hash.hex())) + exit(-1) + + # Self-test: hash_to_index() + index = hash_to_index(b"\x12\x34\x56\x78\x9a\xbc\xde\xf0", 0, plaintext_space_total, 666) + if (index != 1438903040496756): + print("hash_to_index() failed self-test! Expected: %u. Got: %u.\n" % (1438903040496756, index)) + exit(-1) + + +run_self_tests() + +if len(sys.argv) != 5: + print("\nThis script takes a start index, end index, the plaintext min & max (which must both be the same), and a chain length, then outputs all the plaintexts. It is assumed that the us-ascii-32-95 character set is in use, along with a table index of 0.") + print() + print("Usage:\n\n %s start_index end_index plaintext_min_max chain_length\n" % sys.argv[0]) + exit(-1) + +start_index = int(sys.argv[1]) +end_index = int(sys.argv[2]) +min_max = int(sys.argv[3]) +chain_len = int(sys.argv[4]) + +table_index = 0 +plaintext_len_min = plaintext_len_max = min_max + +plaintext_space_up_to_index = [None] * 16 +charset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" +charset_len = len(charset) + + +print("\nEnumerating chain (%u, %u) using the following parameters:\n\tcharset: ascii-32-95\n\tplaintext len min: %u\n\tplaintext len max: %u\n\ttable index: %u\n\tchain len: %u\n" % (start_index, end_index, plaintext_len_min, plaintext_len_max, table_index, chain_len)) + +plaintext_space_total = fill_plaintext_space_table(charset_len, plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + +print_plaintexts(start_index, end_index, charset, charset_len, plaintext_space_up_to_index, plaintext_len_min, plaintext_len_max, table_index_to_reduction_offset(table_index), chain_len) diff --git a/scripts/generate_ntlm_8char_script.sh b/scripts/generate_ntlm_8char_script.sh new file mode 100755 index 0000000..2e23929 --- /dev/null +++ b/scripts/generate_ntlm_8char_script.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [[ $# != 2 ]]; then + echo "This tool creates bash scripts to generate NTLM 8-character rainbow tables." + echo + echo "To make a script that creates tables from part index 50 to 75:" + echo + echo " $0 50 75 > generate_ntlm_8char_50-75.sh; chmod 0700 *.sh" + exit 1 +fi + +start=$1 +end=$2 + +echo "#!/bin/bash" +echo +for i in `seq $start $end`; do + echo "./crackalack_gen ntlm ascii-32-95 8 8 0 422000 67108864 $i" +done diff --git a/scripts/make_rts.py b/scripts/make_rts.py new file mode 100755 index 0000000..6811a06 --- /dev/null +++ b/scripts/make_rts.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: make_rts.py +# Copyright (C) 2018-2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +# This will create 96 rainbow tables with 1K chains each. Useful for +# testing the rt_compress.py script. + +import subprocess + +for i in range(0, 96): + proc = subprocess.run(['rainbowcrack.rtgen', 'ntlm', 'ascii-32-95', '8', '8', '0', '100', '1024', str(i)]) diff --git a/scripts/merge.py b/scripts/merge.py new file mode 100755 index 0000000..7536470 --- /dev/null +++ b/scripts/merge.py @@ -0,0 +1,252 @@ +#!/usr/bin/python3 + +import os, shutil, subprocess, sys + +CRACKALACK_VERIFY = './crackalack_verify' +ONE_GB = 1024 * 1024 * 1024 +MERGE_DIR_RELATIVE = 'merge' +COMMON_DIR_ABS = os.path.join(os.getcwd(), 'snap/rainbowcrack/common') + + +# Returns the lowest-indexed file from the list of input files. The returned file name +# is removed from the dictionary, hence subsequent calls will return the next lowest- +# numbered file. Returns None when no file names remain. +def get_next_input_file(input_files): + lowest_index = 9999999999 + found = False + ret = None + for index in input_files: + if index < lowest_index: + lowest_index = index + found = True + + if found: + ret = input_files[lowest_index] + del input_files[lowest_index] + + return ret + + +# Merges tables into a single 1 GB file, then moves it to the output directory. +def merge_tables(merge_dir_abs, input_dir_abs, output_dir_abs, output_index): + try: + proc = subprocess.run(['rainbowcrack.rtmerge', MERGE_DIR_RELATIVE]) + if proc.returncode != 0: + print("rtmerge returned non-zero exit code (%u). Terminating..." % proc.returncode) + exit(-1) + except FileNotFoundError as e: + print("Error: could not run rainbowcrack.rtmerge!") + exit(-1) + except subprocess.SubprocessError as e: + print("Error: SubprocessError: %s" % e) + exit(-1) + + # Find the file that rtmerge created. + merged_filename_abs = None + for f in os.listdir(COMMON_DIR_ABS): + if f.endswith('.rt'): + merged_filename_abs = os.path.join(COMMON_DIR_ABS, f) + + if merged_filename_abs is None: + print("Error: failed to find merged filename!") + exit(-1) + + # Run crackalack_verify --sorted on the merged file. + try: + proc = subprocess.run([CRACKALACK_VERIFY, '--sorted', merged_filename_abs]) + if proc.returncode != 0: + print("crackalack_verify returned non-zero exit code (%u). Terminating..." % proc.returncode) + exit(-1) + except FileNotFoundError as e: + print("Error: could not run crackalack_verify!") + exit(-1) + except subprocess.SubprocessError as e: + print("Error: SubprocessError: %s" % e) + exit(-1) + + # If the file is greater than 1 GB, then this is a hard error. Otherwise, it is + # normal for the last run to yield a single file less than 1 GB. + merged_size = os.path.getsize(merged_filename_abs) + if merged_size > ONE_GB: + print("\nError: %s is greater than 1 GB!: %u" % (merged_filename_abs, merged_size)) + exit(-1) + elif merged_size < ONE_GB: + print("\n !! Warning: %s is less than 1 GB (%u). Make sure this is only happening once at the end.\n\n" % (merged_filename_abs, merged_size)) + + # Find the last underscore so we can rename the output file with the appropriate + # index number. + filename = os.path.basename(merged_filename_abs) + upos = filename.rfind('_') + if upos == -1: + print("Error: could not parse filename: %s" % filename) + exit(-1) + + # Move the merged file from the working directory to the output directory. + merged_filename_abs_new = os.path.join(output_dir_abs, "%s_%u.rt" % (filename[0:upos], output_index)) + print("Moving merged table from %s to %s..." % (merged_filename_abs, merged_filename_abs_new)) + shutil.move(merged_filename_abs, merged_filename_abs_new) + + # Delete all the working files. + print("Deleting merge directory...\n") + shutil.rmtree(merge_dir_abs) + os.mkdir(merge_dir_abs) + + +# Puts rainbow tables into the "merge" directory. Ensures that their sizes add up to +# 1 GB exactly, and returns the left-over chains. +# +# Returns a tuple: +# boolean: True when there are files to be merged, False when no files are left. +# index: The index of the input file to resume processing on. +# filename: The filename that contained the left-over bytes from this block +# data: The chains that overflowed from this block. +def set_merged_dir(input_files, merge_dir_abs, extra_filename, extra_data): + ret = False + leftover_filename = None + leftover_data = None + merge_size = 0 + + # If any data from the last invokation needs to be carried over, add it to the + # working directory first. + if extra_data is not None: + extra_filename_abs = os.path.join(merge_dir_abs, extra_filename) + print("Writing chains from previous block to: %s" % extra_filename_abs) + with open(extra_filename_abs, "wb") as f: + merge_size = f.write(extra_data) + print("merge_size starting at: %u" % merge_size) + ret = True + + # Copy files to the working directory until >= 1 GB is reached. + while True: + rt_file = get_next_input_file(input_files) + if rt_file is None: + print("Input files exhausted.") + break + + rt_file_size = os.path.getsize(rt_file) + merge_size += rt_file_size + + print("Copying %s to %s..." % (rt_file, merge_dir_abs)) + shutil.copy(rt_file, merge_dir_abs) + ret = True + + # If we surpassed 1 GB, store the excess for the next invokation. + if merge_size > ONE_GB: + extra_bytes = merge_size - ONE_GB + offset = rt_file_size - extra_bytes + + print("Source files are greater than 1GB: %u; extra bytes: %u" % (merge_size, extra_bytes)) + rt_file = os.path.join(merge_dir_abs, os.path.basename(rt_file)) + print("Truncating %s from %u to %u..." % (rt_file, rt_file_size, offset)) + data = None + with open(rt_file, "r+b") as f: + f.seek(offset, 0) + leftover_data = f.read(extra_bytes) + f.truncate(offset) + + leftover_filename = os.path.basename(rt_file) + xpos = leftover_filename.find('x') + upos = leftover_filename.rfind('_') + if (xpos == -1) or (upos == -1): + print("Error: could not parse filename: %s" % leftover_filename) + exit(-1) + + leftover_filename = "%s%u%s" % (leftover_filename[0:xpos+1], len(leftover_data) / 16, leftover_filename[upos:]) + print("Leftover filename: %s; leftover data: %u bytes" % (leftover_filename, len(leftover_data))) + break + elif merge_size == ONE_GB: + print("Source files are exactly 1 GB!") + break + + return ret, leftover_filename, leftover_data + + + +if len(sys.argv) != 3: + print("Usage: %s /path/to/input_dir /path/to/output_dir" % sys.argv[0]) + print() + print(" input_dir: the directory holding pruned rainbow tables to merge.") + print(" output_dir: the directory to put the merged files in.") + print() + exit(-1) + +input_dir = sys.argv[1] +output_dir = sys.argv[2] + + +# Ensure we can find the common directory. +if not os.path.isdir(COMMON_DIR_ABS): + print("Error: could not find directory!: %s" % COMMON_DIR_ABS) + exit(-1) + +# Create the working directory, if it doesn't already exist. +merge_dir_abs = os.path.join(COMMON_DIR_ABS, MERGE_DIR_RELATIVE) +if not os.path.isdir(merge_dir_abs): + os.mkdir(merge_dir_abs) + +# Create the output directory if it doesn't exist already. +if not os.path.isdir(output_dir): + print("Creating output directory: %s" % output_dir) + os.mkdir(output_dir) + +# Ensure we can find the crackalack_verify program. +if not os.path.isfile(CRACKALACK_VERIFY): + print('Error: could not find the %s program. This needs to be in the current directory in order to continue.' % CRACKALACK_VERIFY) + exit(-1) + +# Ensure that the common directory does not have any *.rt nor *.rtc files before +# starting. This way, we don't end up confused if anything goes wrong during +# processing. +for f in os.listdir(COMMON_DIR_ABS): + if f.endswith('.rt') or f.endswith('.rtc'): + print("Error: %s must not have any *.rt or *.rtc files! Terminating before doing any processing." % COMMON_DIR_ABS) + exit(-1) + +# Keys are table indices, values are absolute paths to tables. Example: +# 1000: '/path/to/dir/ntlm_ascii-32-95#8-8_0_422000x6830556_1000.rt', +# 1001: '/path/to/dir/ntlm_ascii-32-95#8-8_0_422000x6830556_1001.rt', +input_files = {} +lowest_index = 9999999999 +for f in os.listdir(input_dir): + upos = f.rfind('_') + dotpos = f.rfind('.') + if (upos == -1) or (dotpos == -1): + print(" !! Warning: could not parse index from filename: %s. Skipping..." % f) + continue + + index = int(f[upos+1:dotpos]) + if index < lowest_index: + lowest_index = index + + input_file = os.path.join(input_dir, f) + input_file_size = os.path.getsize(input_file) + if input_file_size > ONE_GB: + print("Error: input file is larger than 1 GB: %s" % input_file) + exit(-1) + elif input_file_size == ONE_GB: + print("Note: skipping file because it is exactly 1 GB already: %s" % input_file) + continue + + input_files[index] = input_file + + +output_index = lowest_index +extra_data = None +extra_filename = None +while True: + # Copy tables from our source directory to the working directory. Truncate the + # last file, if necessary, to ensure that no more than a 1 GB file is created. + ret, extra_filename, extra_data = set_merged_dir(input_files, merge_dir_abs, extra_filename, extra_data) + + # If no more files are left to process, we're done. + if not ret: + break + + # Merge the tables into a single 1 GB file. The last invokation will likely create + # a smaller file. + merge_tables(merge_dir_abs, input_dir, output_dir, output_index) + output_index += 1 + +# The merge directory should be empty, so remove it. +os.rmdir(merge_dir_abs) +print("\nFinished merging successfully.\n") diff --git a/scripts/old/rt_compress.py b/scripts/old/rt_compress.py new file mode 100755 index 0000000..0b312cf --- /dev/null +++ b/scripts/old/rt_compress.py @@ -0,0 +1,868 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: rt_compress.py +# Copyright (C) 2018-2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + + +# This program converts raw rainbow tables into optimized, sorted, and compressed +# tables that are ready for production use. +# +# This program *only* works on the snap-packaged version of the rainbowcrack tools. +# Install them with: +# +# # snap install --beta rainbowcrack + + +# +# Output of production run on raw 8-character NTLM tables. This was run on an +# AMD Threadripper 1950X with 32 GB of memory: +# + + +# +# RTC files will be stored in /home/pwn0r/rtc. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_0.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_31.rt +# Merging completed in 151 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 4114 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 510 seconds. +# Size of compressed blob is 16.85 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 128 seconds. +# Size of uncompressed blob is 29.95 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_0.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_28.rt. +# Short file found (537782448 bytes / 33611403 chains). Processing next round. +# Split 1 GB chunks in 133 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_0.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_28.rtc +# Re-compressed 1 GB blocks in 172 seconds. +# +# Fully processed 32 GB of tables in 5213 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 537782448 bytes / 33611403 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_32.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_63.rt +# Merging completed in 152 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 8396 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 519 seconds. +# Size of compressed blob is 19.00 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 142 seconds. +# Size of uncompressed blob is 30.39 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_32.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_60.rt. +# Short file found (1011327440 bytes / 63207965 chains). Processing next round. +# Split 1 GB chunks in 137 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_29.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_57.rtc +# Re-compressed 1 GB blocks in 166 seconds. +# +# Fully processed 32 GB of tables in 9517 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 1011327440 bytes / 63207965 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_64.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_95.rt +# Merging completed in 154 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 12010 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 533 seconds. +# Size of compressed blob is 21.17 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 147 seconds. +# Size of uncompressed blob is 30.79 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_64.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_93.rt. +# Short file found (340691024 bytes / 21293189 chains). Processing next round. +# Split 1 GB chunks in 137 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_58.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_87.rtc +# Re-compressed 1 GB blocks in 173 seconds. +# +# Fully processed 32 GB of tables in 13159 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 340691024 bytes / 21293189 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_96.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_127.rt +# Merging completed in 154 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1669 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 516 seconds. +# Size of compressed blob is 18.90 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 142 seconds. +# Size of uncompressed blob is 30.24 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_96.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_124.rt. +# Short file found (839761408 bytes / 52485088 chains). Processing next round. +# Split 1 GB chunks in 137 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_88.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_116.rtc +# Re-compressed 1 GB blocks in 163 seconds. +# +# Fully processed 32 GB of tables in 2786 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 839761408 bytes / 52485088 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_128.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_159.rt +# Merging completed in 151 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1383 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 529 seconds. +# Size of compressed blob is 21.07 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 147 seconds. +# Size of uncompressed blob is 30.65 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_128.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_157.rt. +# Short file found (190164752 bytes / 11885297 chains). Processing next round. +# Split 1 GB chunks in 135 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_117.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_146.rtc +# Re-compressed 1 GB blocks in 175 seconds. +# +# Fully processed 32 GB of tables in 2525 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 190164752 bytes / 11885297 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_160.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_191.rt +# Merging completed in 155 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 10880 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 519 seconds. +# Size of compressed blob is 18.82 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 137 seconds. +# Size of uncompressed blob is 30.11 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_160.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_188.rt. +# Short file found (707347744 bytes / 44209234 chains). Processing next round. +# Split 1 GB chunks in 139 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_147.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_175.rtc +# Re-compressed 1 GB blocks in 164 seconds. +# +# Fully processed 32 GB of tables in 11999 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 707347744 bytes / 44209234 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_192.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_223.rt +# Merging completed in 156 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1320 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 528 seconds. +# Size of compressed blob is 20.99 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 147 seconds. +# Size of uncompressed blob is 30.54 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_192.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_221.rt. +# Short file found (72765600 bytes / 4547850 chains). Processing next round. +# Split 1 GB chunks in 139 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_176.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_205.rtc +# Re-compressed 1 GB blocks in 174 seconds. +# +# Fully processed 32 GB of tables in 2470 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 72765600 bytes / 4547850 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_224.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_255.rt +# Merging completed in 159 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1231 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 517 seconds. +# Size of compressed blob is 18.76 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 136 seconds. +# Size of uncompressed blob is 30.01 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_224.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_252.rt. +# Short file found (603276144 bytes / 37704759 chains). Processing next round. +# Split 1 GB chunks in 137 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_206.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_234.rtc +# Re-compressed 1 GB blocks in 163 seconds. +# +# Fully processed 32 GB of tables in 2348 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 603276144 bytes / 37704759 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_256.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_287.rt +# Merging completed in 154 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 5216 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 527 seconds. +# Size of compressed blob is 20.93 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 147 seconds. +# Size of uncompressed blob is 30.45 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_256.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_284.rt. +# Short file found (1070153168 bytes / 66884573 chains). Processing next round. +# Split 1 GB chunks in 137 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_235.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_263.rtc +# Re-compressed 1 GB blocks in 166 seconds. +# +# Fully processed 32 GB of tables in 6353 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 1070153168 bytes / 66884573 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_288.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_319.rt +# Merging completed in 154 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1418 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 537 seconds. +# Size of compressed blob is 21.20 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 144 seconds. +# Size of uncompressed blob is 30.84 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_288.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_317.rt. +# Short file found (393331488 bytes / 24583218 chains). Processing next round. +# Split 1 GB chunks in 140 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_264.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_293.rtc +# Re-compressed 1 GB blocks in 175 seconds. +# +# Fully processed 32 GB of tables in 2573 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 393331488 bytes / 24583218 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_320.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_351.rt +# Merging completed in 158 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 9823 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 529 seconds. +# Size of compressed blob is 20.82 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 141 seconds. +# Size of uncompressed blob is 30.28 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_320.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_348.rt. +# Short file found (887543344 bytes / 55471459 chains). Processing next round. +# Split 1 GB chunks in 135 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_294.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_322.rtc +# Re-compressed 1 GB blocks in 167 seconds. +# +# Fully processed 32 GB of tables in 10958 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 887543344 bytes / 55471459 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_352.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_383.rt +# Merging completed in 162 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 1365 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 532 seconds. +# Size of compressed blob is 21.10 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 148 seconds. +# Size of uncompressed blob is 30.69 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_352.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_381.rt. +# Short file found (233625696 bytes / 14601606 chains). Processing next round. +# Split 1 GB chunks in 136 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_323.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_352.rtc +# Re-compressed 1 GB blocks in 170 seconds. +# +# Fully processed 32 GB of tables in 2517 seconds. +# +# +# ------------------------------------- +# +# Merging RT files into 32 GB blob... +# Added 233625696 bytes / 14601606 chains left over from last block. +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_384.rt +# [...] +# Adding to 32 GB blob.rt: snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_399.rt +# +# Source tables exhausted (snap/rainbowcrack/common/NTLM_8chars_raw/ntlm_ascii-32-95#8-8_0_422000x67108864_400.rt not found). +# Merging completed in 77 seconds. +# +# Sorting 32GB blob... +# Sorting 32 GB completed in 471 seconds. +# +# Compressing 32 GB blob... +# Compressed 32 GB blob in 74 seconds. +# Size of compressed blob is 10.78 GB. +# +# Uncompressing 32 GB blob... +# Uncompressed 32 GB blob in 25 seconds. +# Size of uncompressed blob is 15.68 GB. +# +# Splitting uncompressed, sorted, and pruned blob into 1 GB chunks... +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_384.rt. +# [...] +# Wrote 1.00 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_398.rt. +# Wrote 0.44 GB to snap/rainbowcrack/common/tmpjqu9i95k/ntlm_ascii-32-95#8-8_0_422000x67108864_399.rt. +# Split 1 GB chunks in 32 seconds. +# +# Re-compressing 1 GB blocks... +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_353.rtc +# [...] +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x67108864_367.rtc +# Created final RTC: /home/pwn0r/rtc/ntlm_ascii-32-95#8-8_0_422000x29730227_0.rtc +# Re-compressed 1 GB blocks in 70 seconds. +# +# Fully processed 32 GB of tables in 756 seconds. +# +# +# ------------------------------------- +# +# Raw table size: 400.00 GB +# Compressed table size: 207.49 GB +# Compression rate: 48% +# + + +import os, shutil, subprocess, sys, tempfile, time + + +RT2RTC = 'rainbowcrack.rt2rtc' +RTC2RT = 'rainbowcrack.rtc2rt' +COMMON_DIR = 'snap/rainbowcrack/common' + +TABLE_SIZE = 1024 * 1024 * 1024 # 1 GB +FILE_BUF_SIZE = 1024 * 1024 * 16 # 16 MB + + +# Compresses a single RT file in a temp directory using the most efficient values for +# the start and end bits. +# +# Starting with values of 1 each, the rt2rtc program is run on the RT file. It will +# output the optimal values, which we will parse. On the third invokation, we will +# have all of the optimal values, and the RTC file will be successfully written to the +# snap/rainbowcrack/common directory. +# +# Terminates the program on error, or returns a tuple containing: the path of the RTC +# file (relative the 'common' directory), the start bits, and the end bits. +def compress_rt(relative_temp_dir): + start_bits = 1 + end_bits = 1 + rtc_path = None + for i in range(0, 3): + args = [RT2RTC, relative_temp_dir, '-s', str(start_bits), '-e', str(end_bits), '-c', '32768', '-p'] + #print("args: %s" % " ".join(args)) + + proc = subprocess.run(args, stdout=subprocess.PIPE) + output = proc.stdout.decode('ascii') + #print(output) + + # Loop through all the lines of the output of the rt2rtc program. Parse out the + # start and end bit values. + for line in output.split("\n"): + if line.find('minimal value of start_point_bits is ') != -1: + start_bits = int(line[41:]) + if line.find('minimal value of end_point_bits is ') != -1: + end_bits = int(line[41:]) + if line.find('writing ') != -1: + rtc_path = line.strip()[8:-3] + + if rtc_path is not None: + break + + if rtc_path is None: + print("ERROR: failed to compress file. :(") + exit(-1) + + return rtc_path, start_bits, end_bits + + +# Moves an RTC file from the 'common' directory to the result directory. Renames the +# file if the destination already exists. Returns the absolute path of the RTC file +# in the result directory. +def move_rtc(rtc_filename, result_dir): + rtc_src_path = os.path.join(COMMON_DIR, rtc_filename) + + rtc_dst_path = None + index = 1 + moved = False + while moved is False: + rtc_dst_path = os.path.join(result_dir, rtc_filename) + if not os.path.exists(rtc_dst_path): + shutil.move(rtc_src_path, rtc_dst_path) + moved = True + else: + upos = rtc_filename.rfind('_') + rtc_filename = "%s_%u.rtc" % (rtc_filename[:upos], index) + index += 1 + + return rtc_dst_path + + +if len(sys.argv) == 1: + print("\nThis program runs rainbowcrack's rt2rtc program on a directory of RT files.\n\nBecause rt2rtc does not easily handle the case when two RT files produce the same RTC filename, and because it is not clear if it automatically uses the optimal start and end bit values, this program is preferred for compressing rainbow tables.\n\nIt depends on the 'rainbowcrack' snap package (as root, run \"snap install --beta rainbowcrack\"). Hence, it works on Linux only.\n\nPlace a directory of RT files in the ~/snap/rainbowcrack/common/ directory, and run with:\n\n $ python3 %s relative_rt_directory absolute_rtc_directory\n\nNote that the RT input directory path must be relative to the 'common' directory, but the RTC output directory path is absolute. i.e.: If the RT files are in ~/snap/rainbowcrack/common/rt_files/, and the output should go in ~/rtc, run the program with:\n\n $ python3 %s rt_files ~/rtc\n" % (sys.argv[0], sys.argv[0])) + exit(0) + +rt_dir = sys.argv[1] # Relative to COMMON_DIR +result_dir = sys.argv[2] # Absolute path. +#original_cwd = os.getcwd() + + +# Ensure that the common directory exists. +if not os.path.exists(COMMON_DIR): + print("Error: cannot find rainbowcrack\'s common directory: %s. Did you install the rainbowcrack snap package? (Hint: snap install --beta rainbowcrack)\n\nOtherwise, ensure you are in the current user's base home directory.\n" % COMMON_DIR) + exit(-1) + +# Check that the RT directory exists within the common directory. +if not os.path.isdir(os.path.join(COMMON_DIR, rt_dir)): + print("Error: could not find directory named %s in %s." % (rt_dir, COMMON_DIR)) + exit(-1) + +# If the result directory path exists, ensure it is a diretory. +if os.path.exists(result_dir) and not os.path.isdir(result_dir): + print('Error: %s exists, but is not a directory.' % result_dir) + exit(-1) + +# Create a temp dir to copy each RT file into during processing. +temp_dir = tempfile.mkdtemp(dir='snap/rainbowcrack/common') +temp_dir_single = tempfile.mkdtemp(dir='snap/rainbowcrack/common') + +# Get the relative path (with respect to the common dir) of the temp dir. +relative_temp_dir = temp_dir[temp_dir.rfind('/') + 1:] +relative_temp_dir_single = temp_dir_single[temp_dir_single.rfind('/') + 1:] + +# Create the result directory if it does not already exist. +if not os.path.isdir(result_dir): + os.mkdir(result_dir) + +print("\nRTC files will be stored in %s." % result_dir) +#print("Processing %s/*.rt...\n\n" % os.path.join(COMMON_DIR, rt_dir)) +total_rt_bytes = 0 +total_rtc_bytes = 0 + +# Get the basename of the first *.rt file ("ntlm_ascii-32-95#8-8_0_100x1024_"). With +# this, we can iterate from _0.rt, _1.rt, etc., in order (note that directory listings +# are sometimes unordered). +base_filename = None +for filename in os.listdir(os.path.join(COMMON_DIR, rt_dir)): + if filename.endswith('.rt'): + + upos = filename.rfind('_') + if upos != -1: + base_filename = filename[0:upos + 1] + break + +if base_filename is None: + print("Error: could not determine base filename!") + exit(-1) + +#print(base_filename) +current_file_num = 0 + +blob_filename = os.path.join(temp_dir, 'blob.rt') +blob_filename_relative = os.path.join(relative_temp_dir, 'blob.rt') + +out_of_files = False +while not out_of_files: + + # + # Merge the unsorted tables into one 32 GB blob. + # + + print("\n\n-------------------------------------", flush=True) + print("\nMerging RT files into 32 GB blob...", flush=True) + block_start = time.time() + merge_start = time.time() + with open(blob_filename, 'wb') as blob: + + # If any bytes from the previous run were left over, add that to this blob first. + if os.path.exists('leftover.rt'): + leftover_len = 0 + with open('leftover.rt', 'rb') as lo: + buf = lo.read(FILE_BUF_SIZE) + while buf != b'': + leftover_len += blob.write(buf) + buf = lo.read(FILE_BUF_SIZE) + os.unlink('leftover.rt') + print(" Added %u bytes / %u chains left over from last block." % (leftover_len, leftover_len / 16)) + + # Iterate over the next 32 unsorted tables and append them to the blob. + for i in range(current_file_num, current_file_num + 32): + rt_filename = os.path.join(COMMON_DIR, rt_dir, "%s%d.rt" % (base_filename, i)) + + if os.path.exists(rt_filename): + print(" Adding to 32 GB blob.rt: %s" % rt_filename, flush=True) + + # Read blocks out of the raw table file, and write them to the blob file. + with open(rt_filename, 'rb') as rt: + buf = rt.read(FILE_BUF_SIZE) + while buf != b'': + total_rt_bytes += blob.write(buf) + buf = rt.read(FILE_BUF_SIZE) + else: + print("\nSource tables exhausted (%s not found)." % rt_filename, flush=True) + out_of_files = True + break + + print(" Merging completed in %u seconds." % int(time.time() - merge_start), flush=True) + + if out_of_files and os.path.getsize(blob_filename) == 0: + print("DONE", flush=True) + os.unlink(blob_filename) + break + + # Rename from 'blob.rt' to something like 'ntlm_ascii-32-95#8-8_0_100x32768_0.rt', + # since the compression tool doesn't like vague file names. + xpos = base_filename.rfind('x') + if xpos == -1: + print("Error: failed to parse basename: %s" % base_filename, flush=True) + exit(-1) + + new_blob_filename = "%s%d_0.rt" % (base_filename[0:xpos + 1], os.path.getsize(blob_filename) / 16) + new_path = os.path.join(temp_dir, new_blob_filename) + shutil.move(blob_filename, new_path) + blob_filename = new_path + blob_filename_relative = os.path.join(relative_temp_dir, new_blob_filename) + #print(" Renamed unsorted blob from 'blob.rt' to: %s." % blob_filename) + + + # + # Now sort the blob. + # + + print("\nSorting 32GB blob...", flush=True) + sort_start = time.time() + proc = subprocess.run(['rainbowcrack.rtsort', relative_temp_dir], stdout=subprocess.PIPE) + if proc.stdout.decode('ascii').find('writing sorted data') == -1: + print("\nSorting failed. :(", flush=True) + exit(-1) + print(" Sorting 32 GB completed in %u seconds." % int(time.time() - sort_start), flush=True) + + + # + # Compress the blob. + # + + print("\nCompressing 32 GB blob...", flush=True) + compress_start = time.time() + rtc_filename_relative, unused1, unused2 = compress_rt(relative_temp_dir) + print(" Compressed 32 GB blob in %u seconds." % int(time.time() - compress_start), flush=True) + + # Remove the uncompressed blob. + os.unlink(blob_filename) + + # Move the RTC into the temp dir. + shutil.move(os.path.join(COMMON_DIR, rtc_filename_relative), temp_dir) + + + rtc_filename_relative = os.path.join(temp_dir, rtc_filename_relative) + print(" Size of compressed blob is %.2f GB." % (os.path.getsize(rtc_filename_relative) / (1024 ** 3)), flush=True) + + + # + # Uncompress it again. + # + + print("\nUncompressing 32 GB blob...", flush=True) + uncompress_start = time.time() + proc = subprocess.run(['rainbowcrack.rtc2rt', relative_temp_dir], stdout=subprocess.PIPE) + if proc.stdout.decode('ascii').find('converting') == -1: + print("\nUncompressing failed. :(", flush=True) + exit(-1) + + print(" Uncompressed 32 GB blob in %u seconds." % int(time.time() - uncompress_start), flush=True) + + + # Remove the compressed & sorted blob. + os.unlink(rtc_filename_relative) + #print(" Deleted: %s" % rtc_filename_relative) + + # The filename may be different because chains were pruned... + for f in os.listdir(temp_dir): + if f.endswith('.rt'): + blob_filename = os.path.join(temp_dir, f) + + print(" Size of uncompressed blob is %.2f GB." % (os.path.getsize(blob_filename) / (1024 ** 3)), flush=True) + + + # + # Split 1 GB chunks from 32 GB uncompressed sorted blob. + # + + print("\nSplitting uncompressed, sorted, and pruned blob into 1 GB chunks...", flush=True) + split_start = time.time() + with open(blob_filename, 'rb') as blob: + for i in range(current_file_num, current_file_num + 32): + rt_filename = os.path.join(temp_dir, "%s%d.rt" % (base_filename, i)) + with open(rt_filename, 'wb') as rt: + + # While debugging with small tables, just read the chunk entirely at once. + if FILE_BUF_SIZE > TABLE_SIZE: + rt.write(blob.read(TABLE_SIZE)) + else: # Production mode with really big files: read in 16 MB blocks. + buf = blob.read(FILE_BUF_SIZE) + len_written = 0 + while (buf != b'') and (len_written < TABLE_SIZE): + len_written += rt.write(buf) + buf = blob.read(FILE_BUF_SIZE) + + rt_size = os.path.getsize(rt_filename) + if rt_size == 0: + os.unlink(rt_filename) + + # If a partial table was written, rename it to 'leftover.rt' so its handled in + # the next loop. + elif (rt_size != TABLE_SIZE) and not out_of_files: + print(" Short file found (%u bytes / %u chains). Processing next round." % (rt_size, rt_size / 16)) + shutil.move(rt_filename, 'leftover.rt') + else: + print(" Wrote %.2f GB to %s." % (os.path.getsize(rt_filename) / (1024 ** 3), rt_filename), flush=True) + + # Write (or append) what's left in the blob to 'leftover.rt' for the next loop to + # handle. + leftover = blob.read(FILE_BUF_SIZE) + leftover_len = 0 + f = None + if len(leftover) > 0: + if os.path.exists('leftover.rt'): + f = open('leftover.rt', 'r+b') + else: + f = open('leftover.rt', 'wb') + + while leftover != b'': + leftover_len += f.write(leftover) + leftover = blob.read(FILE_BUF_SIZE) + f.close() + print(" Left-over %u bytes / %u chains will be processed next round." % (leftover_len, leftover_len / 16), flush=True) + leftover = None + + print(" Split 1 GB chunks in %u seconds." % int(time.time() - split_start), flush=True) + + # Delete the uncompressed sorted blob. + os.unlink(blob_filename) + + + # + # Compress the individual 1 GB tables. + # + + print("\nRe-compressing 1 GB blocks...", flush=True) + + recompress_start = time.time() + for i in range(current_file_num, current_file_num + 32): + rt_filename = os.path.join(temp_dir, "%s%d.rt" % (base_filename, i)) + if os.path.exists(rt_filename): + shutil.move(rt_filename, temp_dir_single) + + rtc_filename_relative, unused1, unused2 = compress_rt(relative_temp_dir_single) + rtc_path = move_rtc(rtc_filename_relative, result_dir) + print(" Created final RTC: %s" % rtc_path, flush=True) + total_rtc_bytes += os.path.getsize(rtc_path) + + for f in os.listdir(temp_dir_single): + if f.endswith('.rt'): + os.unlink(os.path.join(temp_dir_single, f)) + + print(" Re-compressed 1 GB blocks in %u seconds." % int(time.time() - recompress_start), flush=True) + + print("\nFully processed 32 GB of tables in %u seconds." % int(time.time() - block_start), flush=True) + current_file_num += 32 + +os.rmdir(temp_dir_single) +os.rmdir(temp_dir) + +print("\n\n-------------------------------------\n") +print("Raw table size: %.2f GB" % (total_rt_bytes / (1024 ** 3))) +print("Compressed table size: %.2f GB" % (total_rtc_bytes / (1024 ** 3))) +print("Compression rate: %.0f%%" % (((total_rt_bytes - total_rtc_bytes) / total_rt_bytes) * 100)) +exit(0) diff --git a/scripts/perfectify.py b/scripts/perfectify.py new file mode 100755 index 0000000..536a0f1 --- /dev/null +++ b/scripts/perfectify.py @@ -0,0 +1,161 @@ +#!/usr/bin/python3 + +# +# This is a wrapper script for the perfectify executable. Responsible for +# moving unpruned files around, running perfectify on them, then verifying them +# for correctness. +# + +import os, shutil, subprocess, sys, time + + +# The number of input rainbow tables that should be processed at a time. +num_input_files = 0 + + +def p(msg=''): + print(msg, flush=True) + + +# Returns a list of highest-numbered unpruned files, or None if no more exist. +def get_unpruned_files(unpruned_dir): + highest_index = -1 + ret = None + + rt_dict = {} + for filename in os.listdir(unpruned_dir): + upos = filename.rfind('_') + if filename.endswith('.rt') and upos != -1: + # Parse the index from the filename. Example: + # 'ntlm_ascii-32-95#8-8_0_422000x66966623_799.rt' => 799 + index = int(filename[upos+1:-3]) + + rt_dict[index] = os.path.join(unpruned_dir, filename) + + # If this is the highest index we've seen so far, update it, and + # remember its filename. + if index > highest_index: + highest_index = index + + if highest_index == -1: + return None + + ret = [rt_dict[highest_index]] + while (len(ret) < num_input_files) and (highest_index >= 0): + highest_index -= 1 + if highest_index in rt_dict: + ret.insert(0, rt_dict[highest_index]) + + return ret + + +if len(sys.argv) != 3: + p("Usage: %s unpruned_dir pruned_dir" % sys.argv[0]) + p() + p(" unpruned_dir: the directory with unpruned RT files.") + p(" pruned_dir: the directory to put pruned tables.") + p() + p("This script will automate the perfectify executable to prune one file at a time by comparing it to all the pruned tables. Then the newly-pruned table will be moved to the pruned directory. This is repeated until no more files exist in the unpruned directory.") + p() + exit(-1) + +unpruned_dir = sys.argv[1] +pruned_dir = sys.argv[2] + +# Ensure our executables exist. +if not os.path.exists("../perfectify"): + p("Error: could not find 'perfectify' executable.") + exit(-1) + +if not os.path.exists("../crackalack_verify"): + p("Error: could not find 'crackalack_verify' executable.") + exit(-1) + + +# Base the number of input files to process at a time on the total memory size of the +# system. Reserve 2 GB for the system, and 1 GB for the table to compare against. +total_memory_gb = int((os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')) / (1024**3)) +num_input_files = total_memory_gb - 2 - 1 +if num_input_files < 1: + num_input_files = 1 + + +# Print the options. +p() +p("Unpruned RT directory: %s" % unpruned_dir) +p("Pruned RT directory: %s" % pruned_dir) +p("Number of input files processed at a time: %u" % num_input_files) + +# Loop through the sorted & unpruned source RT files. +files_pruned = 0 +total_start_time = time.time() +while True: + sources = [] + destinations = [] + + block_start_time = time.time() + rt_source_abs = get_unpruned_files(unpruned_dir) + if rt_source_abs is None: + p("No unpruned tables remain.") + break + + #shutil.move(rt_source_abs, rt_destination_abs) + p("\n\n----------------\n") + p("Pruning %s...\n\n" % "\n".join(rt_source_abs)) + + try: + args = ['../perfectify'] + rt_source_abs + [pruned_dir] + proc = subprocess.run(args) + if proc.returncode != 0: + p("perfectify returned non-zero exit code (%u). Terminating..." % proc.returncode) + exit(-1) + except FileNotFoundError as e: + p("Error: could not run ../perfectify!") + exit(-1) + except subprocess.SubprocessError as e: + p("Error: SubprocessError: %s" % e) + exit(-1) + + for source_file in rt_source_abs: + sources.append(source_file) + + if not os.path.exists(source_file): + p("\nFile was possibly fully pruned(!): %s\n" % source_file) + else: + + # Ensure that the pruned file is a multiple of the chain size. + file_size = os.path.getsize(source_file) + if (file_size % 16) != 0: + p("Error: pruned file is not a multiple of the chain size!: %u" % file_size) + exit(-1) + + num_chains = file_size / 16 + + # Parses the prefix and suffix around the number of chains. + # Example: ntlm_ascii-32-95#8-8_0_422000x[num chains]_0.rt + prefix = source_file[0:source_file.rfind('x')+1] + suffix = source_file[source_file.rfind('_'):] + rt_source_abs_new = "%s%u%s" % (prefix, num_chains, suffix) + + # Rename the file with the correct number of chains. Then move it to + # the pruned directory. + rt_destination_abs = shutil.move(source_file, rt_source_abs_new) + rt_destination_abs = shutil.move(rt_destination_abs, pruned_dir) + destinations.append(rt_destination_abs) + try: + proc = subprocess.run(['../crackalack_verify', '--sorted', rt_destination_abs]) + if proc.returncode != 0: + p("crackalack_verify returned non-zero exit code (%u). Terminating..." % proc.returncode) + exit(-1) + except FileNotFoundError as e: + p("Error: could not run ../crackalack_verify!") + exit(-1) + except subprocess.SubprocessError as e: + p("Error: SubprocessError: %s" % e) + exit(-1) + + p("\nSuccessfully pruned in %u seconds: \n %s \n =>\n %s\n" % (int(time.time() - block_start_time), "\n ".join(sources), "\n ".join(destinations))) + files_pruned += len(rt_source_abs) + +p("\n\nAll %u files pruned in %u seconds.\n" % (files_pruned, int(time.time() - total_start_time))) +exit(0) diff --git a/scripts/perfectify_estimator.py b/scripts/perfectify_estimator.py new file mode 100755 index 0000000..d51b9ca --- /dev/null +++ b/scripts/perfectify_estimator.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +import sys + +if len(sys.argv) != 5: + print("Usage: %s table_start table_end total_tables time_per_table" % sys.argv[0]) + print() + print(" table_start: the number of the table to start with") + print(" table_end: the number of the table to end on") + print(" total_tables: the total number of tables") + print(" time_per_table: the number of seconds spent processing each table") + print() + exit(-1) + +start = int(sys.argv[1]) +end = int(sys.argv[2]) +total = int(sys.argv[3]) +time_per_table = float(sys.argv[4]) + +num_steps = 0 +for i in range(start, end): + for j in range(i + 1, total): + num_steps = num_steps + 1 + +num_seconds = time_per_table * num_steps +num_minutes = num_seconds / 60 +num_hours = num_minutes / 60 +num_days = num_hours / 24 + +print("Number of steps: %u" % num_steps) +print() +print("Estimated time to complete:") +print(" Seconds: %.1f" % num_seconds) +print(" Minutes: %.1f" % num_minutes) +print(" Hours: %.1f" % num_hours) +print(" Days: %.1f" % num_days) diff --git a/scripts/rename.py b/scripts/rename.py new file mode 100755 index 0000000..7d5333e --- /dev/null +++ b/scripts/rename.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import os, shutil, sys + +rtc_dir = sys.argv[1] + +for rtc in os.listdir(rtc_dir): + if not rtc.endswith(".rtc"): + continue + + upos = rtc.rfind('_') + dotpos = rtc.rfind('.') + old_index = int(rtc[upos+1:dotpos]) + new_index = old_index + 3600 + + rtc_new = '%s%d.rtc' % (rtc[0:upos+1], new_index) + + print("Renaming %s to %s..." % (rtc, rtc_new)) + shutil.move(os.path.join(rtc_dir, rtc), os.path.join(rtc_dir, rtc_new)) diff --git a/scripts/rt_sort.sh b/scripts/rt_sort.sh new file mode 100755 index 0000000..c898b22 --- /dev/null +++ b/scripts/rt_sort.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# +# This script will sort a directory of rainbow tables in parallel. +# + +GREP=/bin/grep +MKDIR=/bin/mkdir +MV=/bin/mv +PS=/bin/ps +RMDIR=/bin/rmdir +RTSORT=rainbowcrack.rtsort +SLEEP=/bin/sleep + +# Each core loads 1 GB of tables. Physical machine has 32 GB of RAM, so we subtract +# two and divide by two to ensure everything stays out of swap. +NUM_CORES=15 + +COMMON_DIR=snap/rainbowcrack/common + + +if [[ $# != 2 ]]; then + echo "Usage: $0 raw_rt_dir sorted_rt_dir" + echo + echo "Note: the raw & unsorted RT directory must be owned and writeable by the invoking process. The tables are moved to the specified sorted directory and then replaced with sorted chains. The specified sorted directory is relative to the snap/rainbowcrack/common/ directory, whereas the unsorted dir is not." + echo + exit -1 +fi + +raw_dir=$1 +sorted_dir=$2 + +pushd $COMMON_DIR > /dev/null +if [[ ! -d $sorted_dir ]]; then + $MKDIR $sorted_dir + + if [[ -d $sorted_dir ]]; then + echo "Created output directory: $sorted_dir" + else + popd > /dev/null + echo "Failed to create output directory: $sorted_dir" + exit -1 + fi +fi + +# Create the temporary directories. +temp_dirs=() +for i in `seq 1 $NUM_CORES`; do + temp_dir=$sorted_dir/$i + if [[ ! -d $temp_dir ]]; then + $MKDIR $temp_dir + echo "Created temp dir: $temp_dir" + fi + temp_dirs+=($temp_dir) +done +popd > /dev/null + +# Move each RT file into one of the temp dirs. +i=0 +for rt in $raw_dir/*.rt; do + dest_dir=${temp_dirs[$i]} + let "i++" + if [[ $i == $NUM_CORES ]]; then + i=0 + fi + + $MV $rt $COMMON_DIR/$dest_dir +done + +# Spawn one process per temp dir. +echo "Spawning processes..." +let "core_minus_one=$NUM_CORES-1" +for i in `seq 0 $core_minus_one`; do + process_dir=${temp_dirs[$i]} + echo "Process dir: $process_dir" + $RTSORT $process_dir & +done + +# Sleep 15 seconds, then check if all the sort processes have finished. +echo "Waiting for processes to finish..." +while true; do + jobs=`$PS ax | $GREP rtsort | $GREP -v grep` + if [[ $jobs == "" ]]; then + break + fi + $SLEEP 15 +done + +echo "All processes finished." + +# Move the sorted files from their temporary dirs into the final sorted dir. +pushd $COMMON_DIR > /dev/null +for i in `seq 0 $core_minus_one`; do + process_dir=${temp_dirs[$i]} + echo "Moving $process_dir/*.rt to $sorted_dir..." + $MV $process_dir/*.rt $sorted_dir + $RMDIR $process_dir +done +popd > /dev/null + +echo "Done." + diff --git a/scripts/verify_all.py b/scripts/verify_all.py new file mode 100755 index 0000000..ca6fa33 --- /dev/null +++ b/scripts/verify_all.py @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# +# Rainbow Crackalack: verify_all.py +# Copyright (C) 2019 Joe Testa +# +# This program is free software: you can redistribute it and/or modify +# it under the terms version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# +# This script will run "crackalack_verify" on all *.rt and *.rtc files +# in a target directory. +# + +import os, subprocess, sys, time + + +CRACKALACK_VERIFY = '../crackalack_verify' + +if len(sys.argv) != 2: + print("This script will run crackalack_verify on all sorted *.rt and *.rtc files in a target directory.\n\nUsage: %s /path/to/dir" % sys.argv[0]) + exit(-1) + +target_dir = sys.argv[1] + +if not os.path.exists(CRACKALACK_VERIFY): + print("Error: %s not found!" % CRACKALACK_VERIFY) + exit(-1) + +# Count the total number of files in the directory. +num_files = 0 +for file in os.listdir(target_dir): + if not (file.endswith('.rt') or file.endswith('.rtc')): + continue + + num_files += 1 + +start_time = time.time() +file_number = 0 +for file in os.listdir(target_dir): + if not (file.endswith('.rt') or file.endswith('.rtc')): + continue + + try: + file_number += 1 + print("[%u of %u] Verifying %s..." % (file_number, num_files, file)) + + file_start_time = time.time() + proc = subprocess.run([CRACKALACK_VERIFY, '--sorted', os.path.join(target_dir, file)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + print("\n\nFAILED! %s return non-zero exit code: %d\nStdout: [%s]\nStderr: [%s]\n" % (CRACKALACK_VERIFY, proc.returncode, proc.stdout.decode('ascii'), proc.stdout.decode('ascii'))) + exit(-1) + + print("Finished in %.1f seconds.\n" % float(time.time() - file_start_time)) + except FileNotFoundError as e: + print("Error: could not run %s!" % CRACKALACK_VERIFY) + exit(-1) + except subprocess.SubprocessError as e: + print("Error: SubprocessError: %s" % e) + exit(-1) + +print("\n\nSUCCESS. Processed %u files in %.1f minutes" % (num_files, float(time.time() - start_time) / 60.0)) diff --git a/shared.h b/shared.h new file mode 100644 index 0000000..472df45 --- /dev/null +++ b/shared.h @@ -0,0 +1,21 @@ +/* Constants shared between host programs and OpenCL kernels. */ + +#ifndef _SHARED_H +#define _SHARED_H + +#define HASH_UNDEFINED 0 +#define HASH_LM 1 +#define HASH_NTLM 2 +#define HASH_MD5 3 +#define HASH_SHA1 4 + +#define MAX_PLAINTEXT_LEN 16 +#define MAX_HASH_OUTPUT_LEN 16 +#define MAX_CHARSET_LEN 96 + +#define DEBUG_LEN 32 + +/* Converts a table index to a reduction offset. */ +#define TABLE_INDEX_TO_REDUCTION_OFFSET(_table_index) (_table_index * 65536) + +#endif diff --git a/terminal_color.h b/terminal_color.h new file mode 100644 index 0000000..c001373 --- /dev/null +++ b/terminal_color.h @@ -0,0 +1,43 @@ +#ifndef _TERMINAL_COLOR_H +#define _TERMINAL_COLOR_H + + +char *CLR = "\033[0m"; +char *WHITEB = "\033[1;97m"; /* White + bold */ +char *ITALICIZE = "\033[3m"; +char *GREEN = "\033[0;32m"; +char *RED = "\033[0;31m"; +char *GREENB = "\033[1;32m"; /* Green + bold */ +char *REDB = "\033[1;31m"; /* Red + bold */ + + +#ifdef _WIN32 + +#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING +#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004 +#endif + +#define ENABLE_CONSOLE_COLOR() \ + /* Attempt to enable console colors. This succeeds in Windows 10. For other OSes \ + * color is disabled. */ \ + HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); \ + DWORD consoleMode = 0; \ + if ((hConsole == INVALID_HANDLE_VALUE) || (!GetConsoleMode(hConsole, &consoleMode)) || (!SetConsoleMode(hConsole, consoleMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING))) { \ + CLR = ""; \ + WHITEB = ""; \ + ITALICIZE = ""; \ + GREEN = ""; \ + RED = ""; \ + GREENB = ""; \ + REDB = ""; \ + } + +#else + +/* Do nothing: Linux consoles have color enabled by default. */ +#define ENABLE_CONSOLE_COLOR() + +#endif + + +#endif diff --git a/test_chain.c b/test_chain.c new file mode 100644 index 0000000..d3a8776 --- /dev/null +++ b/test_chain.c @@ -0,0 +1,172 @@ +/* + * Rainbow Crackalack: test_chain.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "cpu_rt_functions.h" +#include "misc.h" +#include "opencl_setup.h" +#include "shared.h" +#include "test_shared.h" +#include "test_chain.h" + + +struct chain_test { + char charset[MAX_CHARSET_LEN]; + unsigned int plaintext_len_min; + unsigned int plaintext_len_max; + unsigned int table_index; + unsigned int chain_len; + uint64_t start; + uint64_t end; +}; + +struct chain_test lm_chain_tests[] = { + /* From open-source RainbowCrack v1.2. */ + {CHARSET_ALPHA, 1, 7, 0, 2, 4216457714UL, 4110420946UL}, + {CHARSET_ALPHA, 1, 7, 0, 10000, 5134134059UL, 4643897595UL}, + {CHARSET_ALPHA, 1, 7, 5, 25777, 1223762207UL, 3058691277UL}, + + /* From closed-source RainbowCrack v1.7. */ + {CHARSET_ALPHA, 1, 7, 3, 999, 0UL, 6846105348UL}, + {CHARSET_ALPHA, 1, 7, 6, 5000, 999UL, 2849418373UL}, + {CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE, 1, 7, 1, 2, 0UL, 5175055677957UL}, + /*{CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE, 1, 7, 2, 10, 0, 4634515320952},*/ + {CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE, 1, 7, 2, 345, 9001UL, 6921712277323UL}, + {CHARSET_ASCII_32_65_123_4, 1, 7, 4, 666, 6969UL, 481794222594UL}, + + /* Misc. */ + /*{CHARSET_ALPHA, 1, 7, 0, 100, 319, 2872914729},*/ + /*{CHARSET_ALPHA, 1, 7, 0, 100, 100, 1074664166},*/ + /*{CHARSET_ALPHA, 1, 7, 0, 100, 666, 6566110770},*/ +}; + +struct chain_test ntlm_chain_tests[] = { + {CHARSET_ASCII_32_65_123_4, 1, 8, 0, 1000, 0UL, 495620913785177UL}, + {CHARSET_ASCII_32_95, 8, 8, 0, 666, 456UL, 6003715575086450UL}, + {CHARSET_ASCII_32_95, 9, 9, 0, 1000000, 9999UL, 197723360287561186UL}, +}; + + +/* Test a chain using the CPU. */ +int cpu_test_chain(char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int table_index, unsigned int chain_len, uint64_t start, uint64_t expected_end) { + uint64_t plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + uint64_t computed_end = 0, plaintext_space_total = 0; + unsigned char hash[16] = {0}; + char plaintext[MAX_PLAINTEXT_LEN] = {0}; + unsigned int hash_len = sizeof(hash), plaintext_len = sizeof(plaintext); + + + plaintext_space_total = fill_plaintext_space_table(strlen(charset), plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + computed_end = generate_rainbow_chain(HASH_NTLM, charset, strlen(charset), plaintext_len_min, plaintext_len_max, TABLE_INDEX_TO_REDUCTION_OFFSET(table_index), chain_len, start, plaintext_space_up_to_index, plaintext_space_total, plaintext, &plaintext_len, hash, &hash_len); + + if (computed_end != expected_end) { + fprintf(stderr, "\n\nCPU error:\n\tExpected chain end: %"PRIu64"\n\tComputed chain end: %"PRIu64"\n\n", expected_end, computed_end); + return 0; + } + + return 1; +} + + +/* Test a chain using the GPU. */ +int gpu_test_chain(cl_device_id device, cl_context context, cl_kernel kernel, char *charset, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int table_index, unsigned int chain_len, uint64_t start, uint64_t expected_end) { + CLMAKETESTVARS(); + + int test_passed = 0; + + cl_mem charset_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, table_index_buffer = NULL, chain_len_buffer = NULL, start_buffer = NULL, end_buffer = NULL, debug_buffer = NULL; + + unsigned char *debug_ptr = NULL; + cl_ulong *end_ptr = NULL; + cl_ulong end = 0; + + + queue = CLCREATEQUEUE(context, device); + + CLCREATEARG_ARRAY(0, charset_buffer, CL_RO, charset, strlen(charset) + 1); + CLCREATEARG(1, plaintext_len_min_buffer, CL_RO, plaintext_len_min, sizeof(plaintext_len_min)); + CLCREATEARG(2, plaintext_len_max_buffer, CL_RO, plaintext_len_max, sizeof(plaintext_len_max)); + CLCREATEARG(3, table_index_buffer, CL_RO, table_index, sizeof(table_index)); + CLCREATEARG(4, chain_len_buffer, CL_RO, chain_len, sizeof(chain_len)); + CLCREATEARG(5, start_buffer, CL_RO, start, sizeof(start)); + CLCREATEARG(6, end_buffer, CL_WO, end, sizeof(end)); + CLCREATEARG_DEBUG(7, debug_buffer, debug_ptr); + + CLRUNKERNEL(queue, kernel, &global_work_size); + CLFLUSH(queue); + CLWAIT(queue); + + end_ptr = calloc(1, sizeof(cl_ulong)); + if (end_ptr == NULL) { + fprintf(stderr, "Error while creating output buffer.\n"); + exit(-1); + } + + CLREADBUFFER(end_buffer, sizeof(cl_ulong), end_ptr); + + if (*end_ptr == expected_end) + test_passed = 1; + else + printf("\n\n\tStart: %"PRIu64"\n\tExpected end: %"PRIu64"\n\tComputed end: %"PRIu64"\n\n", start, expected_end, *end_ptr); + + /* + READBUFFER(debug_buffer, DEBUG_LEN, debug_ptr); + printf("debug: "); + for (i = 0; i < DEBUG_LEN; i++) + printf("%x ", debug_ptr[i]); + printf("\n"); + */ + + CLFREEBUFFER(charset_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(table_index_buffer); + CLFREEBUFFER(chain_len_buffer); + CLFREEBUFFER(start_buffer); + CLFREEBUFFER(end_buffer); + CLFREEBUFFER(debug_buffer); + + CLRELEASEQUEUE(queue); + + FREE(end_ptr); + FREE(debug_ptr); + return test_passed; +} + + +int test_chain(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type) { + int tests_passed = 1; + unsigned int i = 0; + + if (hash_type == HASH_LM) { + for (i = 0; i < (sizeof(lm_chain_tests) / sizeof(struct chain_test)); i++) { + tests_passed &= gpu_test_chain(device, context, kernel, lm_chain_tests[i].charset, lm_chain_tests[i].plaintext_len_min, lm_chain_tests[i].plaintext_len_max, lm_chain_tests[i].table_index, lm_chain_tests[i].chain_len, lm_chain_tests[i].start, lm_chain_tests[i].end); + } + } else if (hash_type == HASH_NTLM) { + for (i = 0; i < (sizeof(ntlm_chain_tests) / sizeof(struct chain_test)); i++) { + tests_passed &= gpu_test_chain(device, context, kernel, ntlm_chain_tests[i].charset, ntlm_chain_tests[i].plaintext_len_min, ntlm_chain_tests[i].plaintext_len_max, ntlm_chain_tests[i].table_index, ntlm_chain_tests[i].chain_len, ntlm_chain_tests[i].start, ntlm_chain_tests[i].end); + tests_passed &= cpu_test_chain(ntlm_chain_tests[i].charset, ntlm_chain_tests[i].plaintext_len_min, ntlm_chain_tests[i].plaintext_len_max, ntlm_chain_tests[i].table_index, ntlm_chain_tests[i].chain_len, ntlm_chain_tests[i].start, ntlm_chain_tests[i].end); + } + } + + return tests_passed; +} diff --git a/test_chain.h b/test_chain.h new file mode 100644 index 0000000..4c765ed --- /dev/null +++ b/test_chain.h @@ -0,0 +1,6 @@ +#ifndef TEST_CHAIN_H +#define TEST_CHAIN_H + +int test_chain(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type); + +#endif diff --git a/test_des.c b/test_des.c new file mode 100644 index 0000000..0f99bed --- /dev/null +++ b/test_des.c @@ -0,0 +1,142 @@ +/* + * Rainbow Crackalack: test_des.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "opencl_setup.h" +#include "test_des.h" + + +#define DES_TEST_INPUT_SIZE 8 +#define DES_TEST_KEY_SIZE 8 +#define DES_TEST_OUTPUT_SIZE 8 + +struct des_test { + unsigned char key[16]; + unsigned char plaintext[16]; + unsigned char ciphertext[16]; +}; + +/* From https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nbsspecialpublication500-20e1980.pdf, page 33: */ +struct des_test des_tests[] = { + /*{"4FB05E1515AB73A7", "072D43A077075292", "2F22E49BAB7CA1AC"}, + {"49E95D6D4CA229BF", "02FE55778117F12A", "5A6B612CC26CCE4A"}, + {"018310DC409B26D6", "1D9D5C5018F728C2", "5F4C038ED12B2E41"}, + {"1C587F1C13924FEF", "305532286D6F295A", "63FAC0D034D9F793"}*/ + {"0000000000000000", "unused", "aad3b435b51404ee"} +}; + + +int _test_des(cl_device_id device, cl_context context, cl_kernel kernel, unsigned char *plaintext, unsigned char *testkey, unsigned char *ciphertext) { + CLMAKETESTVARS(); + int test_passed = 0; + + cl_mem input_buffer = NULL, key_buffer = NULL, output_buffer = NULL; + cl_mem debug_buffer = NULL; + + unsigned char *input = NULL, *key = NULL, *output = NULL; + unsigned int *debug = NULL; + unsigned int i = 0, u = 0; + + unsigned char expected_output[8] = {0}; + char hex[3] = {0}; + + + memset(expected_output, 0, sizeof(expected_output)); + memset(hex, 0, sizeof(hex)); + + queue = CLCREATEQUEUE(context, device); + + input = calloc(DES_TEST_INPUT_SIZE, sizeof(unsigned char)); + key = calloc(DES_TEST_KEY_SIZE, sizeof(unsigned char)); + output = calloc(DES_TEST_OUTPUT_SIZE, sizeof(unsigned char)); + #define DEBUG_LEN 16 + debug = calloc(DEBUG_LEN, sizeof(unsigned int)); + if ((input == NULL) || (key == NULL) || (output == NULL) || (debug == NULL)) { + fprintf(stderr, "Failed to create I/O arrays.\n"); + exit(-1); + } + + /* Convert the string hex into bytes. */ + for (i = 0; i < 8; i++) { + memcpy(hex, plaintext + (i * 2), 2); + sscanf(hex, "%2x", &u); + input[i] = u; + + memcpy(hex, testkey + (i * 2), 2); + sscanf(hex, "%2x", &u); + key[i] = u; + + memcpy(hex, ciphertext + (i * 2), 2); + sscanf(hex, "%2x", &u); + expected_output[i] = u; + } + + CLCREATEARG_ARRAY(0, input_buffer, CL_RO, input, DES_TEST_INPUT_SIZE * sizeof(unsigned char)); + CLCREATEARG_ARRAY(1, key_buffer, CL_RO, key, DES_TEST_KEY_SIZE * sizeof(unsigned char)); + CLCREATEARG_ARRAY(2, output_buffer, CL_WO, output, DES_TEST_OUTPUT_SIZE * sizeof(unsigned char)); + CLCREATEARG_ARRAY(3, debug_buffer, CL_WO, debug, DEBUG_LEN * sizeof(unsigned int)); + + CLRUNKERNEL(queue, kernel, &global_work_size); + CLFLUSH(queue); + CLWAIT(queue); + + CLREADBUFFER(output_buffer, DES_TEST_OUTPUT_SIZE * sizeof(unsigned char), output); + CLREADBUFFER(debug_buffer, DEBUG_LEN * sizeof(unsigned int), debug); + + printf("\nDEBUG: %x %x\n\n", debug[0], debug[1]); + + if (memcmp(output, expected_output, 8) == 0) + test_passed = 1; + else { + test_passed = 0; + + printf("\n\n\tExpected: "); + for(i = 0; i < 8; i++) + printf("%02x", expected_output[i]); + printf("\n\tComputed: "); + for(i = 0; i < DES_TEST_OUTPUT_SIZE; i++) + printf("%02x", output[i]); + printf("\n\n"); + } + + CLFREEBUFFER(input_buffer); + CLFREEBUFFER(key_buffer); + CLFREEBUFFER(output_buffer); + + CLRELEASEQUEUE(queue); + + FREE(input); + FREE(key); + FREE(output); + FREE(debug); + return test_passed; +} + + +int test_des(cl_device_id device, cl_context context, cl_kernel kernel) { + int tests_passed = 1; + unsigned int i = 0; + + for (i = 0; i < (sizeof(des_tests) / sizeof(struct des_test)); i++) { + tests_passed &= _test_des(device, context, kernel, des_tests[i].plaintext, des_tests[i].key, des_tests[i].ciphertext); + } + + return tests_passed; +} diff --git a/test_des.h b/test_des.h new file mode 100644 index 0000000..86eb7e9 --- /dev/null +++ b/test_des.h @@ -0,0 +1,6 @@ +#ifndef TEST_DES_H +#define TEST_DES_H + +int test_des(cl_device_id device, cl_context context, cl_kernel kernel); + +#endif diff --git a/test_hash.c b/test_hash.c new file mode 100644 index 0000000..f972376 --- /dev/null +++ b/test_hash.c @@ -0,0 +1,189 @@ +/* + * Rainbow Crackalack: test_hash.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +#include "cpu_rt_functions.h" +#include "misc.h" +#include "opencl_setup.h" +#include "shared.h" +#include "test_shared.h" + +#include "test_hash.h" + + +struct hash_test { + char input[MAX_PLAINTEXT_LEN]; + char output[(MAX_HASH_OUTPUT_LEN * 2) + 1]; +}; + +struct hash_test lm_hash_tests[] = { + {"", "aad3b435b51404ee"}, + {"passwor", "9b39717fb8d352de"}, + {"PASSWOR", "e52cac67419a9a22"}, + {"ABC", "8c6f5d02deb21501"}, + {"ABCD", "e165f0192ef85ebb"}, + {"MPVVOLO", "d663e6cf87a4a45c"}, + {"0123456", "5645f13f500882b2"}, + {"()", "a31ffd349f83b8ca"}, + {"!!!!", "844181211e2dd527"}, + {"Z1Y2X3:", "04c20b9a5e0c54ce"}, + {"!@#$%^&", "d0daebaf1cff9d12"}, + {"@#$%^&*", "60faab48f42dcd8b"} +}; + +struct hash_test ntlm_hash_tests[] = { + {"", "31d6cfe0d16ae931b73c59d7e0c089c0"}, + {"12345", "7a21990fcd3d759941e45c490f143d5f"}, + {"abc123", "f9e37e83b83c47a93c2f09f66408631b"}, + {"password", "8846f7eaee8fb117ad06bdd830b7586c"}, + {"computer", "2b2ac2d1c7c8fda6cea80b5fad7563aa"}, + {"123456", "32ed87bdb5fdc5e9cba88547376818d4"}, + {"tigger", "b7e0ea9fbffcf6dd83086e905089effd"}, + {"1234", "7ce21f17c0aee7fb9ceba532d0546ad6"}, + {"Hockey7!", "ff91fb3186204bbd651dc6d163d7f113"}, + {"C1t1z3n#", "ff0bc475edd85a6af13afd6e4c5039a9"}, + {"London101#", "fe8b0b8163e31388ed16680f5bc6a086"}, + {"Holiday!1234", "fddf95b2194203ddc84d53e822510005"}, +}; + + +/* Creates and tests a hash using the CPU. */ +int cpu_test_hash_ntlm(char *input, char *expected_output_hex) { + unsigned char hash[16] = {0}; + char hash_hex[(sizeof(hash) * 2) + 1] = {0}; + + + ntlm_hash(input, strlen(input), hash); + if (!bytes_to_hex(hash, sizeof(hash), hash_hex, sizeof(hash_hex))) + return 0; + + if (strcmp(hash_hex, expected_output_hex) != 0) { + printf("\n\nCPU Error:\n\tPlaintext: %s\n\tExpected hash: %s\n\tComputed hash: %s\n\n", input, expected_output_hex, hash_hex); + return 0; + } else + return 1; +} + + +/* Creates and tests a hash using the GPU. */ +int gpu_test_hash(cl_device_id device, cl_context context, cl_kernel kernel, char *_input, char *expected_output_hex) { + CLMAKETESTVARS(); + int test_passed = 0; + int i = 0; + + unsigned char expected_output[32] = {0}; + unsigned int expected_output_len = 0; + + cl_mem alg_buffer = NULL, input_buffer = NULL, input_len_buffer = NULL, output_buffer = NULL, output_len_buffer = NULL, debug_buffer = NULL; + + char *input = NULL; + unsigned char *output = NULL, *debug_ptr = NULL; + /*unsigned int *debug_ptr = NULL;*/ + + cl_uint hash_type = HASH_UNDEFINED; + cl_uint input_len = 0, output_len = 0; + + + queue = CLCREATEQUEUE(context, device); + + output = calloc(MAX_HASH_OUTPUT_LEN, sizeof(unsigned char)); + debug_ptr = calloc(DEBUG_LEN, sizeof(unsigned char)); + if ((output == NULL) || (debug_ptr == NULL)) { + fprintf(stderr, "Error creating I/O arrays.\n"); + exit(-1); + } + + + /* clCreateBuffer() doesn't like zero-length buffers... */ + if (strlen(_input) == 0) { + input = strdup("X"); + input_len = 0; + } else { + input = strdup(_input); + input_len = strlen(input); + /*input = calloc(input_len + 1, sizeof(unsigned char)); + strncpy(input, _input, input_len);*/ + } + + CLCREATEARG(0, alg_buffer, CL_RO, hash_type, sizeof(hash_type)); + CLCREATEARG_ARRAY(1, input_buffer, CL_RO, input, strlen(input) + 1); + CLCREATEARG(2, input_len_buffer, CL_RO, input_len, sizeof(cl_uint)); + CLCREATEARG_ARRAY(3, output_buffer, CL_WO, output, MAX_HASH_OUTPUT_LEN); + CLCREATEARG(4, output_len_buffer, CL_WO, output_len, sizeof(cl_uint)); + CLCREATEARG_DEBUG(5, debug_buffer, debug_ptr); + /*CLCREATEARG_ARRAY(5, debug_buffer, CL_WO, debug_ptr, DEBUG_LEN * sizeof(unsigned int));*/ + + CLRUNKERNEL(queue, kernel, &global_work_size); + CLFLUSH(queue); + CLWAIT(queue); + + CLREADBUFFER(output_buffer, MAX_HASH_OUTPUT_LEN, output); + CLREADBUFFER(output_len_buffer, sizeof(cl_uint), &output_len); + CLREADBUFFER(debug_buffer, DEBUG_LEN, debug_ptr); + + expected_output_len = hex_to_bytes(expected_output_hex, sizeof(expected_output), expected_output); + if ((expected_output_len == output_len) && (memcmp(output, expected_output, expected_output_len) == 0)) + test_passed = 1; + else { + printf("\n\nGPU Error:\n\tPlaintext: %s\n\tExpected hash: %s\n\tComputed hash: ", input, expected_output_hex); + for (i = 0; i < output_len; i++) + printf("%02x", output[i]); + printf("\n\n"); + } + /* + printf("debug: "); + for (i = 0; i < DEBUG_LEN; i++) + printf("%x ", debug_ptr[i]); + printf("\n"); + */ + CLFREEBUFFER(alg_buffer); + CLFREEBUFFER(input_buffer); + CLFREEBUFFER(input_len_buffer); + CLFREEBUFFER(output_buffer); + CLFREEBUFFER(output_len_buffer); + CLFREEBUFFER(debug_buffer); + + CLRELEASEQUEUE(queue); + + FREE(input); + FREE(debug_ptr); + return test_passed; +} + + +int test_hash(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type) { + int tests_passed = 1; + unsigned int i = 0; + + if (hash_type == HASH_LM) { + for (i = 0; i < (sizeof(lm_hash_tests) / sizeof(struct hash_test)); i++) + tests_passed &= gpu_test_hash(device, context, kernel, lm_hash_tests[i].input, lm_hash_tests[i].output); + } else if (hash_type == HASH_NTLM) { + for (i = 0; i < (sizeof(ntlm_hash_tests) / sizeof(struct hash_test)); i++) { + tests_passed &= gpu_test_hash(device, context, kernel, ntlm_hash_tests[i].input, ntlm_hash_tests[i].output); + tests_passed &= cpu_test_hash_ntlm(ntlm_hash_tests[i].input, ntlm_hash_tests[i].output); + } + } else { + fprintf(stderr, "Error: unimplemented hash: %u\n", hash_type); + tests_passed = 0; + } + + return tests_passed; +} diff --git a/test_hash.h b/test_hash.h new file mode 100644 index 0000000..34ce82f --- /dev/null +++ b/test_hash.h @@ -0,0 +1,6 @@ +#ifndef TEST_HASH_H +#define TEST_HASH_H + +int test_hash(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type); + +#endif diff --git a/test_hash_to_index.c b/test_hash_to_index.c new file mode 100644 index 0000000..b021b42 --- /dev/null +++ b/test_hash_to_index.c @@ -0,0 +1,185 @@ +/* + * Rainbow Crackalack: test_hash_to_index.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "cpu_rt_functions.h" +#include "misc.h" +#include "opencl_setup.h" +#include "shared.h" +#include "test_shared.h" +#include "test_hash_to_index.h" + + +struct h2i_test { + char hash[MAX_HASH_OUTPUT_LEN * 2]; + unsigned int charset_len; + unsigned int plaintext_len_min; + unsigned int plaintext_len_max; + unsigned int table_index; + unsigned int pos; + uint64_t index; +}; + +struct h2i_test lm_h2i_tests[] = { + {"aabbccddeeff0011", CHARSET_ALPHA_LEN, 1, 7, 0, 0, 156475956UL}, + {"aabbccddeeff0011", CHARSET_ALPHA_LEN, 1, 7, 0, 100, 156476056UL}, + {"aabbccddeeff0011", CHARSET_ALPHA_LEN, 1, 7, 8, 0, 157000244UL}, + {"d663e6cf87a4a45c", CHARSET_ALPHA_LEN, 1, 7, 0, 0, 4110420946UL}, + {"0102030405060708", CHARSET_ALPHA_LEN, 1, 7, 5, 750, 8350441011UL}, + {"0102030405060708", CHARSET_ALPHA_LEN, 1, 7, 30, 20000, 8352098661UL}, + {"ffeeddccbbaa9988", CHARSET_ASCII_32_65_123_4_LEN, 1, 7, 0, 0, 1381910712028UL}, + {"123456789abcdef0", CHARSET_ASCII_32_65_123_4_LEN, 1, 7, 7, 9999, 281009513815UL}, +}; +struct h2i_test ntlm_h2i_tests[] = { + /*{"", CHARSET_ASCII_32_65_123_4_LEN, 1, 8, 0, 0, 0},*/ + {"123456789abcdef0", CHARSET_ASCII_32_95_LEN, 8, 8, 0, 666, 1438903040496756UL}, +}; + + +int cpu_test_h2i(char *hash_hex, uint64_t plaintext_space_total, unsigned int table_index, unsigned int pos, uint64_t expected_index) { + unsigned char hash[MAX_HASH_OUTPUT_LEN] = {0}; + unsigned int hash_len = hex_to_bytes(hash_hex, sizeof(hash), hash); + uint64_t computed_index = hash_to_index(hash, hash_len, TABLE_INDEX_TO_REDUCTION_OFFSET(table_index), plaintext_space_total, pos); + + + if (computed_index == expected_index) + return 1; + else { + printf("\n\nCPU error:\n\tExpected index: %"PRIu64"\n\tComputed index: %"PRIu64"\n", expected_index, computed_index); + return 0; + } +} + + +int gpu_test_h2i(cl_device_id device, cl_context context, cl_kernel kernel, char *hash_hex, unsigned int charset_len, unsigned int plaintext_len_min, unsigned int plaintext_len_max, unsigned int table_index, unsigned int pos, uint64_t expected_index) { + CLMAKETESTVARS(); + + int test_passed = 0; + + cl_mem hash_buffer = NULL, hash_len_buffer = NULL, charset_len_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, table_index_buffer = NULL, pos_buffer = NULL, index_buffer = NULL, debug_buffer = NULL; + + unsigned char hash[MAX_HASH_OUTPUT_LEN] = {0}; + unsigned int hash_len = 0; + cl_ulong index = 0; + + unsigned char *debug_ptr = NULL; + cl_ulong *index_ptr = NULL; + + queue = CLCREATEQUEUE(context, device); + + hash_len = hex_to_bytes(hash_hex, sizeof(hash), hash); + CLCREATEARG_ARRAY(0, hash_buffer, CL_RO, hash, hash_len); + CLCREATEARG(1, hash_len_buffer, CL_RO, hash_len, sizeof(hash_len)); + CLCREATEARG(2, charset_len_buffer, CL_RO, charset_len, sizeof(charset_len)); + CLCREATEARG(3, plaintext_len_min_buffer, CL_RO, plaintext_len_min, sizeof(plaintext_len_min)); + CLCREATEARG(4, plaintext_len_max_buffer, CL_RO, plaintext_len_max, sizeof(plaintext_len_max)); + CLCREATEARG(5, table_index_buffer, CL_RO, table_index, sizeof(table_index)); + CLCREATEARG(6, pos_buffer, CL_RO, pos, sizeof(pos)); + CLCREATEARG(7, index_buffer, CL_WO, index, sizeof(index)); + CLCREATEARG_DEBUG(8, debug_buffer, debug_ptr); + + CLRUNKERNEL(queue, kernel, &global_work_size); + CLFLUSH(queue); + CLWAIT(queue); + + index_ptr = calloc(1, sizeof(cl_ulong)); + if (index_ptr == NULL) { + fprintf(stderr, "Error while creating output buffer.\n"); + exit(-1); + } + + CLREADBUFFER(index_buffer, sizeof(cl_ulong), index_ptr); + + if (*index_ptr == expected_index) + test_passed = 1; + else { + printf("\n\nGPU Error:\n\tExpected index: %"PRIu64"\n\tComputed index: %"PRIu64"\n\n", expected_index, *index_ptr); + } + + /* + READBUFFER(debug_buffer, DEBUG_LEN, debug_ptr); + printf("debug: "); + for (i = 0; i < DEBUG_LEN; i++) + printf("%x ", debug_ptr[i]); + printf("\n"); + */ + + CLFREEBUFFER(hash_buffer); + CLFREEBUFFER(hash_len_buffer); + CLFREEBUFFER(charset_len_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(table_index_buffer); + CLFREEBUFFER(pos_buffer); + CLFREEBUFFER(index_buffer); + CLFREEBUFFER(debug_buffer); + CLRELEASEQUEUE(queue); + + FREE(index_ptr); + FREE(debug_ptr); + return test_passed; +} + +int test_h2i(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type) { + int tests_passed = 1; + unsigned int i = 0; + uint64_t plaintext_space_total = 0; + uint64_t plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + + + if (hash_type == HASH_LM) { + for (i = 0; i < (sizeof(lm_h2i_tests) / sizeof(struct h2i_test)); i++) { + + /* For LM tests, ensure that the hex is 16 characters long (8 bytes). + * Otherwise, the test is broken. */ + /*if ((lm_h2i_tests[i].hash_type == HASH_LM) && + (strlen(lm_h2i_tests[i].hash) != 16)) { + fprintf(stderr, "Error: h2i_test has invalid hash length: %zu [%s]\n", strlen(lm_h2i_tests[i].hash), lm_h2i_tests[i].hash); + exit(-1); + }*/ + + tests_passed &= gpu_test_h2i(device, context, kernel, lm_h2i_tests[i].hash, lm_h2i_tests[i].charset_len, lm_h2i_tests[i].plaintext_len_min, lm_h2i_tests[i].plaintext_len_max, lm_h2i_tests[i].table_index, lm_h2i_tests[i].pos, lm_h2i_tests[i].index); + + plaintext_space_total = fill_plaintext_space_table(lm_h2i_tests[i].charset_len, lm_h2i_tests[i].plaintext_len_min, lm_h2i_tests[i].plaintext_len_max, plaintext_space_up_to_index); + + tests_passed &= cpu_test_h2i(lm_h2i_tests[i].hash, plaintext_space_total, lm_h2i_tests[i].table_index, lm_h2i_tests[i].pos, lm_h2i_tests[i].index); + } + } else if (hash_type == HASH_NTLM) { + for (i = 0; i < (sizeof(ntlm_h2i_tests) / sizeof(struct h2i_test)); i++) { + + /* For NTLM tests, ensure that the hex is 32 characters long (16 bytes). + * Otherwise, the test is broken. */ + /*if ((ntlm_h2i_tests[i].hash_type == HASH_LM) && + (strlen(ntlm_h2i_tests[i].hash) != 32)) { + fprintf(stderr, "Error: h2i_test has invalid hash length: %zu [%s]\n", strlen(ntlm_h2i_tests[i].hash), ntlm_h2i_tests[i].hash); + exit(-1); + }*/ + + tests_passed &= gpu_test_h2i(device, context, kernel, ntlm_h2i_tests[i].hash, ntlm_h2i_tests[i].charset_len, ntlm_h2i_tests[i].plaintext_len_min, ntlm_h2i_tests[i].plaintext_len_max, ntlm_h2i_tests[i].table_index, ntlm_h2i_tests[i].pos, ntlm_h2i_tests[i].index); + + plaintext_space_total = fill_plaintext_space_table(ntlm_h2i_tests[i].charset_len, ntlm_h2i_tests[i].plaintext_len_min, ntlm_h2i_tests[i].plaintext_len_max, plaintext_space_up_to_index); + + tests_passed &= cpu_test_h2i(ntlm_h2i_tests[i].hash, plaintext_space_total, ntlm_h2i_tests[i].table_index, ntlm_h2i_tests[i].pos, ntlm_h2i_tests[i].index); + } + } + + return tests_passed; +} diff --git a/test_hash_to_index.h b/test_hash_to_index.h new file mode 100644 index 0000000..0c6eb8d --- /dev/null +++ b/test_hash_to_index.h @@ -0,0 +1,6 @@ +#ifndef TEST_HASH_TO_INDEX_H +#define TEST_HASH_TO_INDEX_H + +int test_h2i(cl_device_id device, cl_context context, cl_kernel kernel, unsigned int hash_type); + +#endif diff --git a/test_index_to_plaintext.c b/test_index_to_plaintext.c new file mode 100644 index 0000000..3ee64c5 --- /dev/null +++ b/test_index_to_plaintext.c @@ -0,0 +1,156 @@ +/* + * Rainbow Crackalack: test_index_to_plaintext.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "cpu_rt_functions.h" +#include "misc.h" +#include "opencl_setup.h" +#include "shared.h" +#include "test_shared.h" +#include "test_index_to_plaintext.h" + +struct i2p_test { + char charset[MAX_CHARSET_LEN]; + cl_uint plaintext_len_min; + cl_uint plaintext_len_max; + cl_ulong index; + char expected_plaintext[MAX_PLAINTEXT_LEN]; +}; + +struct i2p_test i2p_tests[] = { + {CHARSET_ALPHA, 1, 7, 0, "A"}, + {CHARSET_ALPHA, 1, 7, 13, "N"}, + {CHARSET_ALPHA, 1, 7, 25, "Z"}, + {CHARSET_ALPHA, 1, 7, 1234, "AUM"}, + {CHARSET_ALPHA, 1, 7, 9999, "NTP"}, + {CHARSET_ALPHA, 1, 7, 8675309, "RYOGT"}, + {CHARSET_ALPHA, 1, 7, 123456789, "JJDDJB"}, + {CHARSET_ALPHA, 1, 7, 2813308004, "IBTIHFA"}, + {CHARSET_ALPHA, 1, 7, 4216457714, "MPVVOLO"}, + {CHARSET_ALPHA, 1, 7, 9876543210, "EYFULZO"}, + {CHARSET_ALPHA, 1, 7, 10550285459311888740UL, "EWBFATI"}, + {CHARSET_ASCII_32_95, 1, 8, 666666666666666, "(Rv!f^-+"}, + {CHARSET_MIXALPHA_NUMERIC, 1, 8, 131313131313131313, "yUsFdo5P"}, + {CHARSET_ALPHA_NUMERIC_SYMBOL32_SPACE, 1, 8, 10450885951331886755UL, "36$&'L.."}, + {CHARSET_ASCII_32_95, 8, 8, 5222991064626285, "jk5(J-f\\"}, + {CHARSET_ASCII_32_95, 9, 9, 381435424925352145, "3!u]YO*f%"} +}; + + +int cpu_test_index_to_plaintext(char *charset, cl_uint plaintext_len_min, cl_uint plaintext_len_max, cl_ulong index, char *expected_plaintext) { + uint64_t plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + char computed_plaintext[MAX_PLAINTEXT_LEN] = {0}; + unsigned int computed_plaintext_len = 0; + + + fill_plaintext_space_table(strlen(charset), plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index); + + index_to_plaintext(index, charset, strlen(charset), plaintext_len_min, plaintext_len_max, plaintext_space_up_to_index, computed_plaintext, &computed_plaintext_len); + if (strcmp(computed_plaintext, expected_plaintext) == 0) + return 1; + else { + printf("\n\nCPU error:\n\tIndex: %"PRIu64"\n\tExpected: [%"PRIu64"][%s]\n\tCalculated: [%u][%s]\n\n", index, strlen(expected_plaintext), expected_plaintext, computed_plaintext_len, computed_plaintext); + return 0; + } +} + + +int gpu_test_index_to_plaintext(cl_device_id device, cl_context context, cl_kernel kernel, char *charset, cl_uint plaintext_len_min, cl_uint plaintext_len_max, cl_ulong index, char *expected_plaintext) { + CLMAKETESTVARS(); + int test_passed = 0; + + cl_mem charset_buffer = NULL, charset_len_buffer = NULL, plaintext_len_min_buffer = NULL, plaintext_len_max_buffer = NULL, index_buffer = NULL, plaintext_buffer = NULL, plaintext_len_buffer = NULL, debug_buffer = NULL; + + unsigned char *plaintext = NULL; + unsigned char *debug_ptr = NULL; + + cl_uint charset_len = strlen(charset); + cl_uint plaintext_len = MAX_PLAINTEXT_LEN; + + + queue = CLCREATEQUEUE(context, device); + + plaintext = calloc(MAX_PLAINTEXT_LEN, sizeof(unsigned char)); + if (plaintext == NULL) { + fprintf(stderr, "Failed to create I/O buffers.\n"); + exit(-1); + } + + CLCREATEARG_ARRAY(0, charset_buffer, CL_RO, charset, strlen(charset) + 1); + CLCREATEARG(1, charset_len_buffer, CL_RO, charset_len, sizeof(cl_uint)); + CLCREATEARG(2, plaintext_len_min_buffer, CL_RO, plaintext_len_min, sizeof(cl_uint)); + CLCREATEARG(3, plaintext_len_max_buffer, CL_RO, plaintext_len_max, sizeof(cl_uint)); + CLCREATEARG(4, index_buffer, CL_RO, index, sizeof(cl_ulong)); + CLCREATEARG_ARRAY(5, plaintext_buffer, CL_WO, plaintext, MAX_PLAINTEXT_LEN); + CLCREATEARG(6, plaintext_len_buffer, CL_WO, plaintext_len, sizeof(cl_uint)); + CLCREATEARG_DEBUG(7, debug_buffer, debug_ptr); + + CLRUNKERNEL(queue, kernel, &global_work_size); + CLFLUSH(queue); + CLWAIT(queue); + + CLREADBUFFER(plaintext_buffer, MAX_PLAINTEXT_LEN, plaintext); + CLREADBUFFER(plaintext_len_buffer, sizeof(cl_uint), &plaintext_len); + CLREADBUFFER(debug_buffer, DEBUG_LEN, debug_ptr); + + /* + printf("\ndebug: "); + for (i = 0; i < DEBUG_LEN; i++) { + printf("%x ", debug_ptr[i]); + } + printf("\n\n"); + */ + + if ((plaintext_len == strlen(expected_plaintext)) && (strcmp(expected_plaintext, (char *)plaintext) == 0)) + test_passed = 1; + else { + printf("\n\nGPU error:\n\tIndex: %"PRIu64"\n\tExpected: [%"PRIu64"][%s]\n\tCalculated: [%u][%s]\n\n", index, strlen(expected_plaintext), expected_plaintext, plaintext_len, plaintext); + } + + CLFREEBUFFER(charset_buffer); + CLFREEBUFFER(charset_len_buffer); + CLFREEBUFFER(plaintext_len_min_buffer); + CLFREEBUFFER(plaintext_len_max_buffer); + CLFREEBUFFER(index_buffer); + CLFREEBUFFER(plaintext_buffer); + CLFREEBUFFER(plaintext_len_buffer); + CLFREEBUFFER(debug_buffer); + + CLRELEASEQUEUE(queue); + + FREE(plaintext); + FREE(debug_ptr); + return test_passed; +} + + +int test_index_to_plaintext(cl_device_id device, cl_context context, cl_kernel kernel) { + int tests_passed = 1; + unsigned int i = 0; + + for (i = 0; i < (sizeof(i2p_tests) / sizeof(struct i2p_test)); i++) { + tests_passed &= gpu_test_index_to_plaintext(device, context, kernel, i2p_tests[i].charset, i2p_tests[i].plaintext_len_min, i2p_tests[i].plaintext_len_max, i2p_tests[i].index, i2p_tests[i].expected_plaintext); + + tests_passed &= cpu_test_index_to_plaintext(i2p_tests[i].charset, i2p_tests[i].plaintext_len_min, i2p_tests[i].plaintext_len_max, i2p_tests[i].index, i2p_tests[i].expected_plaintext); + } + + return tests_passed; +} diff --git a/test_index_to_plaintext.h b/test_index_to_plaintext.h new file mode 100644 index 0000000..c7e510c --- /dev/null +++ b/test_index_to_plaintext.h @@ -0,0 +1,6 @@ +#ifndef TEST_INDEX_TO_PLAINTEXT_H +#define TEST_INDEX_TO_PLAINTEXT_H + +int test_index_to_plaintext(cl_device_id device, cl_context context, cl_kernel kernel); + +#endif diff --git a/test_shared.c b/test_shared.c new file mode 100644 index 0000000..a38ec9f --- /dev/null +++ b/test_shared.c @@ -0,0 +1,53 @@ +/* + * Rainbow Crackalack: test_shared.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include "test_shared.h" + + +/* Converts a byte array to hex. Returns 1 on success, or 0 if output buffer is too + * small. */ +unsigned int bytes_to_hex(unsigned char *bytes, unsigned int num_bytes, char *hex, unsigned int hex_size) { + unsigned int i = 0; + + + /* Ensure the output buffer is big enough for all the hex characters. */ + if (hex_size < (num_bytes * 2) + 1) + return 0; + + for (i = 0; i < num_bytes; i++) + snprintf(hex + (i * 2), 3, "%02x", bytes[i]); + + return 1; +} + + +/* Converts a hex string to bytes and returns the number of bytes converted. */ +unsigned int hex_to_bytes(char *hex_str, unsigned int bytes_len, unsigned char *bytes) { + unsigned int i = 0, u = 0; + char hex[3]; + + /* Convert the string hex into bytes. */ + for (i = 0; (i < strlen(hex_str) / 2) && (i < bytes_len); i++) { + memcpy(hex, hex_str + (i * 2), 2); + sscanf(hex, "%2x", &u); + bytes[i] = u; + } + + return i; +} diff --git a/test_shared.h b/test_shared.h new file mode 100644 index 0000000..9b778ee --- /dev/null +++ b/test_shared.h @@ -0,0 +1,9 @@ +#ifndef TEST_SHARED_H +#define TEST_SHARED_H + +#include "charset.h" + +unsigned int bytes_to_hex(unsigned char *bytes, unsigned int num_bytes, char *hex, unsigned int hex_size); +unsigned int hex_to_bytes(char *hex_str, unsigned int bytes_len, unsigned char *bytes); + +#endif diff --git a/verify.c b/verify.c new file mode 100644 index 0000000..9f11748 --- /dev/null +++ b/verify.c @@ -0,0 +1,270 @@ +/* + * Rainbow Crackalack: verify.c + * Copyright (C) 2018-2019 Joe Testa + * + * This program is free software: you can redistribute it and/or modify + * it under the terms version 3 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "charset.h" +#include "cpu_rt_functions.h" +#include "file_lock.h" +#include "misc.h" +#include "rtc_decompress.h" +#include "shared.h" +#include "verify.h" + + +/* Verifies a rainbow table already loaded from disk. */ +int verify_rainbowtable(uint64_t *rainbowtable, unsigned int num_chains, unsigned int table_type, uint64_t expected_start, uint64_t plaintext_space_total, unsigned int *error_chain_num) { + unsigned int i = 0; + uint64_t start = 0, end = 0; + + + if (table_type == VERIFY_TABLE_TYPE_GENERATED) { + /* Newly-generated tables must have sequential start indices, and end indices that are not zero. */ + for (i = 0; i < num_chains; i++) { + start = rainbowtable[i * 2]; + end = rainbowtable[(i * 2) + 1]; + + if (start != expected_start) { + fprintf(stderr, "Start index at chain #%u is not the expected value! Expected %"PRIu64", but found %"PRIu64".\n", i, expected_start, start); + *error_chain_num = i; + return 0; + } + + if (end == 0) { + fprintf(stderr, "Chain #%u has an end value of zero!\n", i); + *error_chain_num = i; + return 0; + } + + /* The indices must not be equal or greater than the plaintext space total. */ + if ((plaintext_space_total > 0) && \ + ((start >= plaintext_space_total) || (end >= plaintext_space_total))) { + fprintf(stderr, "Start and/or end indices are greater or equal to the plaintext space total!\n\n\tStart index: %"PRIu64"\n\tEnd index: %"PRIu64"\nPlaintext space total: %"PRIu64"\n\n", start, end, plaintext_space_total); + *error_chain_num = i; + return 0; + } + expected_start++; + } + } else if (table_type == VERIFY_TABLE_TYPE_LOOKUP) { + uint64_t last_end = 0; + + + /* For tables ready to be used for lookups (i.e.: sorted tables), the end indices must be sorted in ascending order. */ + for (i = 0; i < num_chains; i++) { + start = rainbowtable[i * 2]; + end = rainbowtable[(i * 2) + 1]; + + if (end == 0) { + fprintf(stderr, "Error: end index for chain #%u is zero.\n", i); + return 0; + } + + if (end < last_end) { + fprintf(stderr, "Error: table end indices are not sorted. Current end index (at chain #%u) is not greater or equal to last end index.\n\n\tCurrent end index: %"PRIu64"\n\tLast end index: %"PRIu64"\n\n", i, end, last_end); + return 0; + } + + /* The indices must not be equal or greater than the plaintext space total. */ + if ((plaintext_space_total > 0) && \ + ((start >= plaintext_space_total) || (end >= plaintext_space_total))) { + fprintf(stderr, "Start and/or end indices are greater or equal to the plaintext space total!\n\n\tStart index: %"PRIu64"\n\tEnd index: %"PRIu64"\nPlaintext space total: %"PRIu64"\n\n", start, end, plaintext_space_total); + return 0; + } + + last_end = end; + } + } else { + fprintf(stderr, "Invalid value for table type: %u\n", table_type); + return 0; + } + + + return 1; +} + + +/* Verifies a rainbow table file. + * + * When 'table_type' is VERIFY_TABLE_TYPE_GENERATED, the table is understood to have + * just been generated. Hence, the start point must begin at (total_chains_in_table * + * part_index), then must increment sequentially. The end points must not be zero. + * When 'table_type' is VERIFY_TABLE_TYPE_LOOKUP, the table is checked for increasing + * end indices (as they must be sorted), and must not be zero. + * + * If 'table_should_be_complete' is VERIFY_TABLE_IS_COMPLETE, then the file size should + * be (total_chains_in_table * CHAIN_SIZE). Otherwise, the file size is not checked, + * other than ensuring it is some multiple of CHAIN_SIZE. This is ignored when + * 'table_type' is VERIFY_TABLE_TYPE_LOOKUP. + * + * If 'truncate_at_error' is set, then the file is truncated to just before the first + * error found, if any. This is ignored when 'table_type' is VERIFY_TABLE_TYPE_LOOKUP. + * + * The num_chains_to_verify parameter specifies how many random chains should be + * verified with CPU code. When set to -1, the default of 100 is checked. + * + * Returns 1 on success, or 0 on failure. */ +int verify_rainbowtable_file(char *filename, unsigned int table_type, unsigned int table_should_be_complete, unsigned int truncate_at_error, int num_chains_to_verify) { + rt_parameters rt_params = {0}; + uint64_t plaintext_space_up_to_index[MAX_PLAINTEXT_LEN] = {0}; + + rc_file f = NULL; + uint64_t *rainbow_table = NULL; + char *charset = NULL; + + unsigned int file_size = 0, actual_num_chains = 0, error_chain_num = 0, is_compressed = 0; + uint64_t expected_start = 0, plaintext_space_total = 0; + + + is_compressed = str_ends_with(filename, ".rtc"); + + /* Parse the RT parameters from the filename. */ + parse_rt_params(&rt_params, filename); + if (rt_params.parsed == 0) { + fprintf(stderr, "Error: failed to parse filename: %s\n", filename); + return 0; + } + + charset = validate_charset(rt_params.charset_name); + if (charset == NULL) { + fprintf(stderr, "Character set is invalid: %s\n", rt_params.charset_name); + return 0; + } + + plaintext_space_total = fill_plaintext_space_table(strlen(charset), rt_params.plaintext_len_min, rt_params.plaintext_len_max, plaintext_space_up_to_index); + + expected_start = (uint64_t)rt_params.num_chains * (uint64_t)rt_params.table_part; + + /* Open the file and obtain a lock on it. */ + f = rc_fopen(filename, 0); + if (f == NULL) { + fprintf(stderr, "Failed to open rainbow table file: %s\n", filename); + return 0; + } + + if (rc_flock(f) != 0) { + rc_fclose(f); + fprintf(stderr, "Failed to lock rainbow table file: %s\n", filename); + return 0; + } + + /* Get the file size. */ + rc_fseek(f, 0, RCSEEK_END); + file_size = rc_ftell(f); + rc_fseek(f, 0, RCSEEK_SET); + + /* An empty file is always an error. */ + if (file_size == 0) { + rc_fclose(f); + fprintf(stderr, "Error: file is empty!\n"); + return 0; + /* If the table should be complete, then ensure its file size is what we'd expect. Skip compressed files. */ + } else if ((table_should_be_complete == VERIFY_TABLE_IS_COMPLETE) && (file_size != (rt_params.num_chains * CHAIN_SIZE)) && !is_compressed) { + rc_fclose(f); + fprintf(stderr, "Error: table is expected to be complete, but file size does not match expected value. Expected: %u; actual: %u\n", rt_params.num_chains * CHAIN_SIZE, file_size); + return 0; + /* If the table is incomplete, ensure that the file size is a multiple of CHAIN_SIZE. */ + } else if (((file_size % CHAIN_SIZE) != 0) && !is_compressed) { + rc_fclose(f); + fprintf(stderr, "Error: file size is not aligned to %u bytes: %u\n", CHAIN_SIZE, file_size); + return 0; + } + + if (is_compressed) { + int ret = -1; + + + rc_fclose(f); + ret = rtc_decompress(filename, &rainbow_table, &actual_num_chains); + if (ret < 0) { + fprintf(stderr, "Error while decompressing RTC file: %d\n", ret); + return 0; + } + + if (table_type == VERIFY_TABLE_TYPE_GENERATED) + printf("\n!! WARNING: table is compressed, yet is supposedly unsorted! Only sorted tables should be compressed...\n\n"); + + } else { + actual_num_chains = file_size / CHAIN_SIZE; + rainbow_table = calloc(actual_num_chains * 2, sizeof(uint64_t)); + if (rainbow_table == NULL) { + fprintf(stderr, "Error while creating buffer to read file.\n"); + return 0; + } + + if (rc_fread(rainbow_table, sizeof(uint64_t), actual_num_chains * 2, f) != (actual_num_chains * 2)) { + fprintf(stderr, "Error while reading file: %s (%d)\n", strerror(errno), errno); + return 0; + } + rc_fclose(f); + } + + if (!verify_rainbowtable(rainbow_table, actual_num_chains, table_type, expected_start, plaintext_space_total, &error_chain_num)) { + if ((table_type == VERIFY_TABLE_TYPE_GENERATED) && (truncate_at_error == VERIFY_TRUNCATE_ON_ERROR)) { + f = rc_fopen(filename, 0); + if (f == NULL) + fprintf(stderr, "Error while opening file for truncation: %s (%d)\n", strerror(errno), errno); + else { + rc_ftruncate(f, (error_chain_num * CHAIN_SIZE)); + rc_fclose(f); + } + } + goto err; + } + + /* By default, check 100 chains. */ + if (num_chains_to_verify < 0) + num_chains_to_verify = 100; + + if (num_chains_to_verify > 0) { + uint64_t start = 0, computed_end = 0, actual_end = 0, random_chain = 0; + char plaintext[MAX_PLAINTEXT_LEN] = {0}; + unsigned char hash[MAX_HASH_OUTPUT_LEN] = {0}; + unsigned int i = 0, plaintext_len = sizeof(plaintext), hash_len = sizeof(hash); + + + if (rt_params.hash_type == HASH_NTLM) { + for (i = 0; i < num_chains_to_verify; i++) { + random_chain = get_random(actual_num_chains); + /*printf(" Verifying chain #%"PRIu64"...\n", random_chain);*/ + + start = rainbow_table[random_chain * 2]; + actual_end = rainbow_table[(random_chain * 2) + 1]; + + computed_end = generate_rainbow_chain(rt_params.hash_type, charset, strlen(charset), rt_params.plaintext_len_min, rt_params.plaintext_len_max, rt_params.reduction_offset, rt_params.chain_len, start, plaintext_space_up_to_index, plaintext_space_total, plaintext, &plaintext_len, hash, &hash_len); + + if (actual_end != computed_end) { + fprintf(stderr, "Error: chain #%"PRIu64" is invalid!\n Start index: %"PRIu64"\n Actual chain end: %"PRIu64"\n Computed chain end: %"PRIu64"\n\n", random_chain, start, actual_end, computed_end); + goto err; + } + } + } else { + printf("Note: skipping CPU chain verification since hash type is not NTLM.\n"); fflush(stdout); + } + } + + FREE(rainbow_table); + return 1; + + err: + FREE(rainbow_table); + return 0; +} diff --git a/verify.h b/verify.h new file mode 100644 index 0000000..ae333a0 --- /dev/null +++ b/verify.h @@ -0,0 +1,22 @@ +#ifndef _VERIFY_H +#define _VERIFY_H + +#include + + +#define VERIFY_TABLE_TYPE_LOOKUP 1 +#define VERIFY_TABLE_TYPE_GENERATED 0 + +#define VERIFY_TABLE_IS_COMPLETE 1 +#define VERIFY_TABLE_MAY_BE_INCOMPLETE 0 + +#define VERIFY_TRUNCATE_ON_ERROR 1 +#define VERIFY_DONT_TRUNCATE 0 + + +int verify_rainbowtable(uint64_t *rainbowtable, unsigned int num_chains, unsigned int table_type, uint64_t expected_start, uint64_t plaintext_space_total, unsigned int *error_chain_num); + +int verify_rainbowtable_file(char *filename, unsigned int table_type, unsigned int table_should_be_complete, unsigned int truncate_at_error, int num_chains_to_verify); + + +#endif diff --git a/version.h b/version.h new file mode 100644 index 0000000..083ac82 --- /dev/null +++ b/version.h @@ -0,0 +1,10 @@ +#ifndef _VERSION_H +#define _VERSION_H + +#include "terminal_color.h" + +#define VERSION "v1.0" + +#define PRINT_PROJECT_HEADER() printf("\n%sRainbow Crackalack %s%s\nCopyright 2018-2019 Positron Security LLC \n%sMake Rainbow Tables Great Again%s\n\n\n", WHITEB, VERSION, CLR, ITALICIZE, CLR); + +#endif