From fcd0d142d0e3df6e926b4c96666fc249e055cd46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20=C5=A0ebela?= <514509@mail.muni.cz> Date: Fri, 7 Jun 2024 11:20:25 +0200 Subject: [PATCH 1/2] pixfmt_conv.c R12LtoRGB implementation in SIMD - added the -msse4.1 flag to tools/Makefile - 9 ms -> 5 ms on my machine --- src/pixfmt_conv.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++ tools/Makefile | 2 +- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/src/pixfmt_conv.c b/src/pixfmt_conv.c index e10f62f386..07b8d4dab6 100644 --- a/src/pixfmt_conv.c +++ b/src/pixfmt_conv.c @@ -410,6 +410,126 @@ vc_copylineR12LtoRGB(unsigned char * __restrict dst, const unsigned char * __res } } +static void +vc_copylineR12L(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift); +static void +vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift); + +// #ifdef __SSSE3__ +#if 1 +#include +#include +static void +vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift) +{ + // assert(false); // verify that the function is executed + + // Map 3 bytes of input to 2 bytes of output + // byte whose high nibble I need -> low nibble of 1st byte + // byte whose low nibble I need -> high nibble of 1st byte + // whole 3rd byte -> whole 2nd byte + + #define Z 0x80 // clear dst byte in shuffle + #define F 0xff + + __m128i leftmask = _mm_setr_epi8(F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F); + __m128i centermask = _mm_setr_epi8(0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0); + __m128i rightmask = _mm_setr_epi8(0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0); + + __m128i unpack_hilo_to_odd0 = _mm_setr_epi8(0, Z, 3, Z, 6, Z, 9, Z, 12, Z, 15, Z, 2, Z, 5, Z); + __m128i unpack_hilo_to_odd1 = _mm_setr_epi8(8, Z, 11, Z, 14, Z, 1, Z, 4, Z, 7, Z, 10, Z, 13, Z); + __m128i unpack_lohi_to_odd0 = _mm_setr_epi8(1, Z, 4, Z, 7, Z, 10, Z, 13, Z, 0, Z, 3, Z, 6, Z); + __m128i unpack_lohi_to_odd1 = _mm_setr_epi8(9, Z, 12, Z, 15, Z, 2, Z, 5, Z, 8, Z, 11, Z, 14, Z); + __m128i unpack_sameb_to_even0 = _mm_setr_epi8(Z, 2, Z, 5, Z, 8, Z, 11, Z, 14, Z, 1, Z, 4, Z, 7); + __m128i unpack_sameb_to_even1 = _mm_setr_epi8(Z, 10, Z, 13, Z, 0, Z, 3, Z, 6, Z, 9, Z, 12, Z, 15); + + #undef Z + #undef F + + int x; + OPTIMIZED_FOR (x = 0; x <= dstlen - 32; x += 32) { + __m128i chunk0, chunk1, chunk2; + chunk0 = _mm_lddqu_si128((__m128i const*)(const void *) src); + chunk1 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 16)); + chunk2 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 32)); + + #ifdef WORDS_BIGENDIAN + __m128i shuffle_BEtoLE = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + chunk0 = _mm_shuffle_epi8(chunk0, shuffle_BEtoLE); + chunk1 = _mm_shuffle_epi8(chunk1, shuffle_BEtoLE); + chunk2 = _mm_shuffle_epi8(chunk2, shuffle_BEtoLE); + #endif + + __m128i hitolo; // positions ≡ 0 (mod 3) + { + __m128i hitolo0, hitolo1, hitolo2; + hitolo0 = _mm_and_si128(chunk0, leftmask); // 0 = 0*3 = 0 + 0 + hitolo1 = _mm_and_si128(chunk1, rightmask); // 18 = 6*3 = 16 + 2 + hitolo2 = _mm_and_si128(chunk2, centermask); // 33 = 11*3 = 32 + 1 + hitolo = _mm_or_si128(hitolo0, hitolo1); + hitolo = _mm_or_si128(hitolo, hitolo2); + } + + __m128i lotohi; // positions ≡ 1 (mod 3) + { + __m128i lotohi0, lotohi1, lotohi2; + lotohi0 = _mm_and_si128(chunk0, centermask); // 1 = 0*3 + 1 = 0 + 1 + lotohi1 = _mm_and_si128(chunk1, leftmask); // 16 = 5*3 + 1 = 16 + 0 + lotohi2 = _mm_and_si128(chunk2, rightmask); // 34 = 11*3 + 1 = 32 + 2 + lotohi = _mm_or_si128(lotohi0, lotohi1); + lotohi = _mm_or_si128(lotohi, lotohi2); + } + + __m128i copybyte; // positions ≡ 2 (mod 3) + { + __m128i copybyte0, copybyte1, copybyte2; + copybyte0 = _mm_and_si128(chunk0, rightmask); // 2 = 0 * 3 + 2 = 0 + 2 + copybyte1 = _mm_and_si128(chunk1, centermask); // 17 = 5 * 3 + 2 = 16 + 1 + copybyte2 = _mm_and_si128(chunk2, leftmask); // 32 = 10 * 3 + 2 = 32 + 0 + copybyte = _mm_or_si128(copybyte0, copybyte1); + copybyte = _mm_or_si128(copybyte, copybyte2); + } + + // uninterleave, moving to correct locations + __m128i lotohi_unp0, lotohi_unp1, hitolo_unp0, hitolo_unp1, + copybyte_unp0, copybyte_unp1; + hitolo_unp0 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_odd0); + hitolo_unp1 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_odd1); + lotohi_unp0 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_odd0); + lotohi_unp1 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_odd1); + copybyte_unp0 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_even0); + copybyte_unp1 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_even1); + + // actually bitshift low -> high and high -> low + lotohi_unp0 = _mm_slli_epi16(lotohi_unp0, 4); + lotohi_unp1 = _mm_slli_epi16(lotohi_unp1, 4); + hitolo_unp0 = _mm_srli_epi16(hitolo_unp0, 4); + hitolo_unp1 = _mm_srli_epi16(hitolo_unp1, 4); + + // assemble + __m128i res0, res1; + res0 = _mm_or_si128(copybyte_unp0, lotohi_unp0); + res0 = _mm_or_si128(res0, hitolo_unp0); + res1 = _mm_or_si128(copybyte_unp1, lotohi_unp1); + res1 = _mm_or_si128(res1, hitolo_unp1); + + + // store + _mm_storeu_si128((__m128i_u *) dst, res0); + _mm_storeu_si128((__m128i_u *) (dst + 16), res1); + + src += 48; + dst += 32; + } + // copy leftover bytes + dstlen -= x; + + vc_copylineR12LtoRGB(dst, src, dstlen, rshift, gshift, bshift); +} + +#endif // __SSSE3__ + + /** * @brief Converts from R12L to RGBA * @@ -2732,6 +2852,9 @@ static const struct decoder_item decoders[] = { { vc_copyliner10ktoY416, R10k, Y416 }, { vc_copyliner10ktoRGB, R10k, RGB }, { vc_copylineR12L, R12L, RGBA }, +#if 1 + { vc_copylineR12LtoRGB_SSE, R12L, RGB }, +#endif { vc_copylineR12LtoRGB, R12L, RGB }, { vc_copylineR12LtoRG48, R12L, RG48 }, { vc_copylineR12LtoR10k, R12L, R10k }, diff --git a/tools/Makefile b/tools/Makefile index 0de0407155..4445457de1 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,4 +1,4 @@ -FLAGS ?= -O2 -g +FLAGS ?= -O2 -g -msse4.1 SRCDIR ?= .. COMMON_FLAGS = $(FLAGS) -D_GNU_SOURCE -I$(SRCDIR)/src/ MKDIR_P = mkdir -p From b62ef724cde411b7203dc9d6ab81d46d1225a95d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20=C5=A0ebela?= <514509@mail.muni.cz> Date: Fri, 7 Jun 2024 11:20:26 +0200 Subject: [PATCH 2/2] pixfmt_conv.c R12LtoRGBA implementation in SIMD - 11 ms -> 6 ms - need to specifically comment out R12LtoRGB to run R12LtoRGBA -- for some reason, UltraGrid prefers the longer R12L -> RGB -> RGBA path --- src/pixfmt_conv.c | 122 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 4 deletions(-) diff --git a/src/pixfmt_conv.c b/src/pixfmt_conv.c index 07b8d4dab6..aaf27ea543 100644 --- a/src/pixfmt_conv.c +++ b/src/pixfmt_conv.c @@ -415,8 +415,8 @@ vc_copylineR12L(unsigned char *dst, const unsigned char *src, int dstlen, int rs static void vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift); +#if 0 // Functional, but commented out for stability // #ifdef __SSSE3__ -#if 1 #include #include static void @@ -521,13 +521,122 @@ vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstle src += 48; dst += 32; } + // copy leftover bytes dstlen -= x; vc_copylineR12LtoRGB(dst, src, dstlen, rshift, gshift, bshift); } -#endif // __SSSE3__ +static void +vc_copylineR12LtoRGBA_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift) +{ + // assert(false); // verify that the function is executed + + // Same as R12LtoRGB, except only 32 bytes are fetched and the last 4 bytes are processed separately + // UNTESTED; this is probably wrong, don't know how to test it -- UltraGrid always prefers the long R12L -> RGB -> RGBA route + + #define Z 0x80 // clear dst byte in shuffle + #define F 0xff + + __m128i leftmask = _mm_setr_epi8(F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F); + __m128i centermask = _mm_setr_epi8(0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0); + __m128i rightmask = _mm_setr_epi8(0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0); + + // r g b a r g b a r g b a r g b a + __m128i unpack_hilo_to_one_grp_0 = _mm_setr_epi8(0, Z, 3, Z, Z, 6, Z, Z, 9, Z, 12, Z, Z, 15, Z, Z); + __m128i unpack_hilo_to_one_grp_1 = _mm_setr_epi8(2, Z, 5, Z, Z, 8, Z, Z, 11, Z, 14, Z, Z, Z, Z, Z); + __m128i unpack_lohi_to_one_grp_0 = _mm_setr_epi8(1, Z, 4, Z, Z, 7, Z, Z, 10, Z, 13, Z, Z, 0, Z, Z); + __m128i unpack_lohi_to_one_grp_1 = _mm_setr_epi8(3, Z, 6, Z, Z, 9, Z, Z, 12, Z, 15, Z, Z, Z, Z, Z); + __m128i unpack_sameb_to_another_grp_0 = _mm_setr_epi8(Z, 2, Z, Z, 5, Z, 8, Z, Z, 11, Z, Z, 14, Z, 1, Z); + __m128i unpack_sameb_to_another_grp_1 = _mm_setr_epi8(Z, 4, Z, Z, 7, Z, 10, Z, Z, 13, Z, Z, Z, Z, Z, Z); + + #undef Z + + int x; + OPTIMIZED_FOR (x = 0; x <= dstlen - 32; x += 32) { + __m128i chunk0, chunk1; + chunk0 = _mm_lddqu_si128((__m128i const*)(const void *) src); + chunk1 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 16)); + + #ifdef WORDS_BIGENDIAN + __m128i shuffle_BEtoLE = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + chunk0 = _mm_shuffle_epi8(chunk0, shuffle_BEtoLE); + chunk1 = _mm_shuffle_epi8(chunk1, shuffle_BEtoLE); + #endif + + __m128i hitolo; // positions ≡ 0 (mod 3) + { + __m128i hitolo0, hitolo1; + hitolo0 = _mm_and_si128(chunk0, leftmask); + hitolo1 = _mm_and_si128(chunk1, rightmask); + hitolo = _mm_or_si128(hitolo0, hitolo1); + } + + __m128i lotohi; // positions ≡ 1 (mod 3) + { + __m128i lotohi0, lotohi1; + lotohi0 = _mm_and_si128(chunk0, centermask); + lotohi1 = _mm_and_si128(chunk1, leftmask); + lotohi = _mm_or_si128(lotohi0, lotohi1); + } + + __m128i copybyte; // positions ≡ 2 (mod 3) + { + __m128i copybyte0, copybyte1; + copybyte0 = _mm_and_si128(chunk0, rightmask); + copybyte1 = _mm_and_si128(chunk1, centermask); + copybyte = _mm_or_si128(copybyte0, copybyte1); + } + + // uninterleave, moving to correct locations + __m128i lotohi_unp0, lotohi_unp1, hitolo_unp0, hitolo_unp1, + copybyte_unp0, copybyte_unp1; + hitolo_unp0 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_one_grp_0); + hitolo_unp1 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_one_grp_1); + lotohi_unp0 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_one_grp_0); + lotohi_unp1 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_one_grp_1); + copybyte_unp0 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_another_grp_0); + copybyte_unp1 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_another_grp_1); + + // actually bitshift low -> high and high -> low + lotohi_unp0 = _mm_slli_epi16(lotohi_unp0, 4); + lotohi_unp1 = _mm_slli_epi16(lotohi_unp1, 4); + hitolo_unp0 = _mm_srli_epi16(hitolo_unp0, 4); + hitolo_unp1 = _mm_srli_epi16(hitolo_unp1, 4); + + // add alpha == 255 and assemble + __m128i res0 = _mm_setr_epi8(0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F); + __m128i res1 = _mm_setr_epi8(0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F); + res0 = _mm_or_si128(res0, copybyte_unp0); + res0 = _mm_or_si128(res0, lotohi_unp0); + res0 = _mm_or_si128(res0, hitolo_unp0); + res1 = _mm_or_si128(res1, copybyte_unp1); + res1 = _mm_or_si128(res1, lotohi_unp1); + res1 = _mm_or_si128(res1, hitolo_unp1); + + _mm_storeu_si128((__m128i_u *) dst, res0); + _mm_storeu_si128((__m128i_u *) (dst + 16), res1); + + // At this point, we still have 4 bytes left over that we didn't fetch yet + // These are copybyte R, hitolo G, lotohi G and copybyte B + + dst[28] = src[32]; + dst[29] = (src[33] >> 4) | (src[34] << 4); + dst[30] = src[35]; + dst[31] = F; + + src += 36; + dst += 32; + } + #undef F + + // copy leftover bytes + dstlen -= x; + vc_copylineR12L(dst, src, dstlen, rshift, gshift, bshift); +} + +#endif /** @@ -2852,9 +2961,14 @@ static const struct decoder_item decoders[] = { { vc_copyliner10ktoY416, R10k, Y416 }, { vc_copyliner10ktoRGB, R10k, RGB }, { vc_copylineR12L, R12L, RGBA }, -#if 1 - { vc_copylineR12LtoRGB_SSE, R12L, RGB }, + +#if 0 // Functional, but commented out for stability + // The R12LtoRGBA_SSE function only runs when R12LtoRGB + ~_SSE are commented out + // UltraGrid preferentially picks the longer R12L -> RGB -> RGBA path for some reason + { vc_copylineR12LtoRGBA_SSE, R12L, RGBA }, + { vc_copylineR12LtoRGB_SSE, R12L, RGB }, #endif + { vc_copylineR12L, R12L, RGBA }, { vc_copylineR12LtoRGB, R12L, RGB }, { vc_copylineR12LtoRG48, R12L, RG48 }, { vc_copylineR12LtoR10k, R12L, R10k },