Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pixfmt_conv.c: R12L->{RGB, RGBA} SIMD implementations #392

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 237 additions & 0 deletions src/pixfmt_conv.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,235 @@ vc_copylineR12LtoRGB(unsigned char * __restrict dst, const unsigned char * __res
}
}

static void
vc_copylineR12L(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift);
static void
vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift);

#if 0 // Functional, but commented out for stability
// #ifdef __SSSE3__
#include <immintrin.h>
#include <assert.h>
static void
vc_copylineR12LtoRGB_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift)
{
// assert(false); // verify that the function is executed

// Map 3 bytes of input to 2 bytes of output
// byte whose high nibble I need -> low nibble of 1st byte
// byte whose low nibble I need -> high nibble of 1st byte
// whole 3rd byte -> whole 2nd byte

#define Z 0x80 // clear dst byte in shuffle
#define F 0xff

__m128i leftmask = _mm_setr_epi8(F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F);
__m128i centermask = _mm_setr_epi8(0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0);
__m128i rightmask = _mm_setr_epi8(0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0);

__m128i unpack_hilo_to_odd0 = _mm_setr_epi8(0, Z, 3, Z, 6, Z, 9, Z, 12, Z, 15, Z, 2, Z, 5, Z);
__m128i unpack_hilo_to_odd1 = _mm_setr_epi8(8, Z, 11, Z, 14, Z, 1, Z, 4, Z, 7, Z, 10, Z, 13, Z);
__m128i unpack_lohi_to_odd0 = _mm_setr_epi8(1, Z, 4, Z, 7, Z, 10, Z, 13, Z, 0, Z, 3, Z, 6, Z);
__m128i unpack_lohi_to_odd1 = _mm_setr_epi8(9, Z, 12, Z, 15, Z, 2, Z, 5, Z, 8, Z, 11, Z, 14, Z);
__m128i unpack_sameb_to_even0 = _mm_setr_epi8(Z, 2, Z, 5, Z, 8, Z, 11, Z, 14, Z, 1, Z, 4, Z, 7);
__m128i unpack_sameb_to_even1 = _mm_setr_epi8(Z, 10, Z, 13, Z, 0, Z, 3, Z, 6, Z, 9, Z, 12, Z, 15);

#undef Z
#undef F

int x;
OPTIMIZED_FOR (x = 0; x <= dstlen - 32; x += 32) {
__m128i chunk0, chunk1, chunk2;
chunk0 = _mm_lddqu_si128((__m128i const*)(const void *) src);
chunk1 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 16));
chunk2 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 32));

#ifdef WORDS_BIGENDIAN
__m128i shuffle_BEtoLE = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
chunk0 = _mm_shuffle_epi8(chunk0, shuffle_BEtoLE);
chunk1 = _mm_shuffle_epi8(chunk1, shuffle_BEtoLE);
chunk2 = _mm_shuffle_epi8(chunk2, shuffle_BEtoLE);
#endif

__m128i hitolo; // positions ≡ 0 (mod 3)
{
__m128i hitolo0, hitolo1, hitolo2;
hitolo0 = _mm_and_si128(chunk0, leftmask); // 0 = 0*3 = 0 + 0
hitolo1 = _mm_and_si128(chunk1, rightmask); // 18 = 6*3 = 16 + 2
hitolo2 = _mm_and_si128(chunk2, centermask); // 33 = 11*3 = 32 + 1
hitolo = _mm_or_si128(hitolo0, hitolo1);
hitolo = _mm_or_si128(hitolo, hitolo2);
}

__m128i lotohi; // positions ≡ 1 (mod 3)
{
__m128i lotohi0, lotohi1, lotohi2;
lotohi0 = _mm_and_si128(chunk0, centermask); // 1 = 0*3 + 1 = 0 + 1
lotohi1 = _mm_and_si128(chunk1, leftmask); // 16 = 5*3 + 1 = 16 + 0
lotohi2 = _mm_and_si128(chunk2, rightmask); // 34 = 11*3 + 1 = 32 + 2
lotohi = _mm_or_si128(lotohi0, lotohi1);
lotohi = _mm_or_si128(lotohi, lotohi2);
}

__m128i copybyte; // positions ≡ 2 (mod 3)
{
__m128i copybyte0, copybyte1, copybyte2;
copybyte0 = _mm_and_si128(chunk0, rightmask); // 2 = 0 * 3 + 2 = 0 + 2
copybyte1 = _mm_and_si128(chunk1, centermask); // 17 = 5 * 3 + 2 = 16 + 1
copybyte2 = _mm_and_si128(chunk2, leftmask); // 32 = 10 * 3 + 2 = 32 + 0
copybyte = _mm_or_si128(copybyte0, copybyte1);
copybyte = _mm_or_si128(copybyte, copybyte2);
}

// uninterleave, moving to correct locations
__m128i lotohi_unp0, lotohi_unp1, hitolo_unp0, hitolo_unp1,
copybyte_unp0, copybyte_unp1;
hitolo_unp0 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_odd0);
hitolo_unp1 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_odd1);
lotohi_unp0 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_odd0);
lotohi_unp1 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_odd1);
copybyte_unp0 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_even0);
copybyte_unp1 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_even1);

// actually bitshift low -> high and high -> low
lotohi_unp0 = _mm_slli_epi16(lotohi_unp0, 4);
lotohi_unp1 = _mm_slli_epi16(lotohi_unp1, 4);
hitolo_unp0 = _mm_srli_epi16(hitolo_unp0, 4);
hitolo_unp1 = _mm_srli_epi16(hitolo_unp1, 4);

// assemble
__m128i res0, res1;
res0 = _mm_or_si128(copybyte_unp0, lotohi_unp0);
res0 = _mm_or_si128(res0, hitolo_unp0);
res1 = _mm_or_si128(copybyte_unp1, lotohi_unp1);
res1 = _mm_or_si128(res1, hitolo_unp1);


// store
_mm_storeu_si128((__m128i_u *) dst, res0);
_mm_storeu_si128((__m128i_u *) (dst + 16), res1);

src += 48;
dst += 32;
}

// copy leftover bytes
dstlen -= x;

vc_copylineR12LtoRGB(dst, src, dstlen, rshift, gshift, bshift);
}

static void
vc_copylineR12LtoRGBA_SSE(unsigned char *dst, const unsigned char *src, int dstlen, int rshift, int gshift, int bshift)
{
// assert(false); // verify that the function is executed

// Same as R12LtoRGB, except only 32 bytes are fetched and the last 4 bytes are processed separately
// UNTESTED; this is probably wrong, don't know how to test it -- UltraGrid always prefers the long R12L -> RGB -> RGBA route

#define Z 0x80 // clear dst byte in shuffle
#define F 0xff

__m128i leftmask = _mm_setr_epi8(F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F);
__m128i centermask = _mm_setr_epi8(0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0);
__m128i rightmask = _mm_setr_epi8(0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0, 0, F, 0);

// r g b a r g b a r g b a r g b a
__m128i unpack_hilo_to_one_grp_0 = _mm_setr_epi8(0, Z, 3, Z, Z, 6, Z, Z, 9, Z, 12, Z, Z, 15, Z, Z);
__m128i unpack_hilo_to_one_grp_1 = _mm_setr_epi8(2, Z, 5, Z, Z, 8, Z, Z, 11, Z, 14, Z, Z, Z, Z, Z);
__m128i unpack_lohi_to_one_grp_0 = _mm_setr_epi8(1, Z, 4, Z, Z, 7, Z, Z, 10, Z, 13, Z, Z, 0, Z, Z);
__m128i unpack_lohi_to_one_grp_1 = _mm_setr_epi8(3, Z, 6, Z, Z, 9, Z, Z, 12, Z, 15, Z, Z, Z, Z, Z);
__m128i unpack_sameb_to_another_grp_0 = _mm_setr_epi8(Z, 2, Z, Z, 5, Z, 8, Z, Z, 11, Z, Z, 14, Z, 1, Z);
__m128i unpack_sameb_to_another_grp_1 = _mm_setr_epi8(Z, 4, Z, Z, 7, Z, 10, Z, Z, 13, Z, Z, Z, Z, Z, Z);

#undef Z

int x;
OPTIMIZED_FOR (x = 0; x <= dstlen - 32; x += 32) {
__m128i chunk0, chunk1;
chunk0 = _mm_lddqu_si128((__m128i const*)(const void *) src);
chunk1 = _mm_lddqu_si128((__m128i const*)(const void *) (src + 16));

#ifdef WORDS_BIGENDIAN
__m128i shuffle_BEtoLE = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
chunk0 = _mm_shuffle_epi8(chunk0, shuffle_BEtoLE);
chunk1 = _mm_shuffle_epi8(chunk1, shuffle_BEtoLE);
#endif

__m128i hitolo; // positions ≡ 0 (mod 3)
{
__m128i hitolo0, hitolo1;
hitolo0 = _mm_and_si128(chunk0, leftmask);
hitolo1 = _mm_and_si128(chunk1, rightmask);
hitolo = _mm_or_si128(hitolo0, hitolo1);
}

__m128i lotohi; // positions ≡ 1 (mod 3)
{
__m128i lotohi0, lotohi1;
lotohi0 = _mm_and_si128(chunk0, centermask);
lotohi1 = _mm_and_si128(chunk1, leftmask);
lotohi = _mm_or_si128(lotohi0, lotohi1);
}

__m128i copybyte; // positions ≡ 2 (mod 3)
{
__m128i copybyte0, copybyte1;
copybyte0 = _mm_and_si128(chunk0, rightmask);
copybyte1 = _mm_and_si128(chunk1, centermask);
copybyte = _mm_or_si128(copybyte0, copybyte1);
}

// uninterleave, moving to correct locations
__m128i lotohi_unp0, lotohi_unp1, hitolo_unp0, hitolo_unp1,
copybyte_unp0, copybyte_unp1;
hitolo_unp0 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_one_grp_0);
hitolo_unp1 = _mm_shuffle_epi8(hitolo, unpack_hilo_to_one_grp_1);
lotohi_unp0 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_one_grp_0);
lotohi_unp1 = _mm_shuffle_epi8(lotohi, unpack_lohi_to_one_grp_1);
copybyte_unp0 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_another_grp_0);
copybyte_unp1 = _mm_shuffle_epi8(copybyte, unpack_sameb_to_another_grp_1);

// actually bitshift low -> high and high -> low
lotohi_unp0 = _mm_slli_epi16(lotohi_unp0, 4);
lotohi_unp1 = _mm_slli_epi16(lotohi_unp1, 4);
hitolo_unp0 = _mm_srli_epi16(hitolo_unp0, 4);
hitolo_unp1 = _mm_srli_epi16(hitolo_unp1, 4);

// add alpha == 255 and assemble
__m128i res0 = _mm_setr_epi8(0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F);
__m128i res1 = _mm_setr_epi8(0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F, 0, 0, 0, F);
res0 = _mm_or_si128(res0, copybyte_unp0);
res0 = _mm_or_si128(res0, lotohi_unp0);
res0 = _mm_or_si128(res0, hitolo_unp0);
res1 = _mm_or_si128(res1, copybyte_unp1);
res1 = _mm_or_si128(res1, lotohi_unp1);
res1 = _mm_or_si128(res1, hitolo_unp1);

_mm_storeu_si128((__m128i_u *) dst, res0);
_mm_storeu_si128((__m128i_u *) (dst + 16), res1);

// At this point, we still have 4 bytes left over that we didn't fetch yet
// These are copybyte R, hitolo G, lotohi G and copybyte B

dst[28] = src[32];
dst[29] = (src[33] >> 4) | (src[34] << 4);
dst[30] = src[35];
dst[31] = F;

src += 36;
dst += 32;
}
#undef F

// copy leftover bytes
dstlen -= x;
vc_copylineR12L(dst, src, dstlen, rshift, gshift, bshift);
}

#endif


/**
* @brief Converts from R12L to RGBA
*
Expand Down Expand Up @@ -2732,6 +2961,14 @@ static const struct decoder_item decoders[] = {
{ vc_copyliner10ktoY416, R10k, Y416 },
{ vc_copyliner10ktoRGB, R10k, RGB },
{ vc_copylineR12L, R12L, RGBA },

#if 0 // Functional, but commented out for stability
// The R12LtoRGBA_SSE function only runs when R12LtoRGB + ~_SSE are commented out
// UltraGrid preferentially picks the longer R12L -> RGB -> RGBA path for some reason
{ vc_copylineR12LtoRGBA_SSE, R12L, RGBA },
{ vc_copylineR12LtoRGB_SSE, R12L, RGB },
#endif
{ vc_copylineR12L, R12L, RGBA },
{ vc_copylineR12LtoRGB, R12L, RGB },
{ vc_copylineR12LtoRG48, R12L, RG48 },
{ vc_copylineR12LtoR10k, R12L, R10k },
Expand Down
2 changes: 1 addition & 1 deletion tools/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FLAGS ?= -O2 -g
FLAGS ?= -O2 -g -msse4.1
SRCDIR ?= ..
COMMON_FLAGS = $(FLAGS) -D_GNU_SOURCE -I$(SRCDIR)/src/
MKDIR_P = mkdir -p
Expand Down